diff --git a/libdrgn/arch_x86_64.c.in b/libdrgn/arch_x86_64.c.in index a64a5d15..c6e4399c 100644 --- a/libdrgn/arch_x86_64.c.in +++ b/libdrgn/arch_x86_64.c.in @@ -257,67 +257,25 @@ out: static struct drgn_error * linux_kernel_set_initial_registers_x86_64(Dwfl_Thread *thread, - const struct drgn_object *task_obj, - const void *prstatus, - size_t prstatus_size) + const struct drgn_object *task_obj) { struct drgn_error *err; struct drgn_program *prog = task_obj->prog; struct drgn_object sp_obj; - struct drgn_qualified_type frame_type; - uint64_t sp; - Dwarf_Word dwarf_reg; drgn_object_init(&sp_obj, prog); - if (prstatus) { - /* - * If the stack pointer in PRSTATUS is within this task's stack, - * then we can use it. Otherwise, the task either wasn't running - * or was in the middle of context switching. Either way, we - * should use the saved registers instead. - */ - uint64_t thread_size; - uint64_t stack; - - err = linux_kernel_get_thread_size(prog, &thread_size); - if (err) - goto out; - err = drgn_object_member_dereference(&sp_obj, task_obj, - "stack"); - if (err) - goto out; - err = drgn_object_read_unsigned(&sp_obj, &stack); - if (err) - goto out; - - if (prstatus_size < 272) { - err = drgn_error_create(DRGN_ERROR_INVALID_ARGUMENT, - "registers are truncated"); - goto out; - } - memcpy(&sp, (char *)prstatus + 264, sizeof(sp)); - if (drgn_program_bswap(prog)) - sp = bswap_64(sp); - if (sp > stack && sp <= stack + thread_size) { - err = prstatus_set_initial_registers_x86_64(prog, - thread, - prstatus, - prstatus_size); - goto out; - } - } - err = drgn_object_member_dereference(&sp_obj, task_obj, "thread"); if (err) goto out; err = drgn_object_member(&sp_obj, &sp_obj, "sp"); if (err) goto out; + uint64_t sp; err = drgn_object_read_unsigned(&sp_obj, &sp); if (err) goto out; - dwarf_reg = sp; + Dwarf_Word dwarf_reg = sp; /* rsp is register 7. */ if (!dwfl_thread_state_registers(thread, 7, 1, &dwarf_reg)) { err = drgn_error_libdwfl(); @@ -330,6 +288,7 @@ linux_kernel_set_initial_registers_x86_64(Dwfl_Thread *thread, * inactive_task_frame, which we can use to get most registers. Before * that, it points to bp. */ + struct drgn_qualified_type frame_type; err = drgn_program_find_type(prog, "struct inactive_task_frame *", NULL, &frame_type); if (!err) { diff --git a/libdrgn/platform.h b/libdrgn/platform.h index c97608ec..8510cc45 100644 --- a/libdrgn/platform.h +++ b/libdrgn/platform.h @@ -65,35 +65,8 @@ struct drgn_architecture_info { Dwfl_Thread *, const void *, size_t); - /* - * Get a task's registers from the task_struct or PRSTATUS note as - * appropriate. - * - * The given PRSTATUS note is for the CPU that the task is assigned to, - * which may or may not be for the given task. This callback must - * determine that (typically by checking whether the stack pointer in - * PRSTATUS lies within the task's stack). - * - * We find the PRSTATUS note by CPU rather than by PID for two reasons: - * - * 1. The PID is populated by the kernel from "current" (the current - * task) via a non-maskable interrupt (NMI). During a context switch, - * the stack pointer and current are not updated atomically, so if - * the NMI arrives in the middle of a context switch, the stack - * pointer may not actually be that of current. Therefore, the stack - * pointer in PRSTATUS may not actually be for the PID in PRSTATUS. - * - * We go through all of this trouble because blindly trusting the PID - * could result in a stack trace for the wrong task, which we want to - * avoid at all costs. - * - * 2. There is an idle task with PID 0 for each CPU, so for an idle task - * we have no choice but to find the note by CPU. - */ struct drgn_error *(*linux_kernel_set_initial_registers)(Dwfl_Thread *, - const struct drgn_object *, - const void *prstatus, - size_t prstatus_size); + const struct drgn_object *); struct drgn_error *(*linux_kernel_get_page_offset)(struct drgn_program *, uint64_t *); struct drgn_error *(*linux_kernel_get_vmemmap)(struct drgn_program *, diff --git a/libdrgn/program.c b/libdrgn/program.c index 3038bb1b..bbd45f36 100644 --- a/libdrgn/program.c +++ b/libdrgn/program.c @@ -738,33 +738,37 @@ drgn_program_load_debug_info(struct drgn_program *prog, const char **paths, return err; } -struct drgn_error *drgn_program_cache_prstatus_entry(struct drgn_program *prog, - char *data, size_t size) +static uint32_t get_prstatus_pid(struct drgn_program *prog, const char *data, + size_t size) { - if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL) { - struct string *entry; + uint32_t pr_pid; + memcpy(&pr_pid, data + (drgn_program_is_64_bit(prog) ? 32 : 24), + sizeof(pr_pid)); + if (drgn_program_bswap(prog)) + pr_pid = bswap_32(pr_pid); + return pr_pid; +} - entry = drgn_prstatus_vector_append_entry(&prog->prstatus_vector); +struct drgn_error *drgn_program_cache_prstatus_entry(struct drgn_program *prog, + const char *data, + size_t size) +{ + if (size < (drgn_program_is_64_bit(prog) ? 36 : 28)) { + return drgn_error_create(DRGN_ERROR_OTHER, + "NT_PRSTATUS is truncated"); + } + if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL) { + struct string *entry = + drgn_prstatus_vector_append_entry(&prog->prstatus_vector); if (!entry) return &drgn_enomem; entry->str = data; entry->len = size; } else { - struct drgn_prstatus_map_entry entry; - size_t pr_pid_offset; - uint32_t pr_pid; - - pr_pid_offset = drgn_program_is_64_bit(prog) ? 32 : 24; - if (size < pr_pid_offset + sizeof(pr_pid)) - return NULL; - - memcpy(&pr_pid, data + pr_pid_offset, sizeof(pr_pid)); - if (drgn_program_bswap(prog)) - pr_pid = bswap_32(pr_pid); - - entry.key = pr_pid; - entry.value.str = data; - entry.value.len = size; + struct drgn_prstatus_map_entry entry = { + .key = get_prstatus_pid(prog, data, size), + .value = { data, size }, + }; if (drgn_prstatus_map_insert(&prog->prstatus_map, &entry, NULL) == -1) return &drgn_enomem; @@ -856,7 +860,8 @@ out: struct drgn_error *drgn_program_find_prstatus_by_cpu(struct drgn_program *prog, uint32_t cpu, - struct string *ret) + struct string *ret, + uint32_t *tid_ret) { struct drgn_error *err; @@ -867,6 +872,7 @@ struct drgn_error *drgn_program_find_prstatus_by_cpu(struct drgn_program *prog, if (cpu < prog->prstatus_vector.size) { *ret = prog->prstatus_vector.data[cpu]; + *tid_ret = get_prstatus_pid(prog, ret->str, ret->len); } else { ret->str = NULL; ret->len = 0; diff --git a/libdrgn/program.h b/libdrgn/program.h index f4db36c6..7efbdcf2 100644 --- a/libdrgn/program.h +++ b/libdrgn/program.h @@ -190,10 +190,12 @@ struct drgn_error *drgn_program_get_dwfl(struct drgn_program *prog, Dwfl **ret); * * @param[out] ret Returned note data. If not found, ret->str is set to * @c NULL and ret->len is set to zero. + * @param[out] tid_ret Returned thread ID of note. */ struct drgn_error *drgn_program_find_prstatus_by_cpu(struct drgn_program *prog, uint32_t cpu, - struct string *ret); + struct string *ret, + uint32_t *tid_ret); /** * Find the @c NT_PRSTATUS note for the given thread ID. @@ -214,7 +216,8 @@ struct drgn_error *drgn_program_find_prstatus_by_tid(struct drgn_program *prog, * @param[in] size Size of data in note. */ struct drgn_error *drgn_program_cache_prstatus_entry(struct drgn_program *prog, - char *data, size_t size); + const char *data, + size_t size); /* * Like @ref drgn_program_find_symbol_by_address(), but @p ret is already diff --git a/libdrgn/stack_trace.c b/libdrgn/stack_trace.c index d3c344df..4224e9a0 100644 --- a/libdrgn/stack_trace.c +++ b/libdrgn/stack_trace.c @@ -256,7 +256,6 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread, /* First, try pt_regs. */ if (prog->stack_trace_obj) { bool is_pt_regs; - err = drgn_get_stack_trace_obj(&obj, prog, &is_pt_regs); if (err) goto out; @@ -275,8 +274,6 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread, goto out; } } else if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL) { - bool found; - err = drgn_program_find_object(prog, "init_pid_ns", NULL, DRGN_FIND_OBJECT_ANY, &tmp); if (err) @@ -287,6 +284,7 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread, err = linux_helper_find_task(&obj, &tmp, prog->stack_trace_tid); if (err) goto out; + bool found; err = drgn_object_bool(&obj, &found); if (err) goto out; @@ -319,29 +317,62 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread, } else { goto out; } - prstatus.str = NULL; - prstatus.len = 0; } else { + /* + * For kernel core dumps, we look up the PRSTATUS note + * by CPU rather than by PID. This is because there is + * an idle task with PID 0 for each CPU, so we must find + * the idle task by CPU. Rather than making PID 0 a + * special case, we handle all tasks this way. + */ union drgn_value value; - uint32_t cpu; - err = drgn_object_member_dereference(&tmp, &obj, "cpu"); if (!err) { err = drgn_object_read_integer(&tmp, &value); if (err) goto out; - cpu = value.uvalue; } else if (err->code == DRGN_ERROR_LOOKUP) { /* !SMP. Must be CPU 0. */ drgn_error_destroy(err); - cpu = 0; + value.uvalue = 0; } else { goto out; } - err = drgn_program_find_prstatus_by_cpu(prog, cpu, - &prstatus); + uint32_t prstatus_tid; + err = drgn_program_find_prstatus_by_cpu(prog, + value.uvalue, + &prstatus, + &prstatus_tid); if (err) goto out; + if (prstatus.str) { + /* + * The PRSTATUS note is for the CPU that the + * task is assigned to, but it is not + * necessarily for this task. Only use it if the + * PID matches. + * + * Note that this isn't perfect: the PID is + * populated by the kernel from "current" (the + * current task) via a non-maskable interrupt + * (NMI). During a context switch, the stack + * pointer and current are not updated + * atomically, so if the NMI arrives in the + * middle of a context switch, the stack pointer + * may not actually be that of current. + * Therefore, the stack pointer in PRSTATUS may + * not actually be for the PID in PRSTATUS. + * Unfortunately, we can't easily fix this. + */ + err = drgn_object_member_dereference(&tmp, &obj, "pid"); + if (err) + goto out; + err = drgn_object_read_integer(&tmp, &value); + if (err) + goto out; + if (prstatus_tid == value.uvalue) + goto prstatus; + } } if (!prog->platform.arch->linux_kernel_set_initial_registers) { err = drgn_error_format(DRGN_ERROR_INVALID_ARGUMENT, @@ -350,9 +381,7 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread, goto out; } err = prog->platform.arch->linux_kernel_set_initial_registers(thread, - &obj, - prstatus.str, - prstatus.len); + &obj); } else { err = drgn_program_find_prstatus_by_tid(prog, prog->stack_trace_tid, @@ -363,6 +392,7 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread, err = drgn_error_create(DRGN_ERROR_LOOKUP, "thread not found"); goto out; } +prstatus: if (!prog->platform.arch->prstatus_set_initial_registers) { err = drgn_error_format(DRGN_ERROR_INVALID_ARGUMENT, "core dump stack unwinding is not supported for %s architecture",