libdrgn: go back to trusting PRSTATUS PID

Commit eea5422546 ("libdrgn: make Linux kernel stack unwinding more
robust") overlooked that if the task is running in userspace, the stack
pointer in PRSTATUS obviously won't match the kernel stack pointer.
Let's bite the bullet and use the PID. If the race shows up in practice,
we can try to come up with another workaround.
This commit is contained in:
Omar Sandoval 2020-07-08 17:50:33 -07:00
parent 4de147e478
commit 1b47b866b4
5 changed files with 81 additions and 110 deletions

View File

@ -257,67 +257,25 @@ out:
static struct drgn_error *
linux_kernel_set_initial_registers_x86_64(Dwfl_Thread *thread,
const struct drgn_object *task_obj,
const void *prstatus,
size_t prstatus_size)
const struct drgn_object *task_obj)
{
struct drgn_error *err;
struct drgn_program *prog = task_obj->prog;
struct drgn_object sp_obj;
struct drgn_qualified_type frame_type;
uint64_t sp;
Dwarf_Word dwarf_reg;
drgn_object_init(&sp_obj, prog);
if (prstatus) {
/*
* If the stack pointer in PRSTATUS is within this task's stack,
* then we can use it. Otherwise, the task either wasn't running
* or was in the middle of context switching. Either way, we
* should use the saved registers instead.
*/
uint64_t thread_size;
uint64_t stack;
err = linux_kernel_get_thread_size(prog, &thread_size);
if (err)
goto out;
err = drgn_object_member_dereference(&sp_obj, task_obj,
"stack");
if (err)
goto out;
err = drgn_object_read_unsigned(&sp_obj, &stack);
if (err)
goto out;
if (prstatus_size < 272) {
err = drgn_error_create(DRGN_ERROR_INVALID_ARGUMENT,
"registers are truncated");
goto out;
}
memcpy(&sp, (char *)prstatus + 264, sizeof(sp));
if (drgn_program_bswap(prog))
sp = bswap_64(sp);
if (sp > stack && sp <= stack + thread_size) {
err = prstatus_set_initial_registers_x86_64(prog,
thread,
prstatus,
prstatus_size);
goto out;
}
}
err = drgn_object_member_dereference(&sp_obj, task_obj, "thread");
if (err)
goto out;
err = drgn_object_member(&sp_obj, &sp_obj, "sp");
if (err)
goto out;
uint64_t sp;
err = drgn_object_read_unsigned(&sp_obj, &sp);
if (err)
goto out;
dwarf_reg = sp;
Dwarf_Word dwarf_reg = sp;
/* rsp is register 7. */
if (!dwfl_thread_state_registers(thread, 7, 1, &dwarf_reg)) {
err = drgn_error_libdwfl();
@ -330,6 +288,7 @@ linux_kernel_set_initial_registers_x86_64(Dwfl_Thread *thread,
* inactive_task_frame, which we can use to get most registers. Before
* that, it points to bp.
*/
struct drgn_qualified_type frame_type;
err = drgn_program_find_type(prog, "struct inactive_task_frame *", NULL,
&frame_type);
if (!err) {

View File

@ -65,35 +65,8 @@ struct drgn_architecture_info {
Dwfl_Thread *,
const void *,
size_t);
/*
* Get a task's registers from the task_struct or PRSTATUS note as
* appropriate.
*
* The given PRSTATUS note is for the CPU that the task is assigned to,
* which may or may not be for the given task. This callback must
* determine that (typically by checking whether the stack pointer in
* PRSTATUS lies within the task's stack).
*
* We find the PRSTATUS note by CPU rather than by PID for two reasons:
*
* 1. The PID is populated by the kernel from "current" (the current
* task) via a non-maskable interrupt (NMI). During a context switch,
* the stack pointer and current are not updated atomically, so if
* the NMI arrives in the middle of a context switch, the stack
* pointer may not actually be that of current. Therefore, the stack
* pointer in PRSTATUS may not actually be for the PID in PRSTATUS.
*
* We go through all of this trouble because blindly trusting the PID
* could result in a stack trace for the wrong task, which we want to
* avoid at all costs.
*
* 2. There is an idle task with PID 0 for each CPU, so for an idle task
* we have no choice but to find the note by CPU.
*/
struct drgn_error *(*linux_kernel_set_initial_registers)(Dwfl_Thread *,
const struct drgn_object *,
const void *prstatus,
size_t prstatus_size);
const struct drgn_object *);
struct drgn_error *(*linux_kernel_get_page_offset)(struct drgn_program *,
uint64_t *);
struct drgn_error *(*linux_kernel_get_vmemmap)(struct drgn_program *,

View File

@ -738,33 +738,37 @@ drgn_program_load_debug_info(struct drgn_program *prog, const char **paths,
return err;
}
struct drgn_error *drgn_program_cache_prstatus_entry(struct drgn_program *prog,
char *data, size_t size)
static uint32_t get_prstatus_pid(struct drgn_program *prog, const char *data,
size_t size)
{
if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL) {
struct string *entry;
uint32_t pr_pid;
memcpy(&pr_pid, data + (drgn_program_is_64_bit(prog) ? 32 : 24),
sizeof(pr_pid));
if (drgn_program_bswap(prog))
pr_pid = bswap_32(pr_pid);
return pr_pid;
}
entry = drgn_prstatus_vector_append_entry(&prog->prstatus_vector);
struct drgn_error *drgn_program_cache_prstatus_entry(struct drgn_program *prog,
const char *data,
size_t size)
{
if (size < (drgn_program_is_64_bit(prog) ? 36 : 28)) {
return drgn_error_create(DRGN_ERROR_OTHER,
"NT_PRSTATUS is truncated");
}
if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL) {
struct string *entry =
drgn_prstatus_vector_append_entry(&prog->prstatus_vector);
if (!entry)
return &drgn_enomem;
entry->str = data;
entry->len = size;
} else {
struct drgn_prstatus_map_entry entry;
size_t pr_pid_offset;
uint32_t pr_pid;
pr_pid_offset = drgn_program_is_64_bit(prog) ? 32 : 24;
if (size < pr_pid_offset + sizeof(pr_pid))
return NULL;
memcpy(&pr_pid, data + pr_pid_offset, sizeof(pr_pid));
if (drgn_program_bswap(prog))
pr_pid = bswap_32(pr_pid);
entry.key = pr_pid;
entry.value.str = data;
entry.value.len = size;
struct drgn_prstatus_map_entry entry = {
.key = get_prstatus_pid(prog, data, size),
.value = { data, size },
};
if (drgn_prstatus_map_insert(&prog->prstatus_map, &entry,
NULL) == -1)
return &drgn_enomem;
@ -856,7 +860,8 @@ out:
struct drgn_error *drgn_program_find_prstatus_by_cpu(struct drgn_program *prog,
uint32_t cpu,
struct string *ret)
struct string *ret,
uint32_t *tid_ret)
{
struct drgn_error *err;
@ -867,6 +872,7 @@ struct drgn_error *drgn_program_find_prstatus_by_cpu(struct drgn_program *prog,
if (cpu < prog->prstatus_vector.size) {
*ret = prog->prstatus_vector.data[cpu];
*tid_ret = get_prstatus_pid(prog, ret->str, ret->len);
} else {
ret->str = NULL;
ret->len = 0;

View File

@ -190,10 +190,12 @@ struct drgn_error *drgn_program_get_dwfl(struct drgn_program *prog, Dwfl **ret);
*
* @param[out] ret Returned note data. If not found, <tt>ret->str</tt> is set to
* @c NULL and <tt>ret->len</tt> is set to zero.
* @param[out] tid_ret Returned thread ID of note.
*/
struct drgn_error *drgn_program_find_prstatus_by_cpu(struct drgn_program *prog,
uint32_t cpu,
struct string *ret);
struct string *ret,
uint32_t *tid_ret);
/**
* Find the @c NT_PRSTATUS note for the given thread ID.
@ -214,7 +216,8 @@ struct drgn_error *drgn_program_find_prstatus_by_tid(struct drgn_program *prog,
* @param[in] size Size of data in note.
*/
struct drgn_error *drgn_program_cache_prstatus_entry(struct drgn_program *prog,
char *data, size_t size);
const char *data,
size_t size);
/*
* Like @ref drgn_program_find_symbol_by_address(), but @p ret is already

View File

@ -256,7 +256,6 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread,
/* First, try pt_regs. */
if (prog->stack_trace_obj) {
bool is_pt_regs;
err = drgn_get_stack_trace_obj(&obj, prog, &is_pt_regs);
if (err)
goto out;
@ -275,8 +274,6 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread,
goto out;
}
} else if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL) {
bool found;
err = drgn_program_find_object(prog, "init_pid_ns", NULL,
DRGN_FIND_OBJECT_ANY, &tmp);
if (err)
@ -287,6 +284,7 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread,
err = linux_helper_find_task(&obj, &tmp, prog->stack_trace_tid);
if (err)
goto out;
bool found;
err = drgn_object_bool(&obj, &found);
if (err)
goto out;
@ -319,29 +317,62 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread,
} else {
goto out;
}
prstatus.str = NULL;
prstatus.len = 0;
} else {
/*
* For kernel core dumps, we look up the PRSTATUS note
* by CPU rather than by PID. This is because there is
* an idle task with PID 0 for each CPU, so we must find
* the idle task by CPU. Rather than making PID 0 a
* special case, we handle all tasks this way.
*/
union drgn_value value;
uint32_t cpu;
err = drgn_object_member_dereference(&tmp, &obj, "cpu");
if (!err) {
err = drgn_object_read_integer(&tmp, &value);
if (err)
goto out;
cpu = value.uvalue;
} else if (err->code == DRGN_ERROR_LOOKUP) {
/* !SMP. Must be CPU 0. */
drgn_error_destroy(err);
cpu = 0;
value.uvalue = 0;
} else {
goto out;
}
err = drgn_program_find_prstatus_by_cpu(prog, cpu,
&prstatus);
uint32_t prstatus_tid;
err = drgn_program_find_prstatus_by_cpu(prog,
value.uvalue,
&prstatus,
&prstatus_tid);
if (err)
goto out;
if (prstatus.str) {
/*
* The PRSTATUS note is for the CPU that the
* task is assigned to, but it is not
* necessarily for this task. Only use it if the
* PID matches.
*
* Note that this isn't perfect: the PID is
* populated by the kernel from "current" (the
* current task) via a non-maskable interrupt
* (NMI). During a context switch, the stack
* pointer and current are not updated
* atomically, so if the NMI arrives in the
* middle of a context switch, the stack pointer
* may not actually be that of current.
* Therefore, the stack pointer in PRSTATUS may
* not actually be for the PID in PRSTATUS.
* Unfortunately, we can't easily fix this.
*/
err = drgn_object_member_dereference(&tmp, &obj, "pid");
if (err)
goto out;
err = drgn_object_read_integer(&tmp, &value);
if (err)
goto out;
if (prstatus_tid == value.uvalue)
goto prstatus;
}
}
if (!prog->platform.arch->linux_kernel_set_initial_registers) {
err = drgn_error_format(DRGN_ERROR_INVALID_ARGUMENT,
@ -350,9 +381,7 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread,
goto out;
}
err = prog->platform.arch->linux_kernel_set_initial_registers(thread,
&obj,
prstatus.str,
prstatus.len);
&obj);
} else {
err = drgn_program_find_prstatus_by_tid(prog,
prog->stack_trace_tid,
@ -363,6 +392,7 @@ static bool drgn_thread_set_initial_registers(Dwfl_Thread *thread,
err = drgn_error_create(DRGN_ERROR_LOOKUP, "thread not found");
goto out;
}
prstatus:
if (!prog->platform.arch->prstatus_set_initial_registers) {
err = drgn_error_format(DRGN_ERROR_INVALID_ARGUMENT,
"core dump stack unwinding is not supported for %s architecture",