scx_mitosis: add RCU-like synchronization

scx_mitosis relied on the implicit assumption that after a sched tick,
all outstanding scheduling events had completed but this might not
actually be correct. This feels like a natural use-case for RCU, but
there is no way to directly make use of RCU in BPF. Instead, this commit
implements an RCU-like synchronization mechanism.

Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
This commit is contained in:
Dan Schatzberg 2024-11-06 08:33:29 -08:00
parent 0f29854e51
commit af2cb1abbe

View File

@ -235,6 +235,91 @@ static inline const struct cpumask *lookup_cell_cpumask(int idx)
return (const struct cpumask *)cpumaskw->cpumask;
}
/*
* This is an RCU-like implementation to keep track of scheduling events so we
* can establish when cell assignments have propagated completely.
*/
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, u32);
__uint(max_entries, 1);
} percpu_critical_sections SEC(".maps");
/* Same implementation for enter/exit */
static __always_inline int critical_section()
{
u32 zero = 0;
u32 *data;
if (!(data = bpf_map_lookup_elem(&percpu_critical_sections, &zero))) {
scx_bpf_error("no percpu_critical_sections");
return -1;
}
/*
* Bump the counter, the LSB indicates we are in a critical section and the
* rest of the bits keep track of how many critical sections.
*/
WRITE_ONCE(*data, *data + 1);
return 0;
}
#define critical_section_enter() critical_section()
#define critical_section_exit() critical_section()
u32 critical_section_state[MAX_CPUS];
/*
* Write side will record the current state and then poll to check that the
* generation has advanced (somewhat like call_rcu)
*/
static __always_inline int critical_section_record()
{
u32 zero = 0;
u32 *data;
int nr_cpus = nr_possible_cpus;
if (nr_cpus > MAX_CPUS)
nr_cpus = MAX_CPUS;
for (int i = 0; i < nr_cpus; ++i) {
if (!(data = bpf_map_lookup_percpu_elem(
&percpu_critical_sections, &zero, i))) {
scx_bpf_error("no percpu_critical_sections");
return -1;
}
critical_section_state[i] = READ_ONCE(*data);
}
return 0;
}
static __always_inline int critical_section_poll()
{
u32 zero = 0;
u32 *data;
int nr_cpus = nr_possible_cpus;
if (nr_cpus > MAX_CPUS)
nr_cpus = MAX_CPUS;
for (int i = 0; i < nr_cpus; ++i) {
/* If not in a critical section at the time of record, then it passes */
if (!(critical_section_state[i] & 1))
continue;
if (!(data = bpf_map_lookup_percpu_elem(
&percpu_critical_sections, &zero, i))) {
scx_bpf_error("no percpu_critical_sections");
return -1;
}
if (READ_ONCE(*data) == critical_section_state[i])
return 1;
}
return 0;
}
/*
* Along with a user_global_seq bump, indicates that cgroup->cell assignment
* changed
@ -264,6 +349,16 @@ int BPF_PROG(sched_tick_fentry)
* scheduler tick. This is a crude way of mimicing RCU synchronization.
*/
if (READ_ONCE(draining)) {
if (critical_section_poll())
return 0;
/* FIXME: If a cell is being destroyed, we need to make sure that dsq is
* drained before removing it from all the cpus
*
* Additionally, the handling of pinned tasks is broken here - we send
* them to a cell DSQ if there's overlap of the cell's CPUs and the
* task's cpumask but if the cell's CPU change we might stall the
* task indefinitely.
*/
bpf_for(cpu_idx, 0, nr_possible_cpus)
{
if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx)))
@ -422,6 +517,11 @@ int BPF_PROG(sched_tick_fentry)
/* Bump the global seq last to ensure that prior stores are now visible. This synchronizes with the read of global_seq */
barrier();
WRITE_ONCE(global_seq, global_seq + 1);
/*
* On subsequent ticks we'll check that all in-flight enqueues are done so
* we can clear the prev_cell for each cpu. Record the state here.
*/
critical_section_record();
return 0;
}
@ -611,8 +711,17 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
return prev_cpu;
if (maybe_refresh_cell(p, tctx) < 0)
return prev_cpu;
/*
* This is a lightweight (RCU-like) critical section covering from when we
* refresh cell information to when we enqueue onto the task's assigned
* cell's DSQ. This allows us to publish new cell assignments and establish
* a point at which all future enqueues will be on the new assignments.
*/
critical_section_enter();
if (maybe_refresh_cell(p, tctx) < 0) {
cpu = prev_cpu;
goto out;
}
if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) {
cstat_inc(CSTAT_LOCAL, tctx->cell, cctx);
@ -622,10 +731,12 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
scx_bpf_error(
"select_cpu returned cpu %d belonging to cell %d but task belongs to cell %d",
cpu, cctx->cell, tctx->cell);
return cpu;
goto out;
}
return prev_cpu;
cpu = prev_cpu;
out:
critical_section_exit();
}
static __always_inline bool pick_idle_cpu_and_kick(struct task_struct *p,
@ -661,11 +772,18 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
return;
/*
* This is a lightweight (RCU-like) critical section covering from when we
* refresh cell information to when we enqueue onto the task's assigned
* cell's DSQ. This allows us to publish new cell assignments and establish
* a point at which all future enqueues will be on the new assignments.
*/
critical_section_enter();
if (maybe_refresh_cell(p, tctx) < 0)
return;
goto out;
if (!(cell = lookup_cell(tctx->cell)))
return;
goto out;
/*
* Limit the amount of budget that an idling task can accumulate
@ -689,6 +807,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
*/
if (!(enq_flags & SCX_ENQ_WAKEUP))
pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx);
out:
critical_section_exit();
}
void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)