mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-28 05:30:24 +00:00
scx_mitosis: add RCU-like synchronization
scx_mitosis relied on the implicit assumption that after a sched tick, all outstanding scheduling events had completed but this might not actually be correct. This feels like a natural use-case for RCU, but there is no way to directly make use of RCU in BPF. Instead, this commit implements an RCU-like synchronization mechanism. Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
This commit is contained in:
parent
0f29854e51
commit
af2cb1abbe
@ -235,6 +235,91 @@ static inline const struct cpumask *lookup_cell_cpumask(int idx)
|
|||||||
return (const struct cpumask *)cpumaskw->cpumask;
|
return (const struct cpumask *)cpumaskw->cpumask;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is an RCU-like implementation to keep track of scheduling events so we
|
||||||
|
* can establish when cell assignments have propagated completely.
|
||||||
|
*/
|
||||||
|
struct {
|
||||||
|
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||||
|
__type(key, u32);
|
||||||
|
__type(value, u32);
|
||||||
|
__uint(max_entries, 1);
|
||||||
|
} percpu_critical_sections SEC(".maps");
|
||||||
|
|
||||||
|
/* Same implementation for enter/exit */
|
||||||
|
static __always_inline int critical_section()
|
||||||
|
{
|
||||||
|
u32 zero = 0;
|
||||||
|
u32 *data;
|
||||||
|
|
||||||
|
if (!(data = bpf_map_lookup_elem(&percpu_critical_sections, &zero))) {
|
||||||
|
scx_bpf_error("no percpu_critical_sections");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Bump the counter, the LSB indicates we are in a critical section and the
|
||||||
|
* rest of the bits keep track of how many critical sections.
|
||||||
|
*/
|
||||||
|
WRITE_ONCE(*data, *data + 1);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define critical_section_enter() critical_section()
|
||||||
|
#define critical_section_exit() critical_section()
|
||||||
|
|
||||||
|
u32 critical_section_state[MAX_CPUS];
|
||||||
|
/*
|
||||||
|
* Write side will record the current state and then poll to check that the
|
||||||
|
* generation has advanced (somewhat like call_rcu)
|
||||||
|
*/
|
||||||
|
static __always_inline int critical_section_record()
|
||||||
|
{
|
||||||
|
u32 zero = 0;
|
||||||
|
u32 *data;
|
||||||
|
int nr_cpus = nr_possible_cpus;
|
||||||
|
if (nr_cpus > MAX_CPUS)
|
||||||
|
nr_cpus = MAX_CPUS;
|
||||||
|
|
||||||
|
for (int i = 0; i < nr_cpus; ++i) {
|
||||||
|
if (!(data = bpf_map_lookup_percpu_elem(
|
||||||
|
&percpu_critical_sections, &zero, i))) {
|
||||||
|
scx_bpf_error("no percpu_critical_sections");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
critical_section_state[i] = READ_ONCE(*data);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __always_inline int critical_section_poll()
|
||||||
|
{
|
||||||
|
u32 zero = 0;
|
||||||
|
u32 *data;
|
||||||
|
|
||||||
|
int nr_cpus = nr_possible_cpus;
|
||||||
|
if (nr_cpus > MAX_CPUS)
|
||||||
|
nr_cpus = MAX_CPUS;
|
||||||
|
|
||||||
|
for (int i = 0; i < nr_cpus; ++i) {
|
||||||
|
/* If not in a critical section at the time of record, then it passes */
|
||||||
|
if (!(critical_section_state[i] & 1))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (!(data = bpf_map_lookup_percpu_elem(
|
||||||
|
&percpu_critical_sections, &zero, i))) {
|
||||||
|
scx_bpf_error("no percpu_critical_sections");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (READ_ONCE(*data) == critical_section_state[i])
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Along with a user_global_seq bump, indicates that cgroup->cell assignment
|
* Along with a user_global_seq bump, indicates that cgroup->cell assignment
|
||||||
* changed
|
* changed
|
||||||
@ -264,6 +349,16 @@ int BPF_PROG(sched_tick_fentry)
|
|||||||
* scheduler tick. This is a crude way of mimicing RCU synchronization.
|
* scheduler tick. This is a crude way of mimicing RCU synchronization.
|
||||||
*/
|
*/
|
||||||
if (READ_ONCE(draining)) {
|
if (READ_ONCE(draining)) {
|
||||||
|
if (critical_section_poll())
|
||||||
|
return 0;
|
||||||
|
/* FIXME: If a cell is being destroyed, we need to make sure that dsq is
|
||||||
|
* drained before removing it from all the cpus
|
||||||
|
*
|
||||||
|
* Additionally, the handling of pinned tasks is broken here - we send
|
||||||
|
* them to a cell DSQ if there's overlap of the cell's CPUs and the
|
||||||
|
* task's cpumask but if the cell's CPU change we might stall the
|
||||||
|
* task indefinitely.
|
||||||
|
*/
|
||||||
bpf_for(cpu_idx, 0, nr_possible_cpus)
|
bpf_for(cpu_idx, 0, nr_possible_cpus)
|
||||||
{
|
{
|
||||||
if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx)))
|
if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx)))
|
||||||
@ -422,6 +517,11 @@ int BPF_PROG(sched_tick_fentry)
|
|||||||
/* Bump the global seq last to ensure that prior stores are now visible. This synchronizes with the read of global_seq */
|
/* Bump the global seq last to ensure that prior stores are now visible. This synchronizes with the read of global_seq */
|
||||||
barrier();
|
barrier();
|
||||||
WRITE_ONCE(global_seq, global_seq + 1);
|
WRITE_ONCE(global_seq, global_seq + 1);
|
||||||
|
/*
|
||||||
|
* On subsequent ticks we'll check that all in-flight enqueues are done so
|
||||||
|
* we can clear the prev_cell for each cpu. Record the state here.
|
||||||
|
*/
|
||||||
|
critical_section_record();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -611,8 +711,17 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
|
|||||||
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
|
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
|
||||||
return prev_cpu;
|
return prev_cpu;
|
||||||
|
|
||||||
if (maybe_refresh_cell(p, tctx) < 0)
|
/*
|
||||||
return prev_cpu;
|
* This is a lightweight (RCU-like) critical section covering from when we
|
||||||
|
* refresh cell information to when we enqueue onto the task's assigned
|
||||||
|
* cell's DSQ. This allows us to publish new cell assignments and establish
|
||||||
|
* a point at which all future enqueues will be on the new assignments.
|
||||||
|
*/
|
||||||
|
critical_section_enter();
|
||||||
|
if (maybe_refresh_cell(p, tctx) < 0) {
|
||||||
|
cpu = prev_cpu;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) {
|
if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) {
|
||||||
cstat_inc(CSTAT_LOCAL, tctx->cell, cctx);
|
cstat_inc(CSTAT_LOCAL, tctx->cell, cctx);
|
||||||
@ -622,10 +731,12 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
|
|||||||
scx_bpf_error(
|
scx_bpf_error(
|
||||||
"select_cpu returned cpu %d belonging to cell %d but task belongs to cell %d",
|
"select_cpu returned cpu %d belonging to cell %d but task belongs to cell %d",
|
||||||
cpu, cctx->cell, tctx->cell);
|
cpu, cctx->cell, tctx->cell);
|
||||||
return cpu;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
return prev_cpu;
|
cpu = prev_cpu;
|
||||||
|
out:
|
||||||
|
critical_section_exit();
|
||||||
}
|
}
|
||||||
|
|
||||||
static __always_inline bool pick_idle_cpu_and_kick(struct task_struct *p,
|
static __always_inline bool pick_idle_cpu_and_kick(struct task_struct *p,
|
||||||
@ -661,11 +772,18 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
|
|||||||
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
|
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is a lightweight (RCU-like) critical section covering from when we
|
||||||
|
* refresh cell information to when we enqueue onto the task's assigned
|
||||||
|
* cell's DSQ. This allows us to publish new cell assignments and establish
|
||||||
|
* a point at which all future enqueues will be on the new assignments.
|
||||||
|
*/
|
||||||
|
critical_section_enter();
|
||||||
if (maybe_refresh_cell(p, tctx) < 0)
|
if (maybe_refresh_cell(p, tctx) < 0)
|
||||||
return;
|
goto out;
|
||||||
|
|
||||||
if (!(cell = lookup_cell(tctx->cell)))
|
if (!(cell = lookup_cell(tctx->cell)))
|
||||||
return;
|
goto out;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Limit the amount of budget that an idling task can accumulate
|
* Limit the amount of budget that an idling task can accumulate
|
||||||
@ -689,6 +807,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
|
|||||||
*/
|
*/
|
||||||
if (!(enq_flags & SCX_ENQ_WAKEUP))
|
if (!(enq_flags & SCX_ENQ_WAKEUP))
|
||||||
pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx);
|
pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx);
|
||||||
|
out:
|
||||||
|
critical_section_exit();
|
||||||
}
|
}
|
||||||
|
|
||||||
void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
|
void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
|
||||||
|
Loading…
Reference in New Issue
Block a user