diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index 42e8fcb..da87a95 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -235,6 +235,91 @@ static inline const struct cpumask *lookup_cell_cpumask(int idx) return (const struct cpumask *)cpumaskw->cpumask; } +/* + * This is an RCU-like implementation to keep track of scheduling events so we + * can establish when cell assignments have propagated completely. + */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} percpu_critical_sections SEC(".maps"); + +/* Same implementation for enter/exit */ +static __always_inline int critical_section() +{ + u32 zero = 0; + u32 *data; + + if (!(data = bpf_map_lookup_elem(&percpu_critical_sections, &zero))) { + scx_bpf_error("no percpu_critical_sections"); + return -1; + } + + /* + * Bump the counter, the LSB indicates we are in a critical section and the + * rest of the bits keep track of how many critical sections. + */ + WRITE_ONCE(*data, *data + 1); + return 0; +} + +#define critical_section_enter() critical_section() +#define critical_section_exit() critical_section() + +u32 critical_section_state[MAX_CPUS]; +/* + * Write side will record the current state and then poll to check that the + * generation has advanced (somewhat like call_rcu) + */ +static __always_inline int critical_section_record() +{ + u32 zero = 0; + u32 *data; + int nr_cpus = nr_possible_cpus; + if (nr_cpus > MAX_CPUS) + nr_cpus = MAX_CPUS; + + for (int i = 0; i < nr_cpus; ++i) { + if (!(data = bpf_map_lookup_percpu_elem( + &percpu_critical_sections, &zero, i))) { + scx_bpf_error("no percpu_critical_sections"); + return -1; + } + + critical_section_state[i] = READ_ONCE(*data); + } + return 0; +} + +static __always_inline int critical_section_poll() +{ + u32 zero = 0; + u32 *data; + + int nr_cpus = nr_possible_cpus; + if (nr_cpus > MAX_CPUS) + nr_cpus = MAX_CPUS; + + for (int i = 0; i < nr_cpus; ++i) { + /* If not in a critical section at the time of record, then it passes */ + if (!(critical_section_state[i] & 1)) + continue; + + if (!(data = bpf_map_lookup_percpu_elem( + &percpu_critical_sections, &zero, i))) { + scx_bpf_error("no percpu_critical_sections"); + return -1; + } + + if (READ_ONCE(*data) == critical_section_state[i]) + return 1; + } + + return 0; +} + /* * Along with a user_global_seq bump, indicates that cgroup->cell assignment * changed @@ -264,6 +349,16 @@ int BPF_PROG(sched_tick_fentry) * scheduler tick. This is a crude way of mimicing RCU synchronization. */ if (READ_ONCE(draining)) { + if (critical_section_poll()) + return 0; + /* FIXME: If a cell is being destroyed, we need to make sure that dsq is + * drained before removing it from all the cpus + * + * Additionally, the handling of pinned tasks is broken here - we send + * them to a cell DSQ if there's overlap of the cell's CPUs and the + * task's cpumask but if the cell's CPU change we might stall the + * task indefinitely. + */ bpf_for(cpu_idx, 0, nr_possible_cpus) { if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx))) @@ -422,6 +517,11 @@ int BPF_PROG(sched_tick_fentry) /* Bump the global seq last to ensure that prior stores are now visible. This synchronizes with the read of global_seq */ barrier(); WRITE_ONCE(global_seq, global_seq + 1); + /* + * On subsequent ticks we'll check that all in-flight enqueues are done so + * we can clear the prev_cell for each cpu. Record the state here. + */ + critical_section_record(); return 0; } @@ -611,8 +711,17 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu, if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) return prev_cpu; - if (maybe_refresh_cell(p, tctx) < 0) - return prev_cpu; + /* + * This is a lightweight (RCU-like) critical section covering from when we + * refresh cell information to when we enqueue onto the task's assigned + * cell's DSQ. This allows us to publish new cell assignments and establish + * a point at which all future enqueues will be on the new assignments. + */ + critical_section_enter(); + if (maybe_refresh_cell(p, tctx) < 0) { + cpu = prev_cpu; + goto out; + } if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) { cstat_inc(CSTAT_LOCAL, tctx->cell, cctx); @@ -622,10 +731,12 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu, scx_bpf_error( "select_cpu returned cpu %d belonging to cell %d but task belongs to cell %d", cpu, cctx->cell, tctx->cell); - return cpu; + goto out; } - return prev_cpu; + cpu = prev_cpu; +out: + critical_section_exit(); } static __always_inline bool pick_idle_cpu_and_kick(struct task_struct *p, @@ -661,11 +772,18 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) return; + /* + * This is a lightweight (RCU-like) critical section covering from when we + * refresh cell information to when we enqueue onto the task's assigned + * cell's DSQ. This allows us to publish new cell assignments and establish + * a point at which all future enqueues will be on the new assignments. + */ + critical_section_enter(); if (maybe_refresh_cell(p, tctx) < 0) - return; + goto out; if (!(cell = lookup_cell(tctx->cell))) - return; + goto out; /* * Limit the amount of budget that an idling task can accumulate @@ -689,6 +807,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) */ if (!(enq_flags & SCX_ENQ_WAKEUP)) pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx); +out: + critical_section_exit(); } void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)