scx_mitosis: add RCU-like synchronization

scx_mitosis relied on the implicit assumption that after a sched tick, all outstanding scheduling events had completed but this might not actually be correct. This feels like a natural use-case for RCU, but there is no way to directly make use of RCU in BPF. Instead, this commit implements an RCU-like synchronization mechanism. Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
2024-11-28 05:30:24 +00:00 · 2024-11-06 08:33:29 -08:00 · 2024-11-06 08:33:29 -08:00 · af2cb1abbe
commit af2cb1abbe
parent 0f29854e51
1 changed files with 126 additions and 6 deletions
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@ -235,6 +235,91 @@ static inline const struct cpumask *lookup_cell_cpumask(int idx)
 	return (const struct cpumask *)cpumaskw->cpumask;
 }
 /*
 * This is an RCU-like implementation to keep track of scheduling events so we
 * can establish when cell assignments have propagated completely.
 */
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__type(key, u32);
 	__type(value, u32);
 	__uint(max_entries, 1);
 } percpu_critical_sections SEC(".maps");
 /* Same implementation for enter/exit */
 static __always_inline int critical_section()
 {
 	u32 zero = 0;
 	u32 *data;
 	if (!(data = bpf_map_lookup_elem(&percpu_critical_sections, &zero))) {
 		scx_bpf_error("no percpu_critical_sections");
 		return -1;
 	}
 	/*
 	 * Bump the counter, the LSB indicates we are in a critical section and the
 	 * rest of the bits keep track of how many critical sections.
 	 */
 	WRITE_ONCE(*data, *data + 1);
 	return 0;
 }
 #define critical_section_enter() critical_section()
 #define critical_section_exit() critical_section()
 u32 critical_section_state[MAX_CPUS];
 /*
 * Write side will record the current state and then poll to check that the
 * generation has advanced (somewhat like call_rcu)
 */
 static __always_inline int critical_section_record()
 {
 	u32 zero = 0;
 	u32 *data;
 	int nr_cpus = nr_possible_cpus;
 	if (nr_cpus > MAX_CPUS)
 		nr_cpus = MAX_CPUS;
 	for (int i = 0; i < nr_cpus; ++i) {
 		if (!(data = bpf_map_lookup_percpu_elem(
 			      &percpu_critical_sections, &zero, i))) {
 			scx_bpf_error("no percpu_critical_sections");
 			return -1;
 		}
 		critical_section_state[i] = READ_ONCE(*data);
 	}
 	return 0;
 }
 static __always_inline int critical_section_poll()
 {
 	u32 zero = 0;
 	u32 *data;
 	int nr_cpus = nr_possible_cpus;
 	if (nr_cpus > MAX_CPUS)
 		nr_cpus = MAX_CPUS;
 	for (int i = 0; i < nr_cpus; ++i) {
 		/* If not in a critical section at the time of record, then it passes */
 		if (!(critical_section_state[i] & 1))
 			continue;
 		if (!(data = bpf_map_lookup_percpu_elem(
 			      &percpu_critical_sections, &zero, i))) {
 			scx_bpf_error("no percpu_critical_sections");
 			return -1;
 		}
 		if (READ_ONCE(*data) == critical_section_state[i])
 			return 1;
 	}
 	return 0;
 }
 /*
 * Along with a user_global_seq bump, indicates that cgroup->cell assignment
 * changed
@ -264,6 +349,16 @@ int BPF_PROG(sched_tick_fentry)
 	 * scheduler tick. This is a crude way of mimicing RCU synchronization.
 	 */
 	if (READ_ONCE(draining)) {
 		if (critical_section_poll())
 			return 0;
 		/* FIXME: If a cell is being destroyed, we need to make sure that dsq is
 		 * drained before removing it from all the cpus
 		 *
 		 * Additionally, the handling of pinned tasks is broken here - we send
 		 * them to a cell DSQ if there's overlap of the cell's CPUs and the
 		 * task's cpumask but if the cell's CPU change we might stall the
 		 * task indefinitely.
 		 */
 		bpf_for(cpu_idx, 0, nr_possible_cpus)
 		{
 			if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx)))
@ -422,6 +517,11 @@ int BPF_PROG(sched_tick_fentry)
 	/* Bump the global seq last to ensure that prior stores are now visible. This synchronizes with the read of global_seq */
 	barrier();
 	WRITE_ONCE(global_seq, global_seq + 1);
 	/*
 	 * On subsequent ticks we'll check that all in-flight enqueues are done so
 	 * we can clear the prev_cell for each cpu. Record the state here.
 	 */
 	critical_section_record();
 	return 0;
 }
@ -611,8 +711,17 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
 	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
 		return prev_cpu;
-	if (maybe_refresh_cell(p, tctx) < 0)
+	/*
-		return prev_cpu;
+	 * This is a lightweight (RCU-like) critical section covering from when we
 	 * refresh cell information to when we enqueue onto the task's assigned
 	 * cell's DSQ. This allows us to publish new cell assignments and establish
 	 * a point at which all future enqueues will be on the new assignments.
 	 */
 	critical_section_enter();
 	if (maybe_refresh_cell(p, tctx) < 0) {
 		cpu = prev_cpu;
 		goto out;
 	}
 	if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) {
 		cstat_inc(CSTAT_LOCAL, tctx->cell, cctx);
@ -622,10 +731,12 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
 			scx_bpf_error(
 				"select_cpu returned cpu %d belonging to cell %d but task belongs to cell %d",
 				cpu, cctx->cell, tctx->cell);
-		return cpu;
+		goto out;
 	}
-	return prev_cpu;
+	cpu = prev_cpu;
 out:
 	critical_section_exit();
 }
 static __always_inline bool pick_idle_cpu_and_kick(struct task_struct *p,
@ -661,11 +772,18 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
 		return;
 	/*
 	 * This is a lightweight (RCU-like) critical section covering from when we
 	 * refresh cell information to when we enqueue onto the task's assigned
 	 * cell's DSQ. This allows us to publish new cell assignments and establish
 	 * a point at which all future enqueues will be on the new assignments.
 	 */
 	critical_section_enter();
 	if (maybe_refresh_cell(p, tctx) < 0)
-		return;
+		goto out;
 	if (!(cell = lookup_cell(tctx->cell)))
-		return;
+		goto out;
 	/*
 	 * Limit the amount of budget that an idling task can accumulate
@ -689,6 +807,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	 */
 	if (!(enq_flags & SCX_ENQ_WAKEUP))
 		pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx);
 out:
 	critical_section_exit();
 }
 void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)