scx_mitosis: add RCU-like synchronization

scx_mitosis relied on the implicit assumption that after a sched tick, all outstanding scheduling events had completed but this might not actually be correct. This feels like a natural use-case for RCU, but there is no way to directly make use of RCU in BPF. Instead, this commit implements an RCU-like synchronization mechanism. Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
2024-11-28 05:30:24 +00:00 · 2024-11-06 08:33:29 -08:00 · 2024-11-06 08:33:29 -08:00 · af2cb1abbe
commit af2cb1abbe
parent 0f29854e51
1 changed files with 126 additions and 6 deletions
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@ -235,6 +235,91 @@ static inline const struct cpumask *lookup_cell_cpumask(int idx)
 	return (const struct cpumask *)cpumaskw->cpumask;
 }

+/*
+ * This is an RCU-like implementation to keep track of scheduling events so we
+ * can establish when cell assignments have propagated completely.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, u32);
+	__type(value, u32);
+	__uint(max_entries, 1);
+} percpu_critical_sections SEC(".maps");
+
+/* Same implementation for enter/exit */
+static __always_inline int critical_section()
+{
+	u32 zero = 0;
+	u32 *data;
+
+	if (!(data = bpf_map_lookup_elem(&percpu_critical_sections, &zero))) {
+		scx_bpf_error("no percpu_critical_sections");
+		return -1;
+	}
+
+	/*
+	 * Bump the counter, the LSB indicates we are in a critical section and the
+	 * rest of the bits keep track of how many critical sections.
+	 */
+	WRITE_ONCE(*data, *data + 1);
+	return 0;
+}
+
+#define critical_section_enter() critical_section()
+#define critical_section_exit() critical_section()
+
+u32 critical_section_state[MAX_CPUS];
+/*
+ * Write side will record the current state and then poll to check that the
+ * generation has advanced (somewhat like call_rcu)
+ */
+static __always_inline int critical_section_record()
+{
+	u32 zero = 0;
+	u32 *data;
+	int nr_cpus = nr_possible_cpus;
+	if (nr_cpus > MAX_CPUS)
+		nr_cpus = MAX_CPUS;
+
+	for (int i = 0; i < nr_cpus; ++i) {
+		if (!(data = bpf_map_lookup_percpu_elem(
+			      &percpu_critical_sections, &zero, i))) {
+			scx_bpf_error("no percpu_critical_sections");
+			return -1;
+		}
+
+		critical_section_state[i] = READ_ONCE(*data);
+	}
+	return 0;
+}
+
+static __always_inline int critical_section_poll()
+{
+	u32 zero = 0;
+	u32 *data;
+
+	int nr_cpus = nr_possible_cpus;
+	if (nr_cpus > MAX_CPUS)
+		nr_cpus = MAX_CPUS;
+
+	for (int i = 0; i < nr_cpus; ++i) {
+		/* If not in a critical section at the time of record, then it passes */
+		if (!(critical_section_state[i] & 1))
+			continue;
+
+		if (!(data = bpf_map_lookup_percpu_elem(
+			      &percpu_critical_sections, &zero, i))) {
+			scx_bpf_error("no percpu_critical_sections");
+			return -1;
+		}
+
+		if (READ_ONCE(*data) == critical_section_state[i])
+			return 1;
+	}
+
+	return 0;
+}
+
 /*
 * Along with a user_global_seq bump, indicates that cgroup->cell assignment
 * changed
@ -264,6 +349,16 @@ int BPF_PROG(sched_tick_fentry)
 	 * scheduler tick. This is a crude way of mimicing RCU synchronization.
 	 */
 	if (READ_ONCE(draining)) {
+		if (critical_section_poll())
+			return 0;
+		/* FIXME: If a cell is being destroyed, we need to make sure that dsq is
+		 * drained before removing it from all the cpus
+		 *
+		 * Additionally, the handling of pinned tasks is broken here - we send
+		 * them to a cell DSQ if there's overlap of the cell's CPUs and the
+		 * task's cpumask but if the cell's CPU change we might stall the
+		 * task indefinitely.
+		 */
 		bpf_for(cpu_idx, 0, nr_possible_cpus)
 		{
 			if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx)))
@ -422,6 +517,11 @@ int BPF_PROG(sched_tick_fentry)
 	/* Bump the global seq last to ensure that prior stores are now visible. This synchronizes with the read of global_seq */
 	barrier();
 	WRITE_ONCE(global_seq, global_seq + 1);
+	/*
+	 * On subsequent ticks we'll check that all in-flight enqueues are done so
+	 * we can clear the prev_cell for each cpu. Record the state here.
+	 */
+	critical_section_record();
 	return 0;
 }

@ -611,8 +711,17 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
 	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
 		return prev_cpu;

-	if (maybe_refresh_cell(p, tctx) < 0)
-		return prev_cpu;
+	/*
+	 * This is a lightweight (RCU-like) critical section covering from when we
+	 * refresh cell information to when we enqueue onto the task's assigned
+	 * cell's DSQ. This allows us to publish new cell assignments and establish
+	 * a point at which all future enqueues will be on the new assignments.
+	 */
+	critical_section_enter();
+	if (maybe_refresh_cell(p, tctx) < 0) {
+		cpu = prev_cpu;
+		goto out;
+	}

 	if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) {
 		cstat_inc(CSTAT_LOCAL, tctx->cell, cctx);
@ -622,10 +731,12 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu,
 			scx_bpf_error(
 				"select_cpu returned cpu %d belonging to cell %d but task belongs to cell %d",
 				cpu, cctx->cell, tctx->cell);
-		return cpu;
+		goto out;
 	}

-	return prev_cpu;
+	cpu = prev_cpu;
+out:
+	critical_section_exit();
 }

 static __always_inline bool pick_idle_cpu_and_kick(struct task_struct *p,
@ -661,11 +772,18 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
 		return;

+	/*
+	 * This is a lightweight (RCU-like) critical section covering from when we
+	 * refresh cell information to when we enqueue onto the task's assigned
+	 * cell's DSQ. This allows us to publish new cell assignments and establish
+	 * a point at which all future enqueues will be on the new assignments.
+	 */
+	critical_section_enter();
 	if (maybe_refresh_cell(p, tctx) < 0)
-		return;
+		goto out;

 	if (!(cell = lookup_cell(tctx->cell)))
-		return;
+		goto out;

 	/*
 	 * Limit the amount of budget that an idling task can accumulate
@ -689,6 +807,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	 */
 	if (!(enq_flags & SCX_ENQ_WAKEUP))
 		pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx);
+out:
+	critical_section_exit();
 }

 void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)