bpf_rustland: do not dispatch the scheduler to the global DSQ

Never dispatch the user-space scheduler to the global DSQ, while all the other tasks are dispatched to the local per-CPU DSQ. Since tasks are consumed from the local DSQ first and then from the global DSQ, we may end up starving the scheduler if we dispatch only this one on the global DSQ. In fact it is really easy to trigger a stall with a workload that triggers many context switches in the system, for example (on a 8 cores system): $ stress-ng --cpu 32 --iomix 4 --vm 2 --vm-bytes 128M --fork 4 --timeout 30s ... 09:28:11 [WARN] EXIT: scx_rustland[1455943] failed to run for 5.275s 09:28:11 [INFO] Unregister RustLand scheduler To prevent this from happening also dispatch the user-space scheduler on the local DSQ, using the current CPU where .dispatch() is called, if possible, or the previously used CPU otherwise. Apply the same logic when the scheduler is congested: dispatch on the previously used CPU using the local DSQ. In this way all tasks will always get the same "dispatch priority" and we can prevent the scheduler starvation issue. Note that with this change in place dispatch_global() is never used and we can get rid of it. Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
2024-11-24 20:00:22 +00:00 · 2024-01-01 10:15:58 +01:00 · 2024-01-01 10:15:58 +01:00 · 676bd88ada
commit 676bd88ada
parent 0fc46b2be2
1 changed files with 15 additions and 17 deletions
--- a/scheds/rust/scx_rustland/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_rustland/src/bpf/main.bpf.c
@ -252,20 +252,17 @@ static void dispatch_local(struct task_struct *p, u64 enq_flags)
 */
 static void dispatch_on_cpu(struct task_struct *p, s32 cpu, u64 enq_flags)
 {
+	/*
+	 * If it's not possible to dispatch on the selected CPU, re-use the
+	 * previously used one.
+	 */
+	if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
+		cpu = scx_bpf_task_cpu(p);
 	dbg_msg("%s: pid=%d cpu=%ld", __func__, p->pid, cpu);
 	scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns,
 			 enq_flags | SCX_ENQ_LOCAL);
 }

-/*
- * Dispatch a task on the global FIFO.
- */
-static void dispatch_global(struct task_struct *p, u64 enq_flags)
-{
-	dbg_msg("%s: pid=%d", __func__, p->pid);
-	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags);
-}
-
 /*
 * Select the target CPU where a task can be directly dispatched to from
 * .enqueue().
@ -389,14 +386,15 @@ void BPF_STRUCT_OPS(rustland_enqueue, struct task_struct *p, u64 enq_flags)
 	 * processed by the user-space scheduler.
 	 *
 	 * If @queued list is full (user-space scheduler is congested) tasks
-	 * will be dispatched directly from the kernel to the global FIFO.
+	 * will be dispatched directly from the kernel (re-using their
+	 * previously used CPU in this case).
 	 */
 	get_task_info(&task, p);
 	dbg_msg("enqueue: pid=%d", task.pid);
 	if (bpf_map_push_elem(&queued, &task, 0)) {
 		dbg_msg("scheduler congested: pid=%d", task.pid);
 		__sync_fetch_and_add(&nr_sched_congested, 1);
-		dispatch_global(p, enq_flags);
+		dispatch_on_cpu(p, task.cpu, enq_flags);
 		__sync_fetch_and_add(&nr_kernel_dispatches, 1);
 		return;
 	}
@ -418,7 +416,11 @@ static void dispatch_user_scheduler(void)
 		scx_bpf_error("Failed to find usersched task %d", usersched_pid);
 		return;
 	}
-	dispatch_global(p, 0);
+	/*
+	 * Always try to dispatch the user-space scheduler on the current CPU,
+	 * if possible.
+	 */
+	dispatch_on_cpu(p, bpf_get_smp_processor_id(), 0);
 	__sync_fetch_and_add(&nr_kernel_dispatches, 1);
 	bpf_task_release(p);
 }
@ -439,7 +441,6 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
 	bpf_repeat(MAX_ENQUEUED_TASKS) {
 		struct task_struct *p;
 		struct dispatched_task_ctx task;
-		s32 prev_cpu;

 		if (!scx_bpf_dispatch_nr_slots())
 			break;
@ -459,10 +460,7 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
 		 */
 		dbg_msg("usersched: pid=%d cpu=%d payload=%llu",
 			task.pid, task.cpu, task.payload);
-		if (bpf_cpumask_test_cpu(task.cpu, p->cpus_ptr))
-			dispatch_on_cpu(p, task.cpu, 0);
-		else
-			dispatch_global(p, 0);
+		dispatch_on_cpu(p, task.cpu, 0);
 		__sync_fetch_and_add(&nr_user_dispatches, 1);
 		bpf_task_release(p);
 	}