bpf_rustland: do not dispatch the scheduler to the global DSQ

Never dispatch the user-space scheduler to the global DSQ, while all
the other tasks are dispatched to the local per-CPU DSQ.

Since tasks are consumed from the local DSQ first and then from the
global DSQ, we may end up starving the scheduler if we dispatch only
this one on the global DSQ.

In fact it is really easy to trigger a stall with a workload that
triggers many context switches in the system, for example (on a 8 cores
system):

 $ stress-ng --cpu 32 --iomix 4 --vm 2 --vm-bytes 128M --fork 4 --timeout 30s

 ...
 09:28:11 [WARN] EXIT: scx_rustland[1455943] failed to run for 5.275s
 09:28:11 [INFO] Unregister RustLand scheduler

To prevent this from happening also dispatch the user-space scheduler on
the local DSQ, using the current CPU where .dispatch() is called, if
possible, or the previously used CPU otherwise.

Apply the same logic when the scheduler is congested: dispatch on the
previously used CPU using the local DSQ.

In this way all tasks will always get the same "dispatch priority" and
we can prevent the scheduler starvation issue.

Note that with this change in place dispatch_global() is never used and
we can get rid of it.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
This commit is contained in:
Andrea Righi 2024-01-01 10:15:58 +01:00
parent 0fc46b2be2
commit 676bd88ada

View File

@ -252,20 +252,17 @@ static void dispatch_local(struct task_struct *p, u64 enq_flags)
*/
static void dispatch_on_cpu(struct task_struct *p, s32 cpu, u64 enq_flags)
{
/*
* If it's not possible to dispatch on the selected CPU, re-use the
* previously used one.
*/
if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
cpu = scx_bpf_task_cpu(p);
dbg_msg("%s: pid=%d cpu=%ld", __func__, p->pid, cpu);
scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns,
enq_flags | SCX_ENQ_LOCAL);
}
/*
* Dispatch a task on the global FIFO.
*/
static void dispatch_global(struct task_struct *p, u64 enq_flags)
{
dbg_msg("%s: pid=%d", __func__, p->pid);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags);
}
/*
* Select the target CPU where a task can be directly dispatched to from
* .enqueue().
@ -389,14 +386,15 @@ void BPF_STRUCT_OPS(rustland_enqueue, struct task_struct *p, u64 enq_flags)
* processed by the user-space scheduler.
*
* If @queued list is full (user-space scheduler is congested) tasks
* will be dispatched directly from the kernel to the global FIFO.
* will be dispatched directly from the kernel (re-using their
* previously used CPU in this case).
*/
get_task_info(&task, p);
dbg_msg("enqueue: pid=%d", task.pid);
if (bpf_map_push_elem(&queued, &task, 0)) {
dbg_msg("scheduler congested: pid=%d", task.pid);
__sync_fetch_and_add(&nr_sched_congested, 1);
dispatch_global(p, enq_flags);
dispatch_on_cpu(p, task.cpu, enq_flags);
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
return;
}
@ -418,7 +416,11 @@ static void dispatch_user_scheduler(void)
scx_bpf_error("Failed to find usersched task %d", usersched_pid);
return;
}
dispatch_global(p, 0);
/*
* Always try to dispatch the user-space scheduler on the current CPU,
* if possible.
*/
dispatch_on_cpu(p, bpf_get_smp_processor_id(), 0);
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
bpf_task_release(p);
}
@ -439,7 +441,6 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
bpf_repeat(MAX_ENQUEUED_TASKS) {
struct task_struct *p;
struct dispatched_task_ctx task;
s32 prev_cpu;
if (!scx_bpf_dispatch_nr_slots())
break;
@ -459,10 +460,7 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
*/
dbg_msg("usersched: pid=%d cpu=%d payload=%llu",
task.pid, task.cpu, task.payload);
if (bpf_cpumask_test_cpu(task.cpu, p->cpus_ptr))
dispatch_on_cpu(p, task.cpu, 0);
else
dispatch_global(p, 0);
dispatch_on_cpu(p, task.cpu, 0);
__sync_fetch_and_add(&nr_user_dispatches, 1);
bpf_task_release(p);
}