From b7a9d3775a2fbb1ed5d9e58ea508beb598a03249 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 3 Jan 2024 01:00:53 +0100 Subject: [PATCH 1/3] scx_rustland: schedule non-cpu intensive kthreads normally With commit a7677fd ("scx_rustland: bypass user-space scheduler for short-lived kthreads") we were try to mitigate a problem that was actually introduced by using the wrong formula to evaluate weighted vruntime, see commit 2900b20 ("scx_rustland: evaluate the proper vruntime delta"). Reverting that (pseudo-)optimization doesn't seem to introduce any performance/latency regression and it makes the code more elegant, therefore drop it. Signed-off-by: Andrea Righi --- scheds/rust/scx_rustland/src/bpf/main.bpf.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/scheds/rust/scx_rustland/src/bpf/main.bpf.c b/scheds/rust/scx_rustland/src/bpf/main.bpf.c index 661fc80..74c00b2 100644 --- a/scheds/rust/scx_rustland/src/bpf/main.bpf.c +++ b/scheds/rust/scx_rustland/src/bpf/main.bpf.c @@ -319,19 +319,6 @@ static bool is_task_cpu_available(struct task_struct *p, u64 enq_flags) if (is_kthread(p) && p->nr_cpus_allowed == 1) return true; - /* - * Moreover, immediately dispatch kthreads that still have more than - * half of their runtime budget. As they are likely to release the CPU - * soon, granting them a substantial priority boost can enhance the - * overall system performance. - * - * In the event that one of these kthreads turns into a CPU hog, it - * will deplete its runtime budget and therefore it will be scheduled - * like any other normal task. - */ - if (is_kthread(p) && p->scx.slice > slice_ns / 2) - return true; - /* * For regular tasks always rely on force_local to determine if we can * bypass the scheduler. From 50b5f6e8c694330429485612ecaea6734e56e329 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 3 Jan 2024 09:10:20 +0100 Subject: [PATCH 2/3] scx_rustland: do not update exiting tasks statistics Avoid updating task information for tasks that are exiting, as they won't be used by the user-space scheduler. Signed-off-by: Andrea Righi --- scheds/rust/scx_rustland/src/bpf/main.bpf.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scheds/rust/scx_rustland/src/bpf/main.bpf.c b/scheds/rust/scx_rustland/src/bpf/main.bpf.c index 74c00b2..88de56c 100644 --- a/scheds/rust/scx_rustland/src/bpf/main.bpf.c +++ b/scheds/rust/scx_rustland/src/bpf/main.bpf.c @@ -339,13 +339,17 @@ static void get_task_info(struct queued_task_ctx *task, const struct task_struct *p, bool exiting) { task->pid = p->pid; - task->sum_exec_runtime = p->se.sum_exec_runtime; - task->weight = p->scx.weight; /* * Use a negative CPU number to notify that the task is exiting, so * that we can free up its resources in the user-space scheduler. */ - task->cpu = exiting ? -1 : scx_bpf_task_cpu(p); + if (exiting) { + task->cpu = -1; + return; + } + task->sum_exec_runtime = p->se.sum_exec_runtime; + task->weight = p->scx.weight; + task->cpu = scx_bpf_task_cpu(p); } /* @@ -555,7 +559,7 @@ s32 BPF_STRUCT_OPS(rustland_prep_enable, struct task_struct *p, */ void BPF_STRUCT_OPS(rustland_disable, struct task_struct *p) { - struct queued_task_ctx task; + struct queued_task_ctx task = {}; dbg_msg("exiting: pid=%d", task.pid); get_task_info(&task, p, true); From 5d9182d9c358b46e3d0a39d5bcc6ab73d9f1d05a Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 3 Jan 2024 11:11:02 +0100 Subject: [PATCH 3/3] scx_rustland: prioritize interactive workloads The current implementation of the user-space scheduler is strongly prioritizing newly created tasks by setting their initial vruntime to (min_vruntime + 1); this prioritization places them ahead of other tasks waiting to run. While this approach is efficient for processing short-lived tasks, it makes the scheduler vulnerable to fork-bomb attacks and significantly penalizes interactive workloads (e.g., "foreground" applications), in particular in the presence of background applications that are spawning multiple tasks, such as parallel builds. Instead of prioritizing newly created tasks, do the opposite and account (max_slice_ns / 2) to their initial vruntime, to make sure they are not scheduled before the other tasks that are already waiting for the CPU in the current scheduler run. This allows to mitigate potential fork-bomb attacks and it strongly improves the responsiveness of interactive applications (such as UI, audio/video streams, gaming, etc.). With this change applied, under certain conditions, scx_rustland can even outperform the default Linux scheduler. For example, with a parallel kernel build (make -j32) running in the background, I can play Terraria with a constant rate of ~30-40 fps, while the default Linux scheduler can handle only ~20-30 fps under the same conditions. Signed-off-by: Andrea Righi --- scheds/rust/scx_rustland/src/main.rs | 46 ++++++++++++++++++---------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/scheds/rust/scx_rustland/src/main.rs b/scheds/rust/scx_rustland/src/main.rs index d9e6fd3..91087bf 100644 --- a/scheds/rust/scx_rustland/src/main.rs +++ b/scheds/rust/scx_rustland/src/main.rs @@ -314,23 +314,40 @@ impl<'a> Scheduler<'a> { idle_cpus } - // Update task's vruntime based on the information collected from the kernel part. + // Update task's vruntime based on the information collected from the kernel. + // + // This method implements the main task ordering logic of the scheduler. fn update_enqueued( task_info: &mut TaskInfo, sum_exec_runtime: u64, weight: u64, min_vruntime: u64, - max_slice_ns: u64, + slice_ns: u64, ) { - // Add cputime delta normalized by weight to the vruntime (if delta > 0). - if sum_exec_runtime > task_info.sum_exec_runtime { - let delta = (sum_exec_runtime - task_info.sum_exec_runtime) * 100 / weight; - // Never account more than max_slice_ns. This helps to prevent starving a task for too - // long in the scheduler task pool. - task_info.vruntime += delta.min(max_slice_ns); + // Scale the maximum allowed time slice by a factor of 10 to increase the + // range of allowed time delta and give a better chance to prioritize tasks + // with shorter time delta / higher weight. + let max_slice_ns = slice_ns * 10; + + // Evaluate last time slot used by the task, scaled by its priority (weight). + let mut delta = (sum_exec_runtime - task_info.sum_exec_runtime) * 100 / weight; + + // Account (max_slice_ns / 2) to new tasks to avoid granting excessive priority without + // understanding their nature. This allows to mitigate potential system starvation caused + // by spawning a massive amount of tasks (e.g., fork-bomb attacks). + if task_info.sum_exec_runtime == 0 { + delta = max_slice_ns / 2; } - // Make sure vruntime is moving forward (> current minimum). - task_info.vruntime = task_info.vruntime.max(min_vruntime); + + // Never account more than max_slice_ns, to prevent starving a task for too long in the + // scheduler task pool, but still give a range large enough to be able to prioritize + // tasks with short delta / higher weight. + task_info.vruntime += delta.min(max_slice_ns); + + // Also make sure that the global vruntime is always progressing (at least by +1) + // during each scheduler run, to prevent excessive starvation of the other tasks + // sitting in the self.task_pool tree, waiting to be dispatched. + task_info.vruntime = task_info.vruntime.max(min_vruntime + 1); // Update total task cputime. task_info.sum_exec_runtime = sum_exec_runtime; @@ -362,7 +379,7 @@ impl<'a> Scheduler<'a> { .tasks .entry(task.pid) .or_insert_with_key(|&_pid| TaskInfo { - sum_exec_runtime: task.sum_exec_runtime, + sum_exec_runtime: 0, vruntime: self.min_vruntime, }); @@ -371,12 +388,7 @@ impl<'a> Scheduler<'a> { task_info, task.sum_exec_runtime, task.weight, - // Make sure the global vruntime is always progressing (at least by +1) - // during each scheduler run, providing a priority boost to newer tasks - // (that is still beneficial for potential short-lived tasks), while also - // preventing excessive starvation of the other tasks sitting in the - // self.task_pool tree, waiting to be dispatched. - self.min_vruntime + 1, + self.min_vruntime, self.skel.rodata().slice_ns, );