Merge pull request #64 from arighi/improve-interactive-workloads

scx_rustland: improve interactive workloads
2024-11-28 21:50:23 +00:00 · 2024-01-03 12:10:26 -06:00 · 2024-01-03 12:10:26 -06:00 · 9f1a3973d8
commit 9f1a3973d8
parent 70088fd7da 5d9182d9c3
2 changed files with 37 additions and 34 deletions
--- a/scheds/rust/scx_rustland/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_rustland/src/bpf/main.bpf.c
@ -319,19 +319,6 @@ static bool is_task_cpu_available(struct task_struct *p, u64 enq_flags)
 	if (is_kthread(p) && p->nr_cpus_allowed == 1)
 		return true;

-	/*
-	 * Moreover, immediately dispatch kthreads that still have more than
-	 * half of their runtime budget. As they are likely to release the CPU
-	 * soon, granting them a substantial priority boost can enhance the
-	 * overall system performance.
-	 *
-	 * In the event that one of these kthreads turns into a CPU hog, it
-	 * will deplete its runtime budget and therefore it will be scheduled
-	 * like any other normal task.
-	 */
-	if (is_kthread(p) && p->scx.slice > slice_ns / 2)
-		return true;
-
 	/*
 	 * For regular tasks always rely on force_local to determine if we can
 	 * bypass the scheduler.
@ -352,13 +339,17 @@ static void get_task_info(struct queued_task_ctx *task,
 			  const struct task_struct *p, bool exiting)
 {
 	task->pid = p->pid;
-	task->sum_exec_runtime = p->se.sum_exec_runtime;
-	task->weight = p->scx.weight;
 	/*
 	 * Use a negative CPU number to notify that the task is exiting, so
 	 * that we can free up its resources in the user-space scheduler.
 	 */
-	task->cpu = exiting ? -1 : scx_bpf_task_cpu(p);
+	if (exiting) {
+		task->cpu = -1;
+		return;
+	}
+	task->sum_exec_runtime = p->se.sum_exec_runtime;
+	task->weight = p->scx.weight;
+	task->cpu = scx_bpf_task_cpu(p);
 }

 /*
@ -568,7 +559,7 @@ s32 BPF_STRUCT_OPS(rustland_prep_enable, struct task_struct *p,
 */
 void BPF_STRUCT_OPS(rustland_disable, struct task_struct *p)
 {
-        struct queued_task_ctx task;
+        struct queued_task_ctx task = {};

 	dbg_msg("exiting: pid=%d", task.pid);
 	get_task_info(&task, p, true);
--- a/scheds/rust/scx_rustland/src/main.rs
+++ b/scheds/rust/scx_rustland/src/main.rs
@ -314,23 +314,40 @@ impl<'a> Scheduler<'a> {
        idle_cpus
    }

-    // Update task's vruntime based on the information collected from the kernel part.
+    // Update task's vruntime based on the information collected from the kernel.
+    //
+    // This method implements the main task ordering logic of the scheduler.
    fn update_enqueued(
        task_info: &mut TaskInfo,
        sum_exec_runtime: u64,
        weight: u64,
        min_vruntime: u64,
-        max_slice_ns: u64,
+        slice_ns: u64,
    ) {
-        // Add cputime delta normalized by weight to the vruntime (if delta > 0).
-        if sum_exec_runtime > task_info.sum_exec_runtime {
-            let delta = (sum_exec_runtime - task_info.sum_exec_runtime) * 100 / weight;
-            // Never account more than max_slice_ns. This helps to prevent starving a task for too
-            // long in the scheduler task pool.
-            task_info.vruntime += delta.min(max_slice_ns);
+        // Scale the maximum allowed time slice by a factor of 10 to increase the
+        // range of allowed time delta and give a better chance to prioritize tasks
+        // with shorter time delta / higher weight.
+        let max_slice_ns = slice_ns * 10;
+
+        // Evaluate last time slot used by the task, scaled by its priority (weight).
+        let mut delta = (sum_exec_runtime - task_info.sum_exec_runtime) * 100 / weight;
+
+        // Account (max_slice_ns / 2) to new tasks to avoid granting excessive priority without
+        // understanding their nature. This allows to mitigate potential system starvation caused
+        // by spawning a massive amount of tasks (e.g., fork-bomb attacks).
+        if task_info.sum_exec_runtime == 0 {
+            delta = max_slice_ns / 2;
        }
-        // Make sure vruntime is moving forward (> current minimum).
-        task_info.vruntime = task_info.vruntime.max(min_vruntime);
+
+        // Never account more than max_slice_ns, to prevent starving a task for too long in the
+        // scheduler task pool, but still give a range large enough to be able to prioritize
+        // tasks with short delta / higher weight.
+        task_info.vruntime += delta.min(max_slice_ns);
+
+        // Also make sure that the global vruntime is always progressing (at least by +1)
+        // during each scheduler run, to prevent excessive starvation of the other tasks
+        // sitting in the self.task_pool tree, waiting to be dispatched.
+        task_info.vruntime = task_info.vruntime.max(min_vruntime + 1);

        // Update total task cputime.
        task_info.sum_exec_runtime = sum_exec_runtime;
@ -362,7 +379,7 @@ impl<'a> Scheduler<'a> {
                            .tasks
                            .entry(task.pid)
                            .or_insert_with_key(|&_pid| TaskInfo {
-                                sum_exec_runtime: task.sum_exec_runtime,
+                                sum_exec_runtime: 0,
                                vruntime: self.min_vruntime,
                            });

@ -371,12 +388,7 @@ impl<'a> Scheduler<'a> {
                        task_info,
                        task.sum_exec_runtime,
                        task.weight,
-                        // Make sure the global vruntime is always progressing (at least by +1)
-                        // during each scheduler run, providing a priority boost to newer tasks
-                        // (that is still beneficial for potential short-lived tasks), while also
-                        // preventing excessive starvation of the other tasks sitting in the
-                        // self.task_pool tree, waiting to be dispatched.
-                        self.min_vruntime + 1,
+                        self.min_vruntime,
                        self.skel.rodata().slice_ns,
                    );