scx_rustland: aggressively prioritize interactive tasks

scx_rustland was originally designed as a PoC to showcase the benefits of implementing specialized schedulers via sched_ext, focusing on a very specific use case: prioritize game responsiveness regardless of what runs in the background. Its original design was subsequently modified to better serve as a general-purpose scheduler, balancing the prioritization of interactive tasks with CPU-intensive ones to prevent over-prioritization. With scx_bpfland serving as a more "general-purpose" scheduler, it makes sense to revisit scx_rustland's original goal and make it much more aggressive at prioritizing interactive tasks, determined in function of their average amount of context switches. This change makes scx_rustland again a really good PoC to showcase the benefits of having specialized schedulers, by focusing only at a very specific use case: provide a high and stable frames-per-second (fps) while a kernel build is running in the background. = Results = - Test: Run a WebGL application [1] while building the kernel (make -j32) - Hardware: 8-cores Intel 11th Gen Intel(R) Core(TM) i7-1195G7 @ 2.90GHz +----------------------+--------+--------+ | Scheduler | avg fps| stdev | +----------------------+--------+--------+ | EEVDF | 28 | 4.00 | | scx_rustland-before | 43 | 1.25 | | scx_rustland-after | 60 | 0.25 | +----------------------+--------+--------+ [1] https://webglsamples.org/aquarium/aquarium.html Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
2024-11-29 12:40:24 +00:00 · 2024-09-02 10:43:59 +02:00 · 2024-09-02 10:43:59 +02:00 · d0fb29a0f7
commit d0fb29a0f7
parent 172fe1efc6
1 changed files with 41 additions and 28 deletions
--- a/scheds/rust/scx_rustland/src/main.rs
+++ b/scheds/rust/scx_rustland/src/main.rs
@ -118,10 +118,13 @@ struct Opts {
 const NSEC_PER_USEC: u64 = 1_000;
 const NSEC_PER_SEC: u64 = 1_000_000_000;

+// Maximum factor used to scale down the task's vruntime in function of its average amount of
+// context switches per second.
+const RL_MAX_VTIME_FACTOR: u64 = 4;
+
 #[derive(Debug, Clone)]
 struct TaskStat {
    pid: i32,
-    comm: String,
    nvcsw: u64,
 }

@ -129,13 +132,19 @@ fn parse_proc_pid_stat(pid: i32) -> std::io::Result<TaskStat> {
    let path = format!("/proc/{}/status", pid);
    let content = std::fs::read_to_string(&path)?;

-    let mut comm = String::new();
    let mut nvcsw = 0;

    for line in content.lines() {
        if line.starts_with("Name:") {
-            comm = line.split_whitespace().nth(1).unwrap_or("").to_string();
-        } else if line.starts_with("voluntary_ctxt_switches:") {
+            let name = line.split_whitespace().nth(1).unwrap_or("");
+            // Prioritize kthreads by assigning max nvcsw rate.
+            if name.starts_with('[') && name.ends_with(']') {
+                nvcsw = u64::MAX;
+                break;
+            }
+        }
+
+        if line.starts_with("voluntary_ctxt_switches:") {
            nvcsw = line
                .split_whitespace()
                .nth(1)
@ -145,7 +154,7 @@ fn parse_proc_pid_stat(pid: i32) -> std::io::Result<TaskStat> {
        }
    }

-    Ok(TaskStat { pid, comm, nvcsw })
+    Ok(TaskStat { pid, nvcsw })
 }

 fn get_all_pids() -> std::io::Result<Vec<i32>> {
@ -196,7 +205,6 @@ struct Task {
    qtask: QueuedTask,    // queued task
    vruntime: u64,        // total vruntime (that determines the order how tasks are dispatched)
    timestamp: u64,       // task enqueue timestamp
-    is_interactive: bool, // task is interactive
 }

 // Sort tasks by their interactive status first (interactive tasks are always scheduled before
@ -204,10 +212,8 @@ struct Task {
 // pid.
 impl Ord for Task {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        other
-            .is_interactive
-            .cmp(&self.is_interactive)
-            .then_with(|| self.vruntime.cmp(&other.vruntime))
+        self.vruntime
+            .cmp(&other.vruntime)
            .then_with(|| self.timestamp.cmp(&other.timestamp))
            .then_with(|| self.qtask.pid.cmp(&other.qtask.pid))
    }
@ -260,7 +266,7 @@ struct Scheduler<'a> {
    task_pool: TaskTree,                    // tasks ordered by vruntime
    task_map: TaskInfoMap,                  // map pids to the corresponding task information
    proc_stats: HashMap<i32, u64>,          // Task statistics from procfs
-    interactive_pids: Vec<i32>,             // List of interactive tasks
+    interactive_pids: HashMap<i32, u64>,    // Map of task PIDs to their avg_nvcsw metric
    min_vruntime: u64,                      // Keep track of the minimum vruntime across all tasks
    init_page_faults: u64,                  // Initial page faults counter
    slice_ns: u64,                          // Default time slice (in ns)
@ -282,7 +288,7 @@ impl<'a> Scheduler<'a> {
            task_pool: TaskTree::new(),
            task_map: TaskInfoMap::new(),
            proc_stats: HashMap::new(),
-            interactive_pids: Vec::new(),
+            interactive_pids: HashMap::new(),
            min_vruntime: 0,
            init_page_faults: 0,
            slice_ns: opts.slice_us * NSEC_PER_USEC,
@ -363,8 +369,25 @@ impl<'a> Scheduler<'a> {
        // Update total task cputime.
        task_info.sum_exec_runtime = task.sum_exec_runtime;

+        // Evaluate the target vruntime for the task.
+        let mut vruntime = task_info.vruntime;
+
+        // Strongly prioritize interactive tasks by scaling their vruntime in function of their
+        // average amount of context switches.
+        //
+        // This essentially creates multiple priority lanes (up to RL_MAX_VTIME_FACTOR + 1), where
+        // tasks with a higher rate of average context are scheduled before others. Tasks with the
+        // same average context switch rate are still ordered by their weighted used time slice.
+        let avg_nvcsw = self.interactive_pids.get(&task.pid).copied().unwrap_or(0);
+        vruntime = vruntime / (avg_nvcsw.min(RL_MAX_VTIME_FACTOR) + 1) + slice * 100 / task.weight;
+
+        // Make sure to not de-prioritize tasks too much to prevent starvation.
+        if vruntime > self.min_vruntime + self.slice_ns {
+            vruntime = self.min_vruntime + self.slice_ns;
+        }
+
        // Return the task vruntime.
-        task_info.vruntime
+        vruntime
    }

    // Drain all the tasks from the queued list, update their vruntime (Self::update_enqueued()),
@ -376,14 +399,12 @@ impl<'a> Scheduler<'a> {
                    // Update task information and determine vruntime.
                    let vruntime = self.update_enqueued(&task);
                    let timestamp = Self::now();
-                    let is_interactive = self.interactive_pids.contains(&task.pid);

                    // Insert task in the task pool (ordered by vruntime).
                    self.task_pool.push(Task {
                        qtask: task,
                        vruntime,
                        timestamp,
-                        is_interactive,
                    });
                }
                Ok(None) => {
@ -406,7 +427,7 @@ impl<'a> Scheduler<'a> {
    }

    // Dispatch the first task from the task pool (sending them to the BPF dispatcher).
-    fn dispatch_task(&mut self) {
+    fn dispatch_tasks(&mut self) {
        match self.task_pool.pop() {
            Some(task) => {
                // Update global minimum vruntime.
@ -423,7 +444,7 @@ impl<'a> Scheduler<'a> {
                // Create a new task to dispatch.
                let mut dispatched_task = DispatchedTask::new(&task.qtask);

-                // Assign the time slice to the task.
+                // Assign the time slice to the task and propagate the vruntime.
                dispatched_task.slice_ns = slice_ns;

                // Try to pick an idle CPU for the task.
@ -457,7 +478,7 @@ impl<'a> Scheduler<'a> {
    // and dispatch them to the BPF part via the dispatched list).
    fn schedule(&mut self) {
        self.drain_queued_tasks();
-        self.dispatch_task();
+        self.dispatch_tasks();

        // Notify the dispatcher if there are still peding tasks to be processed,
        self.bpf.notify_complete(self.task_pool.tasks.len() as u64);
@ -487,16 +508,9 @@ impl<'a> Scheduler<'a> {
    }

    fn sync_interactive_tasks(&mut self, stats: &[TaskStat]) {
-        self.interactive_pids.clear();
-
        for i in 0..stats.len() {
            let stat = &stats[i];
-
-            // At least 10 context switches per sec are required to consider the
-            // task as interactive.
-            if stat.nvcsw >= 10 {
-                self.interactive_pids.push(stat.pid);
-            }
+            self.interactive_pids.insert(stat.pid, stat.nvcsw);
        }
    }

@ -506,7 +520,7 @@ impl<'a> Scheduler<'a> {
        for pid in get_all_pids()? {
            if let Ok(stat) = parse_proc_pid_stat(pid) {
                // Retrieve the previous nvcsw value, or 0 if not present.
-                let prev_nvcsw = self.proc_stats.get(&stat.pid).copied().unwrap_or_default();
+                let prev_nvcsw = self.proc_stats.get(&stat.pid).copied().unwrap_or(0);

                // Update the proc_stats entry with the new nvcsw.
                self.proc_stats.insert(stat.pid, stat.nvcsw);
@ -518,7 +532,6 @@ impl<'a> Scheduler<'a> {
                    let delta_nvcsw = stat.nvcsw.saturating_sub(prev_nvcsw);
                    new_stats.push(TaskStat {
                        pid: stat.pid,
-                        comm: stat.comm,
                        nvcsw: delta_nvcsw,
                    });
                }