scx_rustland_core: implement effective time slice on a per-task basis

Drop the global effective time-slice and use the more fine-grained per-task time-slice to implement the dynamic time-slice capability. This allows to reduce the scheduler's overhead (dropping the global time slice volatile variable shared between user-space and BPF) and it provides a more fine-grained control on the per-task time slice. Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
2024-11-29 20:50:22 +00:00 · 2024-05-09 07:03:36 +02:00 · 2024-05-09 07:03:36 +02:00 · f052493005
commit f052493005
parent 382ef72999
3 changed files with 16 additions and 46 deletions
--- a/rust/scx_rustland_core/assets/bpf.rs
+++ b/rust/scx_rustland_core/assets/bpf.rs
@ -293,24 +293,6 @@ impl<'cb> BpfScheduler<'cb> {
        }
    }

-    // Override the default scheduler time slice (in us).
-    #[allow(dead_code)]
-    pub fn set_effective_slice_us(&mut self, slice_us: u64) {
-        self.skel.bss_mut().effective_slice_ns = slice_us * 1000;
-    }
-
-    // Get current value of time slice (slice_ns).
-    #[allow(dead_code)]
-    pub fn get_effective_slice_us(&mut self) -> u64 {
-        let slice_ns = self.skel.bss().effective_slice_ns;
-
-        if slice_ns > 0 {
-            slice_ns / 1000
-        } else {
-            self.skel.rodata().slice_ns / 1000
-        }
-    }
-
    // Counter of queued tasks.
    #[allow(dead_code)]
    pub fn nr_queued_mut(&mut self) -> &mut u64 {
--- a/rust/scx_rustland_core/assets/bpf/main.bpf.c
+++ b/rust/scx_rustland_core/assets/bpf/main.bpf.c
@ -52,12 +52,6 @@ u32 usersched_pid; /* User-space scheduler PID */
 const volatile bool switch_partial; /* Switch all tasks or SCHED_EXT tasks */
 const volatile u64 slice_ns = SCX_SLICE_DFL; /* Base time slice duration */

-/*
- * Effective time slice: allow the scheduler to override the default time slice
- * (slice_ns) if this one is set.
- */
-volatile u64 effective_slice_ns;
-
 /*
 * Number of tasks that are queued for scheduling.
 *
@ -321,8 +315,7 @@ dispatch_task(struct task_struct *p, u64 dsq_id,
 	      u64 cpumask_cnt, u64 task_slice_ns, u64 enq_flags)
 {
 	struct task_ctx *tctx;
-	u64 slice = task_slice_ns ? :
-		__sync_fetch_and_add(&effective_slice_ns, 0) ? : slice_ns;
+	u64 slice = task_slice_ns ? : slice_ns;
 	u64 curr_cpumask_cnt;
 	bool force_shared = false;
 	s32 cpu;
--- a/scheds/rust/scx_rustland/src/main.rs
+++ b/scheds/rust/scx_rustland/src/main.rs
@ -150,9 +150,8 @@ struct Opts {
 }

 // Time constants.
-const USEC_PER_NSEC: u64 = 1_000;
 const NSEC_PER_USEC: u64 = 1_000;
-const MSEC_PER_SEC: u64 = 1_000;
+const NSEC_PER_MSEC: u64 = 1_000_000;
 const NSEC_PER_SEC: u64 = 1_000_000_000;

 // Basic item stored in the task information map.
@ -375,9 +374,6 @@ impl<'a> Scheduler<'a> {
        // Cache the current timestamp.
        let now = Self::now();

-        // Get the current effective time slice.
-        let slice_ns = self.bpf.get_effective_slice_us() * MSEC_PER_SEC;
-
        // Update dynamic slice boost.
        //
        // The slice boost is dynamically adjusted as a function of the amount of CPUs
@ -445,7 +441,7 @@ impl<'a> Scheduler<'a> {
        //
        // Moreover, limiting the accounted time slice to slice_ns, allows to prevent starving the
        // current task for too long in the scheduler task pool.
-        task_info.vruntime = self.min_vruntime + slice.clamp(1, slice_ns);
+        task_info.vruntime = self.min_vruntime + slice.clamp(1, self.slice_ns);

        // Update total task cputime.
        task_info.sum_exec_runtime = task.sum_exec_runtime;
@ -503,21 +499,24 @@ impl<'a> Scheduler<'a> {
        }
    }

-    // Dynamically adjust the time slice based on the amount of waiting tasks.
-    fn scale_slice_ns(&mut self) {
-        let nr_scheduled = self.task_pool.tasks.len() as u64;
-        let slice_us_max = self.slice_ns / NSEC_PER_USEC;
-
+    // Return the target time slice, proportionally adjusted based on the total amount of tasks
+    // waiting to be scheduled (more tasks waiting => shorter time slice).
+    fn effective_slice_ns(&mut self, nr_scheduled: u64) -> u64 {
        // Scale time slice as a function of nr_scheduled, but never scale below 250 us.
+        //
+        // The goal here is to adjust the time slice allocated to tasks based on the number of
+        // tasks currently awaiting scheduling. When the system is heavily loaded, shorter time
+        // slices are assigned to provide more opportunities for all tasks to receive CPU time.
        let scaling = ((nr_scheduled + 1) / 2).max(1);
-        let slice_us = (slice_us_max / scaling).max(USEC_PER_NSEC / 4);
+        let slice_ns = (self.slice_ns / scaling).max(NSEC_PER_MSEC / 4);

-        // Apply new scaling.
-        self.bpf.set_effective_slice_us(slice_us);
+        slice_ns
    }

    // Dispatch tasks from the task pool in order (sending them to the BPF dispatcher).
    fn dispatch_tasks(&mut self) {
+        let nr_scheduled = self.task_pool.tasks.len() as u64;
+
        // Dispatch only a batch of tasks equal to the amount of idle CPUs in the system.
        //
        // This allows to have more tasks sitting in the task pool, reducing the pressure on the
@ -546,6 +545,8 @@ impl<'a> Scheduler<'a> {
                        // maximum static time slice allowed.
                        dispatched_task.set_slice_ns(self.slice_ns);
                        dispatched_task.set_flag(RL_PREEMPT_CPU);
+                    } else {
+                        dispatched_task.set_slice_ns(self.effective_slice_ns(nr_scheduled));
                    }

                    // Send task to the BPF dispatcher.
@ -576,9 +577,6 @@ impl<'a> Scheduler<'a> {
        self.drain_queued_tasks();
        self.dispatch_tasks();

-        // Adjust the dynamic time slice immediately after dispatching the tasks.
-        self.scale_slice_ns();
-
        // Yield to avoid using too much CPU from the scheduler itself.
        thread::yield_now();
    }
@ -702,9 +700,6 @@ impl<'a> Scheduler<'a> {
        // Show total page faults of the user-space scheduler.
        self.print_faults();

-        // Show current used time slice.
-        info!("time slice = {} us", self.bpf.get_effective_slice_us());
-
        // Show current slice boost.
        info!("slice boost = {}", self.eff_slice_boost);