Merge pull request #235 from sched-ext/rustland-preemption

rustland: enable preemption
2024-11-28 21:50:23 +00:00 · 2024-04-23 17:20:39 +02:00 · 2024-04-23 17:20:39 +02:00 · a8226f0fde
commit a8226f0fde
parent d3076de936 f02e9b072c
4 changed files with 111 additions and 31 deletions
--- a/rust/scx_rustland_core/assets/bpf.rs
+++ b/rust/scx_rustland_core/assets/bpf.rs
@ -26,11 +26,15 @@ use scx_rustland_core::ALLOCATOR;
 // Defined in UAPI
 const SCHED_EXT: i32 = 7;

-// Do not assign any specific CPU to the task.
+// Allow to dispatch the task on any CPU.
 //
 // The task will be dispatched to the global shared DSQ and it will run on the first CPU available.
 #[allow(dead_code)]
-pub const NO_CPU: i32 = -1;
+pub const RL_CPU_ANY: u64 = bpf_intf::RL_CPU_ANY as u64;
+
+// Allow to preempt the target CPU when dispatching the task.
+#[allow(dead_code)]
+pub const RL_PREEMPT_CPU: u64 = bpf_intf::RL_PREEMPT_CPU as u64;

 /// High-level Rust abstraction to interact with a generic sched-ext BPF component.
 ///
@ -71,6 +75,7 @@ pub struct QueuedTask {
 pub struct DispatchedTask {
    pid: i32,         // pid that uniquely identifies a task
    cpu: i32,         // target CPU selected by the scheduler
+    flags: u64,       // special dispatch flags
    slice_ns: u64,    // time slice assigned to the task (0 = default)
    cpumask_cnt: u64, // cpumask generation counter (private)
 }
@ -84,6 +89,7 @@ impl DispatchedTask {
        DispatchedTask {
            pid: task.pid,
            cpu: task.cpu,
+            flags: 0,
            cpumask_cnt: task.cpumask_cnt,
            slice_ns: 0, // use default time slice
        }
@ -95,6 +101,12 @@ impl DispatchedTask {
        self.cpu = cpu;
    }

+    // Assign a specific dispatch flag to a task.
+    #[allow(dead_code)]
+    pub fn set_flag(&mut self, flag: u64) {
+        self.flags |= flag;
+    }
+
    // Assign a specific time slice to a task.
    #[allow(dead_code)]
    pub fn set_slice_ns(&mut self, slice_ns: u64) {
@ -141,6 +153,7 @@ impl DispatchedMessage {
        let dispatched_task_struct = bpf_intf::dispatched_task_ctx {
            pid: task.pid,
            cpu: task.cpu,
+            flags: task.flags,
            cpumask_cnt: task.cpumask_cnt,
            slice_ns: task.slice_ns,
        };
--- a/rust/scx_rustland_core/assets/bpf/intf.h
+++ b/rust/scx_rustland_core/assets/bpf/intf.h
@ -26,6 +26,35 @@ typedef unsigned long long u64;
 typedef long long s64;
 #endif

+/* Check a condition at build time */
+#define BUILD_BUG_ON(expr) \
+	do { \
+		extern char __build_assert__[(expr) ? -1 : 1] \
+			__attribute__((unused)); \
+	} while(0)
+
+/*
+ * Maximum amount of CPUs supported by this scheduler (this defines the size of
+ * cpu_map that is used to store the idle state and CPU ownership).
+ */
+#define MAX_CPUS 1024
+
+/* Special dispatch flags */
+enum {
+	/*
+	 * Do not assign any specific CPU to the task.
+	 *
+	 * The task will be dispatched to the global shared DSQ and it will run
+	 * on the first CPU available.
+	 */
+	RL_CPU_ANY = 1 << 0,
+
+	/*
+	 * Allow to preempt the target CPU when dispatching the task.
+	 */
+	RL_PREEMPT_CPU = 1 << 1,
+};
+
 /*
 * Task sent to the user-space scheduler by the BPF dispatcher.
 *
@ -49,6 +78,7 @@ struct queued_task_ctx {
 struct dispatched_task_ctx {
 	s32 pid;
 	s32 cpu; /* CPU where the task should be dispatched */
+	u64 flags; /* special dispatch flags */
 	u64 cpumask_cnt; /* cpumask generation counter */
 	u64 slice_ns; /* time slice assigned to the task (0=default) */
 };
--- a/rust/scx_rustland_core/assets/bpf/main.bpf.c
+++ b/rust/scx_rustland_core/assets/bpf/main.bpf.c
@ -33,12 +33,6 @@ char _license[] SEC("license") = "GPL";

 UEI_DEFINE(uei);

-/*
- * Maximum amount of CPUs supported by this scheduler (this defines the size of
- * cpu_map that is used to store the idle state and CPU ownership).
- */
-#define MAX_CPUS 1024
-
 /*
 * Introduce a custom DSQ shared across all the CPUs, where we can dispatch
 * tasks that will be executed on the first CPU available.
@ -570,6 +564,7 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
 	bpf_repeat(MAX_ENQUEUED_TASKS) {
 		struct task_struct *p;
 		struct dispatched_task_ctx task;
+		u64 enq_flags = 0;

 		/*
 		 * Pop first task from the dispatched queue, stop if dispatch
@ -582,21 +577,28 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
 		p = bpf_task_from_pid(task.pid);
 		if (!p)
 			continue;
+
+		dbg_msg("usersched: pid=%d cpu=%d cpumask_cnt=%llu slice_ns=%llu flags=%llx",
+			task.pid, task.cpu, task.cpumask_cnt, task.slice_ns, task.flags);
+		/*
+		 * Map RL_PREEMPT_CPU to SCX_ENQ_PREEMPT and allow this task to
+		 * preempt others.
+		 */
+		if (task.flags & RL_PREEMPT_CPU)
+			enq_flags = SCX_ENQ_PREEMPT;
 		/*
 		 * Check whether the user-space scheduler assigned a different
 		 * CPU to the task and migrate (if possible).
 		 *
-		 * If no CPU has been specified (task.cpu < 0), then dispatch
-		 * the task to the shared DSQ and rely on the built-in idle CPU
-		 * selection.
+		 * If the task has been submitted with RL_CPU_ANY, then
+		 * dispatch it to the shared DSQ and run it on the first CPU
+		 * available.
 		 */
-		dbg_msg("usersched: pid=%d cpu=%d cpumask_cnt=%llu slice_ns=%llu",
-			task.pid, task.cpu, task.cpumask_cnt, task.slice_ns);
-		if (task.cpu < 0)
-			dispatch_task(p, SHARED_DSQ, 0, task.slice_ns, 0);
+		if (task.flags & RL_CPU_ANY)
+			dispatch_task(p, SHARED_DSQ, 0, task.slice_ns, enq_flags);
 		else
-			dispatch_task(p, cpu_to_dsq(task.cpu), task.cpumask_cnt,
-				      task.slice_ns, 0);
+			dispatch_task(p, cpu_to_dsq(task.cpu),
+				      task.cpumask_cnt, task.slice_ns, enq_flags);
 		bpf_task_release(p);
 		__sync_fetch_and_add(&nr_user_dispatches, 1);
 	}
@ -856,6 +858,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rustland_init)
 {
 	int err;

+	/* Compile-time checks */
+	BUILD_BUG_ON((MAX_CPUS % 2));
+
+	/* Initialize rustland core */
 	err = dsq_init();
 	if (err)
 		return err;
--- a/scheds/rust/scx_rustland/src/main.rs
+++ b/scheds/rust/scx_rustland/src/main.rs
@ -108,6 +108,16 @@ struct Opts {
    #[clap(short = 'i', long, action = clap::ArgAction::SetTrue)]
    builtin_idle: bool,

+    /// If specified, disable task preemption.
+    ///
+    /// Disabling task preemption can help to improve the throughput of CPU-intensive tasks, while
+    /// still providing a good level of system responsiveness.
+    ///
+    /// Preemption is enabled by default to provide a higher level of responsiveness to the
+    /// interactive tasks.
+    #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
+    no_preemption: bool,
+
    /// If specified, all the scheduling events and actions will be processed in user-space,
    /// disabling any form of in-kernel optimization.
    ///
@ -173,8 +183,9 @@ impl TaskInfoMap {

 #[derive(Debug, PartialEq, Eq, PartialOrd, Clone)]
 struct Task {
-    qtask: QueuedTask, // queued task
-    vruntime: u64,     // total vruntime (that determines the order how tasks are dispatched)
+    qtask: QueuedTask,      // queued task
+    vruntime: u64,          // total vruntime (that determines the order how tasks are dispatched)
+    is_interactive: bool,   // task can preempt other tasks
 }

 // Make sure tasks are ordered by vruntime, if multiple tasks have the same vruntime order by pid.
@ -237,7 +248,8 @@ struct Scheduler<'a> {
    slice_boost: u64,      // Slice booster
    eff_slice_boost: u64,  // Effective slice booster
    init_page_faults: u64, // Initial page faults counter
-    builtin_idle: bool,   // Use sched-ext built-in idle selection logic
+    builtin_idle: bool,    // Use sched-ext built-in idle selection logic
+    no_preemption: bool,   // Disable task preemption
 }

 impl<'a> Scheduler<'a> {
@ -255,6 +267,9 @@ impl<'a> Scheduler<'a> {
        // Use built-in idle selection logic.
        let builtin_idle = opts.builtin_idle;

+        // Disable task preemption.
+        let no_preemption = opts.no_preemption;
+
        // Scheduler task pool to sort tasks by vruntime.
        let task_pool = TaskTree::new();

@ -291,6 +306,7 @@ impl<'a> Scheduler<'a> {
            eff_slice_boost,
            init_page_faults,
            builtin_idle,
+            no_preemption,
        })
    }

@ -327,11 +343,12 @@ impl<'a> Scheduler<'a> {
        ts.as_nanos() as u64
    }

-    // Update task's vruntime based on the information collected from the kernel and return the
-    // evaluated weighted time slice to the caller.
+    // Update task's vruntime based on the information collected from the kernel and return to the
+    // caller the evaluated weighted time slice along with a flag indicating whether the task is
+    // interactive or not (interactive tasks are allowed to preempt other tasks).
    //
    // This method implements the main task ordering logic of the scheduler.
-    fn update_enqueued(&mut self, task: &QueuedTask) -> u64 {
+    fn update_enqueued(&mut self, task: &QueuedTask) -> (u64, bool) {
        // Determine if a task is new or old, based on their current runtime and previous runtime
        // counters.
        //
@ -389,18 +406,19 @@ impl<'a> Scheduler<'a> {
            task.sum_exec_runtime - task_info.sum_exec_runtime
        };

-        // Apply the slice boost to interactive tasks.
-        //
        // Determine if a task is interactive, based on the moving average of voluntary context
        // switches over time.
        //
        // NOTE: we should make this threshold a tunable, but for now let's assume that a moving
        // average of 10 voluntary context switch per second is enough to classify the task as
        // interactive.
+        let is_interactive = task_info.avg_nvcsw >= 10;
+
+        // Apply the slice boost to interactive tasks.
        //
        // NOTE: some tasks may have a very high weight, that can potentially disrupt our slice
        // boost optimizations, therefore always limit the task priority to a max of 1000.
-        let weight = if task_info.avg_nvcsw >= 10 {
+        let weight = if is_interactive {
            task.weight.min(1000) * self.slice_boost.max(1)
        } else {
            task.weight.min(1000)
@ -435,8 +453,8 @@ impl<'a> Scheduler<'a> {
            task_info.nvcsw_ts = now;
        }

-        // Return the task vruntime.
-        task_info.vruntime
+        // Return the task vruntime and a flag indicating if the task is interactive.
+        (task_info.vruntime, is_interactive)
    }

    // Drain all the tasks from the queued list, update their vruntime (Self::update_enqueued()),
@ -452,13 +470,14 @@ impl<'a> Scheduler<'a> {
                        continue;
                    }

-                    // Update task information.
-                    let vruntime = self.update_enqueued(&task);
+                    // Update task information and determine vruntime and interactiveness.
+                    let (vruntime, is_interactive) = self.update_enqueued(&task);

                    // Insert task in the task pool (ordered by vruntime).
                    self.task_pool.push(Task {
                        qtask: task,
                        vruntime,
+                        is_interactive,
                    });
                }
                Ok(None) => {
@ -505,8 +524,20 @@ impl<'a> Scheduler<'a> {
                    // If built-in idle selection logic is disabled, dispatch on the first CPU
                    // available.
                    let mut dispatched_task = DispatchedTask::new(&task.qtask);
+
+                    // Set special dispatch flags.
                    if !self.builtin_idle {
-                        dispatched_task.set_cpu(NO_CPU);
+                        dispatched_task.set_flag(RL_CPU_ANY);
+                    }
+                    if task.is_interactive && !self.no_preemption {
+                        // Assign the maximum time slice to this task and allow to preempt others.
+                        //
+                        // NOTE: considering that, with preemption enabled, interactive tasks can
+                        // preempt each other (for now) and they are also more likely to release
+                        // the CPU before its assigned time slice expires, always give them the
+                        // maximum static time slice allowed.
+                        dispatched_task.set_slice_ns(self.slice_ns);
+                        dispatched_task.set_flag(RL_PREEMPT_CPU);
                    }

                    // Send task to the BPF dispatcher.