scx_rustland_core: move CPU idle selection logic in user-space

Allow user-space scheduler to pick an idle CPU via self.bpf.select_cpu(pid, prev_task, flags), mimicking the BPF's select_cpu() iterface. Also remove the full_user option and always rely on the idle selection logic from user-space. Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
2024-11-26 03:20:24 +00:00 · 2024-08-24 11:04:45 +02:00 · 2024-08-24 11:04:45 +02:00 · 5d544ea264
commit 5d544ea264
parent 1dd329dd7d
5 changed files with 86 additions and 92 deletions
--- a/rust/scx_rustland_core/assets/bpf.rs
+++ b/rust/scx_rustland_core/assets/bpf.rs
@ -6,10 +6,13 @@
 use std::mem::MaybeUninit;

 use crate::bpf_intf;
+use crate::bpf_intf::*;
 use crate::bpf_skel::*;

 use std::fs::File;
 use std::io::Read;
+use std::ffi::c_int;
+use std::ffi::c_ulong;

 use anyhow::Context;
 use anyhow::Result;
@ -17,6 +20,7 @@ use anyhow::Result;
 use plain::Plain;

 use libbpf_rs::OpenObject;
+use libbpf_rs::ProgramInput;
 use libbpf_rs::skel::OpenSkel;
 use libbpf_rs::skel::Skel;
 use libbpf_rs::skel::SkelBuilder;
@ -82,11 +86,11 @@ pub struct QueuedTask {
 // Task queued for dispatching to the BPF component (see bpf_intf::dispatched_task_ctx).
 #[derive(Debug, PartialEq, Eq, PartialOrd, Clone)]
 pub struct DispatchedTask {
-    pid: i32,         // pid that uniquely identifies a task
-    cpu: i32,         // target CPU selected by the scheduler
-    flags: u64,       // special dispatch flags
-    slice_ns: u64,    // time slice assigned to the task (0 = default)
-    cpumask_cnt: u64, // cpumask generation counter (private)
+    pub pid: i32,         // pid that uniquely identifies a task
+    pub cpu: i32,         // target CPU selected by the scheduler
+    pub flags: u64,       // special dispatch flags
+    pub slice_ns: u64,    // time slice assigned to the task (0 = default)
+    cpumask_cnt: u64,     // cpumask generation counter (private)
 }

 impl DispatchedTask {
@ -103,24 +107,6 @@ impl DispatchedTask {
            slice_ns: 0, // use default time slice
        }
    }
-
-    // Assign a specific CPU to a task.
-    #[allow(dead_code)]
-    pub fn set_cpu(&mut self, cpu: i32) {
-        self.cpu = cpu;
-    }
-
-    // Assign a specific dispatch flag to a task.
-    #[allow(dead_code)]
-    pub fn set_flag(&mut self, flag: u64) {
-        self.flags |= flag;
-    }
-
-    // Assign a specific time slice to a task.
-    #[allow(dead_code)]
-    pub fn set_slice_ns(&mut self, slice_ns: u64) {
-        self.slice_ns = slice_ns;
-    }
 }

 // Helpers used to submit tasks to the BPF user ring buffer.
@ -196,7 +182,6 @@ impl<'cb> BpfScheduler<'cb> {
        exit_dump_len: u32,
        partial: bool,
        slice_us: u64,
-        full_user: bool,
        low_power: bool,
        verbose: bool,
        debug: bool,
@ -256,7 +241,6 @@ impl<'cb> BpfScheduler<'cb> {

        skel.maps.bss_data.usersched_pid = std::process::id();
        skel.maps.rodata_data.slice_ns = slice_us * 1000;
-        skel.maps.rodata_data.full_user = full_user;
        skel.maps.rodata_data.low_power = low_power;
        skel.maps.rodata_data.debug = debug;

@ -390,6 +374,28 @@ impl<'cb> BpfScheduler<'cb> {
        unsafe { pthread_setschedparam(pthread_self(), SCHED_EXT, &param as *const sched_param) }
    }

+    // Pick an idle CPU for the target PID.
+    pub fn select_cpu(&mut self, pid: i32, cpu: i32, flags: u64) -> i32 {
+        let prog = &mut self.skel.progs.rs_select_cpu;
+        let mut args = task_cpu_arg {
+            pid: pid as c_int,
+            cpu: cpu as c_int,
+            flags: flags as c_ulong,
+        };
+        let input = ProgramInput {
+            context_in: Some(unsafe {
+                std::slice::from_raw_parts_mut(
+                    &mut args as *mut _ as *mut u8,
+                    std::mem::size_of_val(&args),
+                )
+            }),
+            ..Default::default()
+        };
+        let out = prog.test_run(input).unwrap();
+
+        out.return_value as i32
+    }
+
    // Receive a task to be scheduled from the BPF dispatcher.
    //
    // NOTE: if task.cpu is negative the task is exiting and it does not require to be scheduled.
--- a/rust/scx_rustland_core/assets/bpf/intf.h
+++ b/rust/scx_rustland_core/assets/bpf/intf.h
@ -18,13 +18,19 @@
 #define __kptr
 #endif

-#ifndef __KERNEL__
+#ifndef __VMLINUX_H__
 typedef unsigned char u8;
+typedef unsigned short u16;
 typedef unsigned int u32;
-typedef int s32;
-typedef unsigned long long u64;
-typedef long long s64;
-#endif
+typedef unsigned long u64;
+
+typedef signed char s8;
+typedef signed short s16;
+typedef signed int s32;
+typedef signed long s64;
+
+typedef int pid_t;
+#endif /* __VMLINUX_H__ */

 /* Check a condition at build time */
 #define BUILD_BUG_ON(expr) \
@ -55,6 +61,15 @@ enum {
 	RL_PREEMPT_CPU = 1 << 1,
 };

+/*
+ * Specify a target CPU for a specific PID.
+ */
+struct task_cpu_arg {
+	pid_t pid;
+	s32 cpu;
+	u64 flags;
+};
+
 /*
 * Task sent to the user-space scheduler by the BPF dispatcher.
 *
@ -78,8 +93,8 @@ struct dispatched_task_ctx {
 	s32 pid;
 	s32 cpu; /* CPU where the task should be dispatched */
 	u64 flags; /* special dispatch flags */
-	u64 cpumask_cnt; /* cpumask generation counter */
 	u64 slice_ns; /* time slice assigned to the task (0=default) */
+	u64 cpumask_cnt; /* cpumask generation counter */
 };

 #endif /* __INTF_H */
--- a/rust/scx_rustland_core/assets/bpf/main.bpf.c
+++ b/rust/scx_rustland_core/assets/bpf/main.bpf.c
@ -87,14 +87,6 @@ const volatile bool debug;
 		bpf_printk(_fmt, ##__VA_ARGS__);			\
 } while(0)

-/*
- * Enable/disable full user-space mode.
- *
- * In full user-space mode all events and actions will be sent to user-space,
- * basically disabling any optimization to bypass the user-space scheduler.
- */
-const volatile bool full_user;
-
 /*
  * Enable/disable low-power mode.
  *
@ -387,19 +379,8 @@ dispatch_task(struct task_struct *p, u64 dsq_id,
 		break;
 	default:
 		tctx = lookup_task_ctx(p);
-		if (!tctx) {
-			/*
-			 * If the task doesn't have a context anymore, simply
-			 * bounce it to the first CPU available.
-			 */
-			scx_bpf_dispatch(p, SHARED_DSQ, slice, enq_flags);
-			__sync_fetch_and_add(&nr_bounce_dispatches, 1);
-
-			dbg_msg("dispatch: pid=%d (%s) dsq=%llu enq_flags=%llx slice=%llu bounce",
-				p->pid, p->comm, dsq_id, enq_flags, slice);
+		if (!tctx)
 			return;
-		}
-
 		/*
 		 * Dispatch a task to a specific per-CPU DSQ if the target CPU
 		 * can be used (according to the cpumask), otherwise redirect
@ -624,38 +605,37 @@ out_put_cpumask:
 	return cpu;
 }

-/*
- * Select the target CPU where a task can be executed.
- *
- * The idea here is to try to find an idle CPU in the system, and preferably
- * maintain the task on the same CPU. If we can find an idle CPU in the system
- * dispatch the task directly bypassing the user-space scheduler. Otherwise,
- * send the task to the user-space scheduler, maintaining the previously used
- * CPU as a hint for the scheduler.
- *
- * Decision made in this function is not final. The user-space scheduler may
- * decide to move the task to a different CPU later, if needed.
- */
 s32 BPF_STRUCT_OPS(rustland_select_cpu, struct task_struct *p, s32 prev_cpu,
 		   u64 wake_flags)
 {
-	s32 cpu;
-
 	/*
-	 * When full_user is enabled, the user-space scheduler is responsible
-	 * for selecting a target CPU based on its scheduling logic and
-	 * possibly its own idle tracking mechanism.
+	 * Completely delegate the CPU selection logic to the user-space
+	 * scheduler.
 	 */
-	if (full_user)
-		return prev_cpu;
-
-	cpu = pick_idle_cpu(p, prev_cpu, wake_flags);
-	if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0))
-		return cpu;
-
 	return prev_cpu;
 }

+/*
+ * Select an idle CPU for a specific task from the user-space scheduler.
+ */
+SEC("syscall")
+int rs_select_cpu(struct task_cpu_arg *input)
+{
+	struct task_struct *p;
+	int cpu;
+
+	p = bpf_task_from_pid(input->pid);
+	if (!p)
+		return -EINVAL;
+	bpf_rcu_read_lock();
+	cpu = pick_idle_cpu(p, input->cpu, input->flags);
+	bpf_rcu_read_unlock();
+
+	bpf_task_release(p);
+
+	return cpu;
+}
+
 /*
 * Fill @task with all the information that need to be sent to the user-space
 * scheduler.
@ -719,7 +699,7 @@ void BPF_STRUCT_OPS(rustland_enqueue, struct task_struct *p, u64 enq_flags)
 	 * interactivity problems or unfairness if there are too many softirqs
 	 * being scheduled (e.g., in presence of high RX network traffic).
 	 */
-	if (!full_user && is_kthread(p) && p->nr_cpus_allowed == 1)
+	if (is_kthread(p) && p->nr_cpus_allowed == 1)
 		if (!dispatch_direct_cpu(p, cpu, enq_flags))
 			return;

--- a/scheds/rust/scx_rlfifo/src/main.rs
+++ b/scheds/rust/scx_rlfifo/src/main.rs
@ -33,7 +33,6 @@ impl<'a> Scheduler<'a> {
            0,                        // exit_dump_len (buffer size of exit info)
            false,                    // partial (include all tasks if false)
            5000,                     // slice_ns (default task time slice)
-            true,                     // full_user (schedule all tasks in user-space)
            false,                    // low_power (low power mode)
            false,                    // verbose (verbose output)
            false,                    // debug (debug mode)
@ -59,7 +58,7 @@ impl<'a> Scheduler<'a> {
                        let mut dispatched_task = DispatchedTask::new(&task);

                        // Allow to dispatch on the first CPU available.
-                        dispatched_task.set_flag(RL_CPU_ANY);
+                        dispatched_task.flags |= RL_CPU_ANY;

                        let _ = self.bpf.dispatch_task(&dispatched_task);

--- a/scheds/rust/scx_rustland/src/main.rs
+++ b/scheds/rust/scx_rustland/src/main.rs
@ -83,14 +83,6 @@ struct Opts {
    #[clap(short = 'S', long, default_value = "500")]
    slice_us_min: u64,

-    /// If specified, all the scheduling events and actions will be processed in user-space,
-    /// disabling any form of in-kernel optimization.
-    ///
-    /// This mode will likely make the system less responsive, but more predictable in terms of
-    /// performance.
-    #[clap(short = 'u', long, action = clap::ArgAction::SetTrue)]
-    full_user: bool,
-
    /// When low-power mode is enabled, the scheduler behaves in a more non-work conserving way:
    /// the CPUs operate at reduced capacity, which slows down CPU-bound tasks, enhancing the
    /// prioritization of interactive workloads.  In summary, enabling low-power mode will limit
@ -283,7 +275,6 @@ impl<'a> Scheduler<'a> {
            opts.exit_dump_len,
            opts.partial,
            opts.slice_us,
-            opts.full_user,
            opts.low_power,
            opts.verbose,
            opts.debug,
@ -423,12 +414,15 @@ impl<'a> Scheduler<'a> {
                let mut dispatched_task = DispatchedTask::new(&task.qtask);

                // Assign the time slice to the task.
-                dispatched_task.set_slice_ns(slice_ns);
+                dispatched_task.slice_ns = slice_ns;

-                // Dispatch task on the first CPU available if it is classified as
-                // interactive, non-interactive tasks will continue to run on the same CPU.
-                if task.is_interactive {
-                    dispatched_task.set_flag(RL_CPU_ANY);
+                // Try to pick an idle CPU for the task.
+                let cpu = self.bpf.select_cpu(dispatched_task.pid, dispatched_task.cpu, 0);
+                if cpu >= 0 {
+                    dispatched_task.cpu = cpu;
+                } else {
+                    // Dispatch task on the first CPU available.
+                    dispatched_task.flags |= RL_CPU_ANY;
                }

                // Send task to the BPF dispatcher.