mirror of
https://github.com/JakeHillion/scx.git
synced 2024-11-26 03:20:24 +00:00
scx_rustland_core: move CPU idle selection logic in user-space
Allow user-space scheduler to pick an idle CPU via self.bpf.select_cpu(pid, prev_task, flags), mimicking the BPF's select_cpu() iterface. Also remove the full_user option and always rely on the idle selection logic from user-space. Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
This commit is contained in:
parent
1dd329dd7d
commit
5d544ea264
@ -6,10 +6,13 @@
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::bpf_intf;
|
||||
use crate::bpf_intf::*;
|
||||
use crate::bpf_skel::*;
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::ffi::c_int;
|
||||
use std::ffi::c_ulong;
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
@ -17,6 +20,7 @@ use anyhow::Result;
|
||||
use plain::Plain;
|
||||
|
||||
use libbpf_rs::OpenObject;
|
||||
use libbpf_rs::ProgramInput;
|
||||
use libbpf_rs::skel::OpenSkel;
|
||||
use libbpf_rs::skel::Skel;
|
||||
use libbpf_rs::skel::SkelBuilder;
|
||||
@ -82,11 +86,11 @@ pub struct QueuedTask {
|
||||
// Task queued for dispatching to the BPF component (see bpf_intf::dispatched_task_ctx).
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Clone)]
|
||||
pub struct DispatchedTask {
|
||||
pid: i32, // pid that uniquely identifies a task
|
||||
cpu: i32, // target CPU selected by the scheduler
|
||||
flags: u64, // special dispatch flags
|
||||
slice_ns: u64, // time slice assigned to the task (0 = default)
|
||||
cpumask_cnt: u64, // cpumask generation counter (private)
|
||||
pub pid: i32, // pid that uniquely identifies a task
|
||||
pub cpu: i32, // target CPU selected by the scheduler
|
||||
pub flags: u64, // special dispatch flags
|
||||
pub slice_ns: u64, // time slice assigned to the task (0 = default)
|
||||
cpumask_cnt: u64, // cpumask generation counter (private)
|
||||
}
|
||||
|
||||
impl DispatchedTask {
|
||||
@ -103,24 +107,6 @@ impl DispatchedTask {
|
||||
slice_ns: 0, // use default time slice
|
||||
}
|
||||
}
|
||||
|
||||
// Assign a specific CPU to a task.
|
||||
#[allow(dead_code)]
|
||||
pub fn set_cpu(&mut self, cpu: i32) {
|
||||
self.cpu = cpu;
|
||||
}
|
||||
|
||||
// Assign a specific dispatch flag to a task.
|
||||
#[allow(dead_code)]
|
||||
pub fn set_flag(&mut self, flag: u64) {
|
||||
self.flags |= flag;
|
||||
}
|
||||
|
||||
// Assign a specific time slice to a task.
|
||||
#[allow(dead_code)]
|
||||
pub fn set_slice_ns(&mut self, slice_ns: u64) {
|
||||
self.slice_ns = slice_ns;
|
||||
}
|
||||
}
|
||||
|
||||
// Helpers used to submit tasks to the BPF user ring buffer.
|
||||
@ -196,7 +182,6 @@ impl<'cb> BpfScheduler<'cb> {
|
||||
exit_dump_len: u32,
|
||||
partial: bool,
|
||||
slice_us: u64,
|
||||
full_user: bool,
|
||||
low_power: bool,
|
||||
verbose: bool,
|
||||
debug: bool,
|
||||
@ -256,7 +241,6 @@ impl<'cb> BpfScheduler<'cb> {
|
||||
|
||||
skel.maps.bss_data.usersched_pid = std::process::id();
|
||||
skel.maps.rodata_data.slice_ns = slice_us * 1000;
|
||||
skel.maps.rodata_data.full_user = full_user;
|
||||
skel.maps.rodata_data.low_power = low_power;
|
||||
skel.maps.rodata_data.debug = debug;
|
||||
|
||||
@ -390,6 +374,28 @@ impl<'cb> BpfScheduler<'cb> {
|
||||
unsafe { pthread_setschedparam(pthread_self(), SCHED_EXT, ¶m as *const sched_param) }
|
||||
}
|
||||
|
||||
// Pick an idle CPU for the target PID.
|
||||
pub fn select_cpu(&mut self, pid: i32, cpu: i32, flags: u64) -> i32 {
|
||||
let prog = &mut self.skel.progs.rs_select_cpu;
|
||||
let mut args = task_cpu_arg {
|
||||
pid: pid as c_int,
|
||||
cpu: cpu as c_int,
|
||||
flags: flags as c_ulong,
|
||||
};
|
||||
let input = ProgramInput {
|
||||
context_in: Some(unsafe {
|
||||
std::slice::from_raw_parts_mut(
|
||||
&mut args as *mut _ as *mut u8,
|
||||
std::mem::size_of_val(&args),
|
||||
)
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
let out = prog.test_run(input).unwrap();
|
||||
|
||||
out.return_value as i32
|
||||
}
|
||||
|
||||
// Receive a task to be scheduled from the BPF dispatcher.
|
||||
//
|
||||
// NOTE: if task.cpu is negative the task is exiting and it does not require to be scheduled.
|
||||
|
@ -18,13 +18,19 @@
|
||||
#define __kptr
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL__
|
||||
#ifndef __VMLINUX_H__
|
||||
typedef unsigned char u8;
|
||||
typedef unsigned short u16;
|
||||
typedef unsigned int u32;
|
||||
typedef int s32;
|
||||
typedef unsigned long long u64;
|
||||
typedef long long s64;
|
||||
#endif
|
||||
typedef unsigned long u64;
|
||||
|
||||
typedef signed char s8;
|
||||
typedef signed short s16;
|
||||
typedef signed int s32;
|
||||
typedef signed long s64;
|
||||
|
||||
typedef int pid_t;
|
||||
#endif /* __VMLINUX_H__ */
|
||||
|
||||
/* Check a condition at build time */
|
||||
#define BUILD_BUG_ON(expr) \
|
||||
@ -55,6 +61,15 @@ enum {
|
||||
RL_PREEMPT_CPU = 1 << 1,
|
||||
};
|
||||
|
||||
/*
|
||||
* Specify a target CPU for a specific PID.
|
||||
*/
|
||||
struct task_cpu_arg {
|
||||
pid_t pid;
|
||||
s32 cpu;
|
||||
u64 flags;
|
||||
};
|
||||
|
||||
/*
|
||||
* Task sent to the user-space scheduler by the BPF dispatcher.
|
||||
*
|
||||
@ -78,8 +93,8 @@ struct dispatched_task_ctx {
|
||||
s32 pid;
|
||||
s32 cpu; /* CPU where the task should be dispatched */
|
||||
u64 flags; /* special dispatch flags */
|
||||
u64 cpumask_cnt; /* cpumask generation counter */
|
||||
u64 slice_ns; /* time slice assigned to the task (0=default) */
|
||||
u64 cpumask_cnt; /* cpumask generation counter */
|
||||
};
|
||||
|
||||
#endif /* __INTF_H */
|
||||
|
@ -87,14 +87,6 @@ const volatile bool debug;
|
||||
bpf_printk(_fmt, ##__VA_ARGS__); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
* Enable/disable full user-space mode.
|
||||
*
|
||||
* In full user-space mode all events and actions will be sent to user-space,
|
||||
* basically disabling any optimization to bypass the user-space scheduler.
|
||||
*/
|
||||
const volatile bool full_user;
|
||||
|
||||
/*
|
||||
* Enable/disable low-power mode.
|
||||
*
|
||||
@ -387,19 +379,8 @@ dispatch_task(struct task_struct *p, u64 dsq_id,
|
||||
break;
|
||||
default:
|
||||
tctx = lookup_task_ctx(p);
|
||||
if (!tctx) {
|
||||
/*
|
||||
* If the task doesn't have a context anymore, simply
|
||||
* bounce it to the first CPU available.
|
||||
*/
|
||||
scx_bpf_dispatch(p, SHARED_DSQ, slice, enq_flags);
|
||||
__sync_fetch_and_add(&nr_bounce_dispatches, 1);
|
||||
|
||||
dbg_msg("dispatch: pid=%d (%s) dsq=%llu enq_flags=%llx slice=%llu bounce",
|
||||
p->pid, p->comm, dsq_id, enq_flags, slice);
|
||||
if (!tctx)
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Dispatch a task to a specific per-CPU DSQ if the target CPU
|
||||
* can be used (according to the cpumask), otherwise redirect
|
||||
@ -624,38 +605,37 @@ out_put_cpumask:
|
||||
return cpu;
|
||||
}
|
||||
|
||||
/*
|
||||
* Select the target CPU where a task can be executed.
|
||||
*
|
||||
* The idea here is to try to find an idle CPU in the system, and preferably
|
||||
* maintain the task on the same CPU. If we can find an idle CPU in the system
|
||||
* dispatch the task directly bypassing the user-space scheduler. Otherwise,
|
||||
* send the task to the user-space scheduler, maintaining the previously used
|
||||
* CPU as a hint for the scheduler.
|
||||
*
|
||||
* Decision made in this function is not final. The user-space scheduler may
|
||||
* decide to move the task to a different CPU later, if needed.
|
||||
*/
|
||||
s32 BPF_STRUCT_OPS(rustland_select_cpu, struct task_struct *p, s32 prev_cpu,
|
||||
u64 wake_flags)
|
||||
{
|
||||
s32 cpu;
|
||||
|
||||
/*
|
||||
* When full_user is enabled, the user-space scheduler is responsible
|
||||
* for selecting a target CPU based on its scheduling logic and
|
||||
* possibly its own idle tracking mechanism.
|
||||
* Completely delegate the CPU selection logic to the user-space
|
||||
* scheduler.
|
||||
*/
|
||||
if (full_user)
|
||||
return prev_cpu;
|
||||
|
||||
cpu = pick_idle_cpu(p, prev_cpu, wake_flags);
|
||||
if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0))
|
||||
return cpu;
|
||||
|
||||
return prev_cpu;
|
||||
}
|
||||
|
||||
/*
|
||||
* Select an idle CPU for a specific task from the user-space scheduler.
|
||||
*/
|
||||
SEC("syscall")
|
||||
int rs_select_cpu(struct task_cpu_arg *input)
|
||||
{
|
||||
struct task_struct *p;
|
||||
int cpu;
|
||||
|
||||
p = bpf_task_from_pid(input->pid);
|
||||
if (!p)
|
||||
return -EINVAL;
|
||||
bpf_rcu_read_lock();
|
||||
cpu = pick_idle_cpu(p, input->cpu, input->flags);
|
||||
bpf_rcu_read_unlock();
|
||||
|
||||
bpf_task_release(p);
|
||||
|
||||
return cpu;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fill @task with all the information that need to be sent to the user-space
|
||||
* scheduler.
|
||||
@ -719,7 +699,7 @@ void BPF_STRUCT_OPS(rustland_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
* interactivity problems or unfairness if there are too many softirqs
|
||||
* being scheduled (e.g., in presence of high RX network traffic).
|
||||
*/
|
||||
if (!full_user && is_kthread(p) && p->nr_cpus_allowed == 1)
|
||||
if (is_kthread(p) && p->nr_cpus_allowed == 1)
|
||||
if (!dispatch_direct_cpu(p, cpu, enq_flags))
|
||||
return;
|
||||
|
||||
|
@ -33,7 +33,6 @@ impl<'a> Scheduler<'a> {
|
||||
0, // exit_dump_len (buffer size of exit info)
|
||||
false, // partial (include all tasks if false)
|
||||
5000, // slice_ns (default task time slice)
|
||||
true, // full_user (schedule all tasks in user-space)
|
||||
false, // low_power (low power mode)
|
||||
false, // verbose (verbose output)
|
||||
false, // debug (debug mode)
|
||||
@ -59,7 +58,7 @@ impl<'a> Scheduler<'a> {
|
||||
let mut dispatched_task = DispatchedTask::new(&task);
|
||||
|
||||
// Allow to dispatch on the first CPU available.
|
||||
dispatched_task.set_flag(RL_CPU_ANY);
|
||||
dispatched_task.flags |= RL_CPU_ANY;
|
||||
|
||||
let _ = self.bpf.dispatch_task(&dispatched_task);
|
||||
|
||||
|
@ -83,14 +83,6 @@ struct Opts {
|
||||
#[clap(short = 'S', long, default_value = "500")]
|
||||
slice_us_min: u64,
|
||||
|
||||
/// If specified, all the scheduling events and actions will be processed in user-space,
|
||||
/// disabling any form of in-kernel optimization.
|
||||
///
|
||||
/// This mode will likely make the system less responsive, but more predictable in terms of
|
||||
/// performance.
|
||||
#[clap(short = 'u', long, action = clap::ArgAction::SetTrue)]
|
||||
full_user: bool,
|
||||
|
||||
/// When low-power mode is enabled, the scheduler behaves in a more non-work conserving way:
|
||||
/// the CPUs operate at reduced capacity, which slows down CPU-bound tasks, enhancing the
|
||||
/// prioritization of interactive workloads. In summary, enabling low-power mode will limit
|
||||
@ -283,7 +275,6 @@ impl<'a> Scheduler<'a> {
|
||||
opts.exit_dump_len,
|
||||
opts.partial,
|
||||
opts.slice_us,
|
||||
opts.full_user,
|
||||
opts.low_power,
|
||||
opts.verbose,
|
||||
opts.debug,
|
||||
@ -423,12 +414,15 @@ impl<'a> Scheduler<'a> {
|
||||
let mut dispatched_task = DispatchedTask::new(&task.qtask);
|
||||
|
||||
// Assign the time slice to the task.
|
||||
dispatched_task.set_slice_ns(slice_ns);
|
||||
dispatched_task.slice_ns = slice_ns;
|
||||
|
||||
// Dispatch task on the first CPU available if it is classified as
|
||||
// interactive, non-interactive tasks will continue to run on the same CPU.
|
||||
if task.is_interactive {
|
||||
dispatched_task.set_flag(RL_CPU_ANY);
|
||||
// Try to pick an idle CPU for the task.
|
||||
let cpu = self.bpf.select_cpu(dispatched_task.pid, dispatched_task.cpu, 0);
|
||||
if cpu >= 0 {
|
||||
dispatched_task.cpu = cpu;
|
||||
} else {
|
||||
// Dispatch task on the first CPU available.
|
||||
dispatched_task.flags |= RL_CPU_ANY;
|
||||
}
|
||||
|
||||
// Send task to the BPF dispatcher.
|
||||
|
Loading…
Reference in New Issue
Block a user