scx_rustland_core: move CPU idle selection logic in user-space

Allow user-space scheduler to pick an idle CPU via
self.bpf.select_cpu(pid, prev_task, flags), mimicking the BPF's
select_cpu() iterface.

Also remove the full_user option and always rely on the idle selection
logic from user-space.

Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
This commit is contained in:
Andrea Righi 2024-08-24 11:04:45 +02:00
parent 1dd329dd7d
commit 5d544ea264
5 changed files with 86 additions and 92 deletions

View File

@ -6,10 +6,13 @@
use std::mem::MaybeUninit;
use crate::bpf_intf;
use crate::bpf_intf::*;
use crate::bpf_skel::*;
use std::fs::File;
use std::io::Read;
use std::ffi::c_int;
use std::ffi::c_ulong;
use anyhow::Context;
use anyhow::Result;
@ -17,6 +20,7 @@ use anyhow::Result;
use plain::Plain;
use libbpf_rs::OpenObject;
use libbpf_rs::ProgramInput;
use libbpf_rs::skel::OpenSkel;
use libbpf_rs::skel::Skel;
use libbpf_rs::skel::SkelBuilder;
@ -82,11 +86,11 @@ pub struct QueuedTask {
// Task queued for dispatching to the BPF component (see bpf_intf::dispatched_task_ctx).
#[derive(Debug, PartialEq, Eq, PartialOrd, Clone)]
pub struct DispatchedTask {
pid: i32, // pid that uniquely identifies a task
cpu: i32, // target CPU selected by the scheduler
flags: u64, // special dispatch flags
slice_ns: u64, // time slice assigned to the task (0 = default)
cpumask_cnt: u64, // cpumask generation counter (private)
pub pid: i32, // pid that uniquely identifies a task
pub cpu: i32, // target CPU selected by the scheduler
pub flags: u64, // special dispatch flags
pub slice_ns: u64, // time slice assigned to the task (0 = default)
cpumask_cnt: u64, // cpumask generation counter (private)
}
impl DispatchedTask {
@ -103,24 +107,6 @@ impl DispatchedTask {
slice_ns: 0, // use default time slice
}
}
// Assign a specific CPU to a task.
#[allow(dead_code)]
pub fn set_cpu(&mut self, cpu: i32) {
self.cpu = cpu;
}
// Assign a specific dispatch flag to a task.
#[allow(dead_code)]
pub fn set_flag(&mut self, flag: u64) {
self.flags |= flag;
}
// Assign a specific time slice to a task.
#[allow(dead_code)]
pub fn set_slice_ns(&mut self, slice_ns: u64) {
self.slice_ns = slice_ns;
}
}
// Helpers used to submit tasks to the BPF user ring buffer.
@ -196,7 +182,6 @@ impl<'cb> BpfScheduler<'cb> {
exit_dump_len: u32,
partial: bool,
slice_us: u64,
full_user: bool,
low_power: bool,
verbose: bool,
debug: bool,
@ -256,7 +241,6 @@ impl<'cb> BpfScheduler<'cb> {
skel.maps.bss_data.usersched_pid = std::process::id();
skel.maps.rodata_data.slice_ns = slice_us * 1000;
skel.maps.rodata_data.full_user = full_user;
skel.maps.rodata_data.low_power = low_power;
skel.maps.rodata_data.debug = debug;
@ -390,6 +374,28 @@ impl<'cb> BpfScheduler<'cb> {
unsafe { pthread_setschedparam(pthread_self(), SCHED_EXT, &param as *const sched_param) }
}
// Pick an idle CPU for the target PID.
pub fn select_cpu(&mut self, pid: i32, cpu: i32, flags: u64) -> i32 {
let prog = &mut self.skel.progs.rs_select_cpu;
let mut args = task_cpu_arg {
pid: pid as c_int,
cpu: cpu as c_int,
flags: flags as c_ulong,
};
let input = ProgramInput {
context_in: Some(unsafe {
std::slice::from_raw_parts_mut(
&mut args as *mut _ as *mut u8,
std::mem::size_of_val(&args),
)
}),
..Default::default()
};
let out = prog.test_run(input).unwrap();
out.return_value as i32
}
// Receive a task to be scheduled from the BPF dispatcher.
//
// NOTE: if task.cpu is negative the task is exiting and it does not require to be scheduled.

View File

@ -18,13 +18,19 @@
#define __kptr
#endif
#ifndef __KERNEL__
#ifndef __VMLINUX_H__
typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;
typedef int s32;
typedef unsigned long long u64;
typedef long long s64;
#endif
typedef unsigned long u64;
typedef signed char s8;
typedef signed short s16;
typedef signed int s32;
typedef signed long s64;
typedef int pid_t;
#endif /* __VMLINUX_H__ */
/* Check a condition at build time */
#define BUILD_BUG_ON(expr) \
@ -55,6 +61,15 @@ enum {
RL_PREEMPT_CPU = 1 << 1,
};
/*
* Specify a target CPU for a specific PID.
*/
struct task_cpu_arg {
pid_t pid;
s32 cpu;
u64 flags;
};
/*
* Task sent to the user-space scheduler by the BPF dispatcher.
*
@ -78,8 +93,8 @@ struct dispatched_task_ctx {
s32 pid;
s32 cpu; /* CPU where the task should be dispatched */
u64 flags; /* special dispatch flags */
u64 cpumask_cnt; /* cpumask generation counter */
u64 slice_ns; /* time slice assigned to the task (0=default) */
u64 cpumask_cnt; /* cpumask generation counter */
};
#endif /* __INTF_H */

View File

@ -87,14 +87,6 @@ const volatile bool debug;
bpf_printk(_fmt, ##__VA_ARGS__); \
} while(0)
/*
* Enable/disable full user-space mode.
*
* In full user-space mode all events and actions will be sent to user-space,
* basically disabling any optimization to bypass the user-space scheduler.
*/
const volatile bool full_user;
/*
* Enable/disable low-power mode.
*
@ -387,19 +379,8 @@ dispatch_task(struct task_struct *p, u64 dsq_id,
break;
default:
tctx = lookup_task_ctx(p);
if (!tctx) {
/*
* If the task doesn't have a context anymore, simply
* bounce it to the first CPU available.
*/
scx_bpf_dispatch(p, SHARED_DSQ, slice, enq_flags);
__sync_fetch_and_add(&nr_bounce_dispatches, 1);
dbg_msg("dispatch: pid=%d (%s) dsq=%llu enq_flags=%llx slice=%llu bounce",
p->pid, p->comm, dsq_id, enq_flags, slice);
if (!tctx)
return;
}
/*
* Dispatch a task to a specific per-CPU DSQ if the target CPU
* can be used (according to the cpumask), otherwise redirect
@ -624,38 +605,37 @@ out_put_cpumask:
return cpu;
}
/*
* Select the target CPU where a task can be executed.
*
* The idea here is to try to find an idle CPU in the system, and preferably
* maintain the task on the same CPU. If we can find an idle CPU in the system
* dispatch the task directly bypassing the user-space scheduler. Otherwise,
* send the task to the user-space scheduler, maintaining the previously used
* CPU as a hint for the scheduler.
*
* Decision made in this function is not final. The user-space scheduler may
* decide to move the task to a different CPU later, if needed.
*/
s32 BPF_STRUCT_OPS(rustland_select_cpu, struct task_struct *p, s32 prev_cpu,
u64 wake_flags)
{
s32 cpu;
/*
* When full_user is enabled, the user-space scheduler is responsible
* for selecting a target CPU based on its scheduling logic and
* possibly its own idle tracking mechanism.
* Completely delegate the CPU selection logic to the user-space
* scheduler.
*/
if (full_user)
return prev_cpu;
cpu = pick_idle_cpu(p, prev_cpu, wake_flags);
if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0))
return cpu;
return prev_cpu;
}
/*
* Select an idle CPU for a specific task from the user-space scheduler.
*/
SEC("syscall")
int rs_select_cpu(struct task_cpu_arg *input)
{
struct task_struct *p;
int cpu;
p = bpf_task_from_pid(input->pid);
if (!p)
return -EINVAL;
bpf_rcu_read_lock();
cpu = pick_idle_cpu(p, input->cpu, input->flags);
bpf_rcu_read_unlock();
bpf_task_release(p);
return cpu;
}
/*
* Fill @task with all the information that need to be sent to the user-space
* scheduler.
@ -719,7 +699,7 @@ void BPF_STRUCT_OPS(rustland_enqueue, struct task_struct *p, u64 enq_flags)
* interactivity problems or unfairness if there are too many softirqs
* being scheduled (e.g., in presence of high RX network traffic).
*/
if (!full_user && is_kthread(p) && p->nr_cpus_allowed == 1)
if (is_kthread(p) && p->nr_cpus_allowed == 1)
if (!dispatch_direct_cpu(p, cpu, enq_flags))
return;

View File

@ -33,7 +33,6 @@ impl<'a> Scheduler<'a> {
0, // exit_dump_len (buffer size of exit info)
false, // partial (include all tasks if false)
5000, // slice_ns (default task time slice)
true, // full_user (schedule all tasks in user-space)
false, // low_power (low power mode)
false, // verbose (verbose output)
false, // debug (debug mode)
@ -59,7 +58,7 @@ impl<'a> Scheduler<'a> {
let mut dispatched_task = DispatchedTask::new(&task);
// Allow to dispatch on the first CPU available.
dispatched_task.set_flag(RL_CPU_ANY);
dispatched_task.flags |= RL_CPU_ANY;
let _ = self.bpf.dispatch_task(&dispatched_task);

View File

@ -83,14 +83,6 @@ struct Opts {
#[clap(short = 'S', long, default_value = "500")]
slice_us_min: u64,
/// If specified, all the scheduling events and actions will be processed in user-space,
/// disabling any form of in-kernel optimization.
///
/// This mode will likely make the system less responsive, but more predictable in terms of
/// performance.
#[clap(short = 'u', long, action = clap::ArgAction::SetTrue)]
full_user: bool,
/// When low-power mode is enabled, the scheduler behaves in a more non-work conserving way:
/// the CPUs operate at reduced capacity, which slows down CPU-bound tasks, enhancing the
/// prioritization of interactive workloads. In summary, enabling low-power mode will limit
@ -283,7 +275,6 @@ impl<'a> Scheduler<'a> {
opts.exit_dump_len,
opts.partial,
opts.slice_us,
opts.full_user,
opts.low_power,
opts.verbose,
opts.debug,
@ -423,12 +414,15 @@ impl<'a> Scheduler<'a> {
let mut dispatched_task = DispatchedTask::new(&task.qtask);
// Assign the time slice to the task.
dispatched_task.set_slice_ns(slice_ns);
dispatched_task.slice_ns = slice_ns;
// Dispatch task on the first CPU available if it is classified as
// interactive, non-interactive tasks will continue to run on the same CPU.
if task.is_interactive {
dispatched_task.set_flag(RL_CPU_ANY);
// Try to pick an idle CPU for the task.
let cpu = self.bpf.select_cpu(dispatched_task.pid, dispatched_task.cpu, 0);
if cpu >= 0 {
dispatched_task.cpu = cpu;
} else {
// Dispatch task on the first CPU available.
dispatched_task.flags |= RL_CPU_ANY;
}
// Send task to the BPF dispatcher.