Merge pull request #235 from sched-ext/rustland-preemption

rustland: enable preemption
This commit is contained in:
Andrea Righi 2024-04-23 17:20:39 +02:00 committed by GitHub
commit a8226f0fde
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 111 additions and 31 deletions

View File

@ -26,11 +26,15 @@ use scx_rustland_core::ALLOCATOR;
// Defined in UAPI
const SCHED_EXT: i32 = 7;
// Do not assign any specific CPU to the task.
// Allow to dispatch the task on any CPU.
//
// The task will be dispatched to the global shared DSQ and it will run on the first CPU available.
#[allow(dead_code)]
pub const NO_CPU: i32 = -1;
pub const RL_CPU_ANY: u64 = bpf_intf::RL_CPU_ANY as u64;
// Allow to preempt the target CPU when dispatching the task.
#[allow(dead_code)]
pub const RL_PREEMPT_CPU: u64 = bpf_intf::RL_PREEMPT_CPU as u64;
/// High-level Rust abstraction to interact with a generic sched-ext BPF component.
///
@ -71,6 +75,7 @@ pub struct QueuedTask {
pub struct DispatchedTask {
pid: i32, // pid that uniquely identifies a task
cpu: i32, // target CPU selected by the scheduler
flags: u64, // special dispatch flags
slice_ns: u64, // time slice assigned to the task (0 = default)
cpumask_cnt: u64, // cpumask generation counter (private)
}
@ -84,6 +89,7 @@ impl DispatchedTask {
DispatchedTask {
pid: task.pid,
cpu: task.cpu,
flags: 0,
cpumask_cnt: task.cpumask_cnt,
slice_ns: 0, // use default time slice
}
@ -95,6 +101,12 @@ impl DispatchedTask {
self.cpu = cpu;
}
// Assign a specific dispatch flag to a task.
#[allow(dead_code)]
pub fn set_flag(&mut self, flag: u64) {
self.flags |= flag;
}
// Assign a specific time slice to a task.
#[allow(dead_code)]
pub fn set_slice_ns(&mut self, slice_ns: u64) {
@ -141,6 +153,7 @@ impl DispatchedMessage {
let dispatched_task_struct = bpf_intf::dispatched_task_ctx {
pid: task.pid,
cpu: task.cpu,
flags: task.flags,
cpumask_cnt: task.cpumask_cnt,
slice_ns: task.slice_ns,
};

View File

@ -26,6 +26,35 @@ typedef unsigned long long u64;
typedef long long s64;
#endif
/* Check a condition at build time */
#define BUILD_BUG_ON(expr) \
do { \
extern char __build_assert__[(expr) ? -1 : 1] \
__attribute__((unused)); \
} while(0)
/*
* Maximum amount of CPUs supported by this scheduler (this defines the size of
* cpu_map that is used to store the idle state and CPU ownership).
*/
#define MAX_CPUS 1024
/* Special dispatch flags */
enum {
/*
* Do not assign any specific CPU to the task.
*
* The task will be dispatched to the global shared DSQ and it will run
* on the first CPU available.
*/
RL_CPU_ANY = 1 << 0,
/*
* Allow to preempt the target CPU when dispatching the task.
*/
RL_PREEMPT_CPU = 1 << 1,
};
/*
* Task sent to the user-space scheduler by the BPF dispatcher.
*
@ -49,6 +78,7 @@ struct queued_task_ctx {
struct dispatched_task_ctx {
s32 pid;
s32 cpu; /* CPU where the task should be dispatched */
u64 flags; /* special dispatch flags */
u64 cpumask_cnt; /* cpumask generation counter */
u64 slice_ns; /* time slice assigned to the task (0=default) */
};

View File

@ -33,12 +33,6 @@ char _license[] SEC("license") = "GPL";
UEI_DEFINE(uei);
/*
* Maximum amount of CPUs supported by this scheduler (this defines the size of
* cpu_map that is used to store the idle state and CPU ownership).
*/
#define MAX_CPUS 1024
/*
* Introduce a custom DSQ shared across all the CPUs, where we can dispatch
* tasks that will be executed on the first CPU available.
@ -570,6 +564,7 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
bpf_repeat(MAX_ENQUEUED_TASKS) {
struct task_struct *p;
struct dispatched_task_ctx task;
u64 enq_flags = 0;
/*
* Pop first task from the dispatched queue, stop if dispatch
@ -582,21 +577,28 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
p = bpf_task_from_pid(task.pid);
if (!p)
continue;
dbg_msg("usersched: pid=%d cpu=%d cpumask_cnt=%llu slice_ns=%llu flags=%llx",
task.pid, task.cpu, task.cpumask_cnt, task.slice_ns, task.flags);
/*
* Map RL_PREEMPT_CPU to SCX_ENQ_PREEMPT and allow this task to
* preempt others.
*/
if (task.flags & RL_PREEMPT_CPU)
enq_flags = SCX_ENQ_PREEMPT;
/*
* Check whether the user-space scheduler assigned a different
* CPU to the task and migrate (if possible).
*
* If no CPU has been specified (task.cpu < 0), then dispatch
* the task to the shared DSQ and rely on the built-in idle CPU
* selection.
* If the task has been submitted with RL_CPU_ANY, then
* dispatch it to the shared DSQ and run it on the first CPU
* available.
*/
dbg_msg("usersched: pid=%d cpu=%d cpumask_cnt=%llu slice_ns=%llu",
task.pid, task.cpu, task.cpumask_cnt, task.slice_ns);
if (task.cpu < 0)
dispatch_task(p, SHARED_DSQ, 0, task.slice_ns, 0);
if (task.flags & RL_CPU_ANY)
dispatch_task(p, SHARED_DSQ, 0, task.slice_ns, enq_flags);
else
dispatch_task(p, cpu_to_dsq(task.cpu), task.cpumask_cnt,
task.slice_ns, 0);
dispatch_task(p, cpu_to_dsq(task.cpu),
task.cpumask_cnt, task.slice_ns, enq_flags);
bpf_task_release(p);
__sync_fetch_and_add(&nr_user_dispatches, 1);
}
@ -856,6 +858,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rustland_init)
{
int err;
/* Compile-time checks */
BUILD_BUG_ON((MAX_CPUS % 2));
/* Initialize rustland core */
err = dsq_init();
if (err)
return err;

View File

@ -108,6 +108,16 @@ struct Opts {
#[clap(short = 'i', long, action = clap::ArgAction::SetTrue)]
builtin_idle: bool,
/// If specified, disable task preemption.
///
/// Disabling task preemption can help to improve the throughput of CPU-intensive tasks, while
/// still providing a good level of system responsiveness.
///
/// Preemption is enabled by default to provide a higher level of responsiveness to the
/// interactive tasks.
#[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
no_preemption: bool,
/// If specified, all the scheduling events and actions will be processed in user-space,
/// disabling any form of in-kernel optimization.
///
@ -173,8 +183,9 @@ impl TaskInfoMap {
#[derive(Debug, PartialEq, Eq, PartialOrd, Clone)]
struct Task {
qtask: QueuedTask, // queued task
vruntime: u64, // total vruntime (that determines the order how tasks are dispatched)
qtask: QueuedTask, // queued task
vruntime: u64, // total vruntime (that determines the order how tasks are dispatched)
is_interactive: bool, // task can preempt other tasks
}
// Make sure tasks are ordered by vruntime, if multiple tasks have the same vruntime order by pid.
@ -237,7 +248,8 @@ struct Scheduler<'a> {
slice_boost: u64, // Slice booster
eff_slice_boost: u64, // Effective slice booster
init_page_faults: u64, // Initial page faults counter
builtin_idle: bool, // Use sched-ext built-in idle selection logic
builtin_idle: bool, // Use sched-ext built-in idle selection logic
no_preemption: bool, // Disable task preemption
}
impl<'a> Scheduler<'a> {
@ -255,6 +267,9 @@ impl<'a> Scheduler<'a> {
// Use built-in idle selection logic.
let builtin_idle = opts.builtin_idle;
// Disable task preemption.
let no_preemption = opts.no_preemption;
// Scheduler task pool to sort tasks by vruntime.
let task_pool = TaskTree::new();
@ -291,6 +306,7 @@ impl<'a> Scheduler<'a> {
eff_slice_boost,
init_page_faults,
builtin_idle,
no_preemption,
})
}
@ -327,11 +343,12 @@ impl<'a> Scheduler<'a> {
ts.as_nanos() as u64
}
// Update task's vruntime based on the information collected from the kernel and return the
// evaluated weighted time slice to the caller.
// Update task's vruntime based on the information collected from the kernel and return to the
// caller the evaluated weighted time slice along with a flag indicating whether the task is
// interactive or not (interactive tasks are allowed to preempt other tasks).
//
// This method implements the main task ordering logic of the scheduler.
fn update_enqueued(&mut self, task: &QueuedTask) -> u64 {
fn update_enqueued(&mut self, task: &QueuedTask) -> (u64, bool) {
// Determine if a task is new or old, based on their current runtime and previous runtime
// counters.
//
@ -389,18 +406,19 @@ impl<'a> Scheduler<'a> {
task.sum_exec_runtime - task_info.sum_exec_runtime
};
// Apply the slice boost to interactive tasks.
//
// Determine if a task is interactive, based on the moving average of voluntary context
// switches over time.
//
// NOTE: we should make this threshold a tunable, but for now let's assume that a moving
// average of 10 voluntary context switch per second is enough to classify the task as
// interactive.
let is_interactive = task_info.avg_nvcsw >= 10;
// Apply the slice boost to interactive tasks.
//
// NOTE: some tasks may have a very high weight, that can potentially disrupt our slice
// boost optimizations, therefore always limit the task priority to a max of 1000.
let weight = if task_info.avg_nvcsw >= 10 {
let weight = if is_interactive {
task.weight.min(1000) * self.slice_boost.max(1)
} else {
task.weight.min(1000)
@ -435,8 +453,8 @@ impl<'a> Scheduler<'a> {
task_info.nvcsw_ts = now;
}
// Return the task vruntime.
task_info.vruntime
// Return the task vruntime and a flag indicating if the task is interactive.
(task_info.vruntime, is_interactive)
}
// Drain all the tasks from the queued list, update their vruntime (Self::update_enqueued()),
@ -452,13 +470,14 @@ impl<'a> Scheduler<'a> {
continue;
}
// Update task information.
let vruntime = self.update_enqueued(&task);
// Update task information and determine vruntime and interactiveness.
let (vruntime, is_interactive) = self.update_enqueued(&task);
// Insert task in the task pool (ordered by vruntime).
self.task_pool.push(Task {
qtask: task,
vruntime,
is_interactive,
});
}
Ok(None) => {
@ -505,8 +524,20 @@ impl<'a> Scheduler<'a> {
// If built-in idle selection logic is disabled, dispatch on the first CPU
// available.
let mut dispatched_task = DispatchedTask::new(&task.qtask);
// Set special dispatch flags.
if !self.builtin_idle {
dispatched_task.set_cpu(NO_CPU);
dispatched_task.set_flag(RL_CPU_ANY);
}
if task.is_interactive && !self.no_preemption {
// Assign the maximum time slice to this task and allow to preempt others.
//
// NOTE: considering that, with preemption enabled, interactive tasks can
// preempt each other (for now) and they are also more likely to release
// the CPU before its assigned time slice expires, always give them the
// maximum static time slice allowed.
dispatched_task.set_slice_ns(self.slice_ns);
dispatched_task.set_flag(RL_PREEMPT_CPU);
}
// Send task to the BPF dispatcher.