mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-28 21:50:23 +00:00
Merge pull request #235 from sched-ext/rustland-preemption
rustland: enable preemption
This commit is contained in:
commit
a8226f0fde
@ -26,11 +26,15 @@ use scx_rustland_core::ALLOCATOR;
|
||||
// Defined in UAPI
|
||||
const SCHED_EXT: i32 = 7;
|
||||
|
||||
// Do not assign any specific CPU to the task.
|
||||
// Allow to dispatch the task on any CPU.
|
||||
//
|
||||
// The task will be dispatched to the global shared DSQ and it will run on the first CPU available.
|
||||
#[allow(dead_code)]
|
||||
pub const NO_CPU: i32 = -1;
|
||||
pub const RL_CPU_ANY: u64 = bpf_intf::RL_CPU_ANY as u64;
|
||||
|
||||
// Allow to preempt the target CPU when dispatching the task.
|
||||
#[allow(dead_code)]
|
||||
pub const RL_PREEMPT_CPU: u64 = bpf_intf::RL_PREEMPT_CPU as u64;
|
||||
|
||||
/// High-level Rust abstraction to interact with a generic sched-ext BPF component.
|
||||
///
|
||||
@ -71,6 +75,7 @@ pub struct QueuedTask {
|
||||
pub struct DispatchedTask {
|
||||
pid: i32, // pid that uniquely identifies a task
|
||||
cpu: i32, // target CPU selected by the scheduler
|
||||
flags: u64, // special dispatch flags
|
||||
slice_ns: u64, // time slice assigned to the task (0 = default)
|
||||
cpumask_cnt: u64, // cpumask generation counter (private)
|
||||
}
|
||||
@ -84,6 +89,7 @@ impl DispatchedTask {
|
||||
DispatchedTask {
|
||||
pid: task.pid,
|
||||
cpu: task.cpu,
|
||||
flags: 0,
|
||||
cpumask_cnt: task.cpumask_cnt,
|
||||
slice_ns: 0, // use default time slice
|
||||
}
|
||||
@ -95,6 +101,12 @@ impl DispatchedTask {
|
||||
self.cpu = cpu;
|
||||
}
|
||||
|
||||
// Assign a specific dispatch flag to a task.
|
||||
#[allow(dead_code)]
|
||||
pub fn set_flag(&mut self, flag: u64) {
|
||||
self.flags |= flag;
|
||||
}
|
||||
|
||||
// Assign a specific time slice to a task.
|
||||
#[allow(dead_code)]
|
||||
pub fn set_slice_ns(&mut self, slice_ns: u64) {
|
||||
@ -141,6 +153,7 @@ impl DispatchedMessage {
|
||||
let dispatched_task_struct = bpf_intf::dispatched_task_ctx {
|
||||
pid: task.pid,
|
||||
cpu: task.cpu,
|
||||
flags: task.flags,
|
||||
cpumask_cnt: task.cpumask_cnt,
|
||||
slice_ns: task.slice_ns,
|
||||
};
|
||||
|
@ -26,6 +26,35 @@ typedef unsigned long long u64;
|
||||
typedef long long s64;
|
||||
#endif
|
||||
|
||||
/* Check a condition at build time */
|
||||
#define BUILD_BUG_ON(expr) \
|
||||
do { \
|
||||
extern char __build_assert__[(expr) ? -1 : 1] \
|
||||
__attribute__((unused)); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
* Maximum amount of CPUs supported by this scheduler (this defines the size of
|
||||
* cpu_map that is used to store the idle state and CPU ownership).
|
||||
*/
|
||||
#define MAX_CPUS 1024
|
||||
|
||||
/* Special dispatch flags */
|
||||
enum {
|
||||
/*
|
||||
* Do not assign any specific CPU to the task.
|
||||
*
|
||||
* The task will be dispatched to the global shared DSQ and it will run
|
||||
* on the first CPU available.
|
||||
*/
|
||||
RL_CPU_ANY = 1 << 0,
|
||||
|
||||
/*
|
||||
* Allow to preempt the target CPU when dispatching the task.
|
||||
*/
|
||||
RL_PREEMPT_CPU = 1 << 1,
|
||||
};
|
||||
|
||||
/*
|
||||
* Task sent to the user-space scheduler by the BPF dispatcher.
|
||||
*
|
||||
@ -49,6 +78,7 @@ struct queued_task_ctx {
|
||||
struct dispatched_task_ctx {
|
||||
s32 pid;
|
||||
s32 cpu; /* CPU where the task should be dispatched */
|
||||
u64 flags; /* special dispatch flags */
|
||||
u64 cpumask_cnt; /* cpumask generation counter */
|
||||
u64 slice_ns; /* time slice assigned to the task (0=default) */
|
||||
};
|
||||
|
@ -33,12 +33,6 @@ char _license[] SEC("license") = "GPL";
|
||||
|
||||
UEI_DEFINE(uei);
|
||||
|
||||
/*
|
||||
* Maximum amount of CPUs supported by this scheduler (this defines the size of
|
||||
* cpu_map that is used to store the idle state and CPU ownership).
|
||||
*/
|
||||
#define MAX_CPUS 1024
|
||||
|
||||
/*
|
||||
* Introduce a custom DSQ shared across all the CPUs, where we can dispatch
|
||||
* tasks that will be executed on the first CPU available.
|
||||
@ -570,6 +564,7 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
|
||||
bpf_repeat(MAX_ENQUEUED_TASKS) {
|
||||
struct task_struct *p;
|
||||
struct dispatched_task_ctx task;
|
||||
u64 enq_flags = 0;
|
||||
|
||||
/*
|
||||
* Pop first task from the dispatched queue, stop if dispatch
|
||||
@ -582,21 +577,28 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
|
||||
p = bpf_task_from_pid(task.pid);
|
||||
if (!p)
|
||||
continue;
|
||||
|
||||
dbg_msg("usersched: pid=%d cpu=%d cpumask_cnt=%llu slice_ns=%llu flags=%llx",
|
||||
task.pid, task.cpu, task.cpumask_cnt, task.slice_ns, task.flags);
|
||||
/*
|
||||
* Map RL_PREEMPT_CPU to SCX_ENQ_PREEMPT and allow this task to
|
||||
* preempt others.
|
||||
*/
|
||||
if (task.flags & RL_PREEMPT_CPU)
|
||||
enq_flags = SCX_ENQ_PREEMPT;
|
||||
/*
|
||||
* Check whether the user-space scheduler assigned a different
|
||||
* CPU to the task and migrate (if possible).
|
||||
*
|
||||
* If no CPU has been specified (task.cpu < 0), then dispatch
|
||||
* the task to the shared DSQ and rely on the built-in idle CPU
|
||||
* selection.
|
||||
* If the task has been submitted with RL_CPU_ANY, then
|
||||
* dispatch it to the shared DSQ and run it on the first CPU
|
||||
* available.
|
||||
*/
|
||||
dbg_msg("usersched: pid=%d cpu=%d cpumask_cnt=%llu slice_ns=%llu",
|
||||
task.pid, task.cpu, task.cpumask_cnt, task.slice_ns);
|
||||
if (task.cpu < 0)
|
||||
dispatch_task(p, SHARED_DSQ, 0, task.slice_ns, 0);
|
||||
if (task.flags & RL_CPU_ANY)
|
||||
dispatch_task(p, SHARED_DSQ, 0, task.slice_ns, enq_flags);
|
||||
else
|
||||
dispatch_task(p, cpu_to_dsq(task.cpu), task.cpumask_cnt,
|
||||
task.slice_ns, 0);
|
||||
dispatch_task(p, cpu_to_dsq(task.cpu),
|
||||
task.cpumask_cnt, task.slice_ns, enq_flags);
|
||||
bpf_task_release(p);
|
||||
__sync_fetch_and_add(&nr_user_dispatches, 1);
|
||||
}
|
||||
@ -856,6 +858,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rustland_init)
|
||||
{
|
||||
int err;
|
||||
|
||||
/* Compile-time checks */
|
||||
BUILD_BUG_ON((MAX_CPUS % 2));
|
||||
|
||||
/* Initialize rustland core */
|
||||
err = dsq_init();
|
||||
if (err)
|
||||
return err;
|
||||
|
@ -108,6 +108,16 @@ struct Opts {
|
||||
#[clap(short = 'i', long, action = clap::ArgAction::SetTrue)]
|
||||
builtin_idle: bool,
|
||||
|
||||
/// If specified, disable task preemption.
|
||||
///
|
||||
/// Disabling task preemption can help to improve the throughput of CPU-intensive tasks, while
|
||||
/// still providing a good level of system responsiveness.
|
||||
///
|
||||
/// Preemption is enabled by default to provide a higher level of responsiveness to the
|
||||
/// interactive tasks.
|
||||
#[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
|
||||
no_preemption: bool,
|
||||
|
||||
/// If specified, all the scheduling events and actions will be processed in user-space,
|
||||
/// disabling any form of in-kernel optimization.
|
||||
///
|
||||
@ -173,8 +183,9 @@ impl TaskInfoMap {
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Clone)]
|
||||
struct Task {
|
||||
qtask: QueuedTask, // queued task
|
||||
vruntime: u64, // total vruntime (that determines the order how tasks are dispatched)
|
||||
qtask: QueuedTask, // queued task
|
||||
vruntime: u64, // total vruntime (that determines the order how tasks are dispatched)
|
||||
is_interactive: bool, // task can preempt other tasks
|
||||
}
|
||||
|
||||
// Make sure tasks are ordered by vruntime, if multiple tasks have the same vruntime order by pid.
|
||||
@ -237,7 +248,8 @@ struct Scheduler<'a> {
|
||||
slice_boost: u64, // Slice booster
|
||||
eff_slice_boost: u64, // Effective slice booster
|
||||
init_page_faults: u64, // Initial page faults counter
|
||||
builtin_idle: bool, // Use sched-ext built-in idle selection logic
|
||||
builtin_idle: bool, // Use sched-ext built-in idle selection logic
|
||||
no_preemption: bool, // Disable task preemption
|
||||
}
|
||||
|
||||
impl<'a> Scheduler<'a> {
|
||||
@ -255,6 +267,9 @@ impl<'a> Scheduler<'a> {
|
||||
// Use built-in idle selection logic.
|
||||
let builtin_idle = opts.builtin_idle;
|
||||
|
||||
// Disable task preemption.
|
||||
let no_preemption = opts.no_preemption;
|
||||
|
||||
// Scheduler task pool to sort tasks by vruntime.
|
||||
let task_pool = TaskTree::new();
|
||||
|
||||
@ -291,6 +306,7 @@ impl<'a> Scheduler<'a> {
|
||||
eff_slice_boost,
|
||||
init_page_faults,
|
||||
builtin_idle,
|
||||
no_preemption,
|
||||
})
|
||||
}
|
||||
|
||||
@ -327,11 +343,12 @@ impl<'a> Scheduler<'a> {
|
||||
ts.as_nanos() as u64
|
||||
}
|
||||
|
||||
// Update task's vruntime based on the information collected from the kernel and return the
|
||||
// evaluated weighted time slice to the caller.
|
||||
// Update task's vruntime based on the information collected from the kernel and return to the
|
||||
// caller the evaluated weighted time slice along with a flag indicating whether the task is
|
||||
// interactive or not (interactive tasks are allowed to preempt other tasks).
|
||||
//
|
||||
// This method implements the main task ordering logic of the scheduler.
|
||||
fn update_enqueued(&mut self, task: &QueuedTask) -> u64 {
|
||||
fn update_enqueued(&mut self, task: &QueuedTask) -> (u64, bool) {
|
||||
// Determine if a task is new or old, based on their current runtime and previous runtime
|
||||
// counters.
|
||||
//
|
||||
@ -389,18 +406,19 @@ impl<'a> Scheduler<'a> {
|
||||
task.sum_exec_runtime - task_info.sum_exec_runtime
|
||||
};
|
||||
|
||||
// Apply the slice boost to interactive tasks.
|
||||
//
|
||||
// Determine if a task is interactive, based on the moving average of voluntary context
|
||||
// switches over time.
|
||||
//
|
||||
// NOTE: we should make this threshold a tunable, but for now let's assume that a moving
|
||||
// average of 10 voluntary context switch per second is enough to classify the task as
|
||||
// interactive.
|
||||
let is_interactive = task_info.avg_nvcsw >= 10;
|
||||
|
||||
// Apply the slice boost to interactive tasks.
|
||||
//
|
||||
// NOTE: some tasks may have a very high weight, that can potentially disrupt our slice
|
||||
// boost optimizations, therefore always limit the task priority to a max of 1000.
|
||||
let weight = if task_info.avg_nvcsw >= 10 {
|
||||
let weight = if is_interactive {
|
||||
task.weight.min(1000) * self.slice_boost.max(1)
|
||||
} else {
|
||||
task.weight.min(1000)
|
||||
@ -435,8 +453,8 @@ impl<'a> Scheduler<'a> {
|
||||
task_info.nvcsw_ts = now;
|
||||
}
|
||||
|
||||
// Return the task vruntime.
|
||||
task_info.vruntime
|
||||
// Return the task vruntime and a flag indicating if the task is interactive.
|
||||
(task_info.vruntime, is_interactive)
|
||||
}
|
||||
|
||||
// Drain all the tasks from the queued list, update their vruntime (Self::update_enqueued()),
|
||||
@ -452,13 +470,14 @@ impl<'a> Scheduler<'a> {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Update task information.
|
||||
let vruntime = self.update_enqueued(&task);
|
||||
// Update task information and determine vruntime and interactiveness.
|
||||
let (vruntime, is_interactive) = self.update_enqueued(&task);
|
||||
|
||||
// Insert task in the task pool (ordered by vruntime).
|
||||
self.task_pool.push(Task {
|
||||
qtask: task,
|
||||
vruntime,
|
||||
is_interactive,
|
||||
});
|
||||
}
|
||||
Ok(None) => {
|
||||
@ -505,8 +524,20 @@ impl<'a> Scheduler<'a> {
|
||||
// If built-in idle selection logic is disabled, dispatch on the first CPU
|
||||
// available.
|
||||
let mut dispatched_task = DispatchedTask::new(&task.qtask);
|
||||
|
||||
// Set special dispatch flags.
|
||||
if !self.builtin_idle {
|
||||
dispatched_task.set_cpu(NO_CPU);
|
||||
dispatched_task.set_flag(RL_CPU_ANY);
|
||||
}
|
||||
if task.is_interactive && !self.no_preemption {
|
||||
// Assign the maximum time slice to this task and allow to preempt others.
|
||||
//
|
||||
// NOTE: considering that, with preemption enabled, interactive tasks can
|
||||
// preempt each other (for now) and they are also more likely to release
|
||||
// the CPU before its assigned time slice expires, always give them the
|
||||
// maximum static time slice allowed.
|
||||
dispatched_task.set_slice_ns(self.slice_ns);
|
||||
dispatched_task.set_flag(RL_PREEMPT_CPU);
|
||||
}
|
||||
|
||||
// Send task to the BPF dispatcher.
|
||||
|
Loading…
Reference in New Issue
Block a user