mirror of
https://github.com/JakeHillion/scx.git
synced 2024-12-02 05:47:12 +00:00
Merge pull request #345 from sched-ext/rustland-prevent-starvation
scx_rustland: prevent starvation
This commit is contained in:
commit
fafbc90fa5
@ -367,7 +367,6 @@ dispatch_task(struct task_struct *p, u64 dsq_id,
|
|||||||
s32 cpu;
|
s32 cpu;
|
||||||
|
|
||||||
switch (dsq_id) {
|
switch (dsq_id) {
|
||||||
case SCX_DSQ_LOCAL:
|
|
||||||
case SHARED_DSQ:
|
case SHARED_DSQ:
|
||||||
scx_bpf_dispatch(p, dsq_id, slice, enq_flags);
|
scx_bpf_dispatch(p, dsq_id, slice, enq_flags);
|
||||||
dbg_msg("dispatch: pid=%d (%s) dsq=%llu enq_flags=%llx slice=%llu",
|
dbg_msg("dispatch: pid=%d (%s) dsq=%llu enq_flags=%llx slice=%llu",
|
||||||
@ -457,10 +456,22 @@ static void dispatch_user_scheduler(void)
|
|||||||
* Dispatch the scheduler on the first CPU available, likely the
|
* Dispatch the scheduler on the first CPU available, likely the
|
||||||
* current one.
|
* current one.
|
||||||
*/
|
*/
|
||||||
dispatch_task(p, SHARED_DSQ, 0, 0, 0);
|
dispatch_task(p, SHARED_DSQ, 0, 0, SCX_ENQ_PREEMPT);
|
||||||
bpf_task_release(p);
|
bpf_task_release(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Directly dispatch a task on a target CPU bypassing the user-space scheduler.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
dispatch_direct_cpu(struct task_struct *p, s32 cpu, u64 slice_ns, u64 enq_flags)
|
||||||
|
{
|
||||||
|
scx_bpf_dispatch(p, cpu_to_dsq(cpu), slice_ns, enq_flags);
|
||||||
|
scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE);
|
||||||
|
|
||||||
|
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Select the target CPU where a task can be executed.
|
* Select the target CPU where a task can be executed.
|
||||||
*
|
*
|
||||||
@ -495,14 +506,10 @@ s32 BPF_STRUCT_OPS(rustland_select_cpu, struct task_struct *p, s32 prev_cpu,
|
|||||||
/*
|
/*
|
||||||
* If the previously used CPU is still available, keep using it to take
|
* If the previously used CPU is still available, keep using it to take
|
||||||
* advantage of the cached working set.
|
* advantage of the cached working set.
|
||||||
*
|
|
||||||
* NOTE: assign a shorter time slice (slice_ns / 4) to a task directly
|
|
||||||
* dispatched to prevent it from gaining excessive CPU bandwidth.
|
|
||||||
*/
|
*/
|
||||||
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
|
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
|
||||||
tctx->allow_migration = false;
|
tctx->allow_migration = false;
|
||||||
dispatch_task(p, SCX_DSQ_LOCAL, 0, slice_ns / 4, 0);
|
dispatch_direct_cpu(p, prev_cpu, slice_ns, 0);
|
||||||
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
|
|
||||||
return prev_cpu;
|
return prev_cpu;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -523,8 +530,7 @@ s32 BPF_STRUCT_OPS(rustland_select_cpu, struct task_struct *p, s32 prev_cpu,
|
|||||||
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
|
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
|
||||||
if (is_idle) {
|
if (is_idle) {
|
||||||
tctx->allow_migration = false;
|
tctx->allow_migration = false;
|
||||||
dispatch_task(p, SCX_DSQ_LOCAL, 0, slice_ns / 4, 0);
|
dispatch_direct_cpu(p, cpu, slice_ns, 0);
|
||||||
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
|
|
||||||
return cpu;
|
return cpu;
|
||||||
}
|
}
|
||||||
tctx->allow_migration = true;
|
tctx->allow_migration = true;
|
||||||
@ -595,7 +601,7 @@ void BPF_STRUCT_OPS(rustland_enqueue, struct task_struct *p, u64 enq_flags)
|
|||||||
* long (i.e., ksoftirqd/N, rcuop/N, etc.).
|
* long (i.e., ksoftirqd/N, rcuop/N, etc.).
|
||||||
*/
|
*/
|
||||||
if (is_kthread(p) && p->nr_cpus_allowed == 1) {
|
if (is_kthread(p) && p->nr_cpus_allowed == 1) {
|
||||||
dispatch_task(p, SCX_DSQ_LOCAL, 0, slice_ns, enq_flags);
|
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
|
||||||
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
|
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -605,12 +611,7 @@ void BPF_STRUCT_OPS(rustland_enqueue, struct task_struct *p, u64 enq_flags)
|
|||||||
* FIFO mode.
|
* FIFO mode.
|
||||||
*/
|
*/
|
||||||
if (!full_user && is_fifo_enabled) {
|
if (!full_user && is_fifo_enabled) {
|
||||||
s32 cpu = scx_bpf_task_cpu(p);
|
dispatch_direct_cpu(p, scx_bpf_task_cpu(p), slice_ns, enq_flags);
|
||||||
|
|
||||||
scx_bpf_dispatch(p, cpu_to_dsq(cpu), slice_ns, enq_flags);
|
|
||||||
scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE);
|
|
||||||
|
|
||||||
__sync_fetch_and_add(&nr_kernel_dispatches, 1);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -261,6 +261,7 @@ struct Scheduler<'a> {
|
|||||||
task_pool: TaskTree, // tasks ordered by vruntime
|
task_pool: TaskTree, // tasks ordered by vruntime
|
||||||
task_map: TaskInfoMap, // map pids to the corresponding task information
|
task_map: TaskInfoMap, // map pids to the corresponding task information
|
||||||
min_vruntime: u64, // Keep track of the minimum vruntime across all tasks
|
min_vruntime: u64, // Keep track of the minimum vruntime across all tasks
|
||||||
|
max_vruntime: u64, // Keep track of the maximum vruntime across all tasks
|
||||||
slice_ns: u64, // Default time slice (in ns)
|
slice_ns: u64, // Default time slice (in ns)
|
||||||
slice_boost: u64, // Slice booster
|
slice_boost: u64, // Slice booster
|
||||||
init_page_faults: u64, // Initial page faults counter
|
init_page_faults: u64, // Initial page faults counter
|
||||||
@ -292,8 +293,9 @@ impl<'a> Scheduler<'a> {
|
|||||||
// Scheduler task map to store tasks information.
|
// Scheduler task map to store tasks information.
|
||||||
let task_map = TaskInfoMap::new();
|
let task_map = TaskInfoMap::new();
|
||||||
|
|
||||||
// Initialize global minimum vruntime.
|
// Initialize global minimum and maximum vruntime.
|
||||||
let min_vruntime: u64 = 0;
|
let min_vruntime: u64 = 0;
|
||||||
|
let max_vruntime: u64 = 0;
|
||||||
|
|
||||||
// Initialize initial page fault counter.
|
// Initialize initial page fault counter.
|
||||||
let init_page_faults: u64 = 0;
|
let init_page_faults: u64 = 0;
|
||||||
@ -319,6 +321,7 @@ impl<'a> Scheduler<'a> {
|
|||||||
task_pool,
|
task_pool,
|
||||||
task_map,
|
task_map,
|
||||||
min_vruntime,
|
min_vruntime,
|
||||||
|
max_vruntime,
|
||||||
slice_ns,
|
slice_ns,
|
||||||
slice_boost,
|
slice_boost,
|
||||||
init_page_faults,
|
init_page_faults,
|
||||||
@ -438,6 +441,9 @@ impl<'a> Scheduler<'a> {
|
|||||||
// current task for too long in the scheduler task pool.
|
// current task for too long in the scheduler task pool.
|
||||||
task_info.vruntime = self.min_vruntime + slice.clamp(1, self.slice_ns);
|
task_info.vruntime = self.min_vruntime + slice.clamp(1, self.slice_ns);
|
||||||
|
|
||||||
|
// Update maximum vruntime.
|
||||||
|
self.max_vruntime = self.max_vruntime.max(task_info.vruntime);
|
||||||
|
|
||||||
// Update total task cputime.
|
// Update total task cputime.
|
||||||
task_info.sum_exec_runtime = task.sum_exec_runtime;
|
task_info.sum_exec_runtime = task.sum_exec_runtime;
|
||||||
|
|
||||||
@ -504,7 +510,15 @@ impl<'a> Scheduler<'a> {
|
|||||||
// This allows to have more tasks sitting in the task pool, reducing the pressure on the
|
// This allows to have more tasks sitting in the task pool, reducing the pressure on the
|
||||||
// dispatcher queues and giving a chance to higher priority tasks to come in and get
|
// dispatcher queues and giving a chance to higher priority tasks to come in and get
|
||||||
// dispatched earlier, mitigating potential priority inversion issues.
|
// dispatched earlier, mitigating potential priority inversion issues.
|
||||||
for _ in 0..self.nr_idle_cpus().max(1) {
|
let delta_slice = self.max_vruntime - self.min_vruntime;
|
||||||
|
let nr_tasks = if delta_slice <= self.slice_ns {
|
||||||
|
self.nr_idle_cpus().max(1)
|
||||||
|
} else {
|
||||||
|
// Scheduler is getting congested, flush all tasks that are waiting to be scheduled to
|
||||||
|
// mitigate excessive starvation.
|
||||||
|
usize::MAX
|
||||||
|
};
|
||||||
|
for _ in 0..nr_tasks {
|
||||||
match self.task_pool.pop() {
|
match self.task_pool.pop() {
|
||||||
Some(task) => {
|
Some(task) => {
|
||||||
// Determine the task's virtual time slice.
|
// Determine the task's virtual time slice.
|
||||||
@ -682,7 +696,14 @@ impl<'a> Scheduler<'a> {
|
|||||||
// Print internal scheduler statistics (fetched from the BPF part).
|
// Print internal scheduler statistics (fetched from the BPF part).
|
||||||
fn print_stats(&mut self) {
|
fn print_stats(&mut self) {
|
||||||
// Show minimum vruntime (this should be constantly incrementing).
|
// Show minimum vruntime (this should be constantly incrementing).
|
||||||
info!("vruntime={}", self.min_vruntime);
|
let delta = self.max_vruntime - self.min_vruntime;
|
||||||
|
info!(
|
||||||
|
"min_vruntime={} max_vruntime={} delta={}us slice={}us",
|
||||||
|
self.min_vruntime,
|
||||||
|
self.max_vruntime,
|
||||||
|
delta / NSEC_PER_USEC,
|
||||||
|
self.slice_ns / NSEC_PER_USEC,
|
||||||
|
);
|
||||||
|
|
||||||
// Show the total amount of tasks currently monitored by the scheduler.
|
// Show the total amount of tasks currently monitored by the scheduler.
|
||||||
info!(" tasks={}", self.task_map.tasks.len());
|
info!(" tasks={}", self.task_map.tasks.len());
|
||||||
|
Loading…
Reference in New Issue
Block a user