Merge pull request #755 from sched-ext/bpfland-prevent-kthread-stall

scx_bpfland: prevent per-CPU DSQ stall with per-CPU kthreads
This commit is contained in:
Andrea Righi 2024-10-09 05:28:59 +00:00 committed by GitHub
commit e3e381dc8e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 71 additions and 66 deletions

View File

@ -46,15 +46,12 @@ const volatile u64 slice_ns_min = 500ULL * NSEC_PER_USEC;
const volatile s64 slice_ns_lag;
/*
* When enabled always dispatch per-CPU kthreads directly on their CPU DSQ.
* When enabled always dispatch all kthreads directly.
*
* This allows to prioritize critical kernel threads that may potentially slow
* down the entire system if they are blocked for too long (i.e., ksoftirqd/N,
* rcuop/N, etc.).
*
* NOTE: this could cause interactivity problems or unfairness if there are too
* many softirqs being scheduled (e.g., in presence of high RX network RX
* traffic).
* down the entire system if they are blocked for too long, but it may also
* introduce interactivity issues or unfairness in scenarios with high kthread
* activity, such as heavy I/O or network traffic.
*/
const volatile bool local_kthreads;
@ -121,7 +118,8 @@ static u64 starvation_prio_ts;
/*
* Scheduling statistics.
*/
volatile u64 nr_direct_dispatches, nr_shared_dispatches, nr_prio_dispatches;
volatile u64 nr_kthread_dispatches, nr_direct_dispatches,
nr_prio_dispatches, nr_shared_dispatches;
/*
* Amount of currently running tasks.
@ -494,21 +492,20 @@ static u64 cpu_to_dsq(s32 cpu)
static int dispatch_direct_cpu(struct task_struct *p, s32 cpu, u64 enq_flags)
{
struct bpf_cpumask *offline;
u64 deadline = task_deadline(p);
u64 dsq_id = cpu_to_dsq(cpu);
s32 dsq_id;
/*
* Make sure we can dispatch the task to the target CPU according to
* its cpumask.
* Make sure the CPU is valid and usable by the task.
*/
if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
scx_bpf_error("%d %s can't be dispatched to CPU %d",
p->pid, p->comm, cpu);
if (cpu < 0 || !bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
return -EINVAL;
}
scx_bpf_dispatch_vtime(p, dsq_id, SCX_SLICE_DFL, deadline, enq_flags);
dsq_id = cpu_to_dsq(cpu);
/*
* Dispatch the task to the per-CPU DSQ.
*/
scx_bpf_dispatch_vtime(p, dsq_id, SCX_SLICE_DFL,
task_deadline(p), enq_flags);
/*
* If the CPU has gone offline notify that the task needs to be
* consumed from another CPU.
@ -582,6 +579,25 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
struct cpu_ctx *cctx;
s32 cpu;
/*
* If the task isn't allowed to use its previously used CPU it means
* that it's changing affinity. In this case just pick a random CPU in
* its new allowed CPU domain.
*/
if (!bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr))
prev_cpu = bpf_cpumask_any_distribute(p->cpus_ptr);
/*
* For tasks that can run only on a single CPU, we can simply verify if
* their only allowed CPU is still idle.
*/
if (p->nr_cpus_allowed == 1 || p->migration_disabled) {
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
return prev_cpu;
return -ENOENT;
}
tctx = try_lookup_task_ctx(p);
if (!tctx)
return -ENOENT;
@ -593,31 +609,6 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
if (!primary)
return -ENOENT;
/*
* If the task isn't allowed to use its previously used CPU it means
* that it's rapidly changing affinity. In this case it's pointless to
* find an optimal idle CPU, just return and let the task being
* dispatched to a global DSQ.
*/
if (!bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr))
return -ENOENT;
/*
* For tasks that can run only on a single CPU, we can simply verify if
* their only allowed CPU is still idle.
*/
if (p->nr_cpus_allowed == 1) {
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
return prev_cpu;
/*
* If local_kthreads is enabled, always dispatch per-CPU
* kthreads directly, even if their allowed CPU is not idle.
*/
if (local_kthreads && is_kthread(p))
return prev_cpu;
return -ENOENT;
}
/*
* Acquire the CPU masks to determine the online and idle CPUs in the
* system.
@ -844,7 +835,7 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64
return prev_cpu;
cpu = pick_idle_cpu(p, prev_cpu, wake_flags);
if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0))
if (!dispatch_direct_cpu(p, cpu, 0))
__sync_fetch_and_add(&nr_direct_dispatches, 1);
else
cpu = prev_cpu;
@ -861,8 +852,24 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64
void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
{
struct task_ctx *tctx;
u64 deadline = task_deadline(p);
s32 cpu = scx_bpf_task_cpu(p);
s32 cpu, dsq_id;
/*
* Special case for per-CPU kthreads: we want to run them as soon as
* possible, as they are usually important for system performance and
* responsiveness, so dispatch them immediately on the local DSQ of
* their assigned CPU and allow them to preempt the currently running
* task, if present.
*
* If local_kthreads is enabled, consider all kthreads as critical and
* always dispatch them directly.
*/
if (is_kthread(p) && (local_kthreads || p->nr_cpus_allowed == 1)) {
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
enq_flags | SCX_ENQ_PREEMPT);
__sync_fetch_and_add(&nr_kthread_dispatches, 1);
return;
}
/*
* During ttwu, the kernel may decide to skip ->select_task_rq() (e.g.,
@ -877,8 +884,8 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
*/
tctx = try_lookup_task_ctx(p);
if (tctx && !tctx->select_cpu_done) {
cpu = pick_idle_cpu(p, cpu, 0);
if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0)) {
cpu = pick_idle_cpu(p, scx_bpf_task_cpu(p), 0);
if (!dispatch_direct_cpu(p, cpu, 0)) {
__sync_fetch_and_add(&nr_direct_dispatches, 1);
return;
}
@ -894,14 +901,14 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
* and simply rely on the vruntime logic.
*/
if (is_task_interactive(p)) {
scx_bpf_dispatch_vtime(p, prio_dsq_id, SCX_SLICE_DFL,
deadline, enq_flags);
dsq_id = prio_dsq_id;
__sync_fetch_and_add(&nr_prio_dispatches, 1);
} else {
scx_bpf_dispatch_vtime(p, shared_dsq_id, SCX_SLICE_DFL,
deadline, enq_flags);
dsq_id = shared_dsq_id;
__sync_fetch_and_add(&nr_shared_dispatches, 1);
}
scx_bpf_dispatch_vtime(p, dsq_id, SCX_SLICE_DFL,
task_deadline(p), enq_flags);
/*
* If there are idle CPUs that are usable by the task, wake them up to
@ -1022,7 +1029,7 @@ void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
u64 now = bpf_ktime_get_ns();
/*
* Try also to steal tasks directly dispatched to CPUs that have gone
* Try to steal tasks directly dispatched to CPUs that have gone
* offline (this allows to prevent indefinite task stalls).
*/
if (consume_offline_cpus(cpu))
@ -1486,7 +1493,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
}
/*
* Create the global priority DSQ (for interactive tasks).
* Create the global priority and shared DSQs.
*
* Allocate a new DSQ id that does not clash with any valid CPU id.
*/
@ -1496,12 +1503,6 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
scx_bpf_error("failed to create priority DSQ: %d", err);
return err;
}
/*
* Create the global shared DSQ (for regular tasks).
*
* Allocate a new DSQ id that does not clash with any valid CPU id..
*/
shared_dsq_id = nr_cpu_ids++;
err = scx_bpf_create_dsq(shared_dsq_id, -1);
if (err) {

View File

@ -146,12 +146,11 @@ struct Opts {
#[clap(short = 'L', long, action = clap::ArgAction::SetTrue)]
lowlatency: bool,
/// Enable per-CPU kthreads prioritization.
/// Enable kthreads prioritization.
///
/// Enabling this can enhance the performance of interrupt-driven workloads (e.g., networking
/// throughput) over regular system/user workloads. However, it may also introduce
/// interactivity issues or unfairness under heavy interrupt-driven loads, such as high RX
/// network traffic.
/// Enabling this can improve system performance, but it may also introduce interactivity
/// issues or unfairness in scenarios with high kthread activity, such as heavy I/O or network
/// traffic.
#[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
local_kthreads: bool,
@ -587,6 +586,7 @@ impl<'a> Scheduler<'a> {
nr_interactive: self.skel.maps.bss_data.nr_interactive,
nr_waiting: self.skel.maps.bss_data.nr_waiting,
nvcsw_avg_thresh: self.skel.maps.bss_data.nvcsw_avg_thresh,
nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches,
nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
nr_prio_dispatches: self.skel.maps.bss_data.nr_prio_dispatches,
nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches,

View File

@ -25,6 +25,8 @@ pub struct Metrics {
pub nr_waiting: u64,
#[stat(desc = "Average of voluntary context switches")]
pub nvcsw_avg_thresh: u64,
#[stat(desc = "Number of kthread direct dispatches")]
pub nr_kthread_dispatches: u64,
#[stat(desc = "Number of task direct dispatches")]
pub nr_direct_dispatches: u64,
#[stat(desc = "Number of interactive task dispatches")]
@ -37,13 +39,14 @@ impl Metrics {
fn format<W: Write>(&self, w: &mut W) -> Result<()> {
writeln!(
w,
"[{}] tasks -> run: {:>2}/{:<2} int: {:<2} wait: {:<4} | nvcsw: {:<4} | dispatch -> dir: {:<5} prio: {:<5} shr: {:<5}",
"[{}] tasks -> run: {:>2}/{:<2} int: {:<2} wait: {:<4} | nvcsw: {:<4} | dispatch -> kth: {:<5} dir: {:<5} pri: {:<5} shr: {:<5}",
crate::SCHEDULER_NAME,
self.nr_running,
self.nr_cpus,
self.nr_interactive,
self.nr_waiting,
self.nvcsw_avg_thresh,
self.nr_kthread_dispatches,
self.nr_direct_dispatches,
self.nr_prio_dispatches,
self.nr_shared_dispatches
@ -53,6 +56,7 @@ impl Metrics {
fn delta(&self, rhs: &Self) -> Self {
Self {
nr_kthread_dispatches: self.nr_kthread_dispatches - rhs.nr_kthread_dispatches,
nr_direct_dispatches: self.nr_direct_dispatches - rhs.nr_direct_dispatches,
nr_prio_dispatches: self.nr_prio_dispatches - rhs.nr_prio_dispatches,
nr_shared_dispatches: self.nr_shared_dispatches - rhs.nr_shared_dispatches,