Merge pull request #410 from sched-ext/bpfland-smooth-perf

scx_bpfland: enhance performance consistency and predictability
This commit is contained in:
Andrea Righi 2024-07-04 21:37:07 +02:00 committed by GitHub
commit 86d2f50230
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 191 additions and 162 deletions

View File

@ -15,7 +15,9 @@
#define CLAMP(val, lo, hi) MIN(MAX(val, lo), hi)
enum consts {
NSEC_PER_SEC = 1000000000ULL,
NSEC_PER_USEC = 1000ULL,
NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC),
NSEC_PER_SEC = (1000ULL * NSEC_PER_MSEC),
};
#ifndef __VMLINUX_H__

View File

@ -34,12 +34,12 @@ const volatile bool debug;
/*
* Default task time slice.
*/
const volatile u64 slice_ns = SCX_SLICE_DFL;
const volatile u64 slice_ns = 5ULL * NSEC_PER_SEC;
/*
* Time slice used when system is over commissioned.
*/
const volatile u64 slice_ns_min = 500000;
const volatile u64 slice_ns_min = 500ULL * NSEC_PER_USEC;
/*
* Maximum time slice lag.
@ -50,22 +50,35 @@ const volatile u64 slice_ns_min = 500000;
*/
const volatile u64 slice_ns_lag;
/*
* Enable built-in idle selection logic.
*/
const volatile bool builtin_idle;
/*
* Threshold of voluntary context switches used to classify a task as
* interactive.
*/
const volatile u64 nvcsw_thresh = 10;
const volatile u64 nvcsw_thresh = 10ULL;
/*
* Time threshold to prevent task starvation.
*
* The scheduler processes tasks from various DSQs in the following order:
*
* per-CPU DSQs => priority DSQ => shared DSQ
*
* Tasks in the shared DSQ may be starved by those in the priority DSQ, which
* in turn may be starved by tasks in any per-CPU DSQ.
*
* To mitigate this, store the timestamp of the last task consumption from
* both the priority DSQ and the shared DSQ. If the starvation_thresh_ns
* threshold is exceeded without consuming a task, the scheduler will be
* forced to consume a task from the corresponding DSQ.
*/
const volatile u64 starvation_thresh_ns = 5ULL * NSEC_PER_MSEC;
static u64 starvation_shared_ts;
static u64 starvation_prio_ts;
/*
* Scheduling statistics.
*/
volatile u64 nr_direct_dispatches, nr_kthread_dispatches,
nr_shared_dispatches, nr_prio_dispatches;
volatile u64 nr_direct_dispatches, nr_shared_dispatches, nr_prio_dispatches;
/*
* Amount of currently running tasks.
@ -166,6 +179,19 @@ static int calloc_cpumask(struct bpf_cpumask **p_cpumask)
return 0;
}
/*
* Set the state of a CPU in a cpumask.
*/
static bool set_cpu_state(struct bpf_cpumask *cpumask, s32 cpu, bool state)
{
if (!cpumask)
return false;
if (state)
return bpf_cpumask_test_and_set_cpu(cpu, cpumask);
else
return bpf_cpumask_test_and_clear_cpu(cpu, cpumask);
}
/*
* Exponential weighted moving average (EWMA).
*
@ -212,9 +238,7 @@ static inline u64 task_vtime(struct task_struct *p)
*/
static inline u64 task_slice(struct task_struct *p)
{
u64 slice = p->scx.slice;
return MAX(slice, slice_ns_min);
return MAX(p->scx.slice, slice_ns_min);
}
/*
@ -230,22 +254,6 @@ static u64 cpu_to_dsq(s32 cpu)
return (u64)cpu;
}
/*
* Dispatch a per-CPU kthread directly to the local CPU DSQ.
*/
static void dispatch_kthread(struct task_struct *p, u64 enq_flags)
{
u64 slice = task_slice(p);
/*
* Use the local CPU DSQ directly for per-CPU kthreads, to give them
* maximum priority.
*/
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice, enq_flags);
__sync_fetch_and_add(&nr_kthread_dispatches, 1);
}
/*
* Dispatch a task directly to the assigned CPU DSQ (used when an idle CPU is
* found).
@ -253,6 +261,7 @@ static void dispatch_kthread(struct task_struct *p, u64 enq_flags)
static int dispatch_direct_cpu(struct task_struct *p, s32 cpu)
{
u64 slice = task_slice(p);
u64 vtime = task_vtime(p);
u64 dsq_id = cpu_to_dsq(cpu);
/*
@ -262,14 +271,7 @@ static int dispatch_direct_cpu(struct task_struct *p, s32 cpu)
if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
return -EINVAL;
/*
* We don't need to use vtime here, because we basically dispatch one
* task at a time when the corresponding CPU is idle.
*
* We could also use SCX_DSQ_LOCAL, but we want to distinguish regular
* tasks from per-CPU kthreads to give more priority to the latter.
*/
scx_bpf_dispatch(p, dsq_id, slice, 0);
scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, 0);
__sync_fetch_and_add(&nr_direct_dispatches, 1);
/*
@ -286,54 +288,20 @@ static int dispatch_direct_cpu(struct task_struct *p, s32 cpu)
return 0;
}
/*
* Dispatch a regular task.
*/
static void dispatch_task(struct task_struct *p, u64 enq_flags)
{
u64 vtime = task_vtime(p);
u64 slice = task_slice(p);
struct task_ctx *tctx;
tctx = lookup_task_ctx(p);
if (!tctx)
return;
/*
* Always dispatch interactive tasks to the priority DSQ and regular
* tasks to the shared DSQ.
*/
if (tctx->is_interactive) {
scx_bpf_dispatch_vtime(p, PRIO_DSQ, slice, vtime, enq_flags);
__sync_fetch_and_add(&nr_prio_dispatches, 1);
} else {
scx_bpf_dispatch_vtime(p, SHARED_DSQ, slice, vtime, enq_flags);
__sync_fetch_and_add(&nr_shared_dispatches, 1);
}
}
/*
* Find an idle CPU in the system.
*
* NOTE: the idle CPU selection doesn't need to be formally perfect, it is
* totally fine to accept racy conditions and potentially make mistakes, by
* picking CPUs that are not idle or even offline, the logic has been designed
* to handle these mistakes in favor of a more efficient response and a reduced
* scheduling overhead.
*/
static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
const struct cpumask *online_cpumask, *idle_smtmask, *idle_cpumask;
bool prev_in_cand;
s32 cpu;
if (builtin_idle) {
bool is_idle = false;
/*
* Find an idle CPU using the sched_ext built-in idle selection
* logic.
*/
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
if (is_idle)
return cpu;
return -ENOENT;
}
/*
* Acquire the CPU masks to determine the online and idle CPUs in the
* system.
@ -342,14 +310,28 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
idle_smtmask = scx_bpf_get_idle_smtmask();
idle_cpumask = scx_bpf_get_idle_cpumask();
prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr);
/*
* For tasks that can run only on a single CPU, we can simply verify if
* their only allowed CPU is idle.
*/
if (p->nr_cpus_allowed == 1) {
cpu = bpf_cpumask_first(p->cpus_ptr);
if (scx_bpf_test_and_clear_cpu_idle(cpu))
goto out_put_cpumask;
else
goto out_not_found;
}
/*
* Find the best idle CPU, prioritizing full idle cores in SMT systems.
*/
if (smt_enabled) {
/*
* If the task can still run on the previously used CPU and
* it's a full-idle core, keep using it.
*/
if (prev_in_cand &&
if (bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr) &&
bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu;
@ -366,10 +348,11 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
}
/*
* If a full-idle core can't be found (or if it's not an SMT system)
* If a full-idle core can't be found (or if this is not an SMT system)
* try to re-use the same CPU, even if it's not in a full-idle core.
*/
if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
if (bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr) &&
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu;
goto out_put_cpumask;
}
@ -390,16 +373,16 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
struct task_ctx *tctx;
/*
* If we are waking up a task and we couldn't find any idle CPU
* to use, at least set the task as interactive, so that it can
* be dispatched as soon as possible on the first CPU
* available.
* If we are waking up a task and we can't use the current CPU
* at least set the task as interactive, so that it can be
* dispatched as soon as possible on the first CPU available.
*/
tctx = lookup_task_ctx(p);
if (tctx)
tctx->is_interactive = true;
}
out_not_found:
/*
* If all the previous attempts have failed, dispatch the task to the
* first CPU that will become available.
@ -425,10 +408,18 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64
return prev_cpu;
}
/*
* Dispatch all the other tasks that were not dispatched directly in
* select_cpu().
*/
void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
{
u64 vtime = task_vtime(p);
u64 slice = task_slice(p);
struct task_ctx *tctx;
/*
* Always dispatch per-CPU kthreads immediately.
* Always dispatch per-CPU kthreads directly on their target CPU.
*
* This allows to prioritize critical kernel threads that may
* potentially slow down the entire system if they are blocked for too
@ -440,15 +431,26 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
* this scheduler is desktop usage, this shouldn't be a problem.
*/
if (is_kthread(p) && p->nr_cpus_allowed == 1) {
dispatch_kthread(p, enq_flags);
s32 cpu = scx_bpf_task_cpu(p);
dispatch_direct_cpu(p, cpu);
return;
}
tctx = lookup_task_ctx(p);
if (!tctx)
return;
/*
* Dispatch all the other tasks that were not dispatched directly in
* select_cpu().
* Always dispatch interactive tasks to the priority DSQ and regular
* tasks to the shared DSQ.
*/
dispatch_task(p, enq_flags);
if (tctx->is_interactive) {
scx_bpf_dispatch_vtime(p, PRIO_DSQ, slice, vtime, enq_flags);
__sync_fetch_and_add(&nr_prio_dispatches, 1);
} else {
scx_bpf_dispatch_vtime(p, SHARED_DSQ, slice, vtime, enq_flags);
__sync_fetch_and_add(&nr_shared_dispatches, 1);
}
}
/*
@ -456,18 +458,17 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
*
* These tasks will be consumed on other active CPUs to prevent indefinite
* stalling.
*
* Return true if one task is consumed, false otherwise.
*/
static int dispatch_offline_cpus(s32 cpu)
static bool consume_offline_cpus(s32 cpu)
{
u64 cpu_max = scx_bpf_nr_cpu_ids();
struct bpf_cpumask *offline;
int ret = -ENOENT;
bpf_rcu_read_lock();
offline = offline_cpumask;
if (!offline)
goto out_rcu;
return false;
/*
* Cycle through all the CPUs and evenly consume tasks from the DSQs of
@ -482,22 +483,81 @@ static int dispatch_offline_cpus(s32 cpu)
* This CPU is offline, if a task has been dispatched there
* consume it immediately on the current CPU.
*/
if (scx_bpf_consume(cpu_to_dsq(cpu))) {
ret = 0;
goto out_rcu;
}
if (scx_bpf_consume(cpu_to_dsq(cpu)))
return true;
}
out_rcu:
bpf_rcu_read_unlock();
return false;
}
/*
* Consume a task from the priority DSQ, transferring it to the local CPU DSQ.
*
* Return true if a task is consumed, false otherwise.
*/
static bool consume_prio_task(u64 now)
{
bool ret;
ret = scx_bpf_consume(PRIO_DSQ);
if (ret)
starvation_prio_ts = now;
return ret;
}
/*
* Consume a task from the shared DSQ, transferring it to the local CPU DSQ.
*
* Return true if a task is consumed, false otherwise.
*/
static bool consume_regular_task(u64 now)
{
bool ret;
ret = scx_bpf_consume(SHARED_DSQ);
if (ret)
starvation_shared_ts = now;
return ret;
}
/*
* Consume tasks that are potentially starving.
*
* In order to limit potential starvation conditions the scheduler uses a
* time-based threshold to ensure that at least one task from the
* lower-priority DSQs is periodically consumed.
*/
static bool consume_starving_tasks(u64 now)
{
if (!starvation_thresh_ns)
return false;
if (vtime_before(starvation_shared_ts + starvation_thresh_ns, now))
if (consume_regular_task(now))
return true;
if (vtime_before(starvation_prio_ts + starvation_thresh_ns, now))
if (consume_prio_task(now))
return true;
return false;
}
void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
{
u64 now = bpf_ktime_get_ns();
/*
* First consume directly dispatched tasks, so that they can
* immediately use the CPU assigned in select_cpu().
* Make sure we are not staving tasks from the lower priority DSQs.
*/
if (consume_starving_tasks(now))
return;
/*
* Consume directly dispatched tasks, so that they can immediately use
* the CPU assigned in select_cpu().
*/
if (scx_bpf_consume(cpu_to_dsq(cpu)))
return;
@ -506,28 +566,19 @@ void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
* Try also to steal tasks directly dispatched to CPUs that have gone
* offline (this allows to prevent indefinite task stalls).
*/
if (!dispatch_offline_cpus(cpu))
if (consume_offline_cpus(cpu))
return;
/*
* Then always consume interactive tasks before regular tasks.
*
* This is fine and we shouldn't have starvation, because interactive
* tasks are classified by their amount of voluntary context switches,
* so they should naturally release the CPU quickly and give a chance
* to the regular tasks to run.
*
* TODO: Add a tunable setting to limit the number of priority tasks
* dispatched. Once this limit is reached, at least one regular task
* must be dispatched.
*/
if (scx_bpf_consume(PRIO_DSQ))
if (consume_prio_task(now))
return;
/*
* Lastly, consume regular tasks from the shared DSQ.
*/
scx_bpf_consume(SHARED_DSQ);
consume_regular_task(now);
}
void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
@ -536,6 +587,13 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
if (vtime_before(vtime_now, p->scx.dsq_vtime))
vtime_now = p->scx.dsq_vtime;
/*
* Ensure time slice never exceeds slice_ns when a task is started on a
* CPU.
*/
if (p->scx.slice > slice_ns)
p->scx.slice = slice_ns;
__sync_fetch_and_add(&nr_running, 1);
}
@ -604,37 +662,16 @@ void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p)
p->scx.dsq_vtime = vtime_now;
}
/*
* Set the offline state of a CPU, updating the global offline cpumask.
*/
static void set_cpu_offline(s32 cpu, bool state)
{
struct bpf_cpumask *offline;
bpf_rcu_read_lock();
offline = offline_cpumask;
if (!offline)
goto out_rcu;
if (state)
bpf_cpumask_set_cpu(cpu, offline);
else
bpf_cpumask_clear_cpu(cpu, offline);
out_rcu:
bpf_rcu_read_unlock();
}
void BPF_STRUCT_OPS(bpfland_cpu_online, s32 cpu)
{
/* Set the CPU state to offline */
set_cpu_offline(cpu, false);
set_cpu_state(offline_cpumask, cpu, false);
}
void BPF_STRUCT_OPS(bpfland_cpu_offline, s32 cpu)
{
/* Set the CPU state to online */
set_cpu_offline(cpu, true);
set_cpu_state(offline_cpumask, cpu, true);
}
s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p,
@ -649,7 +686,7 @@ s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p,
s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
{
struct bpf_cpumask *offline;
struct bpf_cpumask *mask;
int err;
s32 cpu;
@ -678,14 +715,12 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
}
/* Initialize the offline CPU mask */
bpf_rcu_read_lock();
err = calloc_cpumask(&offline_cpumask);
offline = offline_cpumask;
if (err || !offline)
err -ENOMEM;
bpf_rcu_read_unlock();
mask = offline_cpumask;
if (!mask)
err = -ENOMEM;
return 0;
return err;
}
void BPF_STRUCT_OPS(bpfland_exit, struct scx_exit_info *ei)

View File

@ -59,7 +59,7 @@ struct Opts {
exit_dump_len: u32,
/// Maximum scheduling slice duration in microseconds.
#[clap(short = 's', long, default_value = "20000")]
#[clap(short = 's', long, default_value = "5000")]
slice_us: u64,
/// Minimum scheduling slice duration in microseconds.
@ -78,9 +78,10 @@ struct Opts {
#[clap(short = 'c', long, default_value = "10")]
nvcsw_thresh: u64,
/// Enable direct dispatch via sched_ext built-in idle selection logic.
#[clap(short = 'i', long, action = clap::ArgAction::SetTrue)]
builtin_idle: bool,
/// Prevent the starvation making sure that at least one lower priority task is scheduled every
/// starvation_thresh_us (0 = disable starvation prevention).
#[clap(short = 't', long, default_value = "5000")]
starvation_thresh_us: u64,
/// Enable the Prometheus endpoint for metrics on port 9000.
#[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
@ -101,7 +102,6 @@ struct Opts {
struct Metrics {
nr_running: Gauge,
nr_kthread_dispatches: Gauge,
nr_direct_dispatches: Gauge,
nr_prio_dispatches: Gauge,
nr_shared_dispatches: Gauge,
@ -113,9 +113,6 @@ impl Metrics {
nr_running: gauge!(
"nr_running", "info" => "Number of running tasks"
),
nr_kthread_dispatches: gauge!(
"nr_kthread_dispatches", "info" => "Number of kthread dispatches"
),
nr_direct_dispatches: gauge!(
"nr_direct_dispatches", "info" => "Number of direct dispatches"
),
@ -173,8 +170,8 @@ impl<'a> Scheduler<'a> {
skel.rodata_mut().slice_ns = opts.slice_us * 1000;
skel.rodata_mut().slice_ns_min = opts.slice_us_min * 1000;
skel.rodata_mut().slice_ns_lag = opts.slice_us_lag * 1000;
skel.rodata_mut().starvation_thresh_ns = opts.starvation_thresh_us * 1000;
skel.rodata_mut().nvcsw_thresh = opts.nvcsw_thresh;
skel.rodata_mut().builtin_idle = opts.builtin_idle;
// Attach the scheduler.
let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
@ -196,18 +193,14 @@ impl<'a> Scheduler<'a> {
}
fn update_stats(&mut self) {
let nr_running = self.skel.bss().nr_running;
let nr_cpus = libbpf_rs::num_possible_cpus().unwrap();
let nr_kthread_dispatches = self.skel.bss().nr_kthread_dispatches;
let nr_running = self.skel.bss().nr_running;
let nr_direct_dispatches = self.skel.bss().nr_direct_dispatches;
let nr_prio_dispatches = self.skel.bss().nr_prio_dispatches;
let nr_shared_dispatches = self.skel.bss().nr_shared_dispatches;
// Update Prometheus statistics.
self.metrics.nr_running.set(nr_running as f64);
self.metrics
.nr_kthread_dispatches
.set(nr_kthread_dispatches as f64);
self.metrics
.nr_direct_dispatches
.set(nr_direct_dispatches as f64);
@ -219,10 +212,9 @@ impl<'a> Scheduler<'a> {
.set(nr_shared_dispatches as f64);
// Log scheduling statistics.
info!("running={}/{} nr_kthread_dispatches={} nr_direct_dispatches={} nr_prio_dispatches={} nr_shared_dispatches={}",
info!("running={}/{} direct_dispatches={} prio_dispatches={} shared_dispatches={}",
nr_running,
nr_cpus,
nr_kthread_dispatches,
nr_direct_dispatches,
nr_prio_dispatches,
nr_shared_dispatches);