scx_bpfland: drop lowlatency mode and the priority DSQ

Schedule all tasks using a single global DSQ. This gives a better
control to prevent potential starvation conditions.

With this change, scx_bpfland adopts a logic similar to scx_rusty and
scx_lavd, prioritizing tasks based on the frequency of their wait and
wake-up events, rather than relying exclusively on the average amount of
voluntary context switches.

Tasks are still classified as interactive / non-interactive based on the
amount of voluntary context switches, but this is only affecting the
cpufreq logic.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
This commit is contained in:
Andrea Righi 2024-11-05 10:15:41 +01:00
parent efc41dd936
commit 78101e4688
4 changed files with 252 additions and 259 deletions

View File

@ -13,6 +13,7 @@
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define CLAMP(val, lo, hi) MIN(MAX(val, lo), hi)
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
enum consts {
NSEC_PER_USEC = 1000ULL,

View File

@ -17,14 +17,19 @@ char _license[] SEC("license") = "GPL";
const volatile bool debug;
/*
* Priority DSQ used to dispatch interactive tasks.
* Maximum task weight.
*/
#define PRIO_DSQ 0
#define MAX_TASK_WEIGHT 10000
/*
* Maximum frequency of task wakeup events / sec.
*/
#define MAX_WAKEUP_FREQ 1024
/*
* DSQ used to dispatch regular tasks.
*/
#define SHARED_DSQ 1
#define SHARED_DSQ 0
/*
* Default task time slice.
@ -55,17 +60,6 @@ const volatile s64 slice_lag = 20ULL * NSEC_PER_MSEC;
*/
const volatile bool local_kthreads;
/*
* With lowlatency enabled, instead of classifying tasks as interactive or
* non-interactive, they all get a dynamic priority, which is adjusted in
* function of their average rate of voluntary context switches.
*
* This option guarantess less spikey behavior and it can be particularly
* useful in soft real-time scenarios, such as audio processing, multimedia,
* etc.
*/
const volatile bool lowlatency;
/*
* Maximum threshold of voluntary context switches.
*/
@ -77,31 +71,15 @@ const volatile u64 nvcsw_max_thresh = 10ULL;
*/
volatile s64 cpufreq_perf_lvl;
/*
* Time threshold to prevent task starvation.
*
* Tasks dispatched to the priority DSQ are always consumed before those
* dispatched to the shared DSQ, so tasks in shared DSQ may be starved by those
* in the priority DSQ.
*
* To mitigate this, store the timestamp of the last task consumption from
* the shared DSQ. If the starvation_thresh_ns threshold is exceeded without
* consuming a task, the scheduler will be forced to consume a task from the
* corresponding DSQ.
*/
const volatile u64 starvation_thresh_ns = 1000ULL * NSEC_PER_MSEC;
static u64 starvation_shared_ts;
/*
* Scheduling statistics.
*/
volatile u64 nr_kthread_dispatches, nr_direct_dispatches,
nr_prio_dispatches, nr_shared_dispatches;
volatile u64 nr_kthread_dispatches, nr_direct_dispatches, nr_shared_dispatches;
/*
* Amount of currently running tasks.
*/
volatile u64 nr_running, nr_interactive, nr_shared_waiting, nr_prio_waiting;
volatile u64 nr_running, nr_interactive;
/*
* Amount of online CPUs.
@ -169,26 +147,31 @@ struct task_ctx {
struct bpf_cpumask __kptr *l2_cpumask;
struct bpf_cpumask __kptr *l3_cpumask;
/*
* Total execution time of the task.
*/
u64 sum_exec_runtime;
/*
* Voluntary context switches metrics.
*/
u64 nvcsw;
u64 nvcsw_ts;
u64 avg_nvcsw;
/*
* Task's latency priority.
* Frequency with which a task is blocked (consumer).
*/
u64 lat_weight;
u64 blocked_freq;
u64 last_blocked_at;
/*
* Frequency with which a task wakes other tasks (producer).
*/
u64 waker_freq;
u64 last_woke_at;
/*
* Task's average used time slice.
*/
u64 avg_runtime;
u64 sum_runtime;
u64 last_run_at;
/*
* Task's deadline.
@ -276,75 +259,184 @@ static u64 calc_avg_clamp(u64 old_val, u64 new_val, u64 low, u64 high)
}
/*
* Return the dynamic priority multiplier (only applied in lowlatency mode).
*
* The multiplier is evaluated in function of the task's average rate of
* voluntary context switches per second.
* Evaluate the average frequency of an event over time.
*/
static u64 task_dyn_prio(struct task_struct *p)
static u64 update_freq(u64 freq, u64 delta)
{
struct task_ctx *tctx;
u64 new_freq;
if (!lowlatency)
return 1;
tctx = try_lookup_task_ctx(p);
if (!tctx)
return 1;
return MAX(tctx->lat_weight, 1);
new_freq = NSEC_PER_SEC / delta;
return calc_avg(freq, new_freq);
}
/*
* Return task's dynamic priority.
* Return the total amount of tasks that are currently waiting to be scheduled.
*/
static u64 task_prio(const struct task_struct *p)
static u64 nr_tasks_waiting(void)
{
return p->scx.weight * task_dyn_prio(p);
return scx_bpf_dsq_nr_queued(SHARED_DSQ) + 1;
}
/*
* Return task's dynamic weight.
*/
static u64 task_weight(const struct task_struct *p, const struct task_ctx *tctx)
{
/*
* Scale the static task weight by the average amount of voluntary
* context switches to determine the dynamic weight.
*/
u64 prio = p->scx.weight * CLAMP(tctx->avg_nvcsw, 1, nvcsw_max_thresh);
return CLAMP(prio, 1, MAX_TASK_WEIGHT);
}
/*
* Return a value proportionally scaled to the task's priority.
*/
static u64 scale_up_fair(const struct task_struct *p,
const struct task_ctx *tctx, u64 value)
{
return value * task_weight(p, tctx) / 100;
}
/*
* Return a value inversely proportional to the task's priority.
*/
static u64 scale_inverse_fair(const struct task_struct *p,
const struct task_ctx *tctx, u64 value)
{
return value * 100 / task_weight(p, tctx);
}
/*
* Return the task's allowed lag: used to determine how early its vruntime can
* be.
*/
static u64 task_lag(const struct task_struct *p)
static u64 task_lag(const struct task_struct *p, const struct task_ctx *tctx)
{
return slice_lag * task_prio(p) / 100;
return scale_up_fair(p, tctx, slice_lag);
}
/*
* Return a value inversely proportional to the task's weight.
* ** Taken directly from fair.c in the Linux kernel **
*
* The "10% effect" is relative and cumulative: from _any_ nice level,
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
* If a task goes up by ~10% and another task goes down by ~10% then
* the relative distance between them is ~25%.)
*/
static u64 scale_inverse_fair(const struct task_struct *p, u64 value)
const int sched_prio_to_weight[40] = {
/* -20 */ 88761, 71755, 56483, 46273, 36291,
/* -15 */ 29154, 23254, 18705, 14949, 11916,
/* -10 */ 9548, 7620, 6100, 4904, 3906,
/* -5 */ 3121, 2501, 1991, 1586, 1277,
/* 0 */ 1024, 820, 655, 526, 423,
/* 5 */ 335, 272, 215, 172, 137,
/* 10 */ 110, 87, 70, 56, 45,
/* 15 */ 36, 29, 23, 18, 15,
};
static u64 max_sched_prio(void)
{
return value * 100 / task_prio(p);
return ARRAY_SIZE(sched_prio_to_weight);
}
/*
* Compute the deadline component of a task (this value will be added to the
* task's vruntime to determine the actual deadline).
* Convert task priority to weight (following fair.c logic).
*/
static s64 task_compute_dl(const struct task_struct *p,
const struct task_ctx *tctx)
static u64 sched_prio_to_latency_weight(u64 prio)
{
/*
* Return the deadline as a function of the average runtime and the
* evaluated task's dynamic priority.
*/
return scale_inverse_fair(p, tctx->avg_runtime);
u64 max_prio = max_sched_prio();
if (prio >= max_prio) {
scx_bpf_error("invalid priority");
return 0;
}
return sched_prio_to_weight[max_prio - prio - 1];
}
/*
* Return task's evaluated vruntime.
* Evaluate task's deadline.
*
* Reuse a logic similar to scx_rusty or scx_lavd and evaluate the deadline as
* a function of the waiting and wake-up events and the average task's runtime.
*/
static u64 task_deadline(struct task_struct *p, struct task_ctx *tctx)
{
u64 min_vruntime = vtime_now - task_lag(p);
u64 waker_freq, blocked_freq;
u64 lat_prio, lat_weight;
u64 avg_run_scaled, avg_run;
u64 freq_factor;
/*
* Limit the wait and wake-up frequencies to prevent spikes.
*/
waker_freq = CLAMP(tctx->waker_freq, 1, MAX_WAKEUP_FREQ);
blocked_freq = CLAMP(tctx->blocked_freq, 1, MAX_WAKEUP_FREQ);
/*
* We want to prioritize producers (waker tasks) more than consumers
* (blocked tasks), using the following formula:
*
* freq_factor = blocked_freq * waker_freq^2
*
* This seems to improve the overall responsiveness of
* producer/consumer pipelines.
*/
freq_factor = blocked_freq * waker_freq * waker_freq;
/*
* Evaluate the "latency priority" as a function of the wake-up, block
* frequencies and average runtime, using the following formula:
*
* lat_prio = log(freq_factor / avg_run_scaled)
*
* Frequencies can grow very quickly, almost exponential, so use
* log2_u64() to get a more linear metric that can be used as a
* priority.
*
* The avg_run_scaled component is used to scale the latency priority
* proportionally to the task's weight and inversely proportional to
* its runtime, so that a task with a higher weight / shorter runtime
* gets a higher latency priority than a task with a lower weight /
* higher runtime.
*/
avg_run_scaled = scale_inverse_fair(p, tctx, tctx->avg_runtime);
avg_run = log2_u64(avg_run_scaled + 1);
lat_prio = log2_u64(freq_factor);
lat_prio = MIN(lat_prio, max_sched_prio());
if (lat_prio >= avg_run)
lat_prio -= avg_run;
else
lat_prio = 0;
/*
* Lastly, translate the latency priority into a weight and apply it to
* the task's average runtime to determine the task's deadline.
*/
lat_weight = sched_prio_to_latency_weight(lat_prio);
return tctx->avg_runtime * 100 / lat_weight;
}
/*
* Return task's evaluated deadline applied to its vruntime.
*/
static u64 task_vtime(struct task_struct *p, struct task_ctx *tctx)
{
u64 min_vruntime = vtime_now - task_lag(p, tctx);
/*
* Limit the vruntime to to avoid excessively penalizing tasks.
*/
if (vtime_before(p->scx.dsq_vtime, min_vruntime)) {
p->scx.dsq_vtime = min_vruntime;
tctx->deadline = p->scx.dsq_vtime + task_compute_dl(p, tctx);
tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx);
}
return tctx->deadline;
@ -356,28 +448,11 @@ static u64 task_deadline(struct task_struct *p, struct task_ctx *tctx)
*/
static inline void task_refill_slice(struct task_struct *p)
{
u64 curr_prio_waiting = scx_bpf_dsq_nr_queued(PRIO_DSQ);
u64 curr_shared_waiting = scx_bpf_dsq_nr_queued(SHARED_DSQ);
u64 scale_factor;
/*
* Refresh the amount of waiting tasks to get a more accurate scaling
* factor for the time slice.
*/
nr_prio_waiting = calc_avg(nr_prio_waiting, curr_prio_waiting);
nr_shared_waiting = calc_avg(nr_shared_waiting, curr_shared_waiting);
/*
* Scale the time slice of an inversely proportional factor of the
* total amount of tasks that are waiting (use a more immediate metric
* in lowlatency mode and an average in normal mode).
* total amount of tasks that are waiting.
*/
if (lowlatency)
scale_factor = curr_shared_waiting + 1;
else
scale_factor = nr_prio_waiting + nr_shared_waiting + 1;
p->scx.slice = CLAMP(slice_max / scale_factor, slice_min, slice_max);
p->scx.slice = CLAMP(slice_max / nr_tasks_waiting(), slice_min, slice_max);
}
static void task_set_domain(struct task_struct *p, s32 cpu,
@ -459,8 +534,7 @@ static void task_set_domain(struct task_struct *p, s32 cpu,
static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle)
{
const struct cpumask *idle_smtmask, *idle_cpumask;
struct bpf_cpumask *primary, *l2_domain, *l3_domain;
struct bpf_cpumask *p_mask, *l2_mask, *l3_mask;
const struct cpumask *primary, *p_mask, *l2_mask, *l3_mask;
struct task_ctx *tctx;
bool is_prev_llc_affine = false;
s32 cpu;
@ -481,7 +555,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
if (!tctx)
return -ENOENT;
primary = primary_cpumask;
primary = cast_mask(primary_cpumask);
if (!primary)
return -EINVAL;
@ -494,21 +568,21 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
/*
* Task's scheduling domains.
*/
p_mask = tctx->cpumask;
p_mask = cast_mask(tctx->cpumask);
if (!p_mask) {
scx_bpf_error("cpumask not initialized");
cpu = -EINVAL;
goto out_put_cpumask;
}
l2_mask = tctx->l2_cpumask;
l2_mask = cast_mask(tctx->l2_cpumask);
if (!l2_mask) {
scx_bpf_error("l2 cpumask not initialized");
cpu = -EINVAL;
goto out_put_cpumask;
}
l3_mask = tctx->l3_cpumask;
l3_mask = cast_mask(tctx->l3_cpumask);
if (!l3_mask) {
scx_bpf_error("l3 cpumask not initialized");
cpu = -EINVAL;
@ -519,7 +593,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
* Check if the previously used CPU is still in the L3 task domain. If
* not, we may want to move the task back to its original L3 domain.
*/
is_prev_llc_affine = bpf_cpumask_test_cpu(prev_cpu, cast_mask(l3_mask));
is_prev_llc_affine = bpf_cpumask_test_cpu(prev_cpu, l3_mask);
/*
* If the current task is waking up another task and releasing the CPU
@ -528,7 +602,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
*/
if (wake_flags & SCX_WAKE_SYNC) {
struct task_struct *current = (void *)bpf_get_current_task_btf();
struct bpf_cpumask *curr_l3_domain;
const struct cpumask *curr_l3_domain;
struct cpu_ctx *cctx;
bool share_llc, has_idle;
@ -542,7 +616,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
goto out_put_cpumask;
}
curr_l3_domain = cctx->l3_cpumask;
curr_l3_domain = cast_mask(cctx->l3_cpumask);
if (!curr_l3_domain)
curr_l3_domain = primary;
@ -550,7 +624,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
* If both the waker and wakee share the same L3 cache keep
* using the same CPU if possible.
*/
share_llc = bpf_cpumask_test_cpu(prev_cpu, cast_mask(curr_l3_domain));
share_llc = bpf_cpumask_test_cpu(prev_cpu, curr_l3_domain);
if (share_llc &&
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu;
@ -563,9 +637,9 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
* the wakee on the same CPU as the waker (since it's going to
* block and release the current CPU).
*/
has_idle = bpf_cpumask_intersects(cast_mask(curr_l3_domain), idle_cpumask);
has_idle = bpf_cpumask_intersects(curr_l3_domain, idle_cpumask);
if (has_idle &&
bpf_cpumask_test_cpu(cpu, cast_mask(p_mask)) &&
bpf_cpumask_test_cpu(cpu, p_mask) &&
!(current->flags & PF_EXITING) &&
scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu) == 0) {
*is_idle = true;
@ -593,7 +667,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
* Search for any full-idle CPU in the primary domain that
* shares the same L2 cache.
*/
cpu = scx_bpf_pick_idle_cpu(cast_mask(l2_mask), SCX_PICK_IDLE_CORE);
cpu = scx_bpf_pick_idle_cpu(l2_mask, SCX_PICK_IDLE_CORE);
if (cpu >= 0) {
*is_idle = true;
goto out_put_cpumask;
@ -603,7 +677,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
* Search for any full-idle CPU in the primary domain that
* shares the same L3 cache.
*/
cpu = scx_bpf_pick_idle_cpu(cast_mask(l3_mask), SCX_PICK_IDLE_CORE);
cpu = scx_bpf_pick_idle_cpu(l3_mask, SCX_PICK_IDLE_CORE);
if (cpu >= 0) {
*is_idle = true;
goto out_put_cpumask;
@ -612,7 +686,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
/*
* Search for any other full-idle core in the primary domain.
*/
cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), SCX_PICK_IDLE_CORE);
cpu = scx_bpf_pick_idle_cpu(p_mask, SCX_PICK_IDLE_CORE);
if (cpu >= 0) {
*is_idle = true;
goto out_put_cpumask;
@ -634,7 +708,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
* Search for any idle CPU in the primary domain that shares the same
* L2 cache.
*/
cpu = scx_bpf_pick_idle_cpu(cast_mask(l2_mask), 0);
cpu = scx_bpf_pick_idle_cpu(l2_mask, 0);
if (cpu >= 0) {
*is_idle = true;
goto out_put_cpumask;
@ -644,7 +718,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
* Search for any idle CPU in the primary domain that shares the same
* L3 cache.
*/
cpu = scx_bpf_pick_idle_cpu(cast_mask(l3_mask), 0);
cpu = scx_bpf_pick_idle_cpu(l3_mask, 0);
if (cpu >= 0) {
*is_idle = true;
goto out_put_cpumask;
@ -653,7 +727,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
/*
* Search for any idle CPU in the scheduling domain.
*/
cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0);
cpu = scx_bpf_pick_idle_cpu(p_mask, 0);
if (cpu >= 0) {
*is_idle = true;
goto out_put_cpumask;
@ -666,7 +740,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
if (is_prev_llc_affine)
cpu = prev_cpu;
else
cpu = scx_bpf_pick_any_cpu(cast_mask(l3_mask), 0);
cpu = scx_bpf_pick_any_cpu(l3_mask, 0);
out_put_cpumask:
scx_bpf_put_cpumask(idle_cpumask);
@ -716,14 +790,6 @@ static void kick_task_cpu(struct task_struct *p)
scx_bpf_kick_cpu(cpu, 0);
}
static bool is_task_interactive(const struct task_struct *p,
const struct task_ctx *tctx)
{
if (!tctx->is_interactive)
return false;
return scx_bpf_dsq_nr_queued(PRIO_DSQ) < 100;
}
/*
* Dispatch all the other tasks that were not dispatched directly in
* select_cpu().
@ -731,7 +797,6 @@ static bool is_task_interactive(const struct task_struct *p,
void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
{
struct task_ctx *tctx;
s32 dsq_id;
tctx = try_lookup_task_ctx(p);
if (!tctx)
@ -753,19 +818,10 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
/*
* Dispatch interactive tasks to the priority DSQ and regular tasks to
* the shared DSQ.
*
* When lowlatency is enabled, the separate priority DSQ is disabled,
* so in this case always dispatch to the shared DSQ.
*/
if (!lowlatency && is_task_interactive(p, tctx)) {
dsq_id = PRIO_DSQ;
__sync_fetch_and_add(&nr_prio_dispatches, 1);
} else {
dsq_id = SHARED_DSQ;
__sync_fetch_and_add(&nr_shared_dispatches, 1);
}
scx_bpf_dispatch_vtime(p, dsq_id, SCX_SLICE_DFL,
task_deadline(p, tctx), enq_flags);
scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL,
task_vtime(p, tctx), enq_flags);
__sync_fetch_and_add(&nr_shared_dispatches, 1);
/*
* If there is an idle CPU available for the task, wake it up so it can
@ -774,76 +830,13 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
kick_task_cpu(p);
}
/*
* Consume a task from the priority DSQ, transferring it to the local CPU DSQ.
*
* Return true if a task is consumed, false otherwise.
*/
static bool consume_prio_task(u64 now)
{
return scx_bpf_consume(PRIO_DSQ);
}
/*
* Consume a task from the shared DSQ, transferring it to the local CPU DSQ.
*
* Return true if a task is consumed, false otherwise.
*/
static bool consume_regular_task(u64 now)
{
bool ret;
ret = scx_bpf_consume(SHARED_DSQ);
if (ret)
starvation_shared_ts = now;
return ret;
}
/*
* Consume tasks that are potentially starving.
*
* In order to limit potential starvation conditions the scheduler uses a
* time-based threshold to ensure that at least one task from the
* lower-priority DSQs is periodically consumed.
*/
static bool consume_starving_tasks(u64 now)
{
if (!starvation_thresh_ns)
return false;
if (vtime_before(starvation_shared_ts + starvation_thresh_ns, now))
if (consume_regular_task(now))
return true;
return false;
}
/*
* Consume regular tasks from the per-CPU DSQ or a shared DSQ, transferring
* them to the local CPU DSQ.
*
* Return true if at least a task is consumed, false otherwise.
*/
static bool consume_shared_tasks(s32 cpu, u64 now)
{
/*
* The priority DSQ can starve the shared DSQ, so to mitigate this
* starvation we have the starvation_thresh_ns, see also
* consume_starving_tasks().
*/
if (consume_prio_task(now) || consume_regular_task(now))
return true;
return false;
}
void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
{
u64 now = bpf_ktime_get_ns();
if (consume_starving_tasks(now))
return;
if (consume_shared_tasks(cpu, now))
/*
* Consume regular tasks from the shared DSQ, transferring them to the
* local CPU DSQ.
*/
if (scx_bpf_consume(SHARED_DSQ))
return;
/*
* If the current task expired its time slice and no other task wants
@ -922,6 +915,7 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
tctx = try_lookup_task_ctx(p);
if (!tctx)
return;
tctx->last_run_at = bpf_ktime_get_ns();
/*
* Adjust target CPU frequency before the task starts to run.
@ -989,18 +983,15 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
/*
* Update task's average runtime.
*/
slice = p->se.sum_exec_runtime - tctx->sum_exec_runtime;
if (lowlatency)
slice = CLAMP(slice, slice_min, slice_max);
tctx->sum_exec_runtime = p->se.sum_exec_runtime;
tctx->avg_runtime = calc_avg(tctx->avg_runtime, slice);
slice = now - tctx->last_run_at;
tctx->sum_runtime += slice;
tctx->avg_runtime = calc_avg(tctx->avg_runtime, tctx->sum_runtime);
/*
* Update task vruntime charging the weighted used time slice.
*/
slice = scale_inverse_fair(p, slice);
p->scx.dsq_vtime += slice;
tctx->deadline = p->scx.dsq_vtime + task_compute_dl(p, tctx);
p->scx.dsq_vtime += scale_inverse_fair(p, tctx, slice);
tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx);
/*
* Refresh voluntary context switch metrics.
@ -1011,7 +1002,7 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
delta_t = (s64)(now - tctx->nvcsw_ts);
if (delta_t > NSEC_PER_SEC) {
u64 avg_nvcsw = tctx->nvcsw * NSEC_PER_SEC / delta_t;
u64 max_lat_weight = nvcsw_max_thresh * 100;
u64 max_nvcsw = nvcsw_max_thresh * 100;
tctx->nvcsw = 0;
tctx->nvcsw_ts = now;
@ -1021,8 +1012,7 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
* of voluntary context switches (limited to to prevent
* excessive spikes).
*/
tctx->lat_weight = calc_avg_clamp(tctx->lat_weight, avg_nvcsw,
0, max_lat_weight);
tctx->avg_nvcsw = calc_avg_clamp(tctx->avg_nvcsw, avg_nvcsw, 0, max_nvcsw);
/*
* Classify the task based on the average of voluntary context
@ -1032,10 +1022,53 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
* it is classified as interactive, otherwise the task is
* classified as regular.
*/
tctx->is_interactive = tctx->lat_weight >= nvcsw_max_thresh;
tctx->is_interactive = tctx->avg_nvcsw >= nvcsw_max_thresh;
}
}
void BPF_STRUCT_OPS(bpfland_runnable, struct task_struct *p, u64 enq_flags)
{
u64 now = bpf_ktime_get_ns(), delta;
struct task_struct *waker;
struct task_ctx *tctx;
tctx = try_lookup_task_ctx(p);
if (!tctx)
return;
tctx->sum_runtime = 0;
waker = bpf_get_current_task_btf();
tctx = try_lookup_task_ctx(waker);
if (!tctx)
return;
delta = MAX(now - tctx->last_woke_at, 1);
tctx->waker_freq = update_freq(tctx->waker_freq, delta);
tctx->last_woke_at = now;
}
void BPF_STRUCT_OPS(bpfland_quiescent, struct task_struct *p, u64 deq_flags)
{
u64 now = bpf_ktime_get_ns(), delta;
struct task_ctx *tctx;
tctx = try_lookup_task_ctx(p);
if (!tctx)
return;
delta = MAX(now - tctx->last_blocked_at, 1);
tctx->blocked_freq = update_freq(tctx->blocked_freq, delta);
tctx->last_blocked_at = now;
}
void BPF_STRUCT_OPS(bpfland_set_cpumask, struct task_struct *p,
const struct cpumask *cpumask)
{
s32 cpu = bpf_get_smp_processor_id();
task_set_domain(p, cpu, cpumask);
}
void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p)
{
u64 now = bpf_ktime_get_ns();
@ -1048,17 +1081,11 @@ void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p)
tctx = try_lookup_task_ctx(p);
if (!tctx)
return;
tctx->sum_exec_runtime = p->se.sum_exec_runtime;
tctx->nvcsw_ts = now;
tctx->deadline = vtime_now;
}
tctx->last_woke_at = now;
tctx->last_blocked_at = now;
void BPF_STRUCT_OPS(bpfland_set_cpumask, struct task_struct *p,
const struct cpumask *cpumask)
{
s32 cpu = bpf_get_smp_processor_id();
task_set_domain(p, cpu, cpumask);
tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx);
}
s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p,
@ -1216,16 +1243,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
nr_online_cpus = get_nr_online_cpus();
/*
* Create the global priority and shared DSQs.
*
* Allocate a new DSQ id that does not clash with any valid CPU id.
* Create the global shared DSQ.
*/
err = scx_bpf_create_dsq(PRIO_DSQ, -1);
if (err) {
scx_bpf_error("failed to create priority DSQ: %d", err);
return err;
}
err = scx_bpf_create_dsq(SHARED_DSQ, -1);
if (err) {
scx_bpf_error("failed to create shared DSQ: %d", err);
@ -1251,8 +1270,10 @@ SCX_OPS_DEFINE(bpfland_ops,
.dispatch = (void *)bpfland_dispatch,
.running = (void *)bpfland_running,
.stopping = (void *)bpfland_stopping,
.enable = (void *)bpfland_enable,
.runnable = (void *)bpfland_runnable,
.quiescent = (void *)bpfland_quiescent,
.set_cpumask = (void *)bpfland_set_cpumask,
.enable = (void *)bpfland_enable,
.init_task = (void *)bpfland_init_task,
.init = (void *)bpfland_init,
.exit = (void *)bpfland_exit,

View File

@ -138,15 +138,6 @@ struct Opts {
#[clap(short = 'l', long, allow_hyphen_values = true, default_value = "20000")]
slice_us_lag: i64,
/// With lowlatency enabled, instead of classifying tasks as interactive or non-interactive,
/// they all get a dynamic priority, which is adjusted in function of their average rate of
/// voluntary context switches.
///
/// This option guarantess less spikey behavior and it can be particularly useful in soft
/// real-time scenarios, such as audio processing, multimedia, etc.
#[clap(short = 'L', long, action = clap::ArgAction::SetTrue)]
lowlatency: bool,
/// Enable kthreads prioritization.
///
/// Enabling this can improve system performance, but it may also introduce interactivity
@ -181,11 +172,6 @@ struct Opts {
#[clap(short = 'c', long, default_value = "10")]
nvcsw_max_thresh: u64,
/// Prevent starvation by making sure that at least one lower priority task is scheduled every
/// starvation_thresh_us (0 = disable starvation prevention).
#[clap(short = 't', long, default_value = "1000")]
starvation_thresh_us: u64,
/// Enable stats monitoring with the specified interval.
#[clap(long)]
stats: Option<f64>,
@ -259,12 +245,10 @@ impl<'a> Scheduler<'a> {
// Override default BPF scheduling parameters.
skel.maps.rodata_data.debug = opts.debug;
skel.maps.rodata_data.smt_enabled = smt_enabled;
skel.maps.rodata_data.lowlatency = opts.lowlatency;
skel.maps.rodata_data.local_kthreads = opts.local_kthreads;
skel.maps.rodata_data.slice_max = opts.slice_us * 1000;
skel.maps.rodata_data.slice_min = opts.slice_us_min * 1000;
skel.maps.rodata_data.slice_lag = opts.slice_us_lag * 1000;
skel.maps.rodata_data.starvation_thresh_ns = opts.starvation_thresh_us * 1000;
skel.maps.rodata_data.nvcsw_max_thresh = opts.nvcsw_max_thresh;
// Load the BPF program for validation.
@ -556,11 +540,8 @@ impl<'a> Scheduler<'a> {
nr_running: self.skel.maps.bss_data.nr_running,
nr_cpus: self.skel.maps.bss_data.nr_online_cpus,
nr_interactive: self.skel.maps.bss_data.nr_interactive,
nr_prio_waiting: self.skel.maps.bss_data.nr_prio_waiting,
nr_shared_waiting: self.skel.maps.bss_data.nr_shared_waiting,
nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches,
nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
nr_prio_dispatches: self.skel.maps.bss_data.nr_prio_dispatches,
nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches,
}
}

View File

@ -21,16 +21,10 @@ pub struct Metrics {
pub nr_cpus: u64,
#[stat(desc = "Number of running interactive tasks")]
pub nr_interactive: u64,
#[stat(desc = "Average amount of regular tasks waiting to be dispatched")]
pub nr_shared_waiting: u64,
#[stat(desc = "Average amount of interactive tasks waiting to be dispatched")]
pub nr_prio_waiting: u64,
#[stat(desc = "Number of kthread direct dispatches")]
pub nr_kthread_dispatches: u64,
#[stat(desc = "Number of task direct dispatches")]
pub nr_direct_dispatches: u64,
#[stat(desc = "Number of interactive task dispatches")]
pub nr_prio_dispatches: u64,
#[stat(desc = "Number of regular task dispatches")]
pub nr_shared_dispatches: u64,
}
@ -39,16 +33,13 @@ impl Metrics {
fn format<W: Write>(&self, w: &mut W) -> Result<()> {
writeln!(
w,
"[{}] tasks -> r: {:>2}/{:<2} i: {:<2} pw: {:<4} w: {:<4} | dispatch -> k: {:<5} d: {:<5} p: {:<5} s: {:<5}",
"[{}] tasks -> r: {:>2}/{:<2} i: {:<2} | dispatch -> k: {:<5} d: {:<5} s: {:<5}",
crate::SCHEDULER_NAME,
self.nr_running,
self.nr_cpus,
self.nr_interactive,
self.nr_prio_waiting,
self.nr_shared_waiting,
self.nr_kthread_dispatches,
self.nr_direct_dispatches,
self.nr_prio_dispatches,
self.nr_shared_dispatches
)?;
Ok(())
@ -58,7 +49,6 @@ impl Metrics {
Self {
nr_kthread_dispatches: self.nr_kthread_dispatches - rhs.nr_kthread_dispatches,
nr_direct_dispatches: self.nr_direct_dispatches - rhs.nr_direct_dispatches,
nr_prio_dispatches: self.nr_prio_dispatches - rhs.nr_prio_dispatches,
nr_shared_dispatches: self.nr_shared_dispatches - rhs.nr_shared_dispatches,
..self.clone()
}