mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-21 18:41:47 +00:00
scx_bpfland: drop lowlatency mode and the priority DSQ
Schedule all tasks using a single global DSQ. This gives a better control to prevent potential starvation conditions. With this change, scx_bpfland adopts a logic similar to scx_rusty and scx_lavd, prioritizing tasks based on the frequency of their wait and wake-up events, rather than relying exclusively on the average amount of voluntary context switches. Tasks are still classified as interactive / non-interactive based on the amount of voluntary context switches, but this is only affecting the cpufreq logic. Signed-off-by: Andrea Righi <arighi@nvidia.com>
This commit is contained in:
parent
efc41dd936
commit
78101e4688
@ -13,6 +13,7 @@
|
||||
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||
#define CLAMP(val, lo, hi) MIN(MAX(val, lo), hi)
|
||||
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
|
||||
|
||||
enum consts {
|
||||
NSEC_PER_USEC = 1000ULL,
|
||||
|
@ -17,14 +17,19 @@ char _license[] SEC("license") = "GPL";
|
||||
const volatile bool debug;
|
||||
|
||||
/*
|
||||
* Priority DSQ used to dispatch interactive tasks.
|
||||
* Maximum task weight.
|
||||
*/
|
||||
#define PRIO_DSQ 0
|
||||
#define MAX_TASK_WEIGHT 10000
|
||||
|
||||
/*
|
||||
* Maximum frequency of task wakeup events / sec.
|
||||
*/
|
||||
#define MAX_WAKEUP_FREQ 1024
|
||||
|
||||
/*
|
||||
* DSQ used to dispatch regular tasks.
|
||||
*/
|
||||
#define SHARED_DSQ 1
|
||||
#define SHARED_DSQ 0
|
||||
|
||||
/*
|
||||
* Default task time slice.
|
||||
@ -55,17 +60,6 @@ const volatile s64 slice_lag = 20ULL * NSEC_PER_MSEC;
|
||||
*/
|
||||
const volatile bool local_kthreads;
|
||||
|
||||
/*
|
||||
* With lowlatency enabled, instead of classifying tasks as interactive or
|
||||
* non-interactive, they all get a dynamic priority, which is adjusted in
|
||||
* function of their average rate of voluntary context switches.
|
||||
*
|
||||
* This option guarantess less spikey behavior and it can be particularly
|
||||
* useful in soft real-time scenarios, such as audio processing, multimedia,
|
||||
* etc.
|
||||
*/
|
||||
const volatile bool lowlatency;
|
||||
|
||||
/*
|
||||
* Maximum threshold of voluntary context switches.
|
||||
*/
|
||||
@ -77,31 +71,15 @@ const volatile u64 nvcsw_max_thresh = 10ULL;
|
||||
*/
|
||||
volatile s64 cpufreq_perf_lvl;
|
||||
|
||||
/*
|
||||
* Time threshold to prevent task starvation.
|
||||
*
|
||||
* Tasks dispatched to the priority DSQ are always consumed before those
|
||||
* dispatched to the shared DSQ, so tasks in shared DSQ may be starved by those
|
||||
* in the priority DSQ.
|
||||
*
|
||||
* To mitigate this, store the timestamp of the last task consumption from
|
||||
* the shared DSQ. If the starvation_thresh_ns threshold is exceeded without
|
||||
* consuming a task, the scheduler will be forced to consume a task from the
|
||||
* corresponding DSQ.
|
||||
*/
|
||||
const volatile u64 starvation_thresh_ns = 1000ULL * NSEC_PER_MSEC;
|
||||
static u64 starvation_shared_ts;
|
||||
|
||||
/*
|
||||
* Scheduling statistics.
|
||||
*/
|
||||
volatile u64 nr_kthread_dispatches, nr_direct_dispatches,
|
||||
nr_prio_dispatches, nr_shared_dispatches;
|
||||
volatile u64 nr_kthread_dispatches, nr_direct_dispatches, nr_shared_dispatches;
|
||||
|
||||
/*
|
||||
* Amount of currently running tasks.
|
||||
*/
|
||||
volatile u64 nr_running, nr_interactive, nr_shared_waiting, nr_prio_waiting;
|
||||
volatile u64 nr_running, nr_interactive;
|
||||
|
||||
/*
|
||||
* Amount of online CPUs.
|
||||
@ -169,26 +147,31 @@ struct task_ctx {
|
||||
struct bpf_cpumask __kptr *l2_cpumask;
|
||||
struct bpf_cpumask __kptr *l3_cpumask;
|
||||
|
||||
/*
|
||||
* Total execution time of the task.
|
||||
*/
|
||||
u64 sum_exec_runtime;
|
||||
|
||||
/*
|
||||
* Voluntary context switches metrics.
|
||||
*/
|
||||
u64 nvcsw;
|
||||
u64 nvcsw_ts;
|
||||
u64 avg_nvcsw;
|
||||
|
||||
/*
|
||||
* Task's latency priority.
|
||||
* Frequency with which a task is blocked (consumer).
|
||||
*/
|
||||
u64 lat_weight;
|
||||
u64 blocked_freq;
|
||||
u64 last_blocked_at;
|
||||
|
||||
/*
|
||||
* Frequency with which a task wakes other tasks (producer).
|
||||
*/
|
||||
u64 waker_freq;
|
||||
u64 last_woke_at;
|
||||
|
||||
/*
|
||||
* Task's average used time slice.
|
||||
*/
|
||||
u64 avg_runtime;
|
||||
u64 sum_runtime;
|
||||
u64 last_run_at;
|
||||
|
||||
/*
|
||||
* Task's deadline.
|
||||
@ -276,75 +259,184 @@ static u64 calc_avg_clamp(u64 old_val, u64 new_val, u64 low, u64 high)
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the dynamic priority multiplier (only applied in lowlatency mode).
|
||||
*
|
||||
* The multiplier is evaluated in function of the task's average rate of
|
||||
* voluntary context switches per second.
|
||||
* Evaluate the average frequency of an event over time.
|
||||
*/
|
||||
static u64 task_dyn_prio(struct task_struct *p)
|
||||
static u64 update_freq(u64 freq, u64 delta)
|
||||
{
|
||||
struct task_ctx *tctx;
|
||||
u64 new_freq;
|
||||
|
||||
if (!lowlatency)
|
||||
return 1;
|
||||
tctx = try_lookup_task_ctx(p);
|
||||
if (!tctx)
|
||||
return 1;
|
||||
return MAX(tctx->lat_weight, 1);
|
||||
new_freq = NSEC_PER_SEC / delta;
|
||||
return calc_avg(freq, new_freq);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return task's dynamic priority.
|
||||
* Return the total amount of tasks that are currently waiting to be scheduled.
|
||||
*/
|
||||
static u64 task_prio(const struct task_struct *p)
|
||||
static u64 nr_tasks_waiting(void)
|
||||
{
|
||||
return p->scx.weight * task_dyn_prio(p);
|
||||
return scx_bpf_dsq_nr_queued(SHARED_DSQ) + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return task's dynamic weight.
|
||||
*/
|
||||
static u64 task_weight(const struct task_struct *p, const struct task_ctx *tctx)
|
||||
{
|
||||
/*
|
||||
* Scale the static task weight by the average amount of voluntary
|
||||
* context switches to determine the dynamic weight.
|
||||
*/
|
||||
u64 prio = p->scx.weight * CLAMP(tctx->avg_nvcsw, 1, nvcsw_max_thresh);
|
||||
|
||||
return CLAMP(prio, 1, MAX_TASK_WEIGHT);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a value proportionally scaled to the task's priority.
|
||||
*/
|
||||
static u64 scale_up_fair(const struct task_struct *p,
|
||||
const struct task_ctx *tctx, u64 value)
|
||||
{
|
||||
return value * task_weight(p, tctx) / 100;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a value inversely proportional to the task's priority.
|
||||
*/
|
||||
static u64 scale_inverse_fair(const struct task_struct *p,
|
||||
const struct task_ctx *tctx, u64 value)
|
||||
{
|
||||
return value * 100 / task_weight(p, tctx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the task's allowed lag: used to determine how early its vruntime can
|
||||
* be.
|
||||
*/
|
||||
static u64 task_lag(const struct task_struct *p)
|
||||
static u64 task_lag(const struct task_struct *p, const struct task_ctx *tctx)
|
||||
{
|
||||
return slice_lag * task_prio(p) / 100;
|
||||
return scale_up_fair(p, tctx, slice_lag);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a value inversely proportional to the task's weight.
|
||||
* ** Taken directly from fair.c in the Linux kernel **
|
||||
*
|
||||
* The "10% effect" is relative and cumulative: from _any_ nice level,
|
||||
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
|
||||
* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
|
||||
* If a task goes up by ~10% and another task goes down by ~10% then
|
||||
* the relative distance between them is ~25%.)
|
||||
*/
|
||||
static u64 scale_inverse_fair(const struct task_struct *p, u64 value)
|
||||
const int sched_prio_to_weight[40] = {
|
||||
/* -20 */ 88761, 71755, 56483, 46273, 36291,
|
||||
/* -15 */ 29154, 23254, 18705, 14949, 11916,
|
||||
/* -10 */ 9548, 7620, 6100, 4904, 3906,
|
||||
/* -5 */ 3121, 2501, 1991, 1586, 1277,
|
||||
/* 0 */ 1024, 820, 655, 526, 423,
|
||||
/* 5 */ 335, 272, 215, 172, 137,
|
||||
/* 10 */ 110, 87, 70, 56, 45,
|
||||
/* 15 */ 36, 29, 23, 18, 15,
|
||||
};
|
||||
|
||||
static u64 max_sched_prio(void)
|
||||
{
|
||||
return value * 100 / task_prio(p);
|
||||
return ARRAY_SIZE(sched_prio_to_weight);
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute the deadline component of a task (this value will be added to the
|
||||
* task's vruntime to determine the actual deadline).
|
||||
* Convert task priority to weight (following fair.c logic).
|
||||
*/
|
||||
static s64 task_compute_dl(const struct task_struct *p,
|
||||
const struct task_ctx *tctx)
|
||||
static u64 sched_prio_to_latency_weight(u64 prio)
|
||||
{
|
||||
/*
|
||||
* Return the deadline as a function of the average runtime and the
|
||||
* evaluated task's dynamic priority.
|
||||
*/
|
||||
return scale_inverse_fair(p, tctx->avg_runtime);
|
||||
u64 max_prio = max_sched_prio();
|
||||
|
||||
if (prio >= max_prio) {
|
||||
scx_bpf_error("invalid priority");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return sched_prio_to_weight[max_prio - prio - 1];
|
||||
}
|
||||
|
||||
/*
|
||||
* Return task's evaluated vruntime.
|
||||
* Evaluate task's deadline.
|
||||
*
|
||||
* Reuse a logic similar to scx_rusty or scx_lavd and evaluate the deadline as
|
||||
* a function of the waiting and wake-up events and the average task's runtime.
|
||||
*/
|
||||
static u64 task_deadline(struct task_struct *p, struct task_ctx *tctx)
|
||||
{
|
||||
u64 min_vruntime = vtime_now - task_lag(p);
|
||||
u64 waker_freq, blocked_freq;
|
||||
u64 lat_prio, lat_weight;
|
||||
u64 avg_run_scaled, avg_run;
|
||||
u64 freq_factor;
|
||||
|
||||
/*
|
||||
* Limit the wait and wake-up frequencies to prevent spikes.
|
||||
*/
|
||||
waker_freq = CLAMP(tctx->waker_freq, 1, MAX_WAKEUP_FREQ);
|
||||
blocked_freq = CLAMP(tctx->blocked_freq, 1, MAX_WAKEUP_FREQ);
|
||||
|
||||
/*
|
||||
* We want to prioritize producers (waker tasks) more than consumers
|
||||
* (blocked tasks), using the following formula:
|
||||
*
|
||||
* freq_factor = blocked_freq * waker_freq^2
|
||||
*
|
||||
* This seems to improve the overall responsiveness of
|
||||
* producer/consumer pipelines.
|
||||
*/
|
||||
freq_factor = blocked_freq * waker_freq * waker_freq;
|
||||
|
||||
/*
|
||||
* Evaluate the "latency priority" as a function of the wake-up, block
|
||||
* frequencies and average runtime, using the following formula:
|
||||
*
|
||||
* lat_prio = log(freq_factor / avg_run_scaled)
|
||||
*
|
||||
* Frequencies can grow very quickly, almost exponential, so use
|
||||
* log2_u64() to get a more linear metric that can be used as a
|
||||
* priority.
|
||||
*
|
||||
* The avg_run_scaled component is used to scale the latency priority
|
||||
* proportionally to the task's weight and inversely proportional to
|
||||
* its runtime, so that a task with a higher weight / shorter runtime
|
||||
* gets a higher latency priority than a task with a lower weight /
|
||||
* higher runtime.
|
||||
*/
|
||||
avg_run_scaled = scale_inverse_fair(p, tctx, tctx->avg_runtime);
|
||||
avg_run = log2_u64(avg_run_scaled + 1);
|
||||
|
||||
lat_prio = log2_u64(freq_factor);
|
||||
lat_prio = MIN(lat_prio, max_sched_prio());
|
||||
|
||||
if (lat_prio >= avg_run)
|
||||
lat_prio -= avg_run;
|
||||
else
|
||||
lat_prio = 0;
|
||||
|
||||
/*
|
||||
* Lastly, translate the latency priority into a weight and apply it to
|
||||
* the task's average runtime to determine the task's deadline.
|
||||
*/
|
||||
lat_weight = sched_prio_to_latency_weight(lat_prio);
|
||||
|
||||
return tctx->avg_runtime * 100 / lat_weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return task's evaluated deadline applied to its vruntime.
|
||||
*/
|
||||
static u64 task_vtime(struct task_struct *p, struct task_ctx *tctx)
|
||||
{
|
||||
u64 min_vruntime = vtime_now - task_lag(p, tctx);
|
||||
|
||||
/*
|
||||
* Limit the vruntime to to avoid excessively penalizing tasks.
|
||||
*/
|
||||
if (vtime_before(p->scx.dsq_vtime, min_vruntime)) {
|
||||
p->scx.dsq_vtime = min_vruntime;
|
||||
tctx->deadline = p->scx.dsq_vtime + task_compute_dl(p, tctx);
|
||||
tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx);
|
||||
}
|
||||
|
||||
return tctx->deadline;
|
||||
@ -356,28 +448,11 @@ static u64 task_deadline(struct task_struct *p, struct task_ctx *tctx)
|
||||
*/
|
||||
static inline void task_refill_slice(struct task_struct *p)
|
||||
{
|
||||
u64 curr_prio_waiting = scx_bpf_dsq_nr_queued(PRIO_DSQ);
|
||||
u64 curr_shared_waiting = scx_bpf_dsq_nr_queued(SHARED_DSQ);
|
||||
u64 scale_factor;
|
||||
|
||||
/*
|
||||
* Refresh the amount of waiting tasks to get a more accurate scaling
|
||||
* factor for the time slice.
|
||||
*/
|
||||
nr_prio_waiting = calc_avg(nr_prio_waiting, curr_prio_waiting);
|
||||
nr_shared_waiting = calc_avg(nr_shared_waiting, curr_shared_waiting);
|
||||
|
||||
/*
|
||||
* Scale the time slice of an inversely proportional factor of the
|
||||
* total amount of tasks that are waiting (use a more immediate metric
|
||||
* in lowlatency mode and an average in normal mode).
|
||||
* total amount of tasks that are waiting.
|
||||
*/
|
||||
if (lowlatency)
|
||||
scale_factor = curr_shared_waiting + 1;
|
||||
else
|
||||
scale_factor = nr_prio_waiting + nr_shared_waiting + 1;
|
||||
|
||||
p->scx.slice = CLAMP(slice_max / scale_factor, slice_min, slice_max);
|
||||
p->scx.slice = CLAMP(slice_max / nr_tasks_waiting(), slice_min, slice_max);
|
||||
}
|
||||
|
||||
static void task_set_domain(struct task_struct *p, s32 cpu,
|
||||
@ -459,8 +534,7 @@ static void task_set_domain(struct task_struct *p, s32 cpu,
|
||||
static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle)
|
||||
{
|
||||
const struct cpumask *idle_smtmask, *idle_cpumask;
|
||||
struct bpf_cpumask *primary, *l2_domain, *l3_domain;
|
||||
struct bpf_cpumask *p_mask, *l2_mask, *l3_mask;
|
||||
const struct cpumask *primary, *p_mask, *l2_mask, *l3_mask;
|
||||
struct task_ctx *tctx;
|
||||
bool is_prev_llc_affine = false;
|
||||
s32 cpu;
|
||||
@ -481,7 +555,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
if (!tctx)
|
||||
return -ENOENT;
|
||||
|
||||
primary = primary_cpumask;
|
||||
primary = cast_mask(primary_cpumask);
|
||||
if (!primary)
|
||||
return -EINVAL;
|
||||
|
||||
@ -494,21 +568,21 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
/*
|
||||
* Task's scheduling domains.
|
||||
*/
|
||||
p_mask = tctx->cpumask;
|
||||
p_mask = cast_mask(tctx->cpumask);
|
||||
if (!p_mask) {
|
||||
scx_bpf_error("cpumask not initialized");
|
||||
cpu = -EINVAL;
|
||||
goto out_put_cpumask;
|
||||
}
|
||||
|
||||
l2_mask = tctx->l2_cpumask;
|
||||
l2_mask = cast_mask(tctx->l2_cpumask);
|
||||
if (!l2_mask) {
|
||||
scx_bpf_error("l2 cpumask not initialized");
|
||||
cpu = -EINVAL;
|
||||
goto out_put_cpumask;
|
||||
}
|
||||
|
||||
l3_mask = tctx->l3_cpumask;
|
||||
l3_mask = cast_mask(tctx->l3_cpumask);
|
||||
if (!l3_mask) {
|
||||
scx_bpf_error("l3 cpumask not initialized");
|
||||
cpu = -EINVAL;
|
||||
@ -519,7 +593,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
* Check if the previously used CPU is still in the L3 task domain. If
|
||||
* not, we may want to move the task back to its original L3 domain.
|
||||
*/
|
||||
is_prev_llc_affine = bpf_cpumask_test_cpu(prev_cpu, cast_mask(l3_mask));
|
||||
is_prev_llc_affine = bpf_cpumask_test_cpu(prev_cpu, l3_mask);
|
||||
|
||||
/*
|
||||
* If the current task is waking up another task and releasing the CPU
|
||||
@ -528,7 +602,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
*/
|
||||
if (wake_flags & SCX_WAKE_SYNC) {
|
||||
struct task_struct *current = (void *)bpf_get_current_task_btf();
|
||||
struct bpf_cpumask *curr_l3_domain;
|
||||
const struct cpumask *curr_l3_domain;
|
||||
struct cpu_ctx *cctx;
|
||||
bool share_llc, has_idle;
|
||||
|
||||
@ -542,7 +616,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
goto out_put_cpumask;
|
||||
}
|
||||
|
||||
curr_l3_domain = cctx->l3_cpumask;
|
||||
curr_l3_domain = cast_mask(cctx->l3_cpumask);
|
||||
if (!curr_l3_domain)
|
||||
curr_l3_domain = primary;
|
||||
|
||||
@ -550,7 +624,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
* If both the waker and wakee share the same L3 cache keep
|
||||
* using the same CPU if possible.
|
||||
*/
|
||||
share_llc = bpf_cpumask_test_cpu(prev_cpu, cast_mask(curr_l3_domain));
|
||||
share_llc = bpf_cpumask_test_cpu(prev_cpu, curr_l3_domain);
|
||||
if (share_llc &&
|
||||
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
|
||||
cpu = prev_cpu;
|
||||
@ -563,9 +637,9 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
* the wakee on the same CPU as the waker (since it's going to
|
||||
* block and release the current CPU).
|
||||
*/
|
||||
has_idle = bpf_cpumask_intersects(cast_mask(curr_l3_domain), idle_cpumask);
|
||||
has_idle = bpf_cpumask_intersects(curr_l3_domain, idle_cpumask);
|
||||
if (has_idle &&
|
||||
bpf_cpumask_test_cpu(cpu, cast_mask(p_mask)) &&
|
||||
bpf_cpumask_test_cpu(cpu, p_mask) &&
|
||||
!(current->flags & PF_EXITING) &&
|
||||
scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu) == 0) {
|
||||
*is_idle = true;
|
||||
@ -593,7 +667,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
* Search for any full-idle CPU in the primary domain that
|
||||
* shares the same L2 cache.
|
||||
*/
|
||||
cpu = scx_bpf_pick_idle_cpu(cast_mask(l2_mask), SCX_PICK_IDLE_CORE);
|
||||
cpu = scx_bpf_pick_idle_cpu(l2_mask, SCX_PICK_IDLE_CORE);
|
||||
if (cpu >= 0) {
|
||||
*is_idle = true;
|
||||
goto out_put_cpumask;
|
||||
@ -603,7 +677,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
* Search for any full-idle CPU in the primary domain that
|
||||
* shares the same L3 cache.
|
||||
*/
|
||||
cpu = scx_bpf_pick_idle_cpu(cast_mask(l3_mask), SCX_PICK_IDLE_CORE);
|
||||
cpu = scx_bpf_pick_idle_cpu(l3_mask, SCX_PICK_IDLE_CORE);
|
||||
if (cpu >= 0) {
|
||||
*is_idle = true;
|
||||
goto out_put_cpumask;
|
||||
@ -612,7 +686,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
/*
|
||||
* Search for any other full-idle core in the primary domain.
|
||||
*/
|
||||
cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), SCX_PICK_IDLE_CORE);
|
||||
cpu = scx_bpf_pick_idle_cpu(p_mask, SCX_PICK_IDLE_CORE);
|
||||
if (cpu >= 0) {
|
||||
*is_idle = true;
|
||||
goto out_put_cpumask;
|
||||
@ -634,7 +708,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
* Search for any idle CPU in the primary domain that shares the same
|
||||
* L2 cache.
|
||||
*/
|
||||
cpu = scx_bpf_pick_idle_cpu(cast_mask(l2_mask), 0);
|
||||
cpu = scx_bpf_pick_idle_cpu(l2_mask, 0);
|
||||
if (cpu >= 0) {
|
||||
*is_idle = true;
|
||||
goto out_put_cpumask;
|
||||
@ -644,7 +718,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
* Search for any idle CPU in the primary domain that shares the same
|
||||
* L3 cache.
|
||||
*/
|
||||
cpu = scx_bpf_pick_idle_cpu(cast_mask(l3_mask), 0);
|
||||
cpu = scx_bpf_pick_idle_cpu(l3_mask, 0);
|
||||
if (cpu >= 0) {
|
||||
*is_idle = true;
|
||||
goto out_put_cpumask;
|
||||
@ -653,7 +727,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
/*
|
||||
* Search for any idle CPU in the scheduling domain.
|
||||
*/
|
||||
cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0);
|
||||
cpu = scx_bpf_pick_idle_cpu(p_mask, 0);
|
||||
if (cpu >= 0) {
|
||||
*is_idle = true;
|
||||
goto out_put_cpumask;
|
||||
@ -666,7 +740,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
|
||||
if (is_prev_llc_affine)
|
||||
cpu = prev_cpu;
|
||||
else
|
||||
cpu = scx_bpf_pick_any_cpu(cast_mask(l3_mask), 0);
|
||||
cpu = scx_bpf_pick_any_cpu(l3_mask, 0);
|
||||
|
||||
out_put_cpumask:
|
||||
scx_bpf_put_cpumask(idle_cpumask);
|
||||
@ -716,14 +790,6 @@ static void kick_task_cpu(struct task_struct *p)
|
||||
scx_bpf_kick_cpu(cpu, 0);
|
||||
}
|
||||
|
||||
static bool is_task_interactive(const struct task_struct *p,
|
||||
const struct task_ctx *tctx)
|
||||
{
|
||||
if (!tctx->is_interactive)
|
||||
return false;
|
||||
return scx_bpf_dsq_nr_queued(PRIO_DSQ) < 100;
|
||||
}
|
||||
|
||||
/*
|
||||
* Dispatch all the other tasks that were not dispatched directly in
|
||||
* select_cpu().
|
||||
@ -731,7 +797,6 @@ static bool is_task_interactive(const struct task_struct *p,
|
||||
void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
{
|
||||
struct task_ctx *tctx;
|
||||
s32 dsq_id;
|
||||
|
||||
tctx = try_lookup_task_ctx(p);
|
||||
if (!tctx)
|
||||
@ -753,19 +818,10 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
/*
|
||||
* Dispatch interactive tasks to the priority DSQ and regular tasks to
|
||||
* the shared DSQ.
|
||||
*
|
||||
* When lowlatency is enabled, the separate priority DSQ is disabled,
|
||||
* so in this case always dispatch to the shared DSQ.
|
||||
*/
|
||||
if (!lowlatency && is_task_interactive(p, tctx)) {
|
||||
dsq_id = PRIO_DSQ;
|
||||
__sync_fetch_and_add(&nr_prio_dispatches, 1);
|
||||
} else {
|
||||
dsq_id = SHARED_DSQ;
|
||||
__sync_fetch_and_add(&nr_shared_dispatches, 1);
|
||||
}
|
||||
scx_bpf_dispatch_vtime(p, dsq_id, SCX_SLICE_DFL,
|
||||
task_deadline(p, tctx), enq_flags);
|
||||
scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL,
|
||||
task_vtime(p, tctx), enq_flags);
|
||||
__sync_fetch_and_add(&nr_shared_dispatches, 1);
|
||||
|
||||
/*
|
||||
* If there is an idle CPU available for the task, wake it up so it can
|
||||
@ -774,76 +830,13 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
kick_task_cpu(p);
|
||||
}
|
||||
|
||||
/*
|
||||
* Consume a task from the priority DSQ, transferring it to the local CPU DSQ.
|
||||
*
|
||||
* Return true if a task is consumed, false otherwise.
|
||||
*/
|
||||
static bool consume_prio_task(u64 now)
|
||||
{
|
||||
return scx_bpf_consume(PRIO_DSQ);
|
||||
}
|
||||
|
||||
/*
|
||||
* Consume a task from the shared DSQ, transferring it to the local CPU DSQ.
|
||||
*
|
||||
* Return true if a task is consumed, false otherwise.
|
||||
*/
|
||||
static bool consume_regular_task(u64 now)
|
||||
{
|
||||
bool ret;
|
||||
|
||||
ret = scx_bpf_consume(SHARED_DSQ);
|
||||
if (ret)
|
||||
starvation_shared_ts = now;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Consume tasks that are potentially starving.
|
||||
*
|
||||
* In order to limit potential starvation conditions the scheduler uses a
|
||||
* time-based threshold to ensure that at least one task from the
|
||||
* lower-priority DSQs is periodically consumed.
|
||||
*/
|
||||
static bool consume_starving_tasks(u64 now)
|
||||
{
|
||||
if (!starvation_thresh_ns)
|
||||
return false;
|
||||
|
||||
if (vtime_before(starvation_shared_ts + starvation_thresh_ns, now))
|
||||
if (consume_regular_task(now))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Consume regular tasks from the per-CPU DSQ or a shared DSQ, transferring
|
||||
* them to the local CPU DSQ.
|
||||
*
|
||||
* Return true if at least a task is consumed, false otherwise.
|
||||
*/
|
||||
static bool consume_shared_tasks(s32 cpu, u64 now)
|
||||
{
|
||||
/*
|
||||
* The priority DSQ can starve the shared DSQ, so to mitigate this
|
||||
* starvation we have the starvation_thresh_ns, see also
|
||||
* consume_starving_tasks().
|
||||
*/
|
||||
if (consume_prio_task(now) || consume_regular_task(now))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
|
||||
{
|
||||
u64 now = bpf_ktime_get_ns();
|
||||
|
||||
if (consume_starving_tasks(now))
|
||||
return;
|
||||
if (consume_shared_tasks(cpu, now))
|
||||
/*
|
||||
* Consume regular tasks from the shared DSQ, transferring them to the
|
||||
* local CPU DSQ.
|
||||
*/
|
||||
if (scx_bpf_consume(SHARED_DSQ))
|
||||
return;
|
||||
/*
|
||||
* If the current task expired its time slice and no other task wants
|
||||
@ -922,6 +915,7 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
|
||||
tctx = try_lookup_task_ctx(p);
|
||||
if (!tctx)
|
||||
return;
|
||||
tctx->last_run_at = bpf_ktime_get_ns();
|
||||
|
||||
/*
|
||||
* Adjust target CPU frequency before the task starts to run.
|
||||
@ -989,18 +983,15 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
|
||||
/*
|
||||
* Update task's average runtime.
|
||||
*/
|
||||
slice = p->se.sum_exec_runtime - tctx->sum_exec_runtime;
|
||||
if (lowlatency)
|
||||
slice = CLAMP(slice, slice_min, slice_max);
|
||||
tctx->sum_exec_runtime = p->se.sum_exec_runtime;
|
||||
tctx->avg_runtime = calc_avg(tctx->avg_runtime, slice);
|
||||
slice = now - tctx->last_run_at;
|
||||
tctx->sum_runtime += slice;
|
||||
tctx->avg_runtime = calc_avg(tctx->avg_runtime, tctx->sum_runtime);
|
||||
|
||||
/*
|
||||
* Update task vruntime charging the weighted used time slice.
|
||||
*/
|
||||
slice = scale_inverse_fair(p, slice);
|
||||
p->scx.dsq_vtime += slice;
|
||||
tctx->deadline = p->scx.dsq_vtime + task_compute_dl(p, tctx);
|
||||
p->scx.dsq_vtime += scale_inverse_fair(p, tctx, slice);
|
||||
tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx);
|
||||
|
||||
/*
|
||||
* Refresh voluntary context switch metrics.
|
||||
@ -1011,7 +1002,7 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
|
||||
delta_t = (s64)(now - tctx->nvcsw_ts);
|
||||
if (delta_t > NSEC_PER_SEC) {
|
||||
u64 avg_nvcsw = tctx->nvcsw * NSEC_PER_SEC / delta_t;
|
||||
u64 max_lat_weight = nvcsw_max_thresh * 100;
|
||||
u64 max_nvcsw = nvcsw_max_thresh * 100;
|
||||
|
||||
tctx->nvcsw = 0;
|
||||
tctx->nvcsw_ts = now;
|
||||
@ -1021,8 +1012,7 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
|
||||
* of voluntary context switches (limited to to prevent
|
||||
* excessive spikes).
|
||||
*/
|
||||
tctx->lat_weight = calc_avg_clamp(tctx->lat_weight, avg_nvcsw,
|
||||
0, max_lat_weight);
|
||||
tctx->avg_nvcsw = calc_avg_clamp(tctx->avg_nvcsw, avg_nvcsw, 0, max_nvcsw);
|
||||
|
||||
/*
|
||||
* Classify the task based on the average of voluntary context
|
||||
@ -1032,10 +1022,53 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
|
||||
* it is classified as interactive, otherwise the task is
|
||||
* classified as regular.
|
||||
*/
|
||||
tctx->is_interactive = tctx->lat_weight >= nvcsw_max_thresh;
|
||||
tctx->is_interactive = tctx->avg_nvcsw >= nvcsw_max_thresh;
|
||||
}
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(bpfland_runnable, struct task_struct *p, u64 enq_flags)
|
||||
{
|
||||
u64 now = bpf_ktime_get_ns(), delta;
|
||||
struct task_struct *waker;
|
||||
struct task_ctx *tctx;
|
||||
|
||||
tctx = try_lookup_task_ctx(p);
|
||||
if (!tctx)
|
||||
return;
|
||||
tctx->sum_runtime = 0;
|
||||
|
||||
waker = bpf_get_current_task_btf();
|
||||
tctx = try_lookup_task_ctx(waker);
|
||||
if (!tctx)
|
||||
return;
|
||||
|
||||
delta = MAX(now - tctx->last_woke_at, 1);
|
||||
tctx->waker_freq = update_freq(tctx->waker_freq, delta);
|
||||
tctx->last_woke_at = now;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(bpfland_quiescent, struct task_struct *p, u64 deq_flags)
|
||||
{
|
||||
u64 now = bpf_ktime_get_ns(), delta;
|
||||
struct task_ctx *tctx;
|
||||
|
||||
tctx = try_lookup_task_ctx(p);
|
||||
if (!tctx)
|
||||
return;
|
||||
|
||||
delta = MAX(now - tctx->last_blocked_at, 1);
|
||||
tctx->blocked_freq = update_freq(tctx->blocked_freq, delta);
|
||||
tctx->last_blocked_at = now;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(bpfland_set_cpumask, struct task_struct *p,
|
||||
const struct cpumask *cpumask)
|
||||
{
|
||||
s32 cpu = bpf_get_smp_processor_id();
|
||||
|
||||
task_set_domain(p, cpu, cpumask);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p)
|
||||
{
|
||||
u64 now = bpf_ktime_get_ns();
|
||||
@ -1048,17 +1081,11 @@ void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p)
|
||||
tctx = try_lookup_task_ctx(p);
|
||||
if (!tctx)
|
||||
return;
|
||||
tctx->sum_exec_runtime = p->se.sum_exec_runtime;
|
||||
tctx->nvcsw_ts = now;
|
||||
tctx->deadline = vtime_now;
|
||||
}
|
||||
tctx->last_woke_at = now;
|
||||
tctx->last_blocked_at = now;
|
||||
|
||||
void BPF_STRUCT_OPS(bpfland_set_cpumask, struct task_struct *p,
|
||||
const struct cpumask *cpumask)
|
||||
{
|
||||
s32 cpu = bpf_get_smp_processor_id();
|
||||
|
||||
task_set_domain(p, cpu, cpumask);
|
||||
tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p,
|
||||
@ -1216,16 +1243,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
|
||||
nr_online_cpus = get_nr_online_cpus();
|
||||
|
||||
/*
|
||||
* Create the global priority and shared DSQs.
|
||||
*
|
||||
* Allocate a new DSQ id that does not clash with any valid CPU id.
|
||||
* Create the global shared DSQ.
|
||||
*/
|
||||
err = scx_bpf_create_dsq(PRIO_DSQ, -1);
|
||||
if (err) {
|
||||
scx_bpf_error("failed to create priority DSQ: %d", err);
|
||||
return err;
|
||||
}
|
||||
|
||||
err = scx_bpf_create_dsq(SHARED_DSQ, -1);
|
||||
if (err) {
|
||||
scx_bpf_error("failed to create shared DSQ: %d", err);
|
||||
@ -1251,8 +1270,10 @@ SCX_OPS_DEFINE(bpfland_ops,
|
||||
.dispatch = (void *)bpfland_dispatch,
|
||||
.running = (void *)bpfland_running,
|
||||
.stopping = (void *)bpfland_stopping,
|
||||
.enable = (void *)bpfland_enable,
|
||||
.runnable = (void *)bpfland_runnable,
|
||||
.quiescent = (void *)bpfland_quiescent,
|
||||
.set_cpumask = (void *)bpfland_set_cpumask,
|
||||
.enable = (void *)bpfland_enable,
|
||||
.init_task = (void *)bpfland_init_task,
|
||||
.init = (void *)bpfland_init,
|
||||
.exit = (void *)bpfland_exit,
|
||||
|
@ -138,15 +138,6 @@ struct Opts {
|
||||
#[clap(short = 'l', long, allow_hyphen_values = true, default_value = "20000")]
|
||||
slice_us_lag: i64,
|
||||
|
||||
/// With lowlatency enabled, instead of classifying tasks as interactive or non-interactive,
|
||||
/// they all get a dynamic priority, which is adjusted in function of their average rate of
|
||||
/// voluntary context switches.
|
||||
///
|
||||
/// This option guarantess less spikey behavior and it can be particularly useful in soft
|
||||
/// real-time scenarios, such as audio processing, multimedia, etc.
|
||||
#[clap(short = 'L', long, action = clap::ArgAction::SetTrue)]
|
||||
lowlatency: bool,
|
||||
|
||||
/// Enable kthreads prioritization.
|
||||
///
|
||||
/// Enabling this can improve system performance, but it may also introduce interactivity
|
||||
@ -181,11 +172,6 @@ struct Opts {
|
||||
#[clap(short = 'c', long, default_value = "10")]
|
||||
nvcsw_max_thresh: u64,
|
||||
|
||||
/// Prevent starvation by making sure that at least one lower priority task is scheduled every
|
||||
/// starvation_thresh_us (0 = disable starvation prevention).
|
||||
#[clap(short = 't', long, default_value = "1000")]
|
||||
starvation_thresh_us: u64,
|
||||
|
||||
/// Enable stats monitoring with the specified interval.
|
||||
#[clap(long)]
|
||||
stats: Option<f64>,
|
||||
@ -259,12 +245,10 @@ impl<'a> Scheduler<'a> {
|
||||
// Override default BPF scheduling parameters.
|
||||
skel.maps.rodata_data.debug = opts.debug;
|
||||
skel.maps.rodata_data.smt_enabled = smt_enabled;
|
||||
skel.maps.rodata_data.lowlatency = opts.lowlatency;
|
||||
skel.maps.rodata_data.local_kthreads = opts.local_kthreads;
|
||||
skel.maps.rodata_data.slice_max = opts.slice_us * 1000;
|
||||
skel.maps.rodata_data.slice_min = opts.slice_us_min * 1000;
|
||||
skel.maps.rodata_data.slice_lag = opts.slice_us_lag * 1000;
|
||||
skel.maps.rodata_data.starvation_thresh_ns = opts.starvation_thresh_us * 1000;
|
||||
skel.maps.rodata_data.nvcsw_max_thresh = opts.nvcsw_max_thresh;
|
||||
|
||||
// Load the BPF program for validation.
|
||||
@ -556,11 +540,8 @@ impl<'a> Scheduler<'a> {
|
||||
nr_running: self.skel.maps.bss_data.nr_running,
|
||||
nr_cpus: self.skel.maps.bss_data.nr_online_cpus,
|
||||
nr_interactive: self.skel.maps.bss_data.nr_interactive,
|
||||
nr_prio_waiting: self.skel.maps.bss_data.nr_prio_waiting,
|
||||
nr_shared_waiting: self.skel.maps.bss_data.nr_shared_waiting,
|
||||
nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches,
|
||||
nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
|
||||
nr_prio_dispatches: self.skel.maps.bss_data.nr_prio_dispatches,
|
||||
nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches,
|
||||
}
|
||||
}
|
||||
|
@ -21,16 +21,10 @@ pub struct Metrics {
|
||||
pub nr_cpus: u64,
|
||||
#[stat(desc = "Number of running interactive tasks")]
|
||||
pub nr_interactive: u64,
|
||||
#[stat(desc = "Average amount of regular tasks waiting to be dispatched")]
|
||||
pub nr_shared_waiting: u64,
|
||||
#[stat(desc = "Average amount of interactive tasks waiting to be dispatched")]
|
||||
pub nr_prio_waiting: u64,
|
||||
#[stat(desc = "Number of kthread direct dispatches")]
|
||||
pub nr_kthread_dispatches: u64,
|
||||
#[stat(desc = "Number of task direct dispatches")]
|
||||
pub nr_direct_dispatches: u64,
|
||||
#[stat(desc = "Number of interactive task dispatches")]
|
||||
pub nr_prio_dispatches: u64,
|
||||
#[stat(desc = "Number of regular task dispatches")]
|
||||
pub nr_shared_dispatches: u64,
|
||||
}
|
||||
@ -39,16 +33,13 @@ impl Metrics {
|
||||
fn format<W: Write>(&self, w: &mut W) -> Result<()> {
|
||||
writeln!(
|
||||
w,
|
||||
"[{}] tasks -> r: {:>2}/{:<2} i: {:<2} pw: {:<4} w: {:<4} | dispatch -> k: {:<5} d: {:<5} p: {:<5} s: {:<5}",
|
||||
"[{}] tasks -> r: {:>2}/{:<2} i: {:<2} | dispatch -> k: {:<5} d: {:<5} s: {:<5}",
|
||||
crate::SCHEDULER_NAME,
|
||||
self.nr_running,
|
||||
self.nr_cpus,
|
||||
self.nr_interactive,
|
||||
self.nr_prio_waiting,
|
||||
self.nr_shared_waiting,
|
||||
self.nr_kthread_dispatches,
|
||||
self.nr_direct_dispatches,
|
||||
self.nr_prio_dispatches,
|
||||
self.nr_shared_dispatches
|
||||
)?;
|
||||
Ok(())
|
||||
@ -58,7 +49,6 @@ impl Metrics {
|
||||
Self {
|
||||
nr_kthread_dispatches: self.nr_kthread_dispatches - rhs.nr_kthread_dispatches,
|
||||
nr_direct_dispatches: self.nr_direct_dispatches - rhs.nr_direct_dispatches,
|
||||
nr_prio_dispatches: self.nr_prio_dispatches - rhs.nr_prio_dispatches,
|
||||
nr_shared_dispatches: self.nr_shared_dispatches - rhs.nr_shared_dispatches,
|
||||
..self.clone()
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user