scx_bpfland: rework lowlatency mode to adjust tasks priority

Rework lowlatency mode as following:
 - introduce task dynamic priority: task weight multiplied by the
   average amount of voluntary context switches
 - use dynamic priority to determine task's vruntime (instead of the
   static task's weight)
 - task's minimum vruntime is evaluated in function of the dynamic
   priority (tasks with a higher dynamic priority can have a smaller
   vruntime compared to tasks with a lower dynamic priority)

The dynamic priority allows to maintain a good system responsiveness
also without applying the classification of tasks in "interactive" and
"regular", therefore in lowlatency mode only the shared DSQ will be
used (priority DSQ is disabled).

Using a separate priority queue to dispatch "interactive" tasks makes
the scheduler less fair, allowing latency-sensitive tasks to be
prioritized even when there is a high number of tasks in the system
(e.g., `stress-ng -c 1024` or similar scenarios), where relying solely
on dynamic priority may not be sufficient.

On the other hand, disabling the classification of "interactive" tasks
results in a fairer scheduler and more predictable performance, making
it better suited for soft real-time applications (e.g, audio and
multimedia).

Therefore, the --lowlatency option is retained to allow users to choose
between more predictable performance (by disabling the interactive task
classification) or a more responsive system (default).

Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
This commit is contained in:
Andrea Righi 2024-10-09 15:15:33 +02:00
parent d336892c71
commit 4d68133f3b
3 changed files with 152 additions and 175 deletions

View File

@ -26,15 +26,20 @@ const volatile bool debug;
*/
#define SHARED_DSQ 1
/*
* Maximum multiplier for the dynamic task priority.
*/
#define MAX_LATENCY_WEIGHT 1000
/*
* Default task time slice.
*/
const volatile u64 slice_ns = 5ULL * NSEC_PER_MSEC;
const volatile u64 slice_max = 5ULL * NSEC_PER_MSEC;
/*
* Time slice used when system is over commissioned.
*/
const volatile u64 slice_ns_min = 500ULL * NSEC_PER_USEC;
const volatile u64 slice_min = 1ULL * NSEC_PER_MSEC;
/*
* Maximum time slice lag.
@ -43,7 +48,7 @@ const volatile u64 slice_ns_min = 500ULL * NSEC_PER_USEC;
* tasks at the cost of making regular and newly created tasks less responsive
* (0 = disabled).
*/
const volatile s64 slice_ns_lag;
const volatile s64 slice_lag = 5ULL * NSEC_PER_MSEC;
/*
* When enabled always dispatch all kthreads directly.
@ -56,23 +61,13 @@ const volatile s64 slice_ns_lag;
const volatile bool local_kthreads;
/*
* Boost interactive tasks, by shortening their deadline as a function of their
* average amount of voluntary context switches.
* With lowlatency enabled, instead of classifying tasks as interactive or
* non-interactive, they all get a dynamic priority, which is adjusted in
* function of their average rate of voluntary context switches.
*
* Tasks are already classified as interactive if their average amount of
* context switches exceeds nvcsw_avg_thresh, which grants them higher
* priority.
*
* When this option is enabled, tasks will receive a deadline boost in addition
* to their interactive vs. regular classification, with the boost being
* proportional to their average number of context switches.
*
* This ensures that within the main scheduling classes (interactive and
* regular), tasks that more frequently voluntarily yield the CPU receive an
* even higher priority.
*
* This option is particularly useful in soft real-time scenarios, such as
* audio processing, multimedia, etc.
* This option guarantess less spikey behavior and it can be particularly
* useful in soft real-time scenarios, such as audio processing, multimedia,
* etc.
*/
const volatile bool lowlatency;
@ -108,7 +103,7 @@ volatile s64 cpufreq_perf_lvl;
* consuming a task, the scheduler will be forced to consume a task from the
* corresponding DSQ.
*/
const volatile u64 starvation_thresh_ns = 5ULL * NSEC_PER_MSEC;
const volatile u64 starvation_thresh_ns = 5000ULL * NSEC_PER_MSEC;
static u64 starvation_shared_ts;
/*
@ -120,7 +115,12 @@ volatile u64 nr_kthread_dispatches, nr_direct_dispatches,
/*
* Amount of currently running tasks.
*/
volatile u64 nr_running, nr_waiting, nr_interactive, nr_online_cpus;
volatile u64 nr_running, nr_interactive, nr_shared_waiting, nr_prio_waiting;
/*
* Amount of online CPUs.
*/
volatile u64 nr_online_cpus;
/*
* Exit information.
@ -193,18 +193,17 @@ struct task_ctx {
*/
u64 nvcsw;
u64 nvcsw_ts;
u64 avg_nvcsw;
/*
* Task's latency priority.
*/
u64 lat_weight;
/*
* Task's average used time slice.
*/
u64 avg_runtime;
/*
* Last task's execution time.
*/
u64 last_running;
/*
* Task's deadline.
*/
@ -233,15 +232,6 @@ struct task_ctx *try_lookup_task_ctx(const struct task_struct *p)
(struct task_struct *)p, 0, 0);
}
/*
* Return true if interactive tasks classification via voluntary context
* switches is enabled, false otherwise.
*/
static bool is_nvcsw_enabled(void)
{
return !!nvcsw_max_thresh;
}
/*
* Compare two vruntime values, returns true if the first value is less than
* the second one.
@ -253,19 +243,6 @@ static inline bool vtime_before(u64 a, u64 b)
return (s64)(a - b) < 0;
}
/*
* Return true if the task is interactive, false otherwise.
*/
static bool is_task_interactive(struct task_struct *p)
{
struct task_ctx *tctx;
tctx = try_lookup_task_ctx(p);
if (!tctx)
return false;
return tctx->is_interactive;
}
/*
* Return true if the target task @p is a kernel thread.
*/
@ -313,11 +290,46 @@ static u64 calc_avg_clamp(u64 old_val, u64 new_val, u64 low, u64 high)
}
/*
* Return a value inversely proportional to a weight.
* Return the dynamic priority multiplier (only applied in lowlatency mode).
*
* The multiplier is evaluated in function of the task's average rate of
* voluntary context switches per second.
*/
static u64 scale_inverse_fair(u64 value, u64 weight)
static u64 task_dyn_prio(struct task_struct *p)
{
return value * 100 / weight;
struct task_ctx *tctx;
if (!lowlatency)
return 1;
tctx = try_lookup_task_ctx(p);
if (!tctx)
return 1;
return MAX(tctx->lat_weight, 1);
}
/*
* Return task's dynamic priority.
*/
static u64 task_prio(struct task_struct *p)
{
return p->scx.weight * task_dyn_prio(p);
}
/*
* Return the task's allowed lag: used to determine how early its vruntime can
* be.
*/
static u64 task_lag(struct task_struct *p)
{
return slice_lag * task_prio(p) / 100;
}
/*
* Return a value inversely proportional to the task's weight.
*/
static u64 scale_inverse_fair(struct task_struct *p, u64 value)
{
return value * 100 / task_prio(p);
}
/*
@ -326,41 +338,19 @@ static u64 scale_inverse_fair(u64 value, u64 weight)
*/
static s64 task_compute_dl(struct task_struct *p ,struct task_ctx *tctx)
{
/*
* The amount of voluntary context switches contributes to determine
* the task's priority.
*/
u64 task_prio = p->scx.weight + tctx->avg_nvcsw;
/*
* If not in "lowlatency" mode, always apply a pure vruntime based
* scheduling.
*/
if (!lowlatency)
return 0;
/*
* If the task has not ran during the previous slice_ns period, use its
* vruntime as deadline to give it a priority boost. This allows to
* speed up tasks that are mostly sleeping and they suddenly need to
* react fast.
*/
if (vtime_before(tctx->last_running + slice_ns, bpf_ktime_get_ns()))
return 0;
/*
* Return the deadline as a function of the average runtime and the
* evaluated task's dynamic priority.
*/
return scale_inverse_fair(tctx->avg_runtime, task_prio);
return scale_inverse_fair(p, tctx->avg_runtime);
}
/*
* Return task's evaluated deadline.
* Return task's evaluated vruntime.
*/
static inline u64 task_deadline(struct task_struct *p)
{
u64 min_vruntime = vtime_now - slice_ns_lag;
u64 min_vruntime = vtime_now - task_lag(p);
struct task_ctx *tctx;
tctx = try_lookup_task_ctx(p);
@ -368,16 +358,7 @@ static inline u64 task_deadline(struct task_struct *p)
return min_vruntime;
/*
* Limit the vruntime to (vtime_now - slice_ns_lag) to avoid
* excessively penalizing tasks.
*
* A positive slice_ns_lag can enhance vruntime scheduling
* effectiveness, but it may lead to more "spikey" performance as tasks
* could remain in the queue for too long.
*
* Instead, a negative slice_ns_lag can result in more consistent
* performance (less spikey), smoothing the reordering of the vruntime
* scheduling and making the scheduler closer to a FIFO.
* Limit the vruntime to to avoid excessively penalizing tasks.
*/
if (vtime_before(p->scx.dsq_vtime, min_vruntime)) {
p->scx.dsq_vtime = min_vruntime;
@ -387,36 +368,36 @@ static inline u64 task_deadline(struct task_struct *p)
return tctx->deadline;
}
/*
* Return the amount of tasks waiting to be dispatched.
*/
static u64 nr_tasks_waiting(void)
{
return scx_bpf_dsq_nr_queued(PRIO_DSQ) +
scx_bpf_dsq_nr_queued(SHARED_DSQ);
}
/*
* Evaluate task's time slice in function of the total amount of tasks that are
* waiting to be dispatched and the task's weight.
*/
static inline void task_refill_slice(struct task_struct *p)
{
u64 slice;
u64 curr_prio_waiting = scx_bpf_dsq_nr_queued(PRIO_DSQ);
u64 curr_shared_waiting = scx_bpf_dsq_nr_queued(SHARED_DSQ);
u64 scale_factor;
/*
* Refresh the amount of waiting tasks to get a more accurate scaling
* factor for the time slice.
*/
nr_waiting = (nr_waiting + nr_tasks_waiting()) / 2;
nr_prio_waiting = calc_avg(nr_prio_waiting, curr_prio_waiting);
nr_shared_waiting = calc_avg(nr_shared_waiting, curr_shared_waiting);
slice = slice_ns / (nr_waiting + 1);
p->scx.slice = CLAMP(slice, slice_ns_min, slice_ns);
/*
* Scale the time slice of an inversely proportional factor of the
* total amount of tasks that are waiting (use a more immediate metric
* in lowlatency mode and an average in normal mode).
*/
if (lowlatency)
scale_factor = curr_shared_waiting + 1;
else
scale_factor = nr_prio_waiting + nr_shared_waiting + 1;
p->scx.slice = CLAMP(slice_max / scale_factor, slice_min, slice_max);
}
/*
* Return true if priority DSQ is congested, false otherwise.
*/
static bool is_prio_congested(void)
{
return scx_bpf_dsq_nr_queued(PRIO_DSQ) > nr_online_cpus * 4;
@ -439,7 +420,7 @@ static void handle_sync_wakeup(struct task_struct *p)
* the tasks that are already classified as interactive.
*/
tctx = try_lookup_task_ctx(p);
if (tctx && is_nvcsw_enabled() && !is_prio_congested())
if (tctx && !is_prio_congested())
tctx->is_interactive = true;
}
@ -738,8 +719,13 @@ static void kick_task_cpu(struct task_struct *p)
*/
void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
{
struct task_ctx *tctx;
s32 dsq_id;
tctx = try_lookup_task_ctx(p);
if (!tctx)
return;
/*
* Per-CPU kthreads are critical for system responsiveness so make sure
* they are dispatched before any other task.
@ -757,12 +743,10 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
* Dispatch interactive tasks to the priority DSQ and regular tasks to
* the shared DSQ.
*
* However, avoid queuing too many tasks to the priority DSQ: if we
* have a storm of interactive tasks (more than 4x the amount of CPUs
* that can consume them) we can just dispatch them to the shared DSQ
* and simply rely on the vruntime logic.
* When lowlatency is enabled, the separate priority DSQ is disabled,
* so in this case always dispatch to the shared DSQ.
*/
if (is_task_interactive(p)) {
if (!lowlatency && tctx->is_interactive) {
dsq_id = PRIO_DSQ;
__sync_fetch_and_add(&nr_prio_dispatches, 1);
} else {
@ -863,7 +847,7 @@ void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
* Scale target CPU frequency based on the performance level selected
* from user-space and the CPU utilization.
*/
static void update_cpuperf_target(struct task_struct *p)
static void update_cpuperf_target(struct task_struct *p, struct task_ctx *tctx)
{
u64 now = bpf_ktime_get_ns();
s32 cpu = scx_bpf_task_cpu(p);
@ -882,7 +866,7 @@ static void update_cpuperf_target(struct task_struct *p)
/*
* Auto mode: always tset max performance for interactive tasks.
*/
if (is_task_interactive(p)) {
if (tctx->is_interactive) {
scx_bpf_cpuperf_set(cpu, SCX_CPUPERF_ONE);
return;
}
@ -916,46 +900,28 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
{
struct task_ctx *tctx;
__sync_fetch_and_add(&nr_running, 1);
/*
* Refresh task's time slice immediately before it starts to run on its
* assigned CPU.
*/
task_refill_slice(p);
tctx = try_lookup_task_ctx(p);
if (!tctx)
return;
/*
* Adjust target CPU frequency before the task starts to run.
*/
update_cpuperf_target(p);
update_cpuperf_target(p, tctx);
tctx = try_lookup_task_ctx(p);
if (tctx) {
/*
* Update CPU interactive state.
*/
if (tctx->is_interactive)
__sync_fetch_and_add(&nr_interactive, 1);
/*
* Update task's running timestamp.
*/
tctx->last_running = bpf_ktime_get_ns();
}
__sync_fetch_and_add(&nr_running, 1);
}
static void update_task_interactive(struct task_ctx *tctx)
{
/*
* Classify the task based on the average of voluntary context
* switches.
*
* If the task has an average greater than the global average
* (nvcsw_avg_thresh) it is classified as interactive, otherwise the
* task is classified as regular.
* Update CPU interactive state.
*/
if (is_nvcsw_enabled())
tctx->is_interactive = tctx->avg_nvcsw >= nvcsw_avg_thresh;
if (tctx->is_interactive)
__sync_fetch_and_add(&nr_interactive, 1);
}
/*
@ -964,7 +930,7 @@ static void update_task_interactive(struct task_ctx *tctx)
*/
void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
{
u64 now = bpf_ktime_get_ns(), task_slice;
u64 now = bpf_ktime_get_ns(), slice;
s32 cpu = scx_bpf_task_cpu(p);
s64 delta_t;
struct cpu_ctx *cctx;
@ -986,22 +952,23 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
/*
* Update task's average runtime.
*/
task_slice = p->se.sum_exec_runtime - tctx->sum_exec_runtime;
slice = p->se.sum_exec_runtime - tctx->sum_exec_runtime;
if (lowlatency)
slice = CLAMP(slice, slice_min, slice_max);
tctx->sum_exec_runtime = p->se.sum_exec_runtime;
tctx->avg_runtime = calc_avg(tctx->avg_runtime, task_slice);
tctx->avg_runtime = calc_avg(tctx->avg_runtime, slice);
/*
* Update task vruntime and deadline, charging the weighted used time
* slice.
* Update task vruntime charging the weighted used time slice.
*/
task_slice = scale_inverse_fair(task_slice, p->scx.weight);
p->scx.dsq_vtime += task_slice;
slice = scale_inverse_fair(p, slice);
p->scx.dsq_vtime += slice;
tctx->deadline = p->scx.dsq_vtime + task_compute_dl(p, tctx);
/*
* Update global vruntime.
*/
vtime_now += task_slice;
vtime_now += slice;
/*
* Refresh voluntary context switch metrics.
@ -1009,23 +976,25 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
* Evaluate the average number of voluntary context switches per second
* using an exponentially weighted moving average, see calc_avg().
*/
if (!lowlatency && !is_nvcsw_enabled())
return;
delta_t = (s64)(now - tctx->nvcsw_ts);
if (delta_t > NSEC_PER_SEC) {
u64 delta_nvcsw = p->nvcsw - tctx->nvcsw;
u64 avg_nvcsw = delta_nvcsw * NSEC_PER_SEC / delta_t;
u64 max_lat_weight = lowlatency ? MAX_LATENCY_WEIGHT :
MIN(nvcsw_max_thresh, MAX_LATENCY_WEIGHT);
/*
* Evaluate the average nvcsw for the task, limited to the
* range [0 .. 1000] to prevent excessive spikes.
*/
tctx->avg_nvcsw = calc_avg_clamp(tctx->avg_nvcsw, avg_nvcsw,
0, MAX(nvcsw_max_thresh, 1000));
tctx->nvcsw = p->nvcsw;
tctx->nvcsw_ts = now;
/*
* Evaluate the latency weight of the task as its average rate
* of voluntary context switches (limited to the max_lat_weight
* to prevent excessive spikes).
*/
tctx->lat_weight = calc_avg_clamp(tctx->lat_weight, avg_nvcsw,
0, max_lat_weight);
/*
* Update the global voluntary context switches average using
* an exponentially weighted moving average (EWMA) with the
* formula:
@ -1039,13 +1008,19 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
* Additionally, restrict the global nvcsw_avg_thresh average
* to the range [1 .. nvcsw_max_thresh] to always allow the
* classification of some tasks as interactive.
*/
*/
nvcsw_avg_thresh = calc_avg_clamp(nvcsw_avg_thresh, avg_nvcsw,
1, nvcsw_max_thresh);
/*
* Reresh task status: interactive or regular.
* Classify the task based on the average of voluntary context
* switches.
*
* If the task has an average greater than the global average
* it is classified as interactive, otherwise the task is
* classified as regular.
*/
update_task_interactive(tctx);
tctx->is_interactive = tctx->lat_weight >= nvcsw_max_thresh;
}
}
@ -1064,12 +1039,9 @@ void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p)
tctx->sum_exec_runtime = p->se.sum_exec_runtime;
tctx->nvcsw = p->nvcsw;
tctx->nvcsw_ts = now;
tctx->avg_nvcsw = p->nvcsw * NSEC_PER_SEC / tctx->nvcsw_ts;
tctx->avg_runtime = slice_ns;
tctx->lat_weight = p->nvcsw * NSEC_PER_SEC / tctx->nvcsw_ts;
tctx->avg_runtime = slice_max;
tctx->deadline = vtime_now;
tctx->last_running = now;
update_task_interactive(tctx);
}
s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p,

View File

@ -138,11 +138,12 @@ struct Opts {
#[clap(short = 'l', long, allow_hyphen_values = true, default_value = "0")]
slice_us_lag: i64,
/// Shorten interactive tasks' deadline based on their average amount of voluntary context
/// switches.
/// With lowlatency enabled, instead of classifying tasks as interactive or non-interactive,
/// they all get a dynamic priority, which is adjusted in function of their average rate of
/// voluntary context switches.
///
/// Enabling this option can be beneficial in soft real-time scenarios, such as audio
/// processing, multimedia, etc.
/// This option guarantess less spikey behavior and it can be particularly useful in soft
/// real-time scenarios, such as audio processing, multimedia, etc.
#[clap(short = 'L', long, action = clap::ArgAction::SetTrue)]
lowlatency: bool,
@ -260,9 +261,9 @@ impl<'a> Scheduler<'a> {
skel.maps.rodata_data.smt_enabled = smt_enabled;
skel.maps.rodata_data.lowlatency = opts.lowlatency;
skel.maps.rodata_data.local_kthreads = opts.local_kthreads;
skel.maps.rodata_data.slice_ns = opts.slice_us * 1000;
skel.maps.rodata_data.slice_ns_min = opts.slice_us_min * 1000;
skel.maps.rodata_data.slice_ns_lag = opts.slice_us_lag * 1000;
skel.maps.rodata_data.slice_max = opts.slice_us * 1000;
skel.maps.rodata_data.slice_min = opts.slice_us_min * 1000;
skel.maps.rodata_data.slice_lag = opts.slice_us_lag * 1000;
skel.maps.rodata_data.starvation_thresh_ns = opts.starvation_thresh_us * 1000;
skel.maps.rodata_data.nvcsw_max_thresh = opts.nvcsw_max_thresh;
@ -555,7 +556,8 @@ impl<'a> Scheduler<'a> {
nr_running: self.skel.maps.bss_data.nr_running,
nr_cpus: self.skel.maps.bss_data.nr_online_cpus,
nr_interactive: self.skel.maps.bss_data.nr_interactive,
nr_waiting: self.skel.maps.bss_data.nr_waiting,
nr_prio_waiting: self.skel.maps.bss_data.nr_prio_waiting,
nr_shared_waiting: self.skel.maps.bss_data.nr_shared_waiting,
nvcsw_avg_thresh: self.skel.maps.bss_data.nvcsw_avg_thresh,
nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches,
nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,

View File

@ -21,8 +21,10 @@ pub struct Metrics {
pub nr_cpus: u64,
#[stat(desc = "Number of running interactive tasks")]
pub nr_interactive: u64,
#[stat(desc = "Average amount of tasks waiting to be dispatched")]
pub nr_waiting: u64,
#[stat(desc = "Average amount of regular tasks waiting to be dispatched")]
pub nr_shared_waiting: u64,
#[stat(desc = "Average amount of interactive tasks waiting to be dispatched")]
pub nr_prio_waiting: u64,
#[stat(desc = "Average of voluntary context switches")]
pub nvcsw_avg_thresh: u64,
#[stat(desc = "Number of kthread direct dispatches")]
@ -39,12 +41,13 @@ impl Metrics {
fn format<W: Write>(&self, w: &mut W) -> Result<()> {
writeln!(
w,
"[{}] tasks -> run: {:>2}/{:<2} int: {:<2} wait: {:<4} | nvcsw: {:<4} | dispatch -> kth: {:<5} dir: {:<5} pri: {:<5} shr: {:<5}",
"[{}] tasks -> r: {:>2}/{:<2} i: {:<2} pw: {:<4} w: {:<4} | nvcsw: {:<4} | dispatch -> k: {:<5} d: {:<5} p: {:<5} s: {:<5}",
crate::SCHEDULER_NAME,
self.nr_running,
self.nr_cpus,
self.nr_interactive,
self.nr_waiting,
self.nr_prio_waiting,
self.nr_shared_waiting,
self.nvcsw_avg_thresh,
self.nr_kthread_dispatches,
self.nr_direct_dispatches,