mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-24 20:00:22 +00:00
scx_lavd: improve latency criticality to latency priority mapping
The old approach is mapping [0, maximum latency criticliy] to [-boost range, boost range). This approach is easily affected by one outlier maximum value and suffers from the integer truncation error. The new approach divides the range into two -- [minimum latency criticality, average latency criticality) and [average latency criticality, maximum latency criticality] -- and maps them into [boost range/2, 0) and [0, -boost range/2), respectively, Signed-off-by: Changwoo Min <changwoo@igalia.com>
This commit is contained in:
parent
2b5d3c1300
commit
83169481a6
@ -85,6 +85,13 @@ struct sys_cpu_util {
|
||||
|
||||
volatile u64 load_ideal; /* average ideal load of runnable tasks */
|
||||
volatile u64 load_actual; /* average actual load of runnable tasks */
|
||||
|
||||
volatile u64 avg_lat_cri; /* average latency criticality (LC) */
|
||||
volatile u64 max_lat_cri; /* maximum latency criticality (LC) */
|
||||
volatile u64 min_lat_cri; /* minimum latency criticality (LC) */
|
||||
|
||||
volatile s64 inc1k_low; /* increment from low LC to priority mapping */
|
||||
volatile s64 inc1k_high; /* increment from high LC to priority mapping */
|
||||
};
|
||||
|
||||
/*
|
||||
@ -102,6 +109,14 @@ struct cpu_ctx {
|
||||
*/
|
||||
volatile u64 load_actual; /* actual load of runnable tasks */
|
||||
volatile u64 load_ideal; /* ideal loaf of runnable tasks */
|
||||
|
||||
/*
|
||||
* Information used to keep track of latency criticality
|
||||
*/
|
||||
volatile u64 max_lat_cri; /* maximum latency criticality */
|
||||
volatile u64 min_lat_cri; /* minimum latency criticality */
|
||||
volatile u64 sum_lat_cri; /* sum of latency criticality */
|
||||
volatile u64 sched_nr; /* number of schedules */
|
||||
};
|
||||
|
||||
/*
|
||||
@ -135,6 +150,7 @@ struct task_ctx {
|
||||
u64 eligible_delta_ns;
|
||||
u64 slice_ns;
|
||||
u64 greedy_ratio;
|
||||
u64 lat_cri;
|
||||
u16 stat; /* NIL -> ENQ -> RUN -> STOP -> NIL ... */
|
||||
u16 slice_boost_prio;/* how many times a task fully consumed the slice */
|
||||
u16 lat_prio; /* latency priority */
|
||||
@ -147,6 +163,9 @@ struct task_ctx_x {
|
||||
u16 static_prio; /* nice priority */
|
||||
u16 cpu_id; /* where a task ran */
|
||||
u64 cpu_util; /* cpu utilization in [0..100] */
|
||||
u64 max_lat_cri; /* maximum latency criticality */
|
||||
u64 min_lat_cri; /* minimum latency criticality */
|
||||
u64 avg_lat_cri; /* average latency criticality */
|
||||
};
|
||||
|
||||
|
||||
|
@ -123,7 +123,6 @@ volatile u64 nr_cpus_onln;
|
||||
static struct sys_cpu_util __sys_cpu_util[2];
|
||||
static volatile int __sys_cpu_util_idx;
|
||||
|
||||
volatile u64 max_lat_cri;
|
||||
struct user_exit_info uei;
|
||||
|
||||
const volatile u8 verbose;
|
||||
@ -520,6 +519,10 @@ static int submit_task_ctx(struct task_struct *p, struct task_ctx *taskc,
|
||||
m->taskc_x.static_prio = get_nice_prio(p);
|
||||
m->taskc_x.cpu_util = cutil_cur->util / 10;
|
||||
m->taskc_x.cpu_id = cpu_id;
|
||||
m->taskc_x.max_lat_cri = cutil_cur->max_lat_cri;
|
||||
m->taskc_x.min_lat_cri = cutil_cur->min_lat_cri;
|
||||
m->taskc_x.avg_lat_cri = cutil_cur->avg_lat_cri;
|
||||
|
||||
memcpy(&m->taskc, taskc, sizeof(m->taskc));
|
||||
|
||||
bpf_ringbuf_submit(m, 0);
|
||||
@ -654,6 +657,8 @@ static void update_sys_cpu_load(void)
|
||||
u64 now, duration, duration_total;
|
||||
u64 idle_total = 0, compute_total = 0;
|
||||
u64 load_actual = 0, load_ideal = 0;
|
||||
s64 max_lat_cri = 0, min_lat_cri = UINT_MAX, avg_lat_cri = 0;
|
||||
u64 sum_lat_cri = 0, sched_nr = 0;
|
||||
u64 new_util;
|
||||
int cpu;
|
||||
|
||||
@ -673,6 +678,27 @@ static void update_sys_cpu_load(void)
|
||||
load_actual += cpuc->load_actual;
|
||||
load_ideal += cpuc->load_ideal;
|
||||
|
||||
/*
|
||||
* Accumulate task's latency criticlity information.
|
||||
*
|
||||
* Updating cpuc->* is racy. However, it could degrade the
|
||||
* accuracy a little bit in very rare cases, so we do embrace
|
||||
* embrace for performance on purpose.
|
||||
*/
|
||||
sum_lat_cri += cpuc->sum_lat_cri;
|
||||
cpuc->sum_lat_cri = 0;
|
||||
|
||||
sched_nr += cpuc->sched_nr;
|
||||
cpuc->sched_nr = 0;
|
||||
|
||||
if (cpuc->max_lat_cri > max_lat_cri)
|
||||
max_lat_cri = cpuc->max_lat_cri;
|
||||
cpuc->max_lat_cri = 0;
|
||||
|
||||
if (cpuc->min_lat_cri < min_lat_cri)
|
||||
min_lat_cri = cpuc->min_lat_cri;
|
||||
cpuc->min_lat_cri = UINT_MAX;
|
||||
|
||||
/*
|
||||
* If the CPU is in an idle state (i.e., idle_start_clk is
|
||||
* non-zero), accumulate the current idle peirod so far.
|
||||
@ -698,44 +724,53 @@ static void update_sys_cpu_load(void)
|
||||
if (duration_total > idle_total)
|
||||
compute_total = duration_total - idle_total;
|
||||
new_util = (compute_total * LAVD_CPU_UTIL_MAX) / duration_total;
|
||||
if (sched_nr > 0)
|
||||
avg_lat_cri = sum_lat_cri / sched_nr;
|
||||
|
||||
/*
|
||||
* Update the CPU utilization to the next version, and then make the
|
||||
* next version atomically visible.
|
||||
* Update the CPU utilization to the next version.
|
||||
*/
|
||||
cutil_next->load_actual = calc_avg(cutil_cur->load_actual, load_actual);
|
||||
cutil_next->load_ideal = calc_avg(cutil_cur->load_ideal, load_ideal);
|
||||
cutil_next->util = calc_avg(cutil_cur->util, new_util);
|
||||
|
||||
/*
|
||||
* Calculate the increment for latency criticality to priority mapping
|
||||
* - Case 1. inc1k_low: [min_lc, avg_lc) -> [half_range, 0)
|
||||
* - Case 2. inc1k_high: [avg_lc, max_lc] -> [0, -half_range)
|
||||
*/
|
||||
cutil_next->min_lat_cri = calc_avg(cutil_cur->min_lat_cri, min_lat_cri);
|
||||
cutil_next->max_lat_cri = calc_avg(cutil_cur->max_lat_cri, max_lat_cri);
|
||||
cutil_next->avg_lat_cri = calc_avg(cutil_cur->avg_lat_cri, avg_lat_cri);
|
||||
|
||||
if (cutil_next->avg_lat_cri == cutil_next->min_lat_cri)
|
||||
cutil_next->inc1k_low = 0;
|
||||
else {
|
||||
cutil_next->inc1k_low = ((LAVD_BOOST_RANGE >> 1) * 1000) /
|
||||
(cutil_next->avg_lat_cri -
|
||||
cutil_next->min_lat_cri);
|
||||
}
|
||||
|
||||
if ((cutil_next->max_lat_cri + 1) == cutil_next->avg_lat_cri)
|
||||
cutil_next->inc1k_high = 0;
|
||||
else {
|
||||
cutil_next->inc1k_high = ((LAVD_BOOST_RANGE >> 1) * 1000) /
|
||||
(cutil_next->max_lat_cri + 1 -
|
||||
cutil_next->avg_lat_cri);
|
||||
}
|
||||
|
||||
/*
|
||||
* Make the next version atomically visible.
|
||||
*/
|
||||
cutil_next->last_update_clk = now;
|
||||
flip_sys_cpu_util();
|
||||
}
|
||||
|
||||
static void decay_max_lat_cri(void)
|
||||
{
|
||||
/*
|
||||
* Decrease max_lat_cri by 3.13% (>> 5) every update interval. The
|
||||
* maximum can be bumped up for a moment, then the actual lat_cri
|
||||
* values could never reach the maximum value. To address this, we
|
||||
* decrease the maximum value here so that the maximum can be bumped up
|
||||
* again to the actual maximum. This helps to manage max_lat_cri close
|
||||
* to the actual maximum value tightly. Note that we don't need to
|
||||
* handle the CAS failure. That is because the CAS failure means
|
||||
* another task bumps the old maximum to the higher, actual maximum.
|
||||
*/
|
||||
u64 cur_max = max_lat_cri;
|
||||
u64 delta = (cur_max >> 5) ? : 1;
|
||||
s64 new_max = cur_max - delta;
|
||||
|
||||
if (new_max > 0)
|
||||
__sync_val_compare_and_swap(&max_lat_cri, cur_max, new_max);
|
||||
}
|
||||
|
||||
static int update_timer_fn(void *map, int *key, struct bpf_timer *timer)
|
||||
{
|
||||
int err;
|
||||
|
||||
update_sys_cpu_load();
|
||||
decay_max_lat_cri();
|
||||
|
||||
err = bpf_timer_start(timer, LAVD_CPU_UTIL_INTERVAL_NS, 0);
|
||||
if (err)
|
||||
@ -923,11 +958,60 @@ static int sum_prios_for_lat(struct task_struct *p, int nice_prio,
|
||||
return prio;
|
||||
}
|
||||
|
||||
static int map_lat_cri_to_lat_prio(u64 lat_cri)
|
||||
{
|
||||
/*
|
||||
* Latency criticality is an absolute metric representing how
|
||||
* latency-critical a task is. However, latency priority is a relative
|
||||
* metric compared to the other co-running tasks. Especially when the
|
||||
* task's latency criticalities are in a small range, the relative
|
||||
* metric is advantageous in mitigating integer truncation errors. In
|
||||
* the relative metric, we map
|
||||
*
|
||||
* - Case 1. inc1k_low: [min_lc, avg_lc) -> [boost_range/2, 0)
|
||||
* - Case 2. inc1k_high: [avg_lc, max_lc] -> [0, -boost_range/2)
|
||||
*
|
||||
* Hence, latency priority 20 now means that a task has an average
|
||||
* latency criticality among the co-running tasks.
|
||||
*/
|
||||
|
||||
struct sys_cpu_util *cutil_cur = get_sys_cpu_util_cur();
|
||||
s64 base_lat_cri, inc1k;
|
||||
int base_prio, lat_prio;
|
||||
|
||||
/*
|
||||
* Set up params for the Case 1 and 2.
|
||||
*/
|
||||
if (lat_cri < cutil_cur->avg_lat_cri) {
|
||||
inc1k = cutil_cur->inc1k_low;
|
||||
base_lat_cri = cutil_cur->min_lat_cri;
|
||||
base_prio = LAVD_BOOST_RANGE >> 1;
|
||||
}
|
||||
else {
|
||||
inc1k = cutil_cur->inc1k_high;
|
||||
base_lat_cri = cutil_cur->avg_lat_cri;
|
||||
base_prio = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Task's lat_cri could be more up-to-date than cutil_cur's one. In
|
||||
* this case, just take the cutil_cur's one.
|
||||
*/
|
||||
if (lat_cri >= base_lat_cri) {
|
||||
lat_prio = base_prio -
|
||||
(((lat_cri - base_lat_cri) * inc1k + 500) / 1000);
|
||||
}
|
||||
else
|
||||
lat_prio = base_prio;
|
||||
|
||||
return lat_prio;
|
||||
}
|
||||
|
||||
static int boost_lat(struct task_struct *p, struct task_ctx *taskc,
|
||||
bool is_wakeup)
|
||||
struct cpu_ctx *cpuc, bool is_wakeup)
|
||||
{
|
||||
u64 run_time_ft = 0, wait_freq_ft = 0, wake_freq_ft = 0;
|
||||
u64 lat_cri_raw = 0, cur_max = 0, lat_cri = 0;
|
||||
u64 lat_cri_raw = 0;
|
||||
u16 static_prio;
|
||||
int boost;
|
||||
|
||||
@ -993,36 +1077,13 @@ static int boost_lat(struct task_struct *p, struct task_ctx *taskc,
|
||||
* conversion, we mitigate the exponentially skewed distribution to
|
||||
* non-linear distribution.
|
||||
*/
|
||||
lat_cri = bpf_log2l(lat_cri_raw + 1);
|
||||
|
||||
/*
|
||||
* Update the global @max_lat_cri if necessary. Updating @max_lat_cri
|
||||
* is racy because it can be tested and updated concurrently from
|
||||
* multiple CPUs. So we use an atomic cmpxchg.
|
||||
*
|
||||
* Note that the bounded retry (@LAVD_MAX_CAS_RETRY) does *not
|
||||
* *guarantee* the update of @max_lat_cri. However, it is unlikely to
|
||||
* happen, and even if it happens, the incorrect value will be
|
||||
* corrected next time anyway. So let's just live with it.
|
||||
*
|
||||
* We decay @max_lat_cri periodially (at decay_max_lat_cri) to maintain
|
||||
* @max_lat_cri up-to-date.
|
||||
*/
|
||||
cur_max = max_lat_cri;
|
||||
for (int i = 0; lat_cri > cur_max && i < LAVD_MAX_CAS_RETRY; i++) {
|
||||
cur_max = __sync_val_compare_and_swap(&max_lat_cri, cur_max,
|
||||
lat_cri);
|
||||
}
|
||||
taskc->lat_cri = bpf_log2l(lat_cri_raw + 1);
|
||||
|
||||
/*
|
||||
* Convert @p's latency criticality to its boost priority linearly.
|
||||
*/
|
||||
boost = (LAVD_BOOST_RANGE - ((lat_cri * LAVD_BOOST_RANGE) / cur_max)) -
|
||||
(LAVD_BOOST_RANGE >> 1);
|
||||
|
||||
/*
|
||||
* When a task is wakening up, boost its latency boost priority by 1.
|
||||
*/
|
||||
boost = map_lat_cri_to_lat_prio(taskc->lat_cri);
|
||||
if (is_wakeup)
|
||||
boost -= LAVD_BOOST_WAKEUP_LAT;
|
||||
|
||||
@ -1035,14 +1096,15 @@ out:
|
||||
}
|
||||
|
||||
static u64 calc_latency_weight(struct task_struct *p, struct task_ctx *taskc,
|
||||
bool is_wakeup)
|
||||
struct cpu_ctx *cpuc, bool is_wakeup)
|
||||
{
|
||||
boost_lat(p, taskc, is_wakeup);
|
||||
boost_lat(p, taskc, cpuc, is_wakeup);
|
||||
return sched_prio_to_latency_weight[taskc->lat_prio];
|
||||
}
|
||||
|
||||
static u64 calc_virtual_dealine_delta(struct task_struct *p,
|
||||
struct task_ctx *taskc,
|
||||
struct cpu_ctx *cpuc,
|
||||
u64 enq_flags)
|
||||
{
|
||||
u64 vdeadline_delta_ns, weight;
|
||||
@ -1062,7 +1124,7 @@ static u64 calc_virtual_dealine_delta(struct task_struct *p,
|
||||
* boost priority (and weight).
|
||||
*/
|
||||
is_wakeup = is_wakeup_ef(enq_flags);
|
||||
weight = calc_latency_weight(p, taskc, is_wakeup);
|
||||
weight = calc_latency_weight(p, taskc, cpuc, is_wakeup);
|
||||
vdeadline_delta_ns = (LAVD_SLICE_MAX_NS * weight) / 1000;
|
||||
taskc->vdeadline_delta_ns = vdeadline_delta_ns;
|
||||
return vdeadline_delta_ns;
|
||||
@ -1235,6 +1297,19 @@ static void update_stat_for_run(struct task_struct *p, struct task_ctx *taskc,
|
||||
interval = taskc->run_time_ns + wait_period;
|
||||
taskc->run_freq = calc_avg_freq(taskc->run_freq, interval);
|
||||
|
||||
/*
|
||||
* Update per-CPU latency criticality information for ever-scheduled
|
||||
* tasks
|
||||
*/
|
||||
if (have_scheduled(taskc)) {
|
||||
if (cpuc->max_lat_cri < taskc->lat_cri)
|
||||
cpuc->max_lat_cri = taskc->lat_cri;
|
||||
if (cpuc->min_lat_cri > taskc->lat_cri)
|
||||
cpuc->min_lat_cri = taskc->lat_cri;
|
||||
cpuc->sum_lat_cri += taskc->lat_cri;
|
||||
cpuc->sched_nr++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update task state when starts running.
|
||||
*/
|
||||
@ -1270,7 +1345,7 @@ static void update_stat_for_stop(struct task_struct *p, struct task_ctx *taskc,
|
||||
}
|
||||
|
||||
static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc,
|
||||
u64 enq_flags)
|
||||
struct cpu_ctx *cpuc, u64 enq_flags)
|
||||
{
|
||||
/*
|
||||
* Before enqueueing a task to a run queue, we should decide when a
|
||||
@ -1278,7 +1353,7 @@ static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc,
|
||||
* urgent it is - vdeadline_delta_ns - and when it becomes eligible if
|
||||
* overscheduled - eligible_time_ns.
|
||||
*/
|
||||
calc_virtual_dealine_delta(p, taskc, enq_flags);
|
||||
calc_virtual_dealine_delta(p, taskc, cpuc, enq_flags);
|
||||
calc_eligible_delta(p, taskc);
|
||||
}
|
||||
|
||||
@ -1287,6 +1362,10 @@ static bool put_local_rq(struct task_struct *p, struct task_ctx *taskc,
|
||||
{
|
||||
struct cpu_ctx *cpuc;
|
||||
|
||||
cpuc = get_cpu_ctx();
|
||||
if (!cpuc)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Calculate when a tack can be scheduled. If a task is cannot be
|
||||
* scheduled soonish (i.e., the task is ineligible since
|
||||
@ -1296,7 +1375,7 @@ static bool put_local_rq(struct task_struct *p, struct task_ctx *taskc,
|
||||
* Note that the task's time slice will be calculated and reassigned
|
||||
* right before running at ops.running().
|
||||
*/
|
||||
calc_when_to_run(p, taskc, enq_flags);
|
||||
calc_when_to_run(p, taskc, cpuc, enq_flags);
|
||||
if (!is_eligible(taskc))
|
||||
return false;
|
||||
|
||||
@ -1305,10 +1384,6 @@ static bool put_local_rq(struct task_struct *p, struct task_ctx *taskc,
|
||||
* rq. Statistics will be adjusted when more accurate statistics
|
||||
* become available (ops.running).
|
||||
*/
|
||||
cpuc = get_cpu_ctx();
|
||||
if (!cpuc)
|
||||
return false;
|
||||
|
||||
if (transit_task_stat(taskc, LAVD_TASK_STAT_ENQ))
|
||||
update_stat_for_enq(p, taskc, cpuc);
|
||||
|
||||
@ -1329,20 +1404,21 @@ static bool put_global_rq(struct task_struct *p, struct task_ctx *taskc,
|
||||
struct cpu_ctx *cpuc;
|
||||
u64 vdeadline;
|
||||
|
||||
cpuc = get_cpu_ctx();
|
||||
if (!cpuc)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Calculate when a tack can be scheduled.
|
||||
*
|
||||
* Note that the task's time slice will be calculated and reassigned
|
||||
* right before running at ops.running().
|
||||
*/
|
||||
calc_when_to_run(p, taskc, enq_flags);
|
||||
calc_when_to_run(p, taskc, cpuc, enq_flags);
|
||||
|
||||
/*
|
||||
* Reflect task's load immediately.
|
||||
*/
|
||||
cpuc = get_cpu_ctx();
|
||||
if (!cpuc)
|
||||
return false;
|
||||
if (transit_task_stat(taskc, LAVD_TASK_STAT_ENQ))
|
||||
update_stat_for_enq(p, taskc, cpuc);
|
||||
|
||||
|
@ -170,9 +170,10 @@ impl<'a> Scheduler<'a> {
|
||||
"| {:9} | {:8} | {:17} \
|
||||
| {:4} | {:9} | {:9} \
|
||||
| {:10} | {:9} | {:8} \
|
||||
| {:12} | {:7} | {:9} \
|
||||
| {:7} | {:7} | {:7} \
|
||||
| {:7} | {:12} | {:7} \
|
||||
| {:9} | {:9} | {:9} \
|
||||
| {:9} | {:8} |",
|
||||
| {:9} | {:9} | {:8} |",
|
||||
"mseq",
|
||||
"pid",
|
||||
"comm",
|
||||
@ -182,6 +183,10 @@ impl<'a> Scheduler<'a> {
|
||||
"slice_ns",
|
||||
"grdy_rt",
|
||||
"lat_prio",
|
||||
"lat_cri",
|
||||
"min_lc",
|
||||
"avg_lc",
|
||||
"max_lc",
|
||||
"static_prio",
|
||||
"lat_bst",
|
||||
"slice_bst",
|
||||
@ -201,9 +206,10 @@ impl<'a> Scheduler<'a> {
|
||||
"| {:9} | {:8} | {:17} \
|
||||
| {:4} | {:9} | {:9} \
|
||||
| {:10} | {:9} | {:8} \
|
||||
| {:12} | {:7} | {:9} \
|
||||
| {:7} | {:7} | {:7} \
|
||||
| {:7} | {:12} | {:7} \
|
||||
| {:9} | {:9} | {:9} \
|
||||
| {:9} | {:8} | ",
|
||||
| {:9} | {:9} | {:8} | ",
|
||||
mseq,
|
||||
tx.pid,
|
||||
tx_comm,
|
||||
@ -213,6 +219,10 @@ impl<'a> Scheduler<'a> {
|
||||
tc.slice_ns,
|
||||
tc.greedy_ratio,
|
||||
tc.lat_prio,
|
||||
tc.lat_cri,
|
||||
tx.min_lat_cri,
|
||||
tx.avg_lat_cri,
|
||||
tx.max_lat_cri,
|
||||
tx.static_prio,
|
||||
tc.lat_boost_prio,
|
||||
tc.slice_boost_prio,
|
||||
|
Loading…
Reference in New Issue
Block a user