mirror of
https://github.com/JakeHillion/scx.git
synced 2024-11-26 11:30:22 +00:00
Merge pull request #417 from multics69/lavd-vdeadline
scx_lavd: improve virtual deadline and current clock handling
This commit is contained in:
commit
00fdc1d949
@ -54,7 +54,7 @@ enum consts {
|
||||
NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC),
|
||||
LAVD_TIME_ONE_SEC = (1000ULL * NSEC_PER_MSEC),
|
||||
LAVD_TIME_INFINITY_NS = SCX_SLICE_INF,
|
||||
LAVD_MAX_CAS_RETRY = 4,
|
||||
LAVD_MAX_RETRY = 4,
|
||||
|
||||
LAVD_TARGETED_LATENCY_NS = (15 * NSEC_PER_MSEC),
|
||||
LAVD_SLICE_MIN_NS = ( 1 * NSEC_PER_MSEC), /* min time slice */
|
||||
@ -63,7 +63,7 @@ enum consts {
|
||||
LAVD_SLICE_GREEDY_FT = 3,
|
||||
LAVD_LOAD_FACTOR_ADJ = 6, /* adjustment for better estimation */
|
||||
LAVD_LOAD_FACTOR_MAX = (20 * 1000),
|
||||
LAVD_LOAD_FACTOR_FT = 80, /* factor to stretch the time line */
|
||||
LAVD_LOAD_FACTOR_FT = 4, /* factor to stretch the time line */
|
||||
|
||||
LAVD_LC_FREQ_MAX = 1000000,
|
||||
LAVD_LC_RUNTIME_MAX = LAVD_TARGETED_LATENCY_NS,
|
||||
@ -74,8 +74,9 @@ enum consts {
|
||||
LAVD_SLICE_BOOST_MAX_FT = 2, /* maximum additional 2x of slice */
|
||||
LAVD_SLICE_BOOST_MAX_STEP = 8, /* 8 slice exhausitions in a row */
|
||||
LAVD_GREEDY_RATIO_MAX = USHRT_MAX,
|
||||
LAVD_LAT_PRIO_NEW = 10,
|
||||
LAVD_LAT_PRIO_IDLE = USHRT_MAX,
|
||||
LAVD_LAT_WEIGHT_SHIFT = 3,
|
||||
LAVD_LAT_WEIGHT_FT = 88761,
|
||||
|
||||
LAVD_ELIGIBLE_TIME_LAT_FT = 16,
|
||||
LAVD_ELIGIBLE_TIME_MAX = (100 * NSEC_PER_USEC),
|
||||
@ -210,6 +211,7 @@ struct task_ctx {
|
||||
/*
|
||||
* Task deadline and time slice
|
||||
*/
|
||||
u64 vdeadline_log_clk; /* logical clock of the deadilne */
|
||||
u64 vdeadline_delta_ns; /* time delta until task's virtual deadline */
|
||||
u64 eligible_delta_ns; /* time delta until task becomes eligible */
|
||||
u64 slice_ns; /* time slice */
|
||||
|
@ -213,6 +213,11 @@ private(LAVD) struct bpf_cpumask __kptr *ovrflw_cpumask; /* CPU mask for overflo
|
||||
*/
|
||||
const volatile u16 cpu_order[LAVD_CPU_ID_MAX]; /* ordered by cpus->core->llc->numa */
|
||||
|
||||
/*
|
||||
* Logical current clock
|
||||
*/
|
||||
u64 cur_logical_clk;
|
||||
|
||||
/*
|
||||
* Options
|
||||
*/
|
||||
@ -363,137 +368,6 @@ static const u64 sched_prio_to_slice_weight[NICE_WIDTH] = {
|
||||
15, /* 19 39 */
|
||||
};
|
||||
|
||||
/*
|
||||
* A nice priority to latency weight array
|
||||
* ---------------------------------------
|
||||
*
|
||||
* It is used to determine the virtual deadline. Each step increases by 10%.
|
||||
* The idea behind the virtual deadline is to limit the competition window
|
||||
* among concurrent tasks. For example, in the case of a normal priority task
|
||||
* with nice 0, its corresponding value is 7.5 msec (when LAVD_LAT_WEIGHT_SHIFT
|
||||
* is 0). This guarantees that any tasks enqueued in 7.5 msec after the task is
|
||||
* enqueued will not compete for CPU time with the task. This array is the
|
||||
* inverse of sched_prio_to_latency_weight with some normalization. Suppose the
|
||||
* maximum time slice per schedule (LAVD_SLICE_MAX_NS) is 3 msec. We normalized
|
||||
* the values so that the normal priority (nice 0) has a deadline of 7.5 msec,
|
||||
* a center of the targeted latency (i.e., when LAVD_TARGETED_LATENCY_NS is 15
|
||||
* msec). The virtual deadline ranges from 87 usec to 512 msec. As the maximum
|
||||
* time slice becomes shorter, the deadlines become tighter.
|
||||
*/
|
||||
static const u64 sched_prio_to_latency_weight[NICE_WIDTH] = {
|
||||
/* weight nice priority sched priority vdeadline (usec) */
|
||||
/* (max slice == 3 ms) */
|
||||
/* (LAVD_LAT_WEIGHT_SHIFT == 0) */
|
||||
/* ------ ------------- -------------- ------------------- */
|
||||
29, /* -20 0 87 */
|
||||
36, /* -19 1 108 */
|
||||
45, /* -18 2 135 */
|
||||
55, /* -17 3 165 */
|
||||
71, /* -16 4 213 */
|
||||
88, /* -15 5 264 */
|
||||
110, /* -14 6 330 */
|
||||
137, /* -13 7 411 */
|
||||
171, /* -12 8 513 */
|
||||
215, /* -11 9 645 */
|
||||
268, /* -10 10 804 */
|
||||
336, /* -9 11 1008 */
|
||||
420, /* -8 12 1260 */
|
||||
522, /* -7 13 1566 */
|
||||
655, /* -6 14 1965 */
|
||||
820, /* -5 15 2460 */
|
||||
1024, /* -4 16 3072 */
|
||||
1286, /* -3 17 3858 */
|
||||
1614, /* -2 18 4842 */
|
||||
2005, /* -1 19 6015 */
|
||||
2500, /* 0 20 7500 */
|
||||
3122, /* 1 21 9366 */
|
||||
3908, /* 2 22 11724 */
|
||||
4867, /* 3 23 14601 */
|
||||
6052, /* 4 24 18156 */
|
||||
7642, /* 5 25 22926 */
|
||||
9412, /* 6 26 28236 */
|
||||
11907, /* 7 27 35721 */
|
||||
14884, /* 8 28 44652 */
|
||||
18686, /* 9 29 56058 */
|
||||
23273, /* 10 30 69819 */
|
||||
29425, /* 11 31 88275 */
|
||||
36571, /* 12 32 109713 */
|
||||
45714, /* 13 33 137142 */
|
||||
56889, /* 14 34 170667 */
|
||||
71111, /* 15 35 213333 */
|
||||
88276, /* 16 36 264828 */
|
||||
111304, /* 17 37 333912 */
|
||||
142222, /* 18 38 426666 */
|
||||
170667, /* 19 39 512001 */
|
||||
};
|
||||
|
||||
/*
|
||||
* A latency priority to greedy ratios for eligibility
|
||||
* ---------------------------------------------------
|
||||
*
|
||||
* This table is nothing but sched_prio_to_slice_weight * (1000/1024) for
|
||||
* direct comparison against greedy_ratio, which is based on 1000.
|
||||
*
|
||||
* We distribute CPU time based on its nice (static) priorities described in
|
||||
* sched_prio_to_slice_weight, the same as the conventional way, for the fair
|
||||
* use of CPU time. However, when checking whether a particular task is
|
||||
* eligible, we consider its (dynamic) latency priority. Because a
|
||||
* latency-critical task may have CPU usage spikes to meet its (soft) deadline,
|
||||
* too strict fairness enforcement does not work well.
|
||||
*
|
||||
* Hence, we are more generous to a latency-critical task and aim for eventual
|
||||
* fairness of CPU time. To this end, we determine the task's time slice and
|
||||
* ineligible duration based on its nice priority for fairness. But we check if
|
||||
* a task is greedier compared to its (dynamic) _latency_ priority (not nice
|
||||
* priority). This allows the task to use more CPU time temporarily, but
|
||||
* eventually, its CPU time is under fairness control using time slice and
|
||||
* ineligibility duration calculation.
|
||||
*/
|
||||
static const u64 lat_prio_to_greedy_thresholds[NICE_WIDTH] = {
|
||||
/* weight nice priority sched priority */
|
||||
/* ------ ------------- -------------- */
|
||||
86681, /* -20 0 */
|
||||
70073, /* -19 1 */
|
||||
55159, /* -18 2 */
|
||||
45188, /* -17 3 */
|
||||
35440, /* -16 4 */
|
||||
28471, /* -15 5 */
|
||||
22709, /* -14 6 */
|
||||
18267, /* -13 7 */
|
||||
14599, /* -12 8 */
|
||||
11637, /* -11 9 */
|
||||
9324, /* -10 10 */
|
||||
7441, /* -9 11 */
|
||||
5957, /* -8 12 */
|
||||
4789, /* -7 13 */
|
||||
3814, /* -6 14 */
|
||||
3048, /* -5 15 */
|
||||
2442, /* -4 16 */
|
||||
1944, /* -3 17 */
|
||||
1549, /* -2 18 */
|
||||
1247, /* -1 19 */
|
||||
1000, /* 0 20 */
|
||||
1000, /* 1 21 */
|
||||
1000, /* 2 22 */
|
||||
1000, /* 3 23 */
|
||||
1000, /* 4 24 */
|
||||
1000, /* 5 25 */
|
||||
1000, /* 6 26 */
|
||||
1000, /* 7 27 */
|
||||
1000, /* 8 28 */
|
||||
1000, /* 9 29 */
|
||||
1000, /* 10 30 */
|
||||
1000, /* 11 31 */
|
||||
1000, /* 12 32 */
|
||||
1000, /* 13 33 */
|
||||
1000, /* 14 34 */
|
||||
1000, /* 15 35 */
|
||||
1000, /* 16 36 */
|
||||
1000, /* 17 37 */
|
||||
1000, /* 18 38 */
|
||||
1000, /* 19 39 */
|
||||
};
|
||||
|
||||
static u16 get_nice_prio(struct task_struct *p);
|
||||
static u64 get_task_load_ideal(struct task_struct *p);
|
||||
static void adjust_slice_boost(struct cpu_ctx *cpuc, struct task_ctx *taskc);
|
||||
@ -636,13 +510,13 @@ static void proc_introspec_sched_n(struct task_struct *p,
|
||||
cur_nr = intrspc.arg;
|
||||
|
||||
/*
|
||||
* Note that the bounded retry (@LAVD_MAX_CAS_RETRY) does *not
|
||||
* *guarantee* to decrement introspec_arg. However, it is unlikely to
|
||||
* happen. Even if it happens, it is nothing but a matter of delaying a
|
||||
* message delivery. That's because other threads will try and succeed
|
||||
* the CAS operation eventually. So this is good enough. ;-)
|
||||
* Note that the bounded retry (@LAVD_MAX_RETRY) does *not *guarantee*
|
||||
* to decrement introspec_arg. However, it is unlikely to happen. Even
|
||||
* if it happens, it is nothing but a matter of delaying a message
|
||||
* delivery. That's because other threads will try and succeed the CAS
|
||||
* operation eventually. So this is good enough. ;-)
|
||||
*/
|
||||
for (i = 0; cur_nr > 0 && i < LAVD_MAX_CAS_RETRY; i++) {
|
||||
for (i = 0; cur_nr > 0 && i < LAVD_MAX_RETRY; i++) {
|
||||
prev_nr = __sync_val_compare_and_swap(
|
||||
&intrspc.arg, cur_nr, cur_nr - 1);
|
||||
/* CAS success: submit a message and done */
|
||||
@ -838,7 +712,7 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
|
||||
* If the CPU is in an idle state (i.e., idle_start_clk is
|
||||
* non-zero), accumulate the current idle peirod so far.
|
||||
*/
|
||||
for (int i = 0; i < LAVD_MAX_CAS_RETRY; i++) {
|
||||
for (int i = 0; i < LAVD_MAX_RETRY; i++) {
|
||||
u64 old_clk = cpuc->idle_start_clk;
|
||||
if (old_clk == 0)
|
||||
break;
|
||||
@ -1177,25 +1051,9 @@ static u64 calc_lat_factor(u64 lat_prio)
|
||||
static u32 calc_greedy_factor(struct task_ctx *taskc)
|
||||
{
|
||||
u32 greedy_ratio = taskc->greedy_ratio;
|
||||
s16 lat_prio = taskc->lat_prio;
|
||||
u32 greedy_threshold;
|
||||
u32 gr_ft;
|
||||
|
||||
if (lat_prio < 0)
|
||||
lat_prio = 0;
|
||||
else if (lat_prio >= NICE_WIDTH)
|
||||
lat_prio = NICE_WIDTH - 1;
|
||||
|
||||
/*
|
||||
* When determining how greedy a task is, we are more generous to a
|
||||
* latency-critical task with a low lat_prio value. That is because a
|
||||
* latency-critical task can temporarily overspend CPU time. However,
|
||||
* the time slice and ineligible duration allocation will eventually
|
||||
* enforce fairness.
|
||||
*/
|
||||
greedy_threshold = lat_prio_to_greedy_thresholds[lat_prio];
|
||||
|
||||
gr_ft = (greedy_ratio * 1000) / greedy_threshold;
|
||||
gr_ft = greedy_ratio;
|
||||
if (gr_ft < 1000)
|
||||
gr_ft = 1000;
|
||||
else
|
||||
@ -1206,22 +1064,7 @@ static u32 calc_greedy_factor(struct task_ctx *taskc)
|
||||
|
||||
static bool is_eligible(struct task_ctx *taskc)
|
||||
{
|
||||
u64 greedy_threshold;
|
||||
s16 lat_prio = taskc->lat_prio;
|
||||
|
||||
if (lat_prio < 0)
|
||||
lat_prio = 0;
|
||||
else if (lat_prio >= NICE_WIDTH)
|
||||
lat_prio = NICE_WIDTH - 1;
|
||||
|
||||
/*
|
||||
* Similar to the greedy factor calculation, we have a loose bound for
|
||||
* a latency-critical task. That makes a latency-critical task less
|
||||
* frequently ineligible for low (tail) latency.
|
||||
*/
|
||||
greedy_threshold = lat_prio_to_greedy_thresholds[lat_prio];
|
||||
|
||||
return taskc->greedy_ratio <= greedy_threshold;
|
||||
return taskc->greedy_ratio <= 1000;
|
||||
}
|
||||
|
||||
static bool is_wakeup_wf(u64 wake_flags)
|
||||
@ -1372,7 +1215,7 @@ static int boost_lat(struct task_struct *p, struct task_ctx *taskc,
|
||||
* its property.
|
||||
*/
|
||||
if (!have_scheduled(taskc)) {
|
||||
boost = 0;
|
||||
boost = LAVD_LAT_PRIO_NEW;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1448,13 +1291,11 @@ out:
|
||||
static u64 calc_latency_weight(struct task_struct *p, struct task_ctx *taskc,
|
||||
struct cpu_ctx *cpuc, bool is_wakeup)
|
||||
{
|
||||
boost_lat(p, taskc, cpuc, is_wakeup);
|
||||
u64 w;
|
||||
|
||||
/*
|
||||
* Tighten the competition window according to LAVD_LAT_WEIGHT_SHIFT.
|
||||
*/
|
||||
return sched_prio_to_latency_weight[taskc->lat_prio] >>
|
||||
LAVD_LAT_WEIGHT_SHIFT;
|
||||
boost_lat(p, taskc, cpuc, is_wakeup);
|
||||
w = LAVD_LAT_WEIGHT_FT / sched_prio_to_slice_weight[taskc->lat_prio] + 1;
|
||||
return w;
|
||||
}
|
||||
|
||||
static u64 calc_virtual_deadline_delta(struct task_struct *p,
|
||||
@ -1481,8 +1322,7 @@ static u64 calc_virtual_deadline_delta(struct task_struct *p,
|
||||
*/
|
||||
is_wakeup = is_wakeup_ef(enq_flags);
|
||||
weight = calc_latency_weight(p, taskc, cpuc, is_wakeup);
|
||||
vdeadline_delta_ns = (LAVD_SLICE_MAX_NS * weight) / 1000;
|
||||
|
||||
vdeadline_delta_ns = (((taskc->run_time_ns + 1) * weight) + 1000) / 1000;
|
||||
/*
|
||||
* When a system is overloaded (>1000), stretch time space so make time
|
||||
* tick logically slower to give room to execute the overloaded tasks.
|
||||
@ -1493,11 +1333,12 @@ static u64 calc_virtual_deadline_delta(struct task_struct *p,
|
||||
* is lower (i.e., higher value) and the load is higher.
|
||||
*/
|
||||
vdeadline_delta_ns = (vdeadline_delta_ns * load_factor *
|
||||
taskc->lat_prio * taskc->lat_prio) /
|
||||
(taskc->lat_prio + 1)) /
|
||||
(LAVD_LOAD_FACTOR_FT * 1000);
|
||||
}
|
||||
|
||||
taskc->vdeadline_delta_ns = vdeadline_delta_ns;
|
||||
|
||||
return vdeadline_delta_ns;
|
||||
}
|
||||
|
||||
@ -1646,6 +1487,11 @@ static void update_stat_for_running(struct task_struct *p,
|
||||
u64 load_actual_ft, load_ideal_ft, wait_freq_ft, wake_freq_ft;
|
||||
u64 perf_cri_raw;
|
||||
|
||||
/*
|
||||
* Update the current logical clock.
|
||||
*/
|
||||
WRITE_ONCE(cur_logical_clk, taskc->vdeadline_log_clk);
|
||||
|
||||
/*
|
||||
* Since this is the start of a new schedule for @p, we update run
|
||||
* frequency in a second using an exponential weighted moving average.
|
||||
@ -1749,6 +1595,17 @@ static void update_stat_for_quiescent(struct task_struct *p,
|
||||
cpuc->load_run_time_ns -= cap_time_slice_ns(taskc->run_time_ns);
|
||||
}
|
||||
|
||||
static u64 calc_exclusive_run_window(void)
|
||||
{
|
||||
u64 load_factor;
|
||||
|
||||
load_factor = get_sys_stat_cur()->load_factor;
|
||||
if (load_factor >= 1000)
|
||||
return (LAVD_SLICE_MAX_NS * load_factor) / 1000;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc,
|
||||
struct cpu_ctx *cpuc, u64 enq_flags)
|
||||
{
|
||||
@ -1760,6 +1617,15 @@ static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc,
|
||||
*/
|
||||
calc_virtual_deadline_delta(p, taskc, cpuc, enq_flags);
|
||||
calc_eligible_delta(p, taskc);
|
||||
|
||||
/*
|
||||
* Update the logical clock of the virtual deadline including
|
||||
* ineligible duration.
|
||||
*/
|
||||
taskc->vdeadline_log_clk = READ_ONCE(cur_logical_clk) +
|
||||
calc_exclusive_run_window() +
|
||||
taskc->eligible_delta_ns +
|
||||
taskc->vdeadline_delta_ns;
|
||||
}
|
||||
|
||||
static u64 get_est_stopping_time(struct task_ctx *taskc)
|
||||
@ -1940,7 +1806,8 @@ static struct cpu_ctx *find_victim_cpu(const struct cpumask *cpumask,
|
||||
*/
|
||||
switch(v) {
|
||||
case 2: /* two dandidates */
|
||||
victim_cpu = can_task1_kick_task2(&prm_cpus[0], &prm_cpus[1]) ? &prm_cpus[0] : &prm_cpus[1];
|
||||
victim_cpu = can_task1_kick_task2(&prm_cpus[0], &prm_cpus[1]) ?
|
||||
&prm_cpus[0] : &prm_cpus[1];
|
||||
goto bingo_out;
|
||||
case 1: /* one candidate */
|
||||
victim_cpu = &prm_cpus[0];
|
||||
@ -2077,7 +1944,6 @@ static void put_global_rq(struct task_struct *p, struct task_ctx *taskc,
|
||||
{
|
||||
struct task_ctx *taskc_run;
|
||||
struct task_struct *p_run;
|
||||
u64 vdeadline;
|
||||
|
||||
/*
|
||||
* Calculate when a tack can be scheduled.
|
||||
@ -2086,8 +1952,6 @@ static void put_global_rq(struct task_struct *p, struct task_ctx *taskc,
|
||||
* right before running at ops.running().
|
||||
*/
|
||||
calc_when_to_run(p, taskc, cpuc, enq_flags);
|
||||
vdeadline = taskc->eligible_delta_ns + taskc->vdeadline_delta_ns +
|
||||
bpf_ktime_get_ns();
|
||||
|
||||
/*
|
||||
* Try to find and kick a victim CPU, which runs a less urgent task.
|
||||
@ -2108,7 +1972,7 @@ static void put_global_rq(struct task_struct *p, struct task_ctx *taskc,
|
||||
* deadline.
|
||||
*/
|
||||
scx_bpf_dispatch_vtime(p, LAVD_GLOBAL_DSQ, LAVD_SLICE_UNDECIDED,
|
||||
vdeadline, enq_flags);
|
||||
taskc->vdeadline_log_clk, enq_flags);
|
||||
|
||||
}
|
||||
|
||||
@ -3105,6 +2969,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(lavd_init)
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/*
|
||||
* Initilize the current logical clock.
|
||||
*/
|
||||
WRITE_ONCE(cur_logical_clk, 0);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -192,11 +192,11 @@ impl<'a> Scheduler<'a> {
|
||||
"| {:6} | {:7} | {:17} \
|
||||
| {:4} | {:4} | {:9} \
|
||||
| {:6} | {:8} | {:7} \
|
||||
| {:8} | {:7} | {:8} \
|
||||
| {:7} | {:9} | {:9} \
|
||||
| {:9} | {:9} | {:8} \
|
||||
| {:8} | {:4} | {:7} \
|
||||
| {:8} | {:7} | {:9} \
|
||||
| {:9} | {:9} | {:9} \
|
||||
| {:8} | {:8} | {:8} \
|
||||
| {:6} | {:6} |",
|
||||
| {:8} | {:6} | {:6} |",
|
||||
"mseq",
|
||||
"pid",
|
||||
"comm",
|
||||
@ -207,6 +207,7 @@ impl<'a> Scheduler<'a> {
|
||||
"slc_ns",
|
||||
"grdy_rt",
|
||||
"lat_prio",
|
||||
"lc",
|
||||
"avg_lc",
|
||||
"st_prio",
|
||||
"slc_bst",
|
||||
@ -231,11 +232,11 @@ impl<'a> Scheduler<'a> {
|
||||
"| {:6} | {:7} | {:17} \
|
||||
| {:4} | {:4} | {:9} \
|
||||
| {:6} | {:8} | {:7} \
|
||||
| {:8} | {:7} | {:8} \
|
||||
| {:7} | {:9} | {:9} \
|
||||
| {:9} | {:9} | {:8} \
|
||||
| {:8} | {:4} | {:7} \
|
||||
| {:8} | {:7} | {:9} \
|
||||
| {:9} | {:9} | {:9} \
|
||||
| {:8} | {:8} | {:8} \
|
||||
| {:6} | {:6} |",
|
||||
| {:8} | {:6} | {:6} |",
|
||||
mseq,
|
||||
tx.pid,
|
||||
tx_comm,
|
||||
@ -246,6 +247,7 @@ impl<'a> Scheduler<'a> {
|
||||
tc.slice_ns,
|
||||
tc.greedy_ratio,
|
||||
tc.lat_prio,
|
||||
tc.lat_cri,
|
||||
tx.avg_lat_cri,
|
||||
tx.static_prio,
|
||||
tc.slice_boost_prio,
|
||||
|
Loading…
Reference in New Issue
Block a user