diff --git a/scheds/rust/scx_lavd/src/bpf/intf.h b/scheds/rust/scx_lavd/src/bpf/intf.h index a909fae..7271af1 100644 --- a/scheds/rust/scx_lavd/src/bpf/intf.h +++ b/scheds/rust/scx_lavd/src/bpf/intf.h @@ -54,7 +54,7 @@ enum consts { NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC), LAVD_TIME_ONE_SEC = (1000ULL * NSEC_PER_MSEC), LAVD_TIME_INFINITY_NS = SCX_SLICE_INF, - LAVD_MAX_CAS_RETRY = 4, + LAVD_MAX_RETRY = 4, LAVD_TARGETED_LATENCY_NS = (15 * NSEC_PER_MSEC), LAVD_SLICE_MIN_NS = ( 1 * NSEC_PER_MSEC), /* min time slice */ @@ -63,7 +63,7 @@ enum consts { LAVD_SLICE_GREEDY_FT = 3, LAVD_LOAD_FACTOR_ADJ = 6, /* adjustment for better estimation */ LAVD_LOAD_FACTOR_MAX = (20 * 1000), - LAVD_LOAD_FACTOR_FT = 80, /* factor to stretch the time line */ + LAVD_LOAD_FACTOR_FT = 4, /* factor to stretch the time line */ LAVD_LC_FREQ_MAX = 1000000, LAVD_LC_RUNTIME_MAX = LAVD_TARGETED_LATENCY_NS, @@ -74,8 +74,9 @@ enum consts { LAVD_SLICE_BOOST_MAX_FT = 2, /* maximum additional 2x of slice */ LAVD_SLICE_BOOST_MAX_STEP = 8, /* 8 slice exhausitions in a row */ LAVD_GREEDY_RATIO_MAX = USHRT_MAX, + LAVD_LAT_PRIO_NEW = 10, LAVD_LAT_PRIO_IDLE = USHRT_MAX, - LAVD_LAT_WEIGHT_SHIFT = 3, + LAVD_LAT_WEIGHT_FT = 88761, LAVD_ELIGIBLE_TIME_LAT_FT = 16, LAVD_ELIGIBLE_TIME_MAX = (100 * NSEC_PER_USEC), @@ -210,6 +211,7 @@ struct task_ctx { /* * Task deadline and time slice */ + u64 vdeadline_log_clk; /* logical clock of the deadilne */ u64 vdeadline_delta_ns; /* time delta until task's virtual deadline */ u64 eligible_delta_ns; /* time delta until task becomes eligible */ u64 slice_ns; /* time slice */ diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c index 094b5f5..d48bd7e 100644 --- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c +++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c @@ -213,6 +213,11 @@ private(LAVD) struct bpf_cpumask __kptr *ovrflw_cpumask; /* CPU mask for overflo */ const volatile u16 cpu_order[LAVD_CPU_ID_MAX]; /* ordered by cpus->core->llc->numa */ +/* + * Logical current clock + */ +u64 cur_logical_clk; + /* * Options */ @@ -363,137 +368,6 @@ static const u64 sched_prio_to_slice_weight[NICE_WIDTH] = { 15, /* 19 39 */ }; -/* - * A nice priority to latency weight array - * --------------------------------------- - * - * It is used to determine the virtual deadline. Each step increases by 10%. - * The idea behind the virtual deadline is to limit the competition window - * among concurrent tasks. For example, in the case of a normal priority task - * with nice 0, its corresponding value is 7.5 msec (when LAVD_LAT_WEIGHT_SHIFT - * is 0). This guarantees that any tasks enqueued in 7.5 msec after the task is - * enqueued will not compete for CPU time with the task. This array is the - * inverse of sched_prio_to_latency_weight with some normalization. Suppose the - * maximum time slice per schedule (LAVD_SLICE_MAX_NS) is 3 msec. We normalized - * the values so that the normal priority (nice 0) has a deadline of 7.5 msec, - * a center of the targeted latency (i.e., when LAVD_TARGETED_LATENCY_NS is 15 - * msec). The virtual deadline ranges from 87 usec to 512 msec. As the maximum - * time slice becomes shorter, the deadlines become tighter. - */ -static const u64 sched_prio_to_latency_weight[NICE_WIDTH] = { - /* weight nice priority sched priority vdeadline (usec) */ - /* (max slice == 3 ms) */ - /* (LAVD_LAT_WEIGHT_SHIFT == 0) */ - /* ------ ------------- -------------- ------------------- */ - 29, /* -20 0 87 */ - 36, /* -19 1 108 */ - 45, /* -18 2 135 */ - 55, /* -17 3 165 */ - 71, /* -16 4 213 */ - 88, /* -15 5 264 */ - 110, /* -14 6 330 */ - 137, /* -13 7 411 */ - 171, /* -12 8 513 */ - 215, /* -11 9 645 */ - 268, /* -10 10 804 */ - 336, /* -9 11 1008 */ - 420, /* -8 12 1260 */ - 522, /* -7 13 1566 */ - 655, /* -6 14 1965 */ - 820, /* -5 15 2460 */ - 1024, /* -4 16 3072 */ - 1286, /* -3 17 3858 */ - 1614, /* -2 18 4842 */ - 2005, /* -1 19 6015 */ - 2500, /* 0 20 7500 */ - 3122, /* 1 21 9366 */ - 3908, /* 2 22 11724 */ - 4867, /* 3 23 14601 */ - 6052, /* 4 24 18156 */ - 7642, /* 5 25 22926 */ - 9412, /* 6 26 28236 */ - 11907, /* 7 27 35721 */ - 14884, /* 8 28 44652 */ - 18686, /* 9 29 56058 */ - 23273, /* 10 30 69819 */ - 29425, /* 11 31 88275 */ - 36571, /* 12 32 109713 */ - 45714, /* 13 33 137142 */ - 56889, /* 14 34 170667 */ - 71111, /* 15 35 213333 */ - 88276, /* 16 36 264828 */ - 111304, /* 17 37 333912 */ - 142222, /* 18 38 426666 */ - 170667, /* 19 39 512001 */ -}; - -/* - * A latency priority to greedy ratios for eligibility - * --------------------------------------------------- - * - * This table is nothing but sched_prio_to_slice_weight * (1000/1024) for - * direct comparison against greedy_ratio, which is based on 1000. - * - * We distribute CPU time based on its nice (static) priorities described in - * sched_prio_to_slice_weight, the same as the conventional way, for the fair - * use of CPU time. However, when checking whether a particular task is - * eligible, we consider its (dynamic) latency priority. Because a - * latency-critical task may have CPU usage spikes to meet its (soft) deadline, - * too strict fairness enforcement does not work well. - * - * Hence, we are more generous to a latency-critical task and aim for eventual - * fairness of CPU time. To this end, we determine the task's time slice and - * ineligible duration based on its nice priority for fairness. But we check if - * a task is greedier compared to its (dynamic) _latency_ priority (not nice - * priority). This allows the task to use more CPU time temporarily, but - * eventually, its CPU time is under fairness control using time slice and - * ineligibility duration calculation. - */ -static const u64 lat_prio_to_greedy_thresholds[NICE_WIDTH] = { - /* weight nice priority sched priority */ - /* ------ ------------- -------------- */ - 86681, /* -20 0 */ - 70073, /* -19 1 */ - 55159, /* -18 2 */ - 45188, /* -17 3 */ - 35440, /* -16 4 */ - 28471, /* -15 5 */ - 22709, /* -14 6 */ - 18267, /* -13 7 */ - 14599, /* -12 8 */ - 11637, /* -11 9 */ - 9324, /* -10 10 */ - 7441, /* -9 11 */ - 5957, /* -8 12 */ - 4789, /* -7 13 */ - 3814, /* -6 14 */ - 3048, /* -5 15 */ - 2442, /* -4 16 */ - 1944, /* -3 17 */ - 1549, /* -2 18 */ - 1247, /* -1 19 */ - 1000, /* 0 20 */ - 1000, /* 1 21 */ - 1000, /* 2 22 */ - 1000, /* 3 23 */ - 1000, /* 4 24 */ - 1000, /* 5 25 */ - 1000, /* 6 26 */ - 1000, /* 7 27 */ - 1000, /* 8 28 */ - 1000, /* 9 29 */ - 1000, /* 10 30 */ - 1000, /* 11 31 */ - 1000, /* 12 32 */ - 1000, /* 13 33 */ - 1000, /* 14 34 */ - 1000, /* 15 35 */ - 1000, /* 16 36 */ - 1000, /* 17 37 */ - 1000, /* 18 38 */ - 1000, /* 19 39 */ -}; - static u16 get_nice_prio(struct task_struct *p); static u64 get_task_load_ideal(struct task_struct *p); static void adjust_slice_boost(struct cpu_ctx *cpuc, struct task_ctx *taskc); @@ -636,13 +510,13 @@ static void proc_introspec_sched_n(struct task_struct *p, cur_nr = intrspc.arg; /* - * Note that the bounded retry (@LAVD_MAX_CAS_RETRY) does *not - * *guarantee* to decrement introspec_arg. However, it is unlikely to - * happen. Even if it happens, it is nothing but a matter of delaying a - * message delivery. That's because other threads will try and succeed - * the CAS operation eventually. So this is good enough. ;-) + * Note that the bounded retry (@LAVD_MAX_RETRY) does *not *guarantee* + * to decrement introspec_arg. However, it is unlikely to happen. Even + * if it happens, it is nothing but a matter of delaying a message + * delivery. That's because other threads will try and succeed the CAS + * operation eventually. So this is good enough. ;-) */ - for (i = 0; cur_nr > 0 && i < LAVD_MAX_CAS_RETRY; i++) { + for (i = 0; cur_nr > 0 && i < LAVD_MAX_RETRY; i++) { prev_nr = __sync_val_compare_and_swap( &intrspc.arg, cur_nr, cur_nr - 1); /* CAS success: submit a message and done */ @@ -838,7 +712,7 @@ static void collect_sys_stat(struct sys_stat_ctx *c) * If the CPU is in an idle state (i.e., idle_start_clk is * non-zero), accumulate the current idle peirod so far. */ - for (int i = 0; i < LAVD_MAX_CAS_RETRY; i++) { + for (int i = 0; i < LAVD_MAX_RETRY; i++) { u64 old_clk = cpuc->idle_start_clk; if (old_clk == 0) break; @@ -1177,25 +1051,9 @@ static u64 calc_lat_factor(u64 lat_prio) static u32 calc_greedy_factor(struct task_ctx *taskc) { u32 greedy_ratio = taskc->greedy_ratio; - s16 lat_prio = taskc->lat_prio; - u32 greedy_threshold; u32 gr_ft; - if (lat_prio < 0) - lat_prio = 0; - else if (lat_prio >= NICE_WIDTH) - lat_prio = NICE_WIDTH - 1; - - /* - * When determining how greedy a task is, we are more generous to a - * latency-critical task with a low lat_prio value. That is because a - * latency-critical task can temporarily overspend CPU time. However, - * the time slice and ineligible duration allocation will eventually - * enforce fairness. - */ - greedy_threshold = lat_prio_to_greedy_thresholds[lat_prio]; - - gr_ft = (greedy_ratio * 1000) / greedy_threshold; + gr_ft = greedy_ratio; if (gr_ft < 1000) gr_ft = 1000; else @@ -1206,22 +1064,7 @@ static u32 calc_greedy_factor(struct task_ctx *taskc) static bool is_eligible(struct task_ctx *taskc) { - u64 greedy_threshold; - s16 lat_prio = taskc->lat_prio; - - if (lat_prio < 0) - lat_prio = 0; - else if (lat_prio >= NICE_WIDTH) - lat_prio = NICE_WIDTH - 1; - - /* - * Similar to the greedy factor calculation, we have a loose bound for - * a latency-critical task. That makes a latency-critical task less - * frequently ineligible for low (tail) latency. - */ - greedy_threshold = lat_prio_to_greedy_thresholds[lat_prio]; - - return taskc->greedy_ratio <= greedy_threshold; + return taskc->greedy_ratio <= 1000; } static bool is_wakeup_wf(u64 wake_flags) @@ -1372,7 +1215,7 @@ static int boost_lat(struct task_struct *p, struct task_ctx *taskc, * its property. */ if (!have_scheduled(taskc)) { - boost = 0; + boost = LAVD_LAT_PRIO_NEW; goto out; } @@ -1448,13 +1291,11 @@ out: static u64 calc_latency_weight(struct task_struct *p, struct task_ctx *taskc, struct cpu_ctx *cpuc, bool is_wakeup) { - boost_lat(p, taskc, cpuc, is_wakeup); + u64 w; - /* - * Tighten the competition window according to LAVD_LAT_WEIGHT_SHIFT. - */ - return sched_prio_to_latency_weight[taskc->lat_prio] >> - LAVD_LAT_WEIGHT_SHIFT; + boost_lat(p, taskc, cpuc, is_wakeup); + w = LAVD_LAT_WEIGHT_FT / sched_prio_to_slice_weight[taskc->lat_prio] + 1; + return w; } static u64 calc_virtual_deadline_delta(struct task_struct *p, @@ -1481,8 +1322,7 @@ static u64 calc_virtual_deadline_delta(struct task_struct *p, */ is_wakeup = is_wakeup_ef(enq_flags); weight = calc_latency_weight(p, taskc, cpuc, is_wakeup); - vdeadline_delta_ns = (LAVD_SLICE_MAX_NS * weight) / 1000; - + vdeadline_delta_ns = (((taskc->run_time_ns + 1) * weight) + 1000) / 1000; /* * When a system is overloaded (>1000), stretch time space so make time * tick logically slower to give room to execute the overloaded tasks. @@ -1493,11 +1333,12 @@ static u64 calc_virtual_deadline_delta(struct task_struct *p, * is lower (i.e., higher value) and the load is higher. */ vdeadline_delta_ns = (vdeadline_delta_ns * load_factor * - taskc->lat_prio * taskc->lat_prio) / + (taskc->lat_prio + 1)) / (LAVD_LOAD_FACTOR_FT * 1000); } taskc->vdeadline_delta_ns = vdeadline_delta_ns; + return vdeadline_delta_ns; } @@ -1646,6 +1487,11 @@ static void update_stat_for_running(struct task_struct *p, u64 load_actual_ft, load_ideal_ft, wait_freq_ft, wake_freq_ft; u64 perf_cri_raw; + /* + * Update the current logical clock. + */ + WRITE_ONCE(cur_logical_clk, taskc->vdeadline_log_clk); + /* * Since this is the start of a new schedule for @p, we update run * frequency in a second using an exponential weighted moving average. @@ -1749,6 +1595,17 @@ static void update_stat_for_quiescent(struct task_struct *p, cpuc->load_run_time_ns -= cap_time_slice_ns(taskc->run_time_ns); } +static u64 calc_exclusive_run_window(void) +{ + u64 load_factor; + + load_factor = get_sys_stat_cur()->load_factor; + if (load_factor >= 1000) + return (LAVD_SLICE_MAX_NS * load_factor) / 1000; + + return 0; +} + static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc, struct cpu_ctx *cpuc, u64 enq_flags) { @@ -1760,6 +1617,15 @@ static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc, */ calc_virtual_deadline_delta(p, taskc, cpuc, enq_flags); calc_eligible_delta(p, taskc); + + /* + * Update the logical clock of the virtual deadline including + * ineligible duration. + */ + taskc->vdeadline_log_clk = READ_ONCE(cur_logical_clk) + + calc_exclusive_run_window() + + taskc->eligible_delta_ns + + taskc->vdeadline_delta_ns; } static u64 get_est_stopping_time(struct task_ctx *taskc) @@ -1940,7 +1806,8 @@ static struct cpu_ctx *find_victim_cpu(const struct cpumask *cpumask, */ switch(v) { case 2: /* two dandidates */ - victim_cpu = can_task1_kick_task2(&prm_cpus[0], &prm_cpus[1]) ? &prm_cpus[0] : &prm_cpus[1]; + victim_cpu = can_task1_kick_task2(&prm_cpus[0], &prm_cpus[1]) ? + &prm_cpus[0] : &prm_cpus[1]; goto bingo_out; case 1: /* one candidate */ victim_cpu = &prm_cpus[0]; @@ -2077,7 +1944,6 @@ static void put_global_rq(struct task_struct *p, struct task_ctx *taskc, { struct task_ctx *taskc_run; struct task_struct *p_run; - u64 vdeadline; /* * Calculate when a tack can be scheduled. @@ -2086,8 +1952,6 @@ static void put_global_rq(struct task_struct *p, struct task_ctx *taskc, * right before running at ops.running(). */ calc_when_to_run(p, taskc, cpuc, enq_flags); - vdeadline = taskc->eligible_delta_ns + taskc->vdeadline_delta_ns + - bpf_ktime_get_ns(); /* * Try to find and kick a victim CPU, which runs a less urgent task. @@ -2108,7 +1972,7 @@ static void put_global_rq(struct task_struct *p, struct task_ctx *taskc, * deadline. */ scx_bpf_dispatch_vtime(p, LAVD_GLOBAL_DSQ, LAVD_SLICE_UNDECIDED, - vdeadline, enq_flags); + taskc->vdeadline_log_clk, enq_flags); } @@ -3105,6 +2969,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(lavd_init) if (err) return err; + /* + * Initilize the current logical clock. + */ + WRITE_ONCE(cur_logical_clk, 0); return err; } diff --git a/scheds/rust/scx_lavd/src/main.rs b/scheds/rust/scx_lavd/src/main.rs index 7f1f933..43c1bc9 100644 --- a/scheds/rust/scx_lavd/src/main.rs +++ b/scheds/rust/scx_lavd/src/main.rs @@ -192,11 +192,11 @@ impl<'a> Scheduler<'a> { "| {:6} | {:7} | {:17} \ | {:4} | {:4} | {:9} \ | {:6} | {:8} | {:7} \ - | {:8} | {:7} | {:8} \ - | {:7} | {:9} | {:9} \ - | {:9} | {:9} | {:8} \ + | {:8} | {:4} | {:7} \ + | {:8} | {:7} | {:9} \ + | {:9} | {:9} | {:9} \ | {:8} | {:8} | {:8} \ - | {:6} | {:6} |", + | {:8} | {:6} | {:6} |", "mseq", "pid", "comm", @@ -207,6 +207,7 @@ impl<'a> Scheduler<'a> { "slc_ns", "grdy_rt", "lat_prio", + "lc", "avg_lc", "st_prio", "slc_bst", @@ -231,11 +232,11 @@ impl<'a> Scheduler<'a> { "| {:6} | {:7} | {:17} \ | {:4} | {:4} | {:9} \ | {:6} | {:8} | {:7} \ - | {:8} | {:7} | {:8} \ - | {:7} | {:9} | {:9} \ - | {:9} | {:9} | {:8} \ + | {:8} | {:4} | {:7} \ + | {:8} | {:7} | {:9} \ + | {:9} | {:9} | {:9} \ | {:8} | {:8} | {:8} \ - | {:6} | {:6} |", + | {:8} | {:6} | {:6} |", mseq, tx.pid, tx_comm, @@ -246,6 +247,7 @@ impl<'a> Scheduler<'a> { tc.slice_ns, tc.greedy_ratio, tc.lat_prio, + tc.lat_cri, tx.avg_lat_cri, tx.static_prio, tc.slice_boost_prio,