scx_lavd: improve latency criticality to latency priority mapping

The old approach is mapping [0, maximum latency criticliy] to [-boost range, boost range). This approach is easily affected by one outlier maximum value and suffers from the integer truncation error. The new approach divides the range into two -- [minimum latency criticality, average latency criticality) and [average latency criticality, maximum latency criticality] -- and maps them into [boost range/2, 0) and [0, -boost range/2), respectively, Signed-off-by: Changwoo Min <changwoo@igalia.com>
2024-11-24 20:00:22 +00:00 · 2024-03-25 22:13:41 +09:00 · 2024-03-25 22:13:41 +09:00 · 83169481a6
commit 83169481a6
parent 2b5d3c1300
3 changed files with 174 additions and 69 deletions
--- a/scheds/rust/scx_lavd/src/bpf/intf.h
+++ b/scheds/rust/scx_lavd/src/bpf/intf.h
@ -85,6 +85,13 @@ struct sys_cpu_util {

 	volatile u64	load_ideal;	/* average ideal load of runnable tasks */
 	volatile u64	load_actual;	/* average actual load of runnable tasks */
+
+	volatile u64	avg_lat_cri;	/* average latency criticality (LC) */
+	volatile u64	max_lat_cri;	/* maximum latency criticality (LC) */
+	volatile u64	min_lat_cri;	/* minimum latency criticality (LC) */
+
+	volatile s64	inc1k_low;	/* increment from low LC to priority mapping */
+	volatile s64	inc1k_high;	/* increment from high LC to priority mapping */
 };

 /*
@ -102,6 +109,14 @@ struct cpu_ctx {
 	 */
 	volatile u64	load_actual;	/* actual load of runnable tasks */
 	volatile u64	load_ideal;	/* ideal loaf of runnable tasks */
+
+	/*
+	 * Information used to keep track of latency criticality
+	 */
+	volatile u64	max_lat_cri;	/* maximum latency criticality */
+	volatile u64	min_lat_cri;	/* minimum latency criticality */
+	volatile u64	sum_lat_cri;	/* sum of latency criticality */
+	volatile u64	sched_nr;	/* number of schedules */
 };

 /* 
@ -135,6 +150,7 @@ struct task_ctx {
 	u64	eligible_delta_ns;
 	u64	slice_ns;
 	u64	greedy_ratio;
+	u64	lat_cri;
 	u16	stat;		/* NIL -> ENQ -> RUN -> STOP -> NIL ... */
 	u16	slice_boost_prio;/* how many times a task fully consumed the slice */
 	u16	lat_prio;	/* latency priority */
@ -147,6 +163,9 @@ struct task_ctx_x {
 	u16	static_prio;	/* nice priority */
 	u16	cpu_id;		/* where a task ran */
 	u64	cpu_util;	/* cpu utilization in [0..100] */
+	u64	max_lat_cri;	/* maximum latency criticality */
+	u64	min_lat_cri;	/* minimum latency criticality */
+	u64	avg_lat_cri;	/* average latency criticality */
 };


--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@ -123,7 +123,6 @@ volatile u64			nr_cpus_onln;
 static struct sys_cpu_util	__sys_cpu_util[2];
 static volatile int		__sys_cpu_util_idx;

-volatile u64			max_lat_cri;
 struct user_exit_info		uei;

 const volatile u8		verbose;
@ -520,6 +519,10 @@ static int submit_task_ctx(struct task_struct *p, struct task_ctx *taskc,
 	m->taskc_x.static_prio = get_nice_prio(p);
 	m->taskc_x.cpu_util = cutil_cur->util / 10;
 	m->taskc_x.cpu_id = cpu_id;
+	m->taskc_x.max_lat_cri = cutil_cur->max_lat_cri;
+	m->taskc_x.min_lat_cri = cutil_cur->min_lat_cri;
+	m->taskc_x.avg_lat_cri = cutil_cur->avg_lat_cri;
+
 	memcpy(&m->taskc, taskc, sizeof(m->taskc));

 	bpf_ringbuf_submit(m, 0);
@ -654,6 +657,8 @@ static void update_sys_cpu_load(void)
 	u64 now, duration, duration_total;
 	u64 idle_total = 0, compute_total = 0;
 	u64 load_actual = 0, load_ideal = 0;
+	s64 max_lat_cri = 0, min_lat_cri = UINT_MAX, avg_lat_cri = 0;
+	u64 sum_lat_cri = 0, sched_nr = 0;
 	u64 new_util;
 	int cpu;

@ -673,6 +678,27 @@ static void update_sys_cpu_load(void)
 		load_actual += cpuc->load_actual;
 		load_ideal += cpuc->load_ideal;

+		/*
+		 * Accumulate task's latency criticlity information.
+		 *
+		 * Updating cpuc->* is racy. However, it could degrade the
+		 * accuracy a little bit in very rare cases, so we do embrace
+		 * embrace for performance on purpose.
+		 */
+		sum_lat_cri += cpuc->sum_lat_cri;
+		cpuc->sum_lat_cri = 0;
+
+		sched_nr += cpuc->sched_nr;
+		cpuc->sched_nr = 0;
+
+		if (cpuc->max_lat_cri > max_lat_cri)
+			max_lat_cri = cpuc->max_lat_cri;
+		cpuc->max_lat_cri = 0;
+
+		if (cpuc->min_lat_cri < min_lat_cri)
+			min_lat_cri = cpuc->min_lat_cri;
+		cpuc->min_lat_cri = UINT_MAX;
+
 		/*
 		 * If the CPU is in an idle state (i.e., idle_start_clk is
 		 * non-zero), accumulate the current idle peirod so far.
@ -698,44 +724,53 @@ static void update_sys_cpu_load(void)
 	if (duration_total > idle_total)
 		compute_total = duration_total - idle_total;
 	new_util = (compute_total * LAVD_CPU_UTIL_MAX) / duration_total;
+	if (sched_nr > 0)
+		avg_lat_cri = sum_lat_cri / sched_nr;

 	/*
-	 * Update the CPU utilization to the next version, and then make the
-	 * next version atomically visible.
+	 * Update the CPU utilization to the next version.
 	 */
 	cutil_next->load_actual = calc_avg(cutil_cur->load_actual, load_actual);
 	cutil_next->load_ideal = calc_avg(cutil_cur->load_ideal, load_ideal);
 	cutil_next->util = calc_avg(cutil_cur->util, new_util);
+
+	/*
+	 * Calculate the increment for latency criticality to priority mapping
+	 *  - Case 1. inc1k_low:   [min_lc, avg_lc) -> [half_range, 0)
+	 *  - Case 2. inc1k_high:  [avg_lc, max_lc] -> [0, -half_range)
+	 */
+	cutil_next->min_lat_cri = calc_avg(cutil_cur->min_lat_cri, min_lat_cri);
+	cutil_next->max_lat_cri = calc_avg(cutil_cur->max_lat_cri, max_lat_cri);
+	cutil_next->avg_lat_cri = calc_avg(cutil_cur->avg_lat_cri, avg_lat_cri);
+
+	if (cutil_next->avg_lat_cri == cutil_next->min_lat_cri)
+		cutil_next->inc1k_low = 0;
+	else {
+		cutil_next->inc1k_low = ((LAVD_BOOST_RANGE >> 1) * 1000) /
+					(cutil_next->avg_lat_cri -
+					 cutil_next->min_lat_cri);
+	}
+
+	if ((cutil_next->max_lat_cri + 1) == cutil_next->avg_lat_cri)
+		cutil_next->inc1k_high = 0;
+	else {	
+		cutil_next->inc1k_high = ((LAVD_BOOST_RANGE >> 1) * 1000) /
+					 (cutil_next->max_lat_cri + 1 -
+					  cutil_next->avg_lat_cri);
+	}
+
+	/*
+	 * Make the next version atomically visible.
+	 */
 	cutil_next->last_update_clk = now;
 	flip_sys_cpu_util();
 }

-static void decay_max_lat_cri(void)
-{
-	/*
-	 * Decrease max_lat_cri by 3.13% (>> 5) every update interval. The
-	 * maximum can be bumped up for a moment, then the actual lat_cri
-	 * values could never reach the maximum value. To address this, we
-	 * decrease the maximum value here so that the maximum can be bumped up
-	 * again to the actual maximum. This helps to manage max_lat_cri close
-	 * to the actual maximum value tightly. Note that we don't need to
-	 * handle the CAS failure. That is because the CAS failure means
-	 * another task bumps the old maximum to the higher, actual maximum.
-	 */
-	u64 cur_max = max_lat_cri;
-	u64 delta = (cur_max >> 5) ? : 1;
-	s64 new_max = cur_max - delta;
-
-	if (new_max > 0)
-		__sync_val_compare_and_swap(&max_lat_cri, cur_max, new_max);
-}
-
 static int update_timer_fn(void *map, int *key, struct bpf_timer *timer)
 {
 	int err;

 	update_sys_cpu_load();
-	decay_max_lat_cri();

 	err = bpf_timer_start(timer, LAVD_CPU_UTIL_INTERVAL_NS, 0);
 	if (err)
@ -923,11 +958,60 @@ static int sum_prios_for_lat(struct task_struct *p, int nice_prio,
 	return prio;
 }

+static int map_lat_cri_to_lat_prio(u64 lat_cri)
+{
+	/*
+	 * Latency criticality is an absolute metric representing how
+	 * latency-critical a task is. However, latency priority is a relative
+	 * metric compared to the other co-running tasks. Especially when the
+	 * task's latency criticalities are in a small range, the relative
+	 * metric is advantageous in mitigating integer truncation errors. In
+	 * the relative metric, we map
+	 *
+	 *  - Case 1. inc1k_low:   [min_lc, avg_lc) -> [boost_range/2,  0)
+	 *  - Case 2. inc1k_high:  [avg_lc, max_lc] -> [0, -boost_range/2)
+	 *
+	 * Hence, latency priority 20 now means that a task has an average
+	 * latency criticality among the co-running tasks.
+	 */
+
+	struct sys_cpu_util *cutil_cur = get_sys_cpu_util_cur();
+	s64 base_lat_cri, inc1k;
+	int base_prio, lat_prio;
+
+	/*
+	 * Set up params for the Case 1 and 2.
+	 */
+	if (lat_cri < cutil_cur->avg_lat_cri) {
+		inc1k = cutil_cur->inc1k_low;
+		base_lat_cri = cutil_cur->min_lat_cri;
+		base_prio = LAVD_BOOST_RANGE >> 1;
+	}
+	else {
+		inc1k = cutil_cur->inc1k_high;
+		base_lat_cri = cutil_cur->avg_lat_cri;
+		base_prio = 0;
+	}
+
+	/*
+	 * Task's lat_cri could be more up-to-date than cutil_cur's one. In
+	 * this case, just take the cutil_cur's one.
+	 */
+	if (lat_cri >= base_lat_cri) {
+		lat_prio = base_prio -
+			   (((lat_cri - base_lat_cri) * inc1k + 500) / 1000);
+	}
+	else
+		lat_prio = base_prio;
+
+	return lat_prio;
+}
+
 static int boost_lat(struct task_struct *p, struct task_ctx *taskc,
-		     bool is_wakeup)
+		     struct cpu_ctx *cpuc, bool is_wakeup)
 {
 	u64 run_time_ft = 0, wait_freq_ft = 0, wake_freq_ft = 0;
-	u64 lat_cri_raw = 0, cur_max = 0, lat_cri = 0;
+	u64 lat_cri_raw = 0;
 	u16 static_prio;
 	int boost;

@ -993,36 +1077,13 @@ static int boost_lat(struct task_struct *p, struct task_ctx *taskc,
 	 * conversion, we mitigate the exponentially skewed distribution to
 	 * non-linear distribution.
 	 */
-	lat_cri = bpf_log2l(lat_cri_raw + 1);
-
-	/*
-	 * Update the global @max_lat_cri if necessary. Updating @max_lat_cri
-	 * is racy because it can be tested and updated concurrently from
-	 * multiple CPUs. So we use an atomic cmpxchg.
-	 *
-	 * Note that the bounded retry (@LAVD_MAX_CAS_RETRY) does *not
-	 * *guarantee* the update of @max_lat_cri. However, it is unlikely to
-	 * happen, and even if it happens, the incorrect value will be
-	 * corrected next time anyway. So let's just live with it. 
-	 *
-	 * We decay @max_lat_cri periodially (at decay_max_lat_cri) to maintain
-	 * @max_lat_cri up-to-date.
-	 */
-	cur_max = max_lat_cri;
-	for (int i = 0; lat_cri > cur_max && i < LAVD_MAX_CAS_RETRY; i++) {
-		cur_max = __sync_val_compare_and_swap(&max_lat_cri, cur_max,
-				lat_cri);
-	}
+	taskc->lat_cri = bpf_log2l(lat_cri_raw + 1);

 	/*
 	 * Convert @p's latency criticality to its boost priority linearly.
-	 */
-	boost = (LAVD_BOOST_RANGE - ((lat_cri * LAVD_BOOST_RANGE) / cur_max)) - 
-		(LAVD_BOOST_RANGE >> 1);
-
-	/*
 	 * When a task is wakening up, boost its latency boost priority by 1.
 	 */
+	boost = map_lat_cri_to_lat_prio(taskc->lat_cri);
 	if (is_wakeup)
 		boost -= LAVD_BOOST_WAKEUP_LAT;

@ -1035,14 +1096,15 @@ out:
 }

 static u64 calc_latency_weight(struct task_struct *p, struct task_ctx *taskc,
-			       bool is_wakeup)
+			       struct cpu_ctx *cpuc, bool is_wakeup)
 {
-	boost_lat(p, taskc, is_wakeup);
+	boost_lat(p, taskc, cpuc, is_wakeup);
 	return sched_prio_to_latency_weight[taskc->lat_prio];
 }

 static u64 calc_virtual_dealine_delta(struct task_struct *p,
 				      struct task_ctx *taskc,
+				      struct cpu_ctx *cpuc,
 				      u64 enq_flags)
 {
 	u64 vdeadline_delta_ns, weight;
@ -1062,7 +1124,7 @@ static u64 calc_virtual_dealine_delta(struct task_struct *p,
 	 * boost priority (and weight).
 	 */
 	is_wakeup = is_wakeup_ef(enq_flags);
-	weight = calc_latency_weight(p, taskc, is_wakeup);
+	weight = calc_latency_weight(p, taskc, cpuc, is_wakeup);
 	vdeadline_delta_ns = (LAVD_SLICE_MAX_NS * weight) / 1000;
 	taskc->vdeadline_delta_ns = vdeadline_delta_ns;
 	return vdeadline_delta_ns;
@ -1235,6 +1297,19 @@ static void update_stat_for_run(struct task_struct *p, struct task_ctx *taskc,
 	interval = taskc->run_time_ns + wait_period;
 	taskc->run_freq = calc_avg_freq(taskc->run_freq, interval);

+	/*
+	 * Update per-CPU latency criticality information for ever-scheduled
+	 * tasks
+	 */
+	if (have_scheduled(taskc)) {
+		if (cpuc->max_lat_cri < taskc->lat_cri)
+			cpuc->max_lat_cri = taskc->lat_cri;
+		if (cpuc->min_lat_cri > taskc->lat_cri)
+			cpuc->min_lat_cri = taskc->lat_cri;
+		cpuc->sum_lat_cri += taskc->lat_cri;
+		cpuc->sched_nr++;
+	}
+
 	/*
 	 * Update task state when starts running.
 	 */
@ -1270,7 +1345,7 @@ static void update_stat_for_stop(struct task_struct *p, struct task_ctx *taskc,
 }

 static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc,
-			     u64 enq_flags)
+			     struct cpu_ctx *cpuc, u64 enq_flags)
 {
 	/*
 	 * Before enqueueing a task to a run queue, we should decide when a
@ -1278,7 +1353,7 @@ static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc,
 	 * urgent it is - vdeadline_delta_ns - and when it becomes eligible if
 	 * overscheduled - eligible_time_ns.
 	 */
-	calc_virtual_dealine_delta(p, taskc, enq_flags);
+	calc_virtual_dealine_delta(p, taskc, cpuc, enq_flags);
 	calc_eligible_delta(p, taskc);
 }

@ -1287,6 +1362,10 @@ static bool put_local_rq(struct task_struct *p, struct task_ctx *taskc,
 {
 	struct cpu_ctx *cpuc;

+	cpuc = get_cpu_ctx();
+	if (!cpuc)
+		return false;
+
 	/*
 	 * Calculate when a tack can be scheduled. If a task is cannot be
 	 * scheduled soonish (i.e., the task is ineligible since
@ -1296,7 +1375,7 @@ static bool put_local_rq(struct task_struct *p, struct task_ctx *taskc,
 	 * Note that the task's time slice will be calculated and reassigned
 	 * right before running at ops.running().
 	 */
-	calc_when_to_run(p, taskc, enq_flags);
+	calc_when_to_run(p, taskc, cpuc, enq_flags);
 	if (!is_eligible(taskc))
 		return false;

@ -1305,10 +1384,6 @@ static bool put_local_rq(struct task_struct *p, struct task_ctx *taskc,
 	 * rq. Statistics will be adjusted when more accurate statistics
 	 * become available (ops.running).
 	 */
-	cpuc = get_cpu_ctx();
-	if (!cpuc)
-		return false;
-
 	if (transit_task_stat(taskc, LAVD_TASK_STAT_ENQ))
 		update_stat_for_enq(p, taskc, cpuc);

@ -1329,20 +1404,21 @@ static bool put_global_rq(struct task_struct *p, struct task_ctx *taskc,
 	struct cpu_ctx *cpuc;
 	u64 vdeadline;

+	cpuc = get_cpu_ctx();
+	if (!cpuc)
+		return false;
+
 	/*
 	 * Calculate when a tack can be scheduled.
 	 *
 	 * Note that the task's time slice will be calculated and reassigned
 	 * right before running at ops.running().
 	 */
-	calc_when_to_run(p, taskc, enq_flags);
+	calc_when_to_run(p, taskc, cpuc, enq_flags);

 	/*
 	 * Reflect task's load immediately.
 	 */
-	cpuc = get_cpu_ctx();
-	if (!cpuc)
-		return false;
 	if (transit_task_stat(taskc, LAVD_TASK_STAT_ENQ))
 		update_stat_for_enq(p, taskc, cpuc);

--- a/scheds/rust/scx_lavd/src/main.rs
+++ b/scheds/rust/scx_lavd/src/main.rs
@ -170,9 +170,10 @@ impl<'a> Scheduler<'a> {
                "| {:9} | {:8} | {:17} \
                   | {:4} | {:9} | {:9} \
                   | {:10} | {:9} | {:8} \
-                   | {:12} | {:7} | {:9} \
+                   | {:7} | {:7} | {:7} \
+                   | {:7} | {:12} | {:7} \
                   | {:9} | {:9} | {:9} \
-                   | {:9} | {:8} |",
+                   | {:9} | {:9} | {:8} |",
                "mseq",
                "pid",
                "comm",
@ -182,6 +183,10 @@ impl<'a> Scheduler<'a> {
                "slice_ns",
                "grdy_rt",
                "lat_prio",
+                "lat_cri",
+                "min_lc",
+                "avg_lc",
+                "max_lc",
                "static_prio",
                "lat_bst",
                "slice_bst",
@ -201,9 +206,10 @@ impl<'a> Scheduler<'a> {
            "| {:9} | {:8} | {:17} \
               | {:4} | {:9} | {:9} \
               | {:10} | {:9} | {:8} \
-               | {:12} | {:7} | {:9} \
+               | {:7} | {:7} | {:7} \
+               | {:7} | {:12} | {:7} \
               | {:9} | {:9} | {:9} \
-               | {:9} | {:8} | ",
+               | {:9} | {:9} | {:8} | ",
            mseq,
            tx.pid,
            tx_comm,
@ -213,6 +219,10 @@ impl<'a> Scheduler<'a> {
            tc.slice_ns,
            tc.greedy_ratio,
            tc.lat_prio,
+            tc.lat_cri,
+            tx.min_lat_cri,
+            tx.avg_lat_cri,
+            tx.max_lat_cri,
            tx.static_prio,
            tc.lat_boost_prio,
            tc.slice_boost_prio,