scx_lavd: Move load subtraction to quiescent state transition

scx_lavd tracks task state transitions and updates statistics on each valid transition. However, there's an asymmetry between the runnable/running and stopping/quiescent transitions. In the former, the runnable and running transitions are accounted separately in update_stat_for_enq() and update_stat_for_run(), respectively. However, in the latter, the two transitions are combined together in update_stat_for_stop(). This asymmetry leads to incorrect accounting. For example, a task's load should be added to the cpu's load sum when the task gets enqueued and subtracted when the task is no longer runnable (quiescent). The former is accounted correctly from update_stat_for_enq() but the latter is done whenever the task stops. A task can transit between running and stopping multiple times before becoming quiescent, so the asymmetry can end up subtracting the load of a task which is still running from the cpu's load sum. This patch: - introduces LAVD_TASK_STAT_QUIESCENT and updates transit_task_stat() so that it can handle all valid state transitions including the multiple back and forth transitions between two pairs - QUIESCENT <-> ENQ and RUNNING <-> STOPPING. - restores the symmetry by moving load adjustments part from update_stat_for_stop() to new update_stat_for_quiescent(). This removes a good chunk of ignored transitions. The next patch will take care of the rest.
2024-11-25 04:00:24 +00:00 · 2024-03-26 12:23:19 -10:00 · 2024-03-26 12:23:19 -10:00 · 625bb84bc4
commit 625bb84bc4
parent dd40377f03
2 changed files with 48 additions and 26 deletions
--- a/scheds/rust/scx_lavd/src/bpf/intf.h
+++ b/scheds/rust/scx_lavd/src/bpf/intf.h
@ -125,11 +125,12 @@ struct cpu_ctx {
 enum task_stat {
 	_LAVD_TASK_STAT_MIN		= 0,
-	LAVD_TASK_STAT_STOPPING		= _LAVD_TASK_STAT_MIN,
+	LAVD_TASK_STAT_QUIESCENT	= _LAVD_TASK_STAT_MIN,
 	LAVD_TASK_STAT_ENQ,
 	LAVD_TASK_STAT_RUNNING,
 	LAVD_TASK_STAT_STOPPING,
-	_LAVD_TASK_STAT_MAX		= LAVD_TASK_STAT_RUNNING,
+	_LAVD_TASK_STAT_MAX		= LAVD_TASK_STAT_STOPPING,
 };
 struct task_ctx {
--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@ -1247,29 +1247,41 @@ static bool transit_task_stat(struct task_ctx *taskc, int tgt_stat)
 	 * -------------
 	 *     |
 	 *    \/
-	 * [STOPPING] --> [ENQ] --> [RUNNING]
+	 * [STOPPING] --> [QUIESCENT] <--> [ENQ] --> [RUNNING]
-	 *    /\                        |
+	 *    /\                                         /\
-	 *    |                         |
+	 *    |                                          |
-	 *    +-------------------------+
+	 *    +------------------------------------------+
 	 */
 	const static int valid_tgt_stat[] = {
 		[LAVD_TASK_STAT_STOPPING]	= LAVD_TASK_STAT_ENQ,
 		[LAVD_TASK_STAT_ENQ]		= LAVD_TASK_STAT_RUNNING,
 		[LAVD_TASK_STAT_RUNNING]	= LAVD_TASK_STAT_STOPPING,
 	};
 	int src_stat = taskc->stat;
 	bool valid;
 	if (src_stat < _LAVD_TASK_STAT_MIN || src_stat > _LAVD_TASK_STAT_MAX) {
 		scx_bpf_error("Invalid task state: %d", src_stat);
 		return false;
 	}
-	if (valid_tgt_stat[src_stat] == tgt_stat) {
+	switch (src_stat) {
-		taskc->stat = tgt_stat;
+	case LAVD_TASK_STAT_STOPPING:
-		return true;
+		valid = tgt_stat == LAVD_TASK_STAT_QUIESCENT ||
 			tgt_stat == LAVD_TASK_STAT_RUNNING;
 		break;
 	case LAVD_TASK_STAT_QUIESCENT:
 		valid = tgt_stat == LAVD_TASK_STAT_ENQ;
 		break;
 	case LAVD_TASK_STAT_ENQ:
 		valid = tgt_stat == LAVD_TASK_STAT_QUIESCENT ||
 			tgt_stat == LAVD_TASK_STAT_RUNNING;
 		break;
 	case LAVD_TASK_STAT_RUNNING:
 		valid = tgt_stat == LAVD_TASK_STAT_STOPPING;
 		break;
 	}
-	return false;
+	if (!valid)
 		return false;
 	taskc->stat = tgt_stat;
 	return true;
 }
 static void update_stat_for_enq(struct task_struct *p, struct task_ctx *taskc,
@ -1323,13 +1335,6 @@ static void update_stat_for_stop(struct task_struct *p, struct task_ctx *taskc,
 	now = bpf_ktime_get_ns();
 	/*
 	 * When stopped, reduce the per-CPU task load. Per-CPU task load will
 	 * be aggregated periodically at update_sys_cpu_load().
 	 */
 	cpuc->load_actual -= taskc->load_actual;
 	cpuc->load_ideal  -= get_task_load_ideal(p);
 	/*
 	 * Update task's run_time. If a task got slice-boosted -- in other
 	 * words, its time slices have been fully consumed multiple times,
@ -1344,6 +1349,17 @@ static void update_stat_for_stop(struct task_struct *p, struct task_ctx *taskc,
 	taskc->last_stop_clk = now;
 }
 static void update_stat_for_quiescent(struct task_struct *p, struct task_ctx *taskc,
 				      struct cpu_ctx *cpuc)
 {
 	/*
 	 * When quiescent, reduce the per-CPU task load. Per-CPU task load will
 	 * be aggregated periodically at update_sys_cpu_load().
 	 */
 	cpuc->load_actual -= taskc->load_actual;
 	cpuc->load_ideal  -= get_task_load_ideal(p);
 }
 static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc,
 			     struct cpu_ctx *cpuc, u64 enq_flags)
 {
@ -1630,9 +1646,18 @@ void BPF_STRUCT_OPS(lavd_stopping, struct task_struct *p, bool runnable)
 void BPF_STRUCT_OPS(lavd_quiescent, struct task_struct *p, u64 deq_flags)
 {
 	struct cpu_ctx *cpuc;
 	struct task_ctx *taskc;
 	u64 now, interval;
 	cpuc = get_cpu_ctx();
 	taskc = get_task_ctx(p);
 	if (!cpuc || !taskc)
 		return;
 	if (transit_task_stat(taskc, LAVD_TASK_STAT_QUIESCENT))
 		update_stat_for_quiescent(p, taskc, cpuc);
 	/*
 	 * If a task @p is dequeued from a run queue for some other reason
 	 * other than going to sleep, it is an implementation-level side
@ -1644,10 +1669,6 @@ void BPF_STRUCT_OPS(lavd_quiescent, struct task_struct *p, u64 deq_flags)
 	/*
 	 * When a task @p goes to sleep, its associated wait_freq is updated.
 	 */
 	taskc = get_task_ctx(p);
 	if (!taskc)
 		return;
 	now = bpf_ktime_get_ns();
 	interval = now - taskc->last_wait_clk;
 	taskc->wait_freq = calc_avg_freq(taskc->wait_freq, interval);