From dd40377f037bc18b805b2a28aa68c24830320d55 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 26 Mar 2024 10:04:35 -1000 Subject: [PATCH 1/4] scx_lavd: Drop unnecessary `extern crate`s Since https://doc.rust-lang.org/edition-guide/rust-2018/path-changes.html, extern crate declarations aren't necessary. Let's drop them. --- scheds/rust/scx_lavd/src/main.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scheds/rust/scx_lavd/src/main.rs b/scheds/rust/scx_lavd/src/main.rs index 7f1d87e..e750e3d 100644 --- a/scheds/rust/scx_lavd/src/main.rs +++ b/scheds/rust/scx_lavd/src/main.rs @@ -10,10 +10,6 @@ pub use bpf_skel::*; pub mod bpf_intf; pub use bpf_intf::*; -extern crate libc; -extern crate plain; -extern crate static_assertions; - use std::mem; use std::sync::atomic::AtomicBool; use std::sync::atomic::Ordering; From 625bb84bc42e11ae686ebf7cce56ae8fe6e81752 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 26 Mar 2024 12:23:19 -1000 Subject: [PATCH 2/4] scx_lavd: Move load subtraction to quiescent state transition scx_lavd tracks task state transitions and updates statistics on each valid transition. However, there's an asymmetry between the runnable/running and stopping/quiescent transitions. In the former, the runnable and running transitions are accounted separately in update_stat_for_enq() and update_stat_for_run(), respectively. However, in the latter, the two transitions are combined together in update_stat_for_stop(). This asymmetry leads to incorrect accounting. For example, a task's load should be added to the cpu's load sum when the task gets enqueued and subtracted when the task is no longer runnable (quiescent). The former is accounted correctly from update_stat_for_enq() but the latter is done whenever the task stops. A task can transit between running and stopping multiple times before becoming quiescent, so the asymmetry can end up subtracting the load of a task which is still running from the cpu's load sum. This patch: - introduces LAVD_TASK_STAT_QUIESCENT and updates transit_task_stat() so that it can handle all valid state transitions including the multiple back and forth transitions between two pairs - QUIESCENT <-> ENQ and RUNNING <-> STOPPING. - restores the symmetry by moving load adjustments part from update_stat_for_stop() to new update_stat_for_quiescent(). This removes a good chunk of ignored transitions. The next patch will take care of the rest. --- scheds/rust/scx_lavd/src/bpf/intf.h | 5 +- scheds/rust/scx_lavd/src/bpf/main.bpf.c | 69 ++++++++++++++++--------- 2 files changed, 48 insertions(+), 26 deletions(-) diff --git a/scheds/rust/scx_lavd/src/bpf/intf.h b/scheds/rust/scx_lavd/src/bpf/intf.h index c256086..30e1c46 100644 --- a/scheds/rust/scx_lavd/src/bpf/intf.h +++ b/scheds/rust/scx_lavd/src/bpf/intf.h @@ -125,11 +125,12 @@ struct cpu_ctx { enum task_stat { _LAVD_TASK_STAT_MIN = 0, - LAVD_TASK_STAT_STOPPING = _LAVD_TASK_STAT_MIN, + LAVD_TASK_STAT_QUIESCENT = _LAVD_TASK_STAT_MIN, LAVD_TASK_STAT_ENQ, LAVD_TASK_STAT_RUNNING, + LAVD_TASK_STAT_STOPPING, - _LAVD_TASK_STAT_MAX = LAVD_TASK_STAT_RUNNING, + _LAVD_TASK_STAT_MAX = LAVD_TASK_STAT_STOPPING, }; struct task_ctx { diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c index 0209ec0..58d8de3 100644 --- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c +++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c @@ -1247,29 +1247,41 @@ static bool transit_task_stat(struct task_ctx *taskc, int tgt_stat) * ------------- * | * \/ - * [STOPPING] --> [ENQ] --> [RUNNING] - * /\ | - * | | - * +-------------------------+ + * [STOPPING] --> [QUIESCENT] <--> [ENQ] --> [RUNNING] + * /\ /\ + * | | + * +------------------------------------------+ */ - const static int valid_tgt_stat[] = { - [LAVD_TASK_STAT_STOPPING] = LAVD_TASK_STAT_ENQ, - [LAVD_TASK_STAT_ENQ] = LAVD_TASK_STAT_RUNNING, - [LAVD_TASK_STAT_RUNNING] = LAVD_TASK_STAT_STOPPING, - }; int src_stat = taskc->stat; + bool valid; if (src_stat < _LAVD_TASK_STAT_MIN || src_stat > _LAVD_TASK_STAT_MAX) { scx_bpf_error("Invalid task state: %d", src_stat); return false; } - if (valid_tgt_stat[src_stat] == tgt_stat) { - taskc->stat = tgt_stat; - return true; + switch (src_stat) { + case LAVD_TASK_STAT_STOPPING: + valid = tgt_stat == LAVD_TASK_STAT_QUIESCENT || + tgt_stat == LAVD_TASK_STAT_RUNNING; + break; + case LAVD_TASK_STAT_QUIESCENT: + valid = tgt_stat == LAVD_TASK_STAT_ENQ; + break; + case LAVD_TASK_STAT_ENQ: + valid = tgt_stat == LAVD_TASK_STAT_QUIESCENT || + tgt_stat == LAVD_TASK_STAT_RUNNING; + break; + case LAVD_TASK_STAT_RUNNING: + valid = tgt_stat == LAVD_TASK_STAT_STOPPING; + break; } - return false; + if (!valid) + return false; + + taskc->stat = tgt_stat; + return true; } static void update_stat_for_enq(struct task_struct *p, struct task_ctx *taskc, @@ -1323,13 +1335,6 @@ static void update_stat_for_stop(struct task_struct *p, struct task_ctx *taskc, now = bpf_ktime_get_ns(); - /* - * When stopped, reduce the per-CPU task load. Per-CPU task load will - * be aggregated periodically at update_sys_cpu_load(). - */ - cpuc->load_actual -= taskc->load_actual; - cpuc->load_ideal -= get_task_load_ideal(p); - /* * Update task's run_time. If a task got slice-boosted -- in other * words, its time slices have been fully consumed multiple times, @@ -1344,6 +1349,17 @@ static void update_stat_for_stop(struct task_struct *p, struct task_ctx *taskc, taskc->last_stop_clk = now; } +static void update_stat_for_quiescent(struct task_struct *p, struct task_ctx *taskc, + struct cpu_ctx *cpuc) +{ + /* + * When quiescent, reduce the per-CPU task load. Per-CPU task load will + * be aggregated periodically at update_sys_cpu_load(). + */ + cpuc->load_actual -= taskc->load_actual; + cpuc->load_ideal -= get_task_load_ideal(p); +} + static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc, struct cpu_ctx *cpuc, u64 enq_flags) { @@ -1630,9 +1646,18 @@ void BPF_STRUCT_OPS(lavd_stopping, struct task_struct *p, bool runnable) void BPF_STRUCT_OPS(lavd_quiescent, struct task_struct *p, u64 deq_flags) { + struct cpu_ctx *cpuc; struct task_ctx *taskc; u64 now, interval; + cpuc = get_cpu_ctx(); + taskc = get_task_ctx(p); + if (!cpuc || !taskc) + return; + + if (transit_task_stat(taskc, LAVD_TASK_STAT_QUIESCENT)) + update_stat_for_quiescent(p, taskc, cpuc); + /* * If a task @p is dequeued from a run queue for some other reason * other than going to sleep, it is an implementation-level side @@ -1644,10 +1669,6 @@ void BPF_STRUCT_OPS(lavd_quiescent, struct task_struct *p, u64 deq_flags) /* * When a task @p goes to sleep, its associated wait_freq is updated. */ - taskc = get_task_ctx(p); - if (!taskc) - return; - now = bpf_ktime_get_ns(); interval = now - taskc->last_wait_clk; taskc->wait_freq = calc_avg_freq(taskc->wait_freq, interval); From d7ec05e0172213fc5b721ad1bc80248dd4fa3871 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 26 Mar 2024 12:23:19 -1000 Subject: [PATCH 3/4] scx_lavd: Call update_stat_for_enq() from lavd_runnable() LAVD_TASK_STAT_ENQ is tracking a subset of runnable task state transitions - the ones which end up calling ops.enqueue(). However, what it is trying to track is a task becoming runnable so that its load can be added to the cpu's load sum. Move the LAVD_TASK_STAT_ENQ state transition and update_stat_for_enq() invocation to ops.runnable() which is called for all runnable transitions. Note that when all the methods are invoked, the invocation order would be ops.select_cpu(), runnable() and then enqueue(). So, this change moves update_stat_for_enq() invocation before calc_when_to_run() for put_global_rq(). update_stat_for_enq() updates taskc->load_actual which is consumed by calc_greedy_ratio() and thus affects calc_when_to_run(). Before this patch, calc_greedy_ratio() would use load_actual which doesn't reflect the last running period. After this patch, the latest running period will be reflected when the task gets queued to the global queue. The difference is unlikely to matter but it'd probably make sense to make it more consistent (e.g. do it at the end of quiescent transition). After this change, transit_task_stat() doesn't detect any invalid transitions. --- scheds/rust/scx_lavd/src/bpf/main.bpf.c | 40 ++++++++++++------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c index 58d8de3..db4d6ca 100644 --- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c +++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c @@ -1395,14 +1395,6 @@ static bool put_local_rq(struct task_struct *p, struct task_ctx *taskc, if (!is_eligible(taskc)) return false; - /* - * Add task load based on the current statistics regardless of a target - * rq. Statistics will be adjusted when more accurate statistics - * become available (ops.running). - */ - if (transit_task_stat(taskc, LAVD_TASK_STAT_ENQ)) - update_stat_for_enq(p, taskc, cpuc); - /* * This task should be scheduled as soon as possible (e.g., wakened up) * so the deadline is no use and enqueued into a local DSQ, which @@ -1432,12 +1424,6 @@ static bool put_global_rq(struct task_struct *p, struct task_ctx *taskc, */ calc_when_to_run(p, taskc, cpuc, enq_flags); - /* - * Reflect task's load immediately. - */ - if (transit_task_stat(taskc, LAVD_TASK_STAT_ENQ)) - update_stat_for_enq(p, taskc, cpuc); - /* * Enqueue the task to the global runqueue based on its virtual * deadline. @@ -1527,10 +1513,24 @@ void BPF_STRUCT_OPS(lavd_dispatch, s32 cpu, struct task_struct *prev) void BPF_STRUCT_OPS(lavd_runnable, struct task_struct *p, u64 enq_flags) { + struct cpu_ctx *cpuc; struct task_struct *waker; - struct task_ctx *taskc; + struct task_ctx *p_taskc, *waker_taskc; u64 now, interval; + cpuc = get_cpu_ctx(); + p_taskc = get_task_ctx(p); + if (!cpuc || !p_taskc) + return; + + /* + * Add task load based on the current statistics regardless of a target + * rq. Statistics will be adjusted when more accurate statistics become + * available (ops.running). + */ + if (transit_task_stat(p_taskc, LAVD_TASK_STAT_ENQ)) + update_stat_for_enq(p, p_taskc, cpuc); + /* * When a task @p is wakened up, the wake frequency of its waker task * is updated. The @current task is a waker and @p is a waiter, which @@ -1540,8 +1540,8 @@ void BPF_STRUCT_OPS(lavd_runnable, struct task_struct *p, u64 enq_flags) return; waker = bpf_get_current_task_btf(); - taskc = try_get_task_ctx(waker); - if (!taskc) { + waker_taskc = try_get_task_ctx(waker); + if (!waker_taskc) { /* * In this case, the waker could be an idle task * (swapper/_[_]), so we just ignore. @@ -1550,9 +1550,9 @@ void BPF_STRUCT_OPS(lavd_runnable, struct task_struct *p, u64 enq_flags) } now = bpf_ktime_get_ns(); - interval = now - taskc->last_wake_clk; - taskc->wake_freq = calc_avg_freq(taskc->wake_freq, interval); - taskc->last_wake_clk = now; + interval = now - waker_taskc->last_wake_clk; + waker_taskc->wake_freq = calc_avg_freq(waker_taskc->wake_freq, interval); + waker_taskc->last_wake_clk = now; } void BPF_STRUCT_OPS(lavd_running, struct task_struct *p) From 129d99f5425c9591edc60a2083b01d390d1048f2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 26 Mar 2024 12:23:19 -1000 Subject: [PATCH 4/4] scx_lavd: Remove custom task state tracking transit_task_stat() is now tracking the same runnable, running, stopping, quiescent transitions that sched_ext core already tracks and always returns %true. Let's remove it. --- scheds/rust/scx_lavd/src/bpf/intf.h | 15 ----- scheds/rust/scx_lavd/src/bpf/main.bpf.c | 76 ++----------------------- 2 files changed, 4 insertions(+), 87 deletions(-) diff --git a/scheds/rust/scx_lavd/src/bpf/intf.h b/scheds/rust/scx_lavd/src/bpf/intf.h index 30e1c46..0aa6f6f 100644 --- a/scheds/rust/scx_lavd/src/bpf/intf.h +++ b/scheds/rust/scx_lavd/src/bpf/intf.h @@ -119,20 +119,6 @@ struct cpu_ctx { volatile u64 sched_nr; /* number of schedules */ }; -/* - * Per-task scheduling context - */ -enum task_stat { - _LAVD_TASK_STAT_MIN = 0, - - LAVD_TASK_STAT_QUIESCENT = _LAVD_TASK_STAT_MIN, - LAVD_TASK_STAT_ENQ, - LAVD_TASK_STAT_RUNNING, - LAVD_TASK_STAT_STOPPING, - - _LAVD_TASK_STAT_MAX = LAVD_TASK_STAT_STOPPING, -}; - struct task_ctx { /* * Essential task running statistics for latency criticality calculation @@ -152,7 +138,6 @@ struct task_ctx { u64 slice_ns; u64 greedy_ratio; u64 lat_cri; - u16 stat; /* NIL -> ENQ -> RUN -> STOP -> NIL ... */ u16 slice_boost_prio;/* how many times a task fully consumed the slice */ u16 lat_prio; /* latency priority */ s16 lat_boost_prio; /* DEBUG */ diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c index db4d6ca..b7cba62 100644 --- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c +++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c @@ -1220,70 +1220,6 @@ static u64 calc_time_slice(struct task_struct *p, struct task_ctx *taskc) return slice; } -static bool transit_task_stat(struct task_ctx *taskc, int tgt_stat) -{ - /* - * Update task loads only when the state transition is valid. So far, - * two types of invalid state transitions have been observed, and there - * are reasons for that. The two are as follows: - * - * - ENQ -> ENQ: This transition can happen because scx_lavd does not - * provide ops.dequeue. When task attributes are updated (e.g., nice - * level, allowed cpus and so on), the scx core will dequeue the task - * and re-enqueue it (ENQ->DEQ->ENQ). However, When ops.dequeue() is - * not provided, the dequeue operations is done by the scx core. - * Hence, ignoring the dequeue operation is completely fine. - * - * - STOPPING -> RUNNING: This can happen because there are several - * special cases where scx core skips enqueue including: 1) bypass - * mode is turned on (this is turned on during both init and exit. - * it's also used across suspend/resume operations. 2) - * SCX_OPS_ENQ_EXITING is not set and an exiting task was woken up. - * 3) The associated CPU is not fully online. However, we avoid - * collecting time & frequency statistics for such special cases for - * accuracy. - * - * initial state - * ------------- - * | - * \/ - * [STOPPING] --> [QUIESCENT] <--> [ENQ] --> [RUNNING] - * /\ /\ - * | | - * +------------------------------------------+ - */ - int src_stat = taskc->stat; - bool valid; - - if (src_stat < _LAVD_TASK_STAT_MIN || src_stat > _LAVD_TASK_STAT_MAX) { - scx_bpf_error("Invalid task state: %d", src_stat); - return false; - } - - switch (src_stat) { - case LAVD_TASK_STAT_STOPPING: - valid = tgt_stat == LAVD_TASK_STAT_QUIESCENT || - tgt_stat == LAVD_TASK_STAT_RUNNING; - break; - case LAVD_TASK_STAT_QUIESCENT: - valid = tgt_stat == LAVD_TASK_STAT_ENQ; - break; - case LAVD_TASK_STAT_ENQ: - valid = tgt_stat == LAVD_TASK_STAT_QUIESCENT || - tgt_stat == LAVD_TASK_STAT_RUNNING; - break; - case LAVD_TASK_STAT_RUNNING: - valid = tgt_stat == LAVD_TASK_STAT_STOPPING; - break; - } - - if (!valid) - return false; - - taskc->stat = tgt_stat; - return true; -} - static void update_stat_for_enq(struct task_struct *p, struct task_ctx *taskc, struct cpu_ctx *cpuc) { @@ -1528,8 +1464,7 @@ void BPF_STRUCT_OPS(lavd_runnable, struct task_struct *p, u64 enq_flags) * rq. Statistics will be adjusted when more accurate statistics become * available (ops.running). */ - if (transit_task_stat(p_taskc, LAVD_TASK_STAT_ENQ)) - update_stat_for_enq(p, p_taskc, cpuc); + update_stat_for_enq(p, p_taskc, cpuc); /* * When a task @p is wakened up, the wake frequency of its waker task @@ -1571,8 +1506,7 @@ void BPF_STRUCT_OPS(lavd_running, struct task_struct *p) if (!cpuc) return; - if (transit_task_stat(taskc, LAVD_TASK_STAT_RUNNING)) - update_stat_for_run(p, taskc, cpuc); + update_stat_for_run(p, taskc, cpuc); /* * Calcualte task's time slice based on updated load. @@ -1635,8 +1569,7 @@ void BPF_STRUCT_OPS(lavd_stopping, struct task_struct *p, bool runnable) if (!taskc) return; - if (transit_task_stat(taskc, LAVD_TASK_STAT_STOPPING)) - update_stat_for_stop(p, taskc, cpuc); + update_stat_for_stop(p, taskc, cpuc); /* * Adjust slice boost for the task's next schedule. @@ -1655,8 +1588,7 @@ void BPF_STRUCT_OPS(lavd_quiescent, struct task_struct *p, u64 deq_flags) if (!cpuc || !taskc) return; - if (transit_task_stat(taskc, LAVD_TASK_STAT_QUIESCENT)) - update_stat_for_quiescent(p, taskc, cpuc); + update_stat_for_quiescent(p, taskc, cpuc); /* * If a task @p is dequeued from a run queue for some other reason