scx_lavd: Move load subtraction to quiescent state transition

scx_lavd tracks task state transitions and updates statistics on each valid
transition. However, there's an asymmetry between the runnable/running and
stopping/quiescent transitions. In the former, the runnable and running
transitions are accounted separately in update_stat_for_enq() and
update_stat_for_run(), respectively. However, in the latter, the two
transitions are combined together in update_stat_for_stop().

This asymmetry leads to incorrect accounting. For example, a task's load
should be added to the cpu's load sum when the task gets enqueued and
subtracted when the task is no longer runnable (quiescent). The former is
accounted correctly from update_stat_for_enq() but the latter is done
whenever the task stops. A task can transit between running and stopping
multiple times before becoming quiescent, so the asymmetry can end up
subtracting the load of a task which is still running from the cpu's load
sum.

This patch:

- introduces LAVD_TASK_STAT_QUIESCENT and updates transit_task_stat() so
  that it can handle all valid state transitions including the multiple back
  and forth transitions between two pairs - QUIESCENT <-> ENQ and RUNNING
  <-> STOPPING.

- restores the symmetry by moving load adjustments part from
  update_stat_for_stop() to new update_stat_for_quiescent().

This removes a good chunk of ignored transitions. The next patch will take
care of the rest.
This commit is contained in:
Tejun Heo 2024-03-26 12:23:19 -10:00
parent dd40377f03
commit 625bb84bc4
2 changed files with 48 additions and 26 deletions

View File

@ -125,11 +125,12 @@ struct cpu_ctx {
enum task_stat { enum task_stat {
_LAVD_TASK_STAT_MIN = 0, _LAVD_TASK_STAT_MIN = 0,
LAVD_TASK_STAT_STOPPING = _LAVD_TASK_STAT_MIN, LAVD_TASK_STAT_QUIESCENT = _LAVD_TASK_STAT_MIN,
LAVD_TASK_STAT_ENQ, LAVD_TASK_STAT_ENQ,
LAVD_TASK_STAT_RUNNING, LAVD_TASK_STAT_RUNNING,
LAVD_TASK_STAT_STOPPING,
_LAVD_TASK_STAT_MAX = LAVD_TASK_STAT_RUNNING, _LAVD_TASK_STAT_MAX = LAVD_TASK_STAT_STOPPING,
}; };
struct task_ctx { struct task_ctx {

View File

@ -1247,29 +1247,41 @@ static bool transit_task_stat(struct task_ctx *taskc, int tgt_stat)
* ------------- * -------------
* | * |
* \/ * \/
* [STOPPING] --> [ENQ] --> [RUNNING] * [STOPPING] --> [QUIESCENT] <--> [ENQ] --> [RUNNING]
* /\ | * /\ /\
* | | * | |
* +-------------------------+ * +------------------------------------------+
*/ */
const static int valid_tgt_stat[] = {
[LAVD_TASK_STAT_STOPPING] = LAVD_TASK_STAT_ENQ,
[LAVD_TASK_STAT_ENQ] = LAVD_TASK_STAT_RUNNING,
[LAVD_TASK_STAT_RUNNING] = LAVD_TASK_STAT_STOPPING,
};
int src_stat = taskc->stat; int src_stat = taskc->stat;
bool valid;
if (src_stat < _LAVD_TASK_STAT_MIN || src_stat > _LAVD_TASK_STAT_MAX) { if (src_stat < _LAVD_TASK_STAT_MIN || src_stat > _LAVD_TASK_STAT_MAX) {
scx_bpf_error("Invalid task state: %d", src_stat); scx_bpf_error("Invalid task state: %d", src_stat);
return false; return false;
} }
if (valid_tgt_stat[src_stat] == tgt_stat) { switch (src_stat) {
taskc->stat = tgt_stat; case LAVD_TASK_STAT_STOPPING:
return true; valid = tgt_stat == LAVD_TASK_STAT_QUIESCENT ||
tgt_stat == LAVD_TASK_STAT_RUNNING;
break;
case LAVD_TASK_STAT_QUIESCENT:
valid = tgt_stat == LAVD_TASK_STAT_ENQ;
break;
case LAVD_TASK_STAT_ENQ:
valid = tgt_stat == LAVD_TASK_STAT_QUIESCENT ||
tgt_stat == LAVD_TASK_STAT_RUNNING;
break;
case LAVD_TASK_STAT_RUNNING:
valid = tgt_stat == LAVD_TASK_STAT_STOPPING;
break;
} }
return false; if (!valid)
return false;
taskc->stat = tgt_stat;
return true;
} }
static void update_stat_for_enq(struct task_struct *p, struct task_ctx *taskc, static void update_stat_for_enq(struct task_struct *p, struct task_ctx *taskc,
@ -1323,13 +1335,6 @@ static void update_stat_for_stop(struct task_struct *p, struct task_ctx *taskc,
now = bpf_ktime_get_ns(); now = bpf_ktime_get_ns();
/*
* When stopped, reduce the per-CPU task load. Per-CPU task load will
* be aggregated periodically at update_sys_cpu_load().
*/
cpuc->load_actual -= taskc->load_actual;
cpuc->load_ideal -= get_task_load_ideal(p);
/* /*
* Update task's run_time. If a task got slice-boosted -- in other * Update task's run_time. If a task got slice-boosted -- in other
* words, its time slices have been fully consumed multiple times, * words, its time slices have been fully consumed multiple times,
@ -1344,6 +1349,17 @@ static void update_stat_for_stop(struct task_struct *p, struct task_ctx *taskc,
taskc->last_stop_clk = now; taskc->last_stop_clk = now;
} }
static void update_stat_for_quiescent(struct task_struct *p, struct task_ctx *taskc,
struct cpu_ctx *cpuc)
{
/*
* When quiescent, reduce the per-CPU task load. Per-CPU task load will
* be aggregated periodically at update_sys_cpu_load().
*/
cpuc->load_actual -= taskc->load_actual;
cpuc->load_ideal -= get_task_load_ideal(p);
}
static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc, static void calc_when_to_run(struct task_struct *p, struct task_ctx *taskc,
struct cpu_ctx *cpuc, u64 enq_flags) struct cpu_ctx *cpuc, u64 enq_flags)
{ {
@ -1630,9 +1646,18 @@ void BPF_STRUCT_OPS(lavd_stopping, struct task_struct *p, bool runnable)
void BPF_STRUCT_OPS(lavd_quiescent, struct task_struct *p, u64 deq_flags) void BPF_STRUCT_OPS(lavd_quiescent, struct task_struct *p, u64 deq_flags)
{ {
struct cpu_ctx *cpuc;
struct task_ctx *taskc; struct task_ctx *taskc;
u64 now, interval; u64 now, interval;
cpuc = get_cpu_ctx();
taskc = get_task_ctx(p);
if (!cpuc || !taskc)
return;
if (transit_task_stat(taskc, LAVD_TASK_STAT_QUIESCENT))
update_stat_for_quiescent(p, taskc, cpuc);
/* /*
* If a task @p is dequeued from a run queue for some other reason * If a task @p is dequeued from a run queue for some other reason
* other than going to sleep, it is an implementation-level side * other than going to sleep, it is an implementation-level side
@ -1644,10 +1669,6 @@ void BPF_STRUCT_OPS(lavd_quiescent, struct task_struct *p, u64 deq_flags)
/* /*
* When a task @p goes to sleep, its associated wait_freq is updated. * When a task @p goes to sleep, its associated wait_freq is updated.
*/ */
taskc = get_task_ctx(p);
if (!taskc)
return;
now = bpf_ktime_get_ns(); now = bpf_ktime_get_ns();
interval = now - taskc->last_wait_clk; interval = now - taskc->last_wait_clk;
taskc->wait_freq = calc_avg_freq(taskc->wait_freq, interval); taskc->wait_freq = calc_avg_freq(taskc->wait_freq, interval);