scx_bpfland: keep tasks running on full-idle SMT cores

When a task is the last one running on a CPU and still wants to continue, allow it to run and replenish its time only if the used CPU is part a fully idle SMT core. Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
2024-11-24 11:50:23 +00:00 · 2024-09-29 15:53:25 +02:00 · 2024-09-29 15:53:25 +02:00 · 6e24fcc7f0
commit 6e24fcc7f0
parent c20a19c946
1 changed files with 34 additions and 17 deletions
--- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c
@ -411,22 +411,29 @@ static u64 nr_tasks_waiting(void)
 }

 /*
- * Return the task's unused portion of its previously assigned time slice in
- * the range a [slice_ns_min .. slice_ns].
+ * Return a value inversely proportional to the task's weight.
 */
-static inline u64 task_slice(struct task_struct *p)
+static inline u64 scale_inverse_fair(struct task_struct *p, u64 value)
 {
+	return value * 100 / p->scx.weight;
+}
+
+/*
+ * Evaluate task's time slice in function of the total amount of tasks that are
+ * waiting to be dispatched and the task's weight.
+ */
+static inline void task_refill_slice(struct task_struct *p)
+{
+	u64 slice;
+
 	/*
 	 * Refresh the amount of waiting tasks to get a more accurate scaling
 	 * factor for the time slice.
 	 */
 	nr_waiting = (nr_waiting + nr_tasks_waiting()) / 2;

-	/*
-	 * Scale the time slice based on the average number of waiting tasks
-	 * (more waiting tasks result in a shorter time slice).
-	 */
-	return MAX(slice_ns / (nr_waiting + 1), slice_ns_min);
+	slice = slice_ns / (nr_waiting + 1);
+	p->scx.slice = CLAMP(slice, slice_ns_min, slice_ns);
 }

 /*
@ -817,7 +824,6 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64
 */
 void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
 {
-	struct bpf_cpumask *primary;
 	struct task_ctx *tctx;
 	u64 deadline = task_deadline(p);
 	s32 cpu = scx_bpf_task_cpu(p);
@ -1012,9 +1018,9 @@ void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
 		return;

 	/*
-	 * If the current task expired its time slice, but no other task wants
-	 * to run, simply replenish its time slice and let it run for another
-	 * round on the same CPU.
+	 * If the current task expired its time slice, its CPU is still a
+	 * full-idle SMT core and no other task wants to run, simply replenish
+	 * its time slice and let it run for another round on the same CPU.
 	 *
 	 * Note that bpfland_stopping() won't be called if we replenish the
 	 * time slice here. As a result, the nvcsw statistics won't be updated,
@ -1022,8 +1028,19 @@ void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
 	 * when the system is overloaded, which isn't the case when there are
 	 * no other tasks to run.
 	 */
-	if (prev && (prev->scx.flags & SCX_TASK_QUEUED))
-		prev->scx.slice = task_slice(prev);
+	if (prev && (prev->scx.flags & SCX_TASK_QUEUED)) {
+		const struct cpumask *idle_smtmask;
+
+		if (!smt_enabled) {
+			task_refill_slice(prev);
+			return;
+		}
+
+		idle_smtmask = scx_bpf_get_idle_smtmask();
+		if (bpf_cpumask_test_cpu(cpu, idle_smtmask))
+			task_refill_slice(prev);
+		scx_bpf_put_idle_cpumask(idle_smtmask);
+	}
 }

 /*
@ -1085,7 +1102,7 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
 	 * Refresh task's time slice immediately before it starts to run on its
 	 * assigned CPU.
 	 */
-	p->scx.slice = task_slice(p);
+	task_refill_slice(p);

 	/*
 	 * Adjust target CPU frequency before the task starts to run.
@ -1146,13 +1163,13 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
 	 * Update task vruntime, charging the weighted used time slice.
 	 */
 	task_slice = p->se.sum_exec_runtime - tctx->sum_exec_runtime;
-	p->scx.dsq_vtime += task_slice * 100 / p->scx.weight;
+	p->scx.dsq_vtime += scale_inverse_fair(p, task_slice);
 	tctx->sum_exec_runtime = p->se.sum_exec_runtime;

 	/*
 	 * Update global vruntime.
 	 */
-	vtime_now += task_slice * 100 / p->scx.weight;
+	vtime_now += scale_inverse_fair(p, task_slice);

 	/*
 	 * Refresh voluntary context switch metrics.