scx_bpfland: boost batch workload when nvcsw_max_thresh is disabled

If nvcsw_max_thresh is disabled (via `-c 0`), interactivity becomes less of a concern, allowing us to focus more on optimizing batch workloads. A particularly effective approach is to prioritize per-CPU tasks and sync wakeups, by dispatching these tasks directly to their local CPU. However, this comes at the cost of reduced fairness and increased susceptibility to bouncy scheduling behavior, making this configuration more suitable for batch-oriented workloads on systems that aer not massively over committed. With this change, running `scx_bpfland -c 0` seems to consistently improve parallel kernel build time by approximately 2-3%. Signed-off-by: Andrea Righi <arighi@nvidia.com>
2024-12-18 05:47:01 +00:00 · 2024-12-10 11:16:10 +01:00 · 2024-12-10 11:16:10 +01:00 · 0bc6c5fb2d
commit 0bc6c5fb2d
parent c8b246ea63
1 changed files with 45 additions and 19 deletions
--- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c
@ -651,7 +651,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 		 * block and release the current CPU).
 		 */
 		has_idle = bpf_cpumask_intersects(curr_l3_domain, idle_cpumask);
-		if (has_idle &&
+		if ((!nvcsw_max_thresh || has_idle) &&
 		    bpf_cpumask_test_cpu(cpu, p_mask) &&
 		    !(current->flags & PF_EXITING) &&
 		    scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu) == 0) {
@ -818,11 +818,25 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 	/*
 	 * Per-CPU tasks didn't get the chance to be dispatched directly from
 	 * ops.select_cpu(), so give them a chance here.
 	 */
 	if ((p->nr_cpus_allowed == 1) || p->migration_disabled) {
 		/*
 		 * If nvcsw_max_thresh is disabled we don't care much about
 		 * interactivity, so we can massively boost per-CPU tasks and
 		 * always dispatch them directly on their CPU.
 		 *
 		 * This can help to improve I/O workloads (like large parallel
 		 * builds).
 		 */
 		if (!nvcsw_max_thresh) {
 			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
 			__sync_fetch_and_add(&nr_direct_dispatches, 1);
 			return;
 		}
 		/*
 		 * Per-CPU tasks didn't get the chance to be dispatched directly from
 		 * ops.select_cpu(), so give them a chance here.
 		 */
 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 		if (cpu >= 0) {
 			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
@ -916,7 +930,16 @@ static void update_cpuperf_target(struct task_struct *p, struct task_ctx *tctx)
 	 */
 	delta_t = now - cctx->last_running;
 	delta_runtime = cctx->tot_runtime - cctx->prev_runtime;
-	perf_lvl = MIN(delta_runtime * SCX_CPUPERF_ONE / delta_t, SCX_CPUPERF_ONE);
+	perf_lvl = delta_runtime * SCX_CPUPERF_ONE / delta_t;
 	/*
 	 * If interactive tasks detection is disabled, always boost the
 	 * frequency to make sure it's at least 50%, to prevent being too
 	 * conservative.
 	 */
 	if (!nvcsw_max_thresh)
 		perf_lvl += SCX_CPUPERF_ONE / 2;
 	perf_lvl = MIN(perf_lvl, SCX_CPUPERF_ONE);
 	/*
 	 * Apply the dynamic cpuperf scaling factor.
@ -987,6 +1010,22 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
 	if (tctx->is_interactive)
 		__sync_fetch_and_sub(&nr_interactive, 1);
 	/*
 	 * Update task's average runtime.
 	 */
 	slice = now - tctx->last_run_at;
 	tctx->sum_runtime += slice;
 	tctx->avg_runtime = calc_avg(tctx->avg_runtime, tctx->sum_runtime);
 	/*
 	 * Update task vruntime charging the weighted used time slice.
 	 */
 	p->scx.dsq_vtime += scale_inverse_fair(p, tctx, slice);
 	tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx);
 	if (!nvcsw_max_thresh)
 		return;
 	/*
 	 * If the time slice is not fully depleted, it means that the task
 	 * voluntarily relased the CPU, therefore update the voluntary context
@ -1007,19 +1046,6 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
 	if (p->scx.slice > 0)
 		tctx->nvcsw++;
 	/*
 	 * Update task's average runtime.
 	 */
 	slice = now - tctx->last_run_at;
 	tctx->sum_runtime += slice;
 	tctx->avg_runtime = calc_avg(tctx->avg_runtime, tctx->sum_runtime);
 	/*
 	 * Update task vruntime charging the weighted used time slice.
 	 */
 	p->scx.dsq_vtime += scale_inverse_fair(p, tctx, slice);
 	tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx);
 	/*
 	 * Refresh voluntary context switch metrics.
 	 *