scx_layered: Open and grouped layers can handle tasks with custom affinities

The main reason why custom affinities are tricky for scx_layered is because if we put a task which doesn't allow all CPUs into a layer's DSQ, it may not get consumed for an indefinite amount of time. However, this is only true for confined layers. Both open and grouped layers always consumed from all CPUs and thus don't have this risk. Let's allow tasks with custom affinities in open and grouped layers. - In select_cpu(), don't consider direct dispatching to a local DSQ as affinity violation even if the target CPU is outside the layer's cpumask if the layer is open. - In enqueue(), separate out per-cpu kthread special case into its own block. Note that this is only applied if the layer is not preempting as a preempting layer has a higher priority than HI_FALLBACK_DSQ anyway. - Trigger the LO_FALLBACK_DSQ path for other threads only if the layer is confined. - The preemption path now also runs for tasks with a custom affinity in open and grouped layers. Update it so that it only considers the CPUs in the preempting task's allowed cpumask. (cherry picked from commit 82d2f887a4608de61ddf5e15643c10e504a88f7b)
2024-11-29 20:50:22 +00:00 · 2024-05-16 19:59:46 -10:00 · 2024-05-16 19:59:46 -10:00 · a576242b69
commit a576242b69
parent 1ce23760b5
1 changed files with 32 additions and 14 deletions
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@ -413,7 +413,8 @@ s32 BPF_STRUCT_OPS(layered_select_cpu, struct task_struct *p, s32 prev_cpu, u64
 	if (p->nr_cpus_allowed == 1) {
 		if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
 			lstat_inc(LSTAT_LOCAL, layer, cctx);
-			if (!bpf_cpumask_test_cpu(prev_cpu, layer_cpumask))
+			if (!layer->open &&
+			    !bpf_cpumask_test_cpu(prev_cpu, layer_cpumask))
 				lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
 			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
 		}
@ -422,9 +423,8 @@ s32 BPF_STRUCT_OPS(layered_select_cpu, struct task_struct *p, s32 prev_cpu, u64

 	maybe_refresh_layered_cpumask(layered_cpumask, p, tctx, layer_cpumask);

-	if (!(idle_smtmask = scx_bpf_get_idle_smtmask())) {
+	if (!(idle_smtmask = scx_bpf_get_idle_smtmask()))
 		return prev_cpu;
-	}

 	/*
 	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
@ -476,16 +476,35 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 	if (vtime_before(vtime, layer->vtime_now - slice_ns))
 		vtime = layer->vtime_now - slice_ns;

-	if (!tctx->all_cpus_allowed) {
+	/*
+	 * Special-case per-cpu kthreads which aren't in a preempting layer so
+	 * that they run between preempting and non-preempting layers. This is
+	 * to give reasonable boost to per-cpu kthreads by default as they are
+	 * usually important for system performance and responsiveness.
+	 */
+	if (!layer->preempt &&
+	    (p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
+		struct cpumask *layer_cpumask;
+
+		if (!layer->open &&
+		    (layer_cpumask = lookup_layer_cpumask(tctx->layer)) &&
+		    !bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), layer_cpumask))
+			lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
+
+		scx_bpf_dispatch(p, HI_FALLBACK_DSQ, slice_ns, enq_flags);
+		return;
+	}
+
+	/*
+	 * As an open or grouped layer is consumed from all CPUs, a task which
+	 * belongs to such a layer can be safely put in the layer's DSQ
+	 * regardless of its cpumask. However, a task with custom cpumask in a
+	 * confined layer may fail to be consumed for an indefinite amount of
+	 * time. Queue them to the fallback DSQ.
+	 */
+	if (!layer->open && !tctx->all_cpus_allowed) {
 		lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
-		/*
-		 * Run kthread w/ modified affinities right after preempt
-		 * layers. User threads w/ modified affinities run last.
-		 */
-		if (p->flags & PF_KTHREAD)
-			scx_bpf_dispatch(p, HI_FALLBACK_DSQ, slice_ns, enq_flags);
-		else
-			scx_bpf_dispatch(p, LO_FALLBACK_DSQ, slice_ns, enq_flags);
+		scx_bpf_dispatch(p, LO_FALLBACK_DSQ, slice_ns, enq_flags);
 		return;
 	}

@ -499,8 +518,7 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 		u32 cpu = (preempt_cursor + idx) % nr_possible_cpus;
 		s32 sib;

-		if (!all_cpumask ||
-		    !bpf_cpumask_test_cpu(cpu, (const struct cpumask *)all_cpumask))
+		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
 			continue;

 		if (!(cand_cctx = lookup_cpu_ctx(cpu)) || cand_cctx->current_preempt)