From 1ac9712d2e31c9bdfb04d18587b60e2cda76a2c2 Mon Sep 17 00:00:00 2001
From: Daniel Hodges <hodges.daniel.scott@gmail.com>
Date: Sun, 22 Sep 2024 16:50:12 -0400
Subject: [PATCH] scx_layered: Refactor preemption into a separate function

Signed-off-by: Daniel Hodges <hodges.daniel.scott@gmail.com>
---
 scheds/rust/scx_layered/src/bpf/main.bpf.c | 322 +++++++++++----------
 1 file changed, 171 insertions(+), 151 deletions(-)

diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
index a191c8a..9e4c7a6 100644
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -658,8 +658,9 @@ bool pick_idle_cpu_and_kick(struct task_struct *p, s32 task_cpu,
 }
 
 static __always_inline
-bool try_preempt(s32 cand, struct task_struct *p, struct cpu_ctx *cctx,
-		 struct task_ctx *tctx, struct layer *layer, bool preempt_first)
+bool try_preempt_cpu(s32 cand, struct task_struct *p, struct cpu_ctx *cctx,
+		     struct task_ctx *tctx, struct layer *layer,
+		     bool preempt_first)
 {
 	struct cpu_ctx *cand_cctx, *sib_cctx = NULL;
 	s32 sib;
@@ -705,12 +706,174 @@ bool try_preempt(s32 cand, struct task_struct *p, struct cpu_ctx *cctx,
 	return true;
 }
 
+static __always_inline
+void try_preempt(s32 task_cpu, struct task_struct *p, struct task_ctx *tctx,
+		 bool preempt_first, u64 enq_flags)
+{
+	struct bpf_cpumask *attempted, *topo_cpus;
+	struct cache_ctx *cachec;
+	struct cpu_ctx *cctx;
+	struct layer *layer;
+	struct node_ctx *nodec;
+	u32 idx;
+
+	if (!(layer = lookup_layer(tctx->layer)) || !(cctx = lookup_cpu_ctx(-1)))
+		return;
+
+	if (preempt_first) {
+		/*
+		 * @p prefers to preempt its previous CPU even when there are
+		 * other idle CPUs.
+		 */
+		if (try_preempt_cpu(task_cpu, p, cctx, tctx, layer, true))
+			return;
+		/* we skipped idle CPU picking in select_cpu. Do it here. */
+		if (pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer))
+			return;
+	} else {
+		/*
+		 * If we aren't in the wakeup path, layered_select_cpu() hasn't
+		 * run and thus we haven't looked for and kicked an idle CPU.
+		 * Let's do it now.
+		 */
+		if (!(enq_flags & SCX_ENQ_WAKEUP) &&
+		    pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer))
+			return;
+		if (!layer->preempt)
+			return;
+		if (try_preempt_cpu(task_cpu, p, cctx, tctx, layer, false))
+			return;
+	}
+
+	if (!disable_topology) {
+		if (!(cachec = lookup_cache_ctx(cctx->cache_idx)) ||
+		    !(nodec = lookup_node_ctx(cctx->node_idx)))
+			return;
+
+		attempted = bpf_cpumask_create();
+		if (!attempted)
+			goto preempt_fail;
+
+		topo_cpus = bpf_cpumask_create();
+		if (!topo_cpus) {
+			bpf_cpumask_release(attempted);
+			goto preempt_fail;
+		}
+
+		if (!cachec->cpumask) {
+			bpf_cpumask_release(attempted);
+			bpf_cpumask_release(topo_cpus);
+			goto preempt_fail;
+		}
+
+		bpf_cpumask_copy(topo_cpus, cast_mask(cachec->cpumask));
+
+		/*
+		 * First try preempting in the local LLC
+		 */
+		bpf_for(idx, 0, cachec->nr_cpus) {
+			s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus));
+			trace("PREEMPT attempt on cpu %d from cpu %d",
+			      preempt_cpu, bpf_get_smp_processor_id());
+			if (preempt_cpu > cachec->nr_cpus)
+				break;
+
+			if (try_preempt_cpu(preempt_cpu, p, cctx, tctx, layer, false)) {
+				bpf_cpumask_release(attempted);
+				bpf_cpumask_release(topo_cpus);
+				return;
+			}
+			bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus);
+			bpf_cpumask_set_cpu(preempt_cpu, attempted);
+		}
+
+		/*
+		 * Next try node local LLC
+		 */
+		if (!nodec->cpumask) {
+			bpf_cpumask_release(attempted);
+			bpf_cpumask_release(topo_cpus);
+			goto preempt_fail;
+		}
+
+		bpf_cpumask_copy(topo_cpus, cast_mask(nodec->cpumask));
+		bpf_cpumask_xor(topo_cpus, cast_mask(attempted), cast_mask(topo_cpus));
+
+		bpf_for(idx, 0, nodec->nr_cpus) {
+			s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus));
+			if (try_preempt_cpu(preempt_cpu, p, cctx, tctx, layer, false)) {
+				bpf_cpumask_release(attempted);
+				bpf_cpumask_release(topo_cpus);
+				lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx);
+				return;
+			}
+			bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus);
+			bpf_cpumask_set_cpu(preempt_cpu, attempted);
+			if (bpf_cpumask_empty(cast_mask(topo_cpus)))
+				break;
+		}
+
+		/*
+		 * Finally try across nodes
+		 */
+		if (xnuma_preemption) {
+			if (!all_cpumask) {
+				bpf_cpumask_release(attempted);
+				bpf_cpumask_release(topo_cpus);
+				goto preempt_fail;
+			}
+			bpf_cpumask_copy(topo_cpus, cast_mask(all_cpumask));
+			bpf_cpumask_xor(topo_cpus, cast_mask(attempted), cast_mask(topo_cpus));
+
+			bpf_for(idx, 0, nr_possible_cpus) {
+				s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus));
+				if (try_preempt_cpu(preempt_cpu, p, cctx, tctx, layer, false)) {
+					bpf_cpumask_release(attempted);
+					bpf_cpumask_release(topo_cpus);
+					lstat_inc(LSTAT_PREEMPT_XNUMA, layer, cctx);
+					return;
+				}
+				bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus);
+				bpf_cpumask_set_cpu(preempt_cpu, attempted);
+				if (bpf_cpumask_empty(cast_mask(topo_cpus)))
+					break;
+			}
+		}
+		bpf_cpumask_release(attempted);
+		bpf_cpumask_release(topo_cpus);
+	} else {
+
+		bpf_for(idx, 0, nr_possible_cpus) {
+			s32 cand = (preempt_cursor + idx) % nr_possible_cpus;
+
+			if (try_preempt_cpu(cand, p, cctx, tctx, layer, false)) {
+				/*
+				 * Round-robining doesn't have to be strict. Let's
+				 * not bother with atomic ops on $preempt_cursor.
+				 */
+				preempt_cursor = (cand + 1) % nr_possible_cpus;
+				struct cpu_ctx *new_cctx;
+				if ((new_cctx = lookup_cpu_ctx(cand))) {
+					if (new_cctx->node_idx != nodec->id && new_cctx->node_idx == nodec->id)
+						lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx);
+					if (new_cctx->node_idx != nodec->id)
+						lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx);
+				}
+				return;
+			}
+		}
+	}
+
+	lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx);
+
+preempt_fail:
+	lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx);
+}
+
 void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	struct cpu_ctx *cctx;
 	struct task_ctx *tctx;
-	struct cache_ctx *cachec;
-	struct node_ctx *nodec;
 	struct layer *layer;
 	s32 task_cpu = scx_bpf_task_cpu(p);
 	u64 vtime = p->scx.dsq_vtime;
@@ -763,7 +926,7 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 
 		idx = cpu_hi_fallback_dsq_id(task_cpu);
 		scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
-		goto find_cpu;
+		goto preempt;
 	}
 
 	/*
@@ -788,7 +951,7 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 		 */
 		idx = cpu_hi_fallback_dsq_id(task_cpu);
 		scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
-		goto find_cpu;
+		goto preempt;
 	}
 
 	if (disable_topology) {
@@ -800,151 +963,8 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 		scx_bpf_dispatch_vtime(p, idx, layer_slice_ns, vtime, enq_flags);
 	}
 
-find_cpu:
-	if (try_preempt_first) {
-		/*
-		 * @p prefers to preempt its previous CPU even when there are
-		 * other idle CPUs.
-		 */
-		if (try_preempt(task_cpu, p, cctx, tctx, layer, true))
-			return;
-		/* we skipped idle CPU picking in select_cpu. Do it here. */
-		if (pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer))
-			return;
-	} else {
-		/*
-		 * If we aren't in the wakeup path, layered_select_cpu() hasn't
-		 * run and thus we haven't looked for and kicked an idle CPU.
-		 * Let's do it now.
-		 */
-		if (!(enq_flags & SCX_ENQ_WAKEUP) &&
-		    pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer))
-			return;
-		if (!layer->preempt)
-			return;
-		if (try_preempt(task_cpu, p, cctx, tctx, layer, false))
-			return;
-	}
-
-	if (!disable_topology) {
-		if (!(cachec = lookup_cache_ctx(cctx->cache_idx)) ||
-		    !(nodec = lookup_node_ctx(cctx->node_idx)))
-			return;
-
-		struct bpf_cpumask *attempted = bpf_cpumask_create();
-		if (!attempted)
-			return;
-
-		struct bpf_cpumask *topo_cpus = bpf_cpumask_create();
-		if (!topo_cpus) {
-			bpf_cpumask_release(attempted);
-			return;
-		}
-
-		if (!cachec->cpumask) {
-			bpf_cpumask_release(attempted);
-			bpf_cpumask_release(topo_cpus);
-			return;
-		}
-
-		bpf_cpumask_copy(topo_cpus, cast_mask(cachec->cpumask));
-
-		/*
-		 * First try preempting in the local LLC
-		 */
-		bpf_for(idx, 0, cachec->nr_cpus) {
-			s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus));
-			trace("PREEMPT attempt on cpu %d from cpu %d",
-			      preempt_cpu, bpf_get_smp_processor_id());
-			if (preempt_cpu > cachec->nr_cpus)
-				break;
-
-			if (try_preempt(preempt_cpu, p, cctx, tctx, layer, false)) {
-				bpf_cpumask_release(attempted);
-				bpf_cpumask_release(topo_cpus);
-				return;
-			}
-			bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus);
-			bpf_cpumask_set_cpu(preempt_cpu, attempted);
-		}
-
-		/*
-		 * Next try node local LLC
-		 */
-		if (!nodec->cpumask) {
-			bpf_cpumask_release(attempted);
-			bpf_cpumask_release(topo_cpus);
-			return;
-		}
-		bpf_cpumask_copy(topo_cpus, cast_mask(nodec->cpumask));
-		bpf_cpumask_xor(topo_cpus, cast_mask(attempted), cast_mask(topo_cpus));
-
-		bpf_for(idx, 0, nodec->nr_cpus) {
-			s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus));
-			if (try_preempt(preempt_cpu, p, cctx, tctx, layer, false)) {
-				bpf_cpumask_release(attempted);
-				bpf_cpumask_release(topo_cpus);
-				lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx);
-				return;
-			}
-			bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus);
-			bpf_cpumask_set_cpu(preempt_cpu, attempted);
-			if (bpf_cpumask_empty(cast_mask(topo_cpus)))
-				break;
-		}
-
-		/*
-		 * Finally try across nodes
-		 */
-		if (xnuma_preemption) {
-			if (!all_cpumask) {
-				bpf_cpumask_release(attempted);
-				bpf_cpumask_release(topo_cpus);
-				return;
-			}
-			bpf_cpumask_copy(topo_cpus, cast_mask(all_cpumask));
-			bpf_cpumask_xor(topo_cpus, cast_mask(attempted), cast_mask(topo_cpus));
-
-			bpf_for(idx, 0, nr_possible_cpus) {
-				s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus));
-				if (try_preempt(preempt_cpu, p, cctx, tctx, layer, false)) {
-					bpf_cpumask_release(attempted);
-					bpf_cpumask_release(topo_cpus);
-					lstat_inc(LSTAT_PREEMPT_XNUMA, layer, cctx);
-					return;
-				}
-				bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus);
-				bpf_cpumask_set_cpu(preempt_cpu, attempted);
-				if (bpf_cpumask_empty(cast_mask(topo_cpus)))
-					break;
-			}
-		}
-		bpf_cpumask_release(attempted);
-		bpf_cpumask_release(topo_cpus);
-	} else {
-
-		bpf_for(idx, 0, nr_possible_cpus) {
-			s32 cand = (preempt_cursor + idx) % nr_possible_cpus;
-
-			if (try_preempt(cand, p, cctx, tctx, layer, false)) {
-				/*
-				 * Round-robining doesn't have to be strict. Let's
-				 * not bother with atomic ops on $preempt_cursor.
-				 */
-				preempt_cursor = (cand + 1) % nr_possible_cpus;
-				struct cpu_ctx *new_cctx;
-				if ((new_cctx = lookup_cpu_ctx(cand))) {
-					if (new_cctx->node_idx != nodec->id && new_cctx->node_idx == nodec->id)
-						lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx);
-					if (new_cctx->node_idx != nodec->id)
-						lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx);
-				}
-				return;
-			}
-		}
-	}
-
-	lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx);
+preempt:
+	try_preempt(task_cpu, p, tctx, try_preempt_first, enq_flags);
 }
 
 static bool keep_running(struct cpu_ctx *cctx, struct task_struct *p)