From 1293ae21fc9896516a0c05d25c2499a08d03e421 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 15 Nov 2024 11:46:00 -1000
Subject: [PATCH 1/6] scx_layered: Stat output format update

Rearrange things a bit so that lines are not too long.
---
 scheds/rust/scx_layered/src/stats.rs | 30 ++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/scheds/rust/scx_layered/src/stats.rs b/scheds/rust/scx_layered/src/stats.rs
index 3388bde..5a14701 100644
--- a/scheds/rust/scx_layered/src/stats.rs
+++ b/scheds/rust/scx_layered/src/stats.rs
@@ -170,7 +170,11 @@ impl LayerStats {
             }
         };
         let calc_frac = |a, b| {
-            if b != 0.0 { a / b * 100.0 } else { 0.0 }
+            if b != 0.0 {
+                a / b * 100.0
+            } else {
+                0.0
+            }
         };
 
         Self {
@@ -221,15 +225,22 @@ impl LayerStats {
     pub fn format<W: Write>(&self, w: &mut W, name: &str, header_width: usize) -> Result<()> {
         writeln!(
             w,
-            "  {:<width$}: util/dcycle/frac/{:5.1}/{:5.1}/{:7.1} load/load_frac_adj/frac={:9.2}/{:2.2}/{:5.1} tasks={:6}",
+            "  {:<width$}: util/dcycle/frac={:5.1}/{:5.1}/{:7.1} tasks={:6}",
             name,
             self.util,
             self.dcycle,
             self.util_frac,
+            self.tasks,
+            width = header_width,
+        )?;
+
+        writeln!(
+            w,
+            "  {:<width$}  load/load_frac_adj/frac={:9.2}/{:2.2}/{:5.1}",
+            "",
             self.load,
             self.load_frac_adj,
             self.load_frac,
-            self.tasks,
             width = header_width,
         )?;
 
@@ -281,7 +292,7 @@ impl LayerStats {
 
         writeln!(
             w,
-            "  {:<width$}  preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{} min_exec={}/{:7.2}ms, slice={}ms",
+            "  {:<width$}  preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{}",
             "",
             fmt_pct(self.preempt),
             fmt_pct(self.preempt_first),
@@ -289,10 +300,17 @@ impl LayerStats {
             fmt_pct(self.preempt_xnuma),
             fmt_pct(self.preempt_idle),
             fmt_pct(self.preempt_fail),
+            width = header_width,
+        )?;
+
+        writeln!(
+            w,
+            "  {:<width$}  slice={}ms min_exec={}/{:7.2}ms",
+            "",
+            self.slice_us as f64 / 1000.0,
             fmt_pct(self.min_exec),
             self.min_exec_us as f64 / 1000.0,
-            self.slice_us as f64 / 1000.0,
-            width = header_width,
+            width = header_width
         )?;
 
         let mut cpus = self

From 2b52d172d42f6e547d797bae1e715eb548ee3044 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 15 Nov 2024 14:30:03 -1000
Subject: [PATCH 2/6] scx_layered: Encapsulate per-task layered cpumask caching

and fix build warnings while at it. Maybe we should drop const from
cast_mask().
---
 scheds/rust/scx_layered/src/bpf/main.bpf.c | 98 ++++++++++++++--------
 1 file changed, 64 insertions(+), 34 deletions(-)

diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
index 320f9d0..6210013 100644
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -325,7 +325,7 @@ static struct cpumask *lookup_layer_cpumask(int idx)
 	struct layer_cpumask_wrapper *cpumaskw;
 
 	if ((cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx))) {
-		return cast_mask(cpumaskw->cpumask);
+		return (struct cpumask *)cpumaskw->cpumask;
 	} else {
 		scx_bpf_error("no layer_cpumask");
 		return NULL;
@@ -400,15 +400,19 @@ int BPF_PROG(sched_tick_fentry)
 	return 0;
 }
 
+struct cached_cpus {
+	s64			id;
+	u64			seq;
+	struct bpf_cpumask __kptr *mask;
+};
+
 struct task_ctx {
 	int			pid;
 	int			last_cpu;
 	int			layer;
 	pid_t			last_waker;
 	bool			refresh_layer;
-	u64			layer_cpus_seq;
-	struct bpf_cpumask __kptr *layered_cpumask;
-
+	struct cached_cpus	layered_cpus;
 	bool			all_cpus_allowed;
 	u64			runnable_at;
 	u64			running_at;
@@ -506,23 +510,40 @@ int BPF_PROG(tp_task_rename, struct task_struct *p, const char *buf)
 	return 0;
 }
 
-static void maybe_refresh_layered_cpumask(struct cpumask *layered_cpumask,
-					  struct task_struct *p, struct task_ctx *tctx,
-					  const struct cpumask *layer_cpumask)
+static bool should_refresh_cached_cpus(struct cached_cpus *ccpus, s64 id)
 {
-	u64 layer_seq = layers->cpus_seq;
+	return ccpus->id != id || ccpus->seq != READ_ONCE(layers->cpus_seq);
+}
 
-	if (tctx->layer_cpus_seq == layer_seq)
+static void refresh_cached_cpus(struct cached_cpus *ccpus, s64 id,
+				const struct cpumask *cpus_a,
+				const struct cpumask *cpus_b)
+{
+	u64 seq = READ_ONCE(layers->cpus_seq);
+
+	if (unlikely(!ccpus->mask || !cpus_a || !cpus_b)) {
+		scx_bpf_error("NULL ccpus->mask or cpus_a/b");
 		return;
+	}
 
 	/*
 	 * XXX - We're assuming that the updated @layer_cpumask matching the new
 	 * @layer_seq is visible which may not be true. For now, leave it as-is.
 	 * Let's update once BPF grows enough memory ordering constructs.
 	 */
-	bpf_cpumask_and((struct bpf_cpumask *)layered_cpumask, layer_cpumask, p->cpus_ptr);
-	tctx->layer_cpus_seq = layer_seq;
-	trace("%s[%d] cpumask refreshed to seq %llu", p->comm, p->pid, layer_seq);
+	bpf_cpumask_and((struct bpf_cpumask *)ccpus->mask, cpus_a, cpus_b);
+	ccpus->id = id;
+	ccpus->seq = seq;
+}
+
+static void maybe_refresh_layered_cpus(struct task_struct *p, struct task_ctx *tctx,
+				      const struct cpumask *layer_cpumask)
+{
+	if (should_refresh_cached_cpus(&tctx->layered_cpus, 0)) {
+		refresh_cached_cpus(&tctx->layered_cpus, 0, p->cpus_ptr, layer_cpumask);
+		trace("%s[%d] layered cpumask refreshed to seq=%llu",
+		      p->comm, p->pid, tctx->layered_cpus.seq);
+	}
 }
 
 static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu,
@@ -585,7 +606,7 @@ s32 pick_idle_no_topo(struct task_struct *p, s32 prev_cpu,
 	s32 cpu;
 
 	/* look up cpumasks */
-	if (!(layered_cpumask = cast_mask(tctx->layered_cpumask)) ||
+	if (!(layered_cpumask = (struct cpumask *)(tctx->layered_cpus.mask)) ||
 	    !(layer_cpumask = lookup_layer_cpumask(tctx->layer)))
 			return -1;
 
@@ -596,7 +617,7 @@ s32 pick_idle_no_topo(struct task_struct *p, s32 prev_cpu,
 		return prev_cpu;
 	}
 
-	maybe_refresh_layered_cpumask(layered_cpumask, p, tctx, layer_cpumask);
+	maybe_refresh_layered_cpus(p, tctx, layer_cpumask);
 
 	/*
 	 * If @p prefers to preempt @prev_cpu than finding an idle CPU and
@@ -640,14 +661,14 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 	s32 cpu;
 
 	/* look up cpumasks */
-	if (!(layered_cpumask = cast_mask(tctx->layered_cpumask)) ||
+	if (!(layered_cpumask = (struct cpumask *)(tctx->layered_cpus.mask)) ||
 	    !(layer_cpumask = lookup_layer_cpumask(tctx->layer)) ||
 	    !(cachec = lookup_cache_ctx(cctx->cache_idx)) ||
 	    !(nodec = lookup_node_ctx(cctx->node_idx)))
 			return -1;
 
-	if (!(cache_cpumask = cast_mask(cachec->cpumask)) ||
-	    !(node_cpumask = cast_mask(nodec->cpumask)))
+	if (!(cache_cpumask = (struct cpumask *)(cachec->cpumask)) ||
+	    !(node_cpumask = (struct cpumask *)(nodec->cpumask)))
 		return -1;
 
 	/* not much to do if bound to a single CPU */
@@ -657,7 +678,7 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 		return prev_cpu;
 	}
 
-	maybe_refresh_layered_cpumask(layered_cpumask, p, tctx, layer_cpumask);
+	maybe_refresh_layered_cpus(p, tctx, layer_cpumask);
 
 	/*
 	 * If @p prefers to preempt @prev_cpu than finding an idle CPU and
@@ -914,7 +935,6 @@ void try_preempt_no_topo(s32 task_cpu, struct task_struct *p,
 
 	lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx);
 
-preempt_fail:
 	lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx);
 }
 
@@ -1855,7 +1875,7 @@ static void maybe_refresh_layer(struct task_struct *p, struct task_ctx *tctx)
 		struct layer *layer = &layers[idx];
 
 		tctx->layer = idx;
-		tctx->layer_cpus_seq = layer->cpus_seq - 1;
+		tctx->layered_cpus.seq = layer->cpus_seq - 1;
 		__sync_fetch_and_add(&layer->nr_tasks, 1);
 		/*
 		 * XXX - To be correct, we'd need to calculate the vtime
@@ -2223,11 +2243,28 @@ void BPF_STRUCT_OPS(layered_cpu_release, s32 cpu,
 	scx_bpf_reenqueue_local();
 }
 
+static int init_cached_cpus(struct cached_cpus *ccpus)
+{
+	struct bpf_cpumask *cpumask;
+
+	ccpus->id = -1;
+
+	if (!(cpumask = bpf_cpumask_create()))
+		return -ENOMEM;
+	if ((cpumask = bpf_kptr_xchg(&ccpus->mask, cpumask))) {
+		/* Should never happen as we just inserted it above. */
+		bpf_cpumask_release(cpumask);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
 		   struct scx_init_task_args *args)
 {
 	struct task_ctx *tctx;
-	struct bpf_cpumask *cpumask;
+	s32 ret;
 
 	/*
 	 * XXX - We want BPF_NOEXIST but bpf_map_delete_elem() in .disable() may
@@ -2241,16 +2278,9 @@ s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
 		return -ENOMEM;
 	}
 
-	cpumask = bpf_cpumask_create();
-	if (!cpumask)
-		return -ENOMEM;
-
-	cpumask = bpf_kptr_xchg(&tctx->layered_cpumask, cpumask);
-	if (cpumask) {
-		/* Should never happen as we just inserted it above. */
-		bpf_cpumask_release(cpumask);
-		return -EINVAL;
-	}
+	ret = init_cached_cpus(&tctx->layered_cpus);
+	if (ret)
+		return ret;
 
 	tctx->pid = p->pid;
 	tctx->last_cpu = -1;
@@ -2500,10 +2530,10 @@ u64 antistall_set(u64 dsq_id, u64 jiffies_now)
 		first_pass = true;
 look_for_cpu:
 		bpf_for(cpu, 0, nr_possible_cpus) {
-			if (!tctx->layered_cpumask)
+			if (!tctx->layered_cpus.mask)
 				goto unlock;
 
-			if (!bpf_cpumask_test_cpu(cpu,   cast_mask(tctx->layered_cpumask)))
+			if (!bpf_cpumask_test_cpu(cpu, cast_mask(tctx->layered_cpus.mask)))
 				continue;
 
 			antistall_dsq = bpf_map_lookup_percpu_elem(&antistall_cpu_dsq, &zero, cpu);
@@ -2608,7 +2638,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 	struct bpf_cpumask *cpumask, *tmp_big_cpumask;
 	struct cpu_ctx *cctx;
 	int i, j, k, nr_online_cpus, ret;
-	u64 *init_antistall_delay, *init_antistall_dsq;
+	u64 *init_antistall_dsq;
 	u32 zero;
 
 	zero = 0;

From 75dd81e3e657bf9250dc3ed11b2c5e25dc61dd8a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 15 Nov 2024 16:29:47 -1000
Subject: [PATCH 3/6] scx_layered: Improve topology aware select_cpu()

- Cache llc and node masked cpumasks instead of calculating them each time.
  They're recalculated only when the task has migrated cross the matching
  boundary and recalculation is necessary.

- llc and node masks should be taken from the wakee's previous CPU not the
  waker's CPU.

- idle_smtmask is already considered by scx_bpf_pick_idle_cpu(). No need to
  and it manually.

- big_cpumask code updated to be simpler. This should also be converted to
  use cached cpumask. big_cpumask portion is not tested.

This brings down CPU utilization of select_cpu() from ~2.7% to ~1.7% while
running rd-hashd at saturation on Ryzen 3900x.
---
 scheds/rust/scx_layered/src/bpf/main.bpf.c | 193 ++++++++++++---------
 1 file changed, 107 insertions(+), 86 deletions(-)

diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
index 6210013..9527918 100644
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -226,7 +226,8 @@ static struct node_ctx *lookup_node_ctx(u32 node)
 {
 	struct node_ctx *nodec;
 
-	nodec = bpf_map_lookup_elem(&node_data, &node);
+	if (!(nodec = bpf_map_lookup_elem(&node_data, &node)))
+		scx_bpf_error("no node_ctx");
 	return nodec;
 }
 
@@ -242,7 +243,8 @@ static struct cache_ctx *lookup_cache_ctx(u32 cache_idx)
 {
 	struct cache_ctx *cachec;
 
-	cachec = bpf_map_lookup_elem(&cache_data, &cache_idx);
+	if (!(cachec = bpf_map_lookup_elem(&cache_data, &cache_idx)))
+		scx_bpf_error("no cache_ctx");
 	return cachec;
 }
 
@@ -413,6 +415,8 @@ struct task_ctx {
 	pid_t			last_waker;
 	bool			refresh_layer;
 	struct cached_cpus	layered_cpus;
+	struct cached_cpus	layered_cpus_llc;
+	struct cached_cpus	layered_cpus_node;
 	bool			all_cpus_allowed;
 	u64			runnable_at;
 	u64			running_at;
@@ -510,17 +514,15 @@ int BPF_PROG(tp_task_rename, struct task_struct *p, const char *buf)
 	return 0;
 }
 
-static bool should_refresh_cached_cpus(struct cached_cpus *ccpus, s64 id)
+static bool should_refresh_cached_cpus(struct cached_cpus *ccpus, s64 id, u64 cpus_seq)
 {
-	return ccpus->id != id || ccpus->seq != READ_ONCE(layers->cpus_seq);
+	return ccpus->id != id || ccpus->seq != cpus_seq;
 }
 
-static void refresh_cached_cpus(struct cached_cpus *ccpus, s64 id,
+static void refresh_cached_cpus(struct cached_cpus *ccpus, s64 id, u64 cpus_seq,
 				const struct cpumask *cpus_a,
 				const struct cpumask *cpus_b)
 {
-	u64 seq = READ_ONCE(layers->cpus_seq);
-
 	if (unlikely(!ccpus->mask || !cpus_a || !cpus_b)) {
 		scx_bpf_error("NULL ccpus->mask or cpus_a/b");
 		return;
@@ -533,19 +535,55 @@ static void refresh_cached_cpus(struct cached_cpus *ccpus, s64 id,
 	 */
 	bpf_cpumask_and((struct bpf_cpumask *)ccpus->mask, cpus_a, cpus_b);
 	ccpus->id = id;
-	ccpus->seq = seq;
+	ccpus->seq = cpus_seq;
 }
 
 static void maybe_refresh_layered_cpus(struct task_struct *p, struct task_ctx *tctx,
-				      const struct cpumask *layer_cpumask)
+				       const struct cpumask *layer_cpumask,
+				       u64 cpus_seq)
 {
-	if (should_refresh_cached_cpus(&tctx->layered_cpus, 0)) {
-		refresh_cached_cpus(&tctx->layered_cpus, 0, p->cpus_ptr, layer_cpumask);
+	if (should_refresh_cached_cpus(&tctx->layered_cpus, 0, cpus_seq)) {
+		refresh_cached_cpus(&tctx->layered_cpus, 0, cpus_seq,
+				    p->cpus_ptr, layer_cpumask);
 		trace("%s[%d] layered cpumask refreshed to seq=%llu",
 		      p->comm, p->pid, tctx->layered_cpus.seq);
 	}
 }
 
+static void maybe_refresh_layered_cpus_llc(struct task_struct *p, struct task_ctx *tctx,
+					   const struct cpumask *layer_cpumask,
+					   s32 llc_id, u64 cpus_seq)
+{
+	if (should_refresh_cached_cpus(&tctx->layered_cpus_llc, llc_id, cpus_seq)) {
+		struct cache_ctx *cachec;
+
+		if (!(cachec = lookup_cache_ctx(llc_id)))
+			return;
+		refresh_cached_cpus(&tctx->layered_cpus_llc, llc_id, cpus_seq,
+				    cast_mask(tctx->layered_cpus.mask),
+				    cast_mask(cachec->cpumask));
+		trace("%s[%d] layered llc cpumask refreshed to llc=%d seq=%llu",
+		      p->comm, p->pid, tctx->layered_cpus_llc.id, tctx->layered_cpus_llc.seq);
+	}
+}
+
+static void maybe_refresh_layered_cpus_node(struct task_struct *p, struct task_ctx *tctx,
+					    const struct cpumask *layer_cpumask,
+					    s32 node_id, u64 cpus_seq)
+{
+	if (should_refresh_cached_cpus(&tctx->layered_cpus_node, node_id, cpus_seq)) {
+		struct node_ctx *nodec;
+
+		if (!(nodec = lookup_node_ctx(node_id)))
+			return;
+		refresh_cached_cpus(&tctx->layered_cpus_node, node_id, cpus_seq,
+				    cast_mask(tctx->layered_cpus.mask),
+				    cast_mask(nodec->cpumask));
+		trace("%s[%d] layered node cpumask refreshed to node=%d seq=%llu",
+		      p->comm, p->pid, tctx->layered_cpus_node.id, tctx->layered_cpus_node.seq);
+	}
+}
+
 static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu,
 			      const struct cpumask *idle_smtmask, bool pref_idle_smt)
 {
@@ -601,7 +639,7 @@ s32 pick_idle_no_topo(struct task_struct *p, s32 prev_cpu,
 		      struct cpu_ctx *cctx, struct task_ctx *tctx,
 		      struct layer *layer, bool from_selcpu)
 {
-	const struct cpumask *idle_cpumask;
+	const struct cpumask *idle_smtmask;
 	struct cpumask *layer_cpumask, *layered_cpumask;
 	s32 cpu;
 
@@ -617,7 +655,7 @@ s32 pick_idle_no_topo(struct task_struct *p, s32 prev_cpu,
 		return prev_cpu;
 	}
 
-	maybe_refresh_layered_cpus(p, tctx, layer_cpumask);
+	maybe_refresh_layered_cpus(p, tctx, layer_cpumask, READ_ONCE(layers->cpus_seq));
 
 	/*
 	 * If @p prefers to preempt @prev_cpu than finding an idle CPU and
@@ -634,14 +672,14 @@ s32 pick_idle_no_topo(struct task_struct *p, s32 prev_cpu,
 	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
 	 * partially idle @prev_cpu.
 	 */
-	idle_cpumask = scx_bpf_get_idle_smtmask();
+	idle_smtmask = scx_bpf_get_idle_smtmask();
 	if ((cpu = pick_idle_cpu_from(layered_cpumask, prev_cpu,
-				      idle_cpumask,
+				      idle_smtmask,
 				      layer->idle_smt)) >= 0)
 		goto out_put;
 
 out_put:
-	scx_bpf_put_idle_cpumask(idle_cpumask);
+	scx_bpf_put_idle_cpumask(idle_smtmask);
 	return cpu;
 }
 
@@ -653,22 +691,12 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 	if (disable_topology)
 		return pick_idle_no_topo(p, prev_cpu, cctx, tctx, layer, from_selcpu);
 
-	const struct cpumask *idle_cpumask;
-	struct cache_ctx *cachec;
-	struct node_ctx *nodec;
-	struct bpf_cpumask *pref_idle_cpumask;
-	struct cpumask *layer_cpumask, *layered_cpumask, *cache_cpumask, *node_cpumask;
+	const struct cpumask *idle_smtmask, *layer_cpumask, *cpumask;
+	struct cpu_ctx *prev_cctx;
+	u64 cpus_seq;
 	s32 cpu;
 
-	/* look up cpumasks */
-	if (!(layered_cpumask = (struct cpumask *)(tctx->layered_cpus.mask)) ||
-	    !(layer_cpumask = lookup_layer_cpumask(tctx->layer)) ||
-	    !(cachec = lookup_cache_ctx(cctx->cache_idx)) ||
-	    !(nodec = lookup_node_ctx(cctx->node_idx)))
-			return -1;
-
-	if (!(cache_cpumask = (struct cpumask *)(cachec->cpumask)) ||
-	    !(node_cpumask = (struct cpumask *)(nodec->cpumask)))
+	if (!(layer_cpumask = lookup_layer_cpumask(tctx->layer)))
 		return -1;
 
 	/* not much to do if bound to a single CPU */
@@ -678,7 +706,7 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 		return prev_cpu;
 	}
 
-	maybe_refresh_layered_cpus(p, tctx, layer_cpumask);
+	cpus_seq = READ_ONCE(layers->cpus_seq);
 
 	/*
 	 * If @p prefers to preempt @prev_cpu than finding an idle CPU and
@@ -686,59 +714,52 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 	 * @prev_cpu. The enqueue path will also retry to find an idle CPU if
 	 * the preemption attempt fails.
 	 */
-	if (from_selcpu && should_try_preempt_first(prev_cpu, layer, layered_cpumask)) {
+	maybe_refresh_layered_cpus(p, tctx, layer_cpumask, cpus_seq);
+	if (!(cpumask = cast_mask(tctx->layered_cpus.mask)))
+		return -1;
+	if (from_selcpu && should_try_preempt_first(prev_cpu, layer, cpumask)) {
 		cctx->try_preempt_first = true;
 		return -1;
 	}
 
-	if (layer->idle_smt) {
-		idle_cpumask = scx_bpf_get_idle_smtmask();
-	} else {
-		idle_cpumask = scx_bpf_get_idle_cpumask();
-	}
-
-	pref_idle_cpumask = bpf_cpumask_create();
-
-	if (!pref_idle_cpumask || !idle_cpumask) {
-		cpu = -1;
-		goto out_put;
-	}
+	if (!(prev_cctx = lookup_cpu_ctx(prev_cpu)) ||
+	    !(idle_smtmask = scx_bpf_get_idle_smtmask()))
+		return -1;
 
 	/*
 	 * Try a CPU in the current LLC
 	 */
-	bpf_cpumask_copy(pref_idle_cpumask, idle_cpumask);
-	bpf_cpumask_and(pref_idle_cpumask, cache_cpumask,
-			cast_mask(pref_idle_cpumask));
-	bpf_cpumask_and(pref_idle_cpumask, layer_cpumask,
-			cast_mask(pref_idle_cpumask));
-	if ((cpu = pick_idle_cpu_from(cast_mask(pref_idle_cpumask),
-				      prev_cpu, idle_cpumask,
-				      layer->idle_smt)) >= 0)
-		goto out_put;
+	if (nr_llcs > 1) {
+		maybe_refresh_layered_cpus_llc(p, tctx, layer_cpumask,
+					       prev_cctx->cache_idx, cpus_seq);
+		if (!(cpumask = cast_mask(tctx->layered_cpus_llc.mask))) {
+			cpu = -1;
+			goto out_put;
+		}
+		if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask,
+					      layer->idle_smt)) >= 0)
+			goto out_put;
+	}
 
 	/*
-	 * If the layer uses BigLittle growth algo try a big cpu
+	 * If the layer uses BigLittle growth algo try a big cpu.
+	 * TODO - Cache layered_cpus.mask & big_cpumask.
 	 */
-	if (has_little_cores
-	    && big_cpumask
-	    && layer->growth_algo == GROWTH_ALGO_BIG_LITTLE)
-	{
-		if (!pref_idle_cpumask || !big_cpumask) {
-			cpu = -1;
-			goto out_put;
-		}
-		bpf_cpumask_copy(pref_idle_cpumask, idle_cpumask);
-		if (!pref_idle_cpumask || !big_cpumask) {
-			cpu = -1;
-			goto out_put;
-		}
-		bpf_cpumask_and(pref_idle_cpumask, cast_mask(big_cpumask),
-				cast_mask(pref_idle_cpumask));
+	if (has_little_cores && big_cpumask &&
+	    layer->growth_algo == GROWTH_ALGO_BIG_LITTLE) {
+		struct bpf_cpumask *tmp_cpumask;
 
-		if ((cpu = pick_idle_cpu_from(cast_mask(pref_idle_cpumask),
-					      prev_cpu, idle_cpumask,
-					      layer->idle_smt)) >= 0)
+		if (!(tmp_cpumask = bpf_cpumask_create())) {
+			cpu = -1;
+			goto out_put;
+		}
+		bpf_cpumask_and(tmp_cpumask, cast_mask(tctx->layered_cpus.mask),
+				cast_mask(big_cpumask));
+		cpu = pick_idle_cpu_from(cast_mask(tmp_cpumask),
+					 prev_cpu, idle_smtmask,
+					 layer->idle_smt);
+		bpf_cpumask_release(tmp_cpumask);
+		if (cpu >= 0)
 			goto out_put;
 	}
 
@@ -746,17 +767,13 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 	 * Next try a CPU in the current node
 	 */
 	if (nr_nodes > 1) {
-		if (!pref_idle_cpumask || !idle_cpumask) {
+		maybe_refresh_layered_cpus_node(p, tctx, layer_cpumask,
+						prev_cctx->node_idx, cpus_seq);
+		if (!(cpumask = cast_mask(tctx->layered_cpus_node.mask))) {
 			cpu = -1;
 			goto out_put;
 		}
-		bpf_cpumask_copy(pref_idle_cpumask, idle_cpumask);
-		bpf_cpumask_and(pref_idle_cpumask, node_cpumask,
-				cast_mask(pref_idle_cpumask));
-		bpf_cpumask_and(pref_idle_cpumask, layer_cpumask,
-				cast_mask(pref_idle_cpumask));
-		if ((cpu = pick_idle_cpu_from(cast_mask(pref_idle_cpumask),
-					      prev_cpu, idle_cpumask,
+		if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask,
 					      layer->idle_smt)) >= 0)
 			goto out_put;
 	}
@@ -765,8 +782,7 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 	 * If the layer is an open one, we can try the whole machine.
 	 */
 	if (layer->kind != LAYER_KIND_CONFINED &&
-	    ((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu,
-				       idle_cpumask,
+	    ((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu, idle_smtmask,
 				       layer->idle_smt)) >= 0)) {
 		lstat_inc(LSTAT_OPEN_IDLE, layer, cctx);
 		goto out_put;
@@ -775,9 +791,7 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 	cpu = -1;
 
 out_put:
-	if (pref_idle_cpumask)
-		bpf_cpumask_release(pref_idle_cpumask);
-	scx_bpf_put_idle_cpumask(idle_cpumask);
+	scx_bpf_put_idle_cpumask(idle_smtmask);
 	return cpu;
 }
 
@@ -987,7 +1001,6 @@ void try_preempt(s32 task_cpu, struct task_struct *p, struct task_ctx *tctx,
 	if (!(cachec = lookup_cache_ctx(cctx->cache_idx)) ||
 	    !(nodec = lookup_node_ctx(cctx->node_idx)) ||
 	    !cachec->cpumask) {
-		scx_bpf_error("can't happen");
 		return;
 	}
 
@@ -1876,6 +1889,8 @@ static void maybe_refresh_layer(struct task_struct *p, struct task_ctx *tctx)
 
 		tctx->layer = idx;
 		tctx->layered_cpus.seq = layer->cpus_seq - 1;
+		tctx->layered_cpus_llc.seq = layer->cpus_seq - 1;
+		tctx->layered_cpus_node.seq = layer->cpus_seq - 1;
 		__sync_fetch_and_add(&layer->nr_tasks, 1);
 		/*
 		 * XXX - To be correct, we'd need to calculate the vtime
@@ -2279,6 +2294,12 @@ s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
 	}
 
 	ret = init_cached_cpus(&tctx->layered_cpus);
+	if (ret)
+		return ret;
+	ret = init_cached_cpus(&tctx->layered_cpus_llc);
+	if (ret)
+		return ret;
+	ret = init_cached_cpus(&tctx->layered_cpus_node);
 	if (ret)
 		return ret;
 

From 51d4945d6931e7798790e02e1506c63a2a02a024 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 16 Nov 2024 05:45:44 -1000
Subject: [PATCH 4/6] scx_layered: Don't call scx_bpf_cpuperf_set()
 unnecessarily

layered_running() is calling scx_bpf_cpuperf_set() whenever a task of a
layer w/ cpuperf setting starts running which can be every task switch.
There's no reason to repeatedly call with the same value. Remember the last
value and call iff the new value is different.

This reduces the bpftop reported CPU consumption of scx_bpf_cpuperf_set()
from ~1.2% to ~0.7% while running rd-hashd at full CPU saturation on Ryzen
3900x.
---
 scheds/rust/scx_layered/src/bpf/intf.h     | 1 +
 scheds/rust/scx_layered/src/bpf/main.bpf.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/scheds/rust/scx_layered/src/bpf/intf.h b/scheds/rust/scx_layered/src/bpf/intf.h
index 7ca7be9..927e5c3 100644
--- a/scheds/rust/scx_layered/src/bpf/intf.h
+++ b/scheds/rust/scx_layered/src/bpf/intf.h
@@ -115,6 +115,7 @@ struct cpu_ctx {
 	u32			layer_idx;
 	u32			cache_idx;
 	u32			node_idx;
+	u32			perf;
 };
 
 struct cache_ctx {
diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
index 9527918..cab67cf 100644
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -2136,8 +2136,10 @@ void BPF_STRUCT_OPS(layered_running, struct task_struct *p)
 		}
 	}
 
-	if (layer->perf > 0)
+	if (layer->perf > 0 && cctx->perf != layer->perf) {
 		scx_bpf_cpuperf_set(task_cpu, layer->perf);
+		cctx->perf = layer->perf;
+	}
 
 	cctx->maybe_idle = false;
 }

From 93a0bc9969c9a29798266d0becee81df9295e477 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 17 Nov 2024 05:54:03 -1000
Subject: [PATCH 5/6] scx_layered: Fix consume_preempting() when
 --local-llc-iteration

consume_preempting() wasn't teting layer->preempt in consume_preempting()
when --local-llc-iterations ending up treating all layers as preempting
layers and often leading to HI fallback starvations under saturation. Fix
it.
---
 scheds/rust/scx_layered/src/bpf/main.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
index cab67cf..42f14e9 100644
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -1515,7 +1515,7 @@ __weak int consume_preempting(struct cost *costc, u32 my_llc_id)
 					return -EINVAL;
 				}
 				layer = MEMBER_VPTR(layers, [layer_idx]);
-				if (has_budget(costc, layer) == 0)
+				if (!layer->preempt || has_budget(costc, layer) == 0)
 					continue;
 				dsq_id = layer_dsq_id(layer_idx, llc_id);
 				if (scx_bpf_consume(dsq_id))

From 56e0dae81d6f459e6dc56c6b13a5274620f23f77 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 17 Nov 2024 06:03:30 -1000
Subject: [PATCH 6/6] scx_layered: Fix linter disagreement

---
 scheds/rust/scx_layered/src/stats.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/scheds/rust/scx_layered/src/stats.rs b/scheds/rust/scx_layered/src/stats.rs
index 5a14701..4ef4d0e 100644
--- a/scheds/rust/scx_layered/src/stats.rs
+++ b/scheds/rust/scx_layered/src/stats.rs
@@ -170,11 +170,7 @@ impl LayerStats {
             }
         };
         let calc_frac = |a, b| {
-            if b != 0.0 {
-                a / b * 100.0
-            } else {
-                0.0
-            }
+            if b != 0.0 { a / b * 100.0 } else { 0.0 }
         };
 
         Self {