Merge 56e0dae81d into a7fcda82cc

2024-11-21 18:41:47 +00:00 · 2024-11-18 10:27:48 +01:00 · 2024-11-18 10:27:48 +01:00 · fdca0059fc
commit fdca0059fc
parent a7fcda82cc 56e0dae81d
3 changed files with 181 additions and 113 deletions
--- a/scheds/rust/scx_layered/src/bpf/intf.h
+++ b/scheds/rust/scx_layered/src/bpf/intf.h
@ -115,6 +115,7 @@ struct cpu_ctx {
 	u32			layer_idx;
 	u32			cache_idx;
 	u32			node_idx;
+	u32			perf;
 };

 struct cache_ctx {
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@ -226,7 +226,8 @@ static struct node_ctx *lookup_node_ctx(u32 node)
 {
 	struct node_ctx *nodec;

-	nodec = bpf_map_lookup_elem(&node_data, &node);
+	if (!(nodec = bpf_map_lookup_elem(&node_data, &node)))
+		scx_bpf_error("no node_ctx");
 	return nodec;
 }

@ -242,7 +243,8 @@ static struct cache_ctx *lookup_cache_ctx(u32 cache_idx)
 {
 	struct cache_ctx *cachec;

-	cachec = bpf_map_lookup_elem(&cache_data, &cache_idx);
+	if (!(cachec = bpf_map_lookup_elem(&cache_data, &cache_idx)))
+		scx_bpf_error("no cache_ctx");
 	return cachec;
 }

@ -325,7 +327,7 @@ static struct cpumask *lookup_layer_cpumask(int idx)
 	struct layer_cpumask_wrapper *cpumaskw;

 	if ((cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx))) {
-		return cast_mask(cpumaskw->cpumask);
+		return (struct cpumask *)cpumaskw->cpumask;
 	} else {
 		scx_bpf_error("no layer_cpumask");
 		return NULL;
@ -400,15 +402,21 @@ int BPF_PROG(sched_tick_fentry)
 	return 0;
 }

+struct cached_cpus {
+	s64			id;
+	u64			seq;
+	struct bpf_cpumask __kptr *mask;
+};
+
 struct task_ctx {
 	int			pid;
 	int			last_cpu;
 	int			layer;
 	pid_t			last_waker;
 	bool			refresh_layer;
-	u64			layer_cpus_seq;
-	struct bpf_cpumask __kptr *layered_cpumask;
-
+	struct cached_cpus	layered_cpus;
+	struct cached_cpus	layered_cpus_llc;
+	struct cached_cpus	layered_cpus_node;
 	bool			all_cpus_allowed;
 	u64			runnable_at;
 	u64			running_at;
@ -506,23 +514,74 @@ int BPF_PROG(tp_task_rename, struct task_struct *p, const char *buf)
 	return 0;
 }

-static void maybe_refresh_layered_cpumask(struct cpumask *layered_cpumask,
-					  struct task_struct *p, struct task_ctx *tctx,
-					  const struct cpumask *layer_cpumask)
+static bool should_refresh_cached_cpus(struct cached_cpus *ccpus, s64 id, u64 cpus_seq)
 {
-	u64 layer_seq = layers->cpus_seq;
+	return ccpus->id != id || ccpus->seq != cpus_seq;
+}

-	if (tctx->layer_cpus_seq == layer_seq)
+static void refresh_cached_cpus(struct cached_cpus *ccpus, s64 id, u64 cpus_seq,
+				const struct cpumask *cpus_a,
+				const struct cpumask *cpus_b)
+{
+	if (unlikely(!ccpus->mask || !cpus_a || !cpus_b)) {
+		scx_bpf_error("NULL ccpus->mask or cpus_a/b");
 		return;
+	}

 	/*
 	 * XXX - We're assuming that the updated @layer_cpumask matching the new
 	 * @layer_seq is visible which may not be true. For now, leave it as-is.
 	 * Let's update once BPF grows enough memory ordering constructs.
 	 */
-	bpf_cpumask_and((struct bpf_cpumask *)layered_cpumask, layer_cpumask, p->cpus_ptr);
-	tctx->layer_cpus_seq = layer_seq;
-	trace("%s[%d] cpumask refreshed to seq %llu", p->comm, p->pid, layer_seq);
+	bpf_cpumask_and((struct bpf_cpumask *)ccpus->mask, cpus_a, cpus_b);
+	ccpus->id = id;
+	ccpus->seq = cpus_seq;
+}
+
+static void maybe_refresh_layered_cpus(struct task_struct *p, struct task_ctx *tctx,
+				       const struct cpumask *layer_cpumask,
+				       u64 cpus_seq)
+{
+	if (should_refresh_cached_cpus(&tctx->layered_cpus, 0, cpus_seq)) {
+		refresh_cached_cpus(&tctx->layered_cpus, 0, cpus_seq,
+				    p->cpus_ptr, layer_cpumask);
+		trace("%s[%d] layered cpumask refreshed to seq=%llu",
+		      p->comm, p->pid, tctx->layered_cpus.seq);
+	}
+}
+
+static void maybe_refresh_layered_cpus_llc(struct task_struct *p, struct task_ctx *tctx,
+					   const struct cpumask *layer_cpumask,
+					   s32 llc_id, u64 cpus_seq)
+{
+	if (should_refresh_cached_cpus(&tctx->layered_cpus_llc, llc_id, cpus_seq)) {
+		struct cache_ctx *cachec;
+
+		if (!(cachec = lookup_cache_ctx(llc_id)))
+			return;
+		refresh_cached_cpus(&tctx->layered_cpus_llc, llc_id, cpus_seq,
+				    cast_mask(tctx->layered_cpus.mask),
+				    cast_mask(cachec->cpumask));
+		trace("%s[%d] layered llc cpumask refreshed to llc=%d seq=%llu",
+		      p->comm, p->pid, tctx->layered_cpus_llc.id, tctx->layered_cpus_llc.seq);
+	}
+}
+
+static void maybe_refresh_layered_cpus_node(struct task_struct *p, struct task_ctx *tctx,
+					    const struct cpumask *layer_cpumask,
+					    s32 node_id, u64 cpus_seq)
+{
+	if (should_refresh_cached_cpus(&tctx->layered_cpus_node, node_id, cpus_seq)) {
+		struct node_ctx *nodec;
+
+		if (!(nodec = lookup_node_ctx(node_id)))
+			return;
+		refresh_cached_cpus(&tctx->layered_cpus_node, node_id, cpus_seq,
+				    cast_mask(tctx->layered_cpus.mask),
+				    cast_mask(nodec->cpumask));
+		trace("%s[%d] layered node cpumask refreshed to node=%d seq=%llu",
+		      p->comm, p->pid, tctx->layered_cpus_node.id, tctx->layered_cpus_node.seq);
+	}
 }

 static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu,
@ -580,12 +639,12 @@ s32 pick_idle_no_topo(struct task_struct *p, s32 prev_cpu,
 		      struct cpu_ctx *cctx, struct task_ctx *tctx,
 		      struct layer *layer, bool from_selcpu)
 {
-	const struct cpumask *idle_cpumask;
+	const struct cpumask *idle_smtmask;
 	struct cpumask *layer_cpumask, *layered_cpumask;
 	s32 cpu;

 	/* look up cpumasks */
-	if (!(layered_cpumask = cast_mask(tctx->layered_cpumask)) ||
+	if (!(layered_cpumask = (struct cpumask *)(tctx->layered_cpus.mask)) ||
 	    !(layer_cpumask = lookup_layer_cpumask(tctx->layer)))
 			return -1;

@ -596,7 +655,7 @@ s32 pick_idle_no_topo(struct task_struct *p, s32 prev_cpu,
 		return prev_cpu;
 	}

-	maybe_refresh_layered_cpumask(layered_cpumask, p, tctx, layer_cpumask);
+	maybe_refresh_layered_cpus(p, tctx, layer_cpumask, READ_ONCE(layers->cpus_seq));

 	/*
 	 * If @p prefers to preempt @prev_cpu than finding an idle CPU and
@ -613,14 +672,14 @@ s32 pick_idle_no_topo(struct task_struct *p, s32 prev_cpu,
 	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
 	 * partially idle @prev_cpu.
 	 */
-	idle_cpumask = scx_bpf_get_idle_smtmask();
+	idle_smtmask = scx_bpf_get_idle_smtmask();
 	if ((cpu = pick_idle_cpu_from(layered_cpumask, prev_cpu,
-				      idle_cpumask,
+				      idle_smtmask,
 				      layer->idle_smt)) >= 0)
 		goto out_put;

 out_put:
-	scx_bpf_put_idle_cpumask(idle_cpumask);
+	scx_bpf_put_idle_cpumask(idle_smtmask);
 	return cpu;
 }

@ -632,22 +691,12 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 	if (disable_topology)
 		return pick_idle_no_topo(p, prev_cpu, cctx, tctx, layer, from_selcpu);

-	const struct cpumask *idle_cpumask;
-	struct cache_ctx *cachec;
-	struct node_ctx *nodec;
-	struct bpf_cpumask *pref_idle_cpumask;
-	struct cpumask *layer_cpumask, *layered_cpumask, *cache_cpumask, *node_cpumask;
+	const struct cpumask *idle_smtmask, *layer_cpumask, *cpumask;
+	struct cpu_ctx *prev_cctx;
+	u64 cpus_seq;
 	s32 cpu;

-	/* look up cpumasks */
-	if (!(layered_cpumask = cast_mask(tctx->layered_cpumask)) ||
-	    !(layer_cpumask = lookup_layer_cpumask(tctx->layer)) ||
-	    !(cachec = lookup_cache_ctx(cctx->cache_idx)) ||
-	    !(nodec = lookup_node_ctx(cctx->node_idx)))
-			return -1;
-
-	if (!(cache_cpumask = cast_mask(cachec->cpumask)) ||
-	    !(node_cpumask = cast_mask(nodec->cpumask)))
+	if (!(layer_cpumask = lookup_layer_cpumask(tctx->layer)))
 		return -1;

 	/* not much to do if bound to a single CPU */
@ -657,7 +706,7 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 		return prev_cpu;
 	}

-	maybe_refresh_layered_cpumask(layered_cpumask, p, tctx, layer_cpumask);
+	cpus_seq = READ_ONCE(layers->cpus_seq);

 	/*
 	 * If @p prefers to preempt @prev_cpu than finding an idle CPU and
@ -665,59 +714,52 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 	 * @prev_cpu. The enqueue path will also retry to find an idle CPU if
 	 * the preemption attempt fails.
 	 */
-	if (from_selcpu && should_try_preempt_first(prev_cpu, layer, layered_cpumask)) {
+	maybe_refresh_layered_cpus(p, tctx, layer_cpumask, cpus_seq);
+	if (!(cpumask = cast_mask(tctx->layered_cpus.mask)))
+		return -1;
+	if (from_selcpu && should_try_preempt_first(prev_cpu, layer, cpumask)) {
 		cctx->try_preempt_first = true;
 		return -1;
 	}

-	if (layer->idle_smt) {
-		idle_cpumask = scx_bpf_get_idle_smtmask();
-	} else {
-		idle_cpumask = scx_bpf_get_idle_cpumask();
-	}
-
-	pref_idle_cpumask = bpf_cpumask_create();
-
-	if (!pref_idle_cpumask || !idle_cpumask) {
-		cpu = -1;
-		goto out_put;
-	}
+	if (!(prev_cctx = lookup_cpu_ctx(prev_cpu)) ||
+	    !(idle_smtmask = scx_bpf_get_idle_smtmask()))
+		return -1;

 	/*
 	 * Try a CPU in the current LLC
 	 */
-	bpf_cpumask_copy(pref_idle_cpumask, idle_cpumask);
-	bpf_cpumask_and(pref_idle_cpumask, cache_cpumask,
-			cast_mask(pref_idle_cpumask));
-	bpf_cpumask_and(pref_idle_cpumask, layer_cpumask,
-			cast_mask(pref_idle_cpumask));
-	if ((cpu = pick_idle_cpu_from(cast_mask(pref_idle_cpumask),
-				      prev_cpu, idle_cpumask,
+	if (nr_llcs > 1) {
+		maybe_refresh_layered_cpus_llc(p, tctx, layer_cpumask,
+					       prev_cctx->cache_idx, cpus_seq);
+		if (!(cpumask = cast_mask(tctx->layered_cpus_llc.mask))) {
+			cpu = -1;
+			goto out_put;
+		}
+		if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask,
 					      layer->idle_smt)) >= 0)
 			goto out_put;
+	}

 	/*
-	 * If the layer uses BigLittle growth algo try a big cpu
+	 * If the layer uses BigLittle growth algo try a big cpu.
+	 * TODO - Cache layered_cpus.mask & big_cpumask.
 	 */
-	if (has_little_cores
-	    && big_cpumask
-	    && layer->growth_algo == GROWTH_ALGO_BIG_LITTLE)
-	{
-		if (!pref_idle_cpumask || !big_cpumask) {
-			cpu = -1;
-			goto out_put;
-		}
-		bpf_cpumask_copy(pref_idle_cpumask, idle_cpumask);
-		if (!pref_idle_cpumask || !big_cpumask) {
-			cpu = -1;
-			goto out_put;
-		}
-		bpf_cpumask_and(pref_idle_cpumask, cast_mask(big_cpumask),
-				cast_mask(pref_idle_cpumask));
+	if (has_little_cores && big_cpumask &&
+	    layer->growth_algo == GROWTH_ALGO_BIG_LITTLE) {
+		struct bpf_cpumask *tmp_cpumask;

-		if ((cpu = pick_idle_cpu_from(cast_mask(pref_idle_cpumask),
-					      prev_cpu, idle_cpumask,
-					      layer->idle_smt)) >= 0)
+		if (!(tmp_cpumask = bpf_cpumask_create())) {
+			cpu = -1;
+			goto out_put;
+		}
+		bpf_cpumask_and(tmp_cpumask, cast_mask(tctx->layered_cpus.mask),
+				cast_mask(big_cpumask));
+		cpu = pick_idle_cpu_from(cast_mask(tmp_cpumask),
+					 prev_cpu, idle_smtmask,
+					 layer->idle_smt);
+		bpf_cpumask_release(tmp_cpumask);
+		if (cpu >= 0)
 			goto out_put;
 	}

@ -725,17 +767,13 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 	 * Next try a CPU in the current node
 	 */
 	if (nr_nodes > 1) {
-		if (!pref_idle_cpumask || !idle_cpumask) {
+		maybe_refresh_layered_cpus_node(p, tctx, layer_cpumask,
+						prev_cctx->node_idx, cpus_seq);
+		if (!(cpumask = cast_mask(tctx->layered_cpus_node.mask))) {
 			cpu = -1;
 			goto out_put;
 		}
-		bpf_cpumask_copy(pref_idle_cpumask, idle_cpumask);
-		bpf_cpumask_and(pref_idle_cpumask, node_cpumask,
-				cast_mask(pref_idle_cpumask));
-		bpf_cpumask_and(pref_idle_cpumask, layer_cpumask,
-				cast_mask(pref_idle_cpumask));
-		if ((cpu = pick_idle_cpu_from(cast_mask(pref_idle_cpumask),
-					      prev_cpu, idle_cpumask,
+		if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask,
 					      layer->idle_smt)) >= 0)
 			goto out_put;
 	}
@ -744,8 +782,7 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 	 * If the layer is an open one, we can try the whole machine.
 	 */
 	if (layer->kind != LAYER_KIND_CONFINED &&
-	    ((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu,
-				       idle_cpumask,
+	    ((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu, idle_smtmask,
 				       layer->idle_smt)) >= 0)) {
 		lstat_inc(LSTAT_OPEN_IDLE, layer, cctx);
 		goto out_put;
@ -754,9 +791,7 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
 	cpu = -1;

 out_put:
-	if (pref_idle_cpumask)
-		bpf_cpumask_release(pref_idle_cpumask);
-	scx_bpf_put_idle_cpumask(idle_cpumask);
+	scx_bpf_put_idle_cpumask(idle_smtmask);
 	return cpu;
 }

@ -914,7 +949,6 @@ void try_preempt_no_topo(s32 task_cpu, struct task_struct *p,

 	lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx);

-preempt_fail:
 	lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx);
 }

@ -967,7 +1001,6 @@ void try_preempt(s32 task_cpu, struct task_struct *p, struct task_ctx *tctx,
 	if (!(cachec = lookup_cache_ctx(cctx->cache_idx)) ||
 	    !(nodec = lookup_node_ctx(cctx->node_idx)) ||
 	    !cachec->cpumask) {
-		scx_bpf_error("can't happen");
 		return;
 	}

@ -1482,7 +1515,7 @@ __weak int consume_preempting(struct cost *costc, u32 my_llc_id)
 					return -EINVAL;
 				}
 				layer = MEMBER_VPTR(layers, [layer_idx]);
-				if (has_budget(costc, layer) == 0)
+				if (!layer->preempt || has_budget(costc, layer) == 0)
 					continue;
 				dsq_id = layer_dsq_id(layer_idx, llc_id);
 				if (scx_bpf_consume(dsq_id))
@ -1855,7 +1888,9 @@ static void maybe_refresh_layer(struct task_struct *p, struct task_ctx *tctx)
 		struct layer *layer = &layers[idx];

 		tctx->layer = idx;
-		tctx->layer_cpus_seq = layer->cpus_seq - 1;
+		tctx->layered_cpus.seq = layer->cpus_seq - 1;
+		tctx->layered_cpus_llc.seq = layer->cpus_seq - 1;
+		tctx->layered_cpus_node.seq = layer->cpus_seq - 1;
 		__sync_fetch_and_add(&layer->nr_tasks, 1);
 		/*
 		 * XXX - To be correct, we'd need to calculate the vtime
@ -2101,8 +2136,10 @@ void BPF_STRUCT_OPS(layered_running, struct task_struct *p)
 		}
 	}

-	if (layer->perf > 0)
+	if (layer->perf > 0 && cctx->perf != layer->perf) {
 		scx_bpf_cpuperf_set(task_cpu, layer->perf);
+		cctx->perf = layer->perf;
+	}

 	cctx->maybe_idle = false;
 }
@ -2223,11 +2260,28 @@ void BPF_STRUCT_OPS(layered_cpu_release, s32 cpu,
 	scx_bpf_reenqueue_local();
 }

+static int init_cached_cpus(struct cached_cpus *ccpus)
+{
+	struct bpf_cpumask *cpumask;
+
+	ccpus->id = -1;
+
+	if (!(cpumask = bpf_cpumask_create()))
+		return -ENOMEM;
+	if ((cpumask = bpf_kptr_xchg(&ccpus->mask, cpumask))) {
+		/* Should never happen as we just inserted it above. */
+		bpf_cpumask_release(cpumask);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
 		   struct scx_init_task_args *args)
 {
 	struct task_ctx *tctx;
-	struct bpf_cpumask *cpumask;
+	s32 ret;

 	/*
 	 * XXX - We want BPF_NOEXIST but bpf_map_delete_elem() in .disable() may
@ -2241,16 +2295,15 @@ s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
 		return -ENOMEM;
 	}

-	cpumask = bpf_cpumask_create();
-	if (!cpumask)
-		return -ENOMEM;
-
-	cpumask = bpf_kptr_xchg(&tctx->layered_cpumask, cpumask);
-	if (cpumask) {
-		/* Should never happen as we just inserted it above. */
-		bpf_cpumask_release(cpumask);
-		return -EINVAL;
-	}
+	ret = init_cached_cpus(&tctx->layered_cpus);
+	if (ret)
+		return ret;
+	ret = init_cached_cpus(&tctx->layered_cpus_llc);
+	if (ret)
+		return ret;
+	ret = init_cached_cpus(&tctx->layered_cpus_node);
+	if (ret)
+		return ret;

 	tctx->pid = p->pid;
 	tctx->last_cpu = -1;
@ -2500,10 +2553,10 @@ u64 antistall_set(u64 dsq_id, u64 jiffies_now)
 		first_pass = true;
 look_for_cpu:
 		bpf_for(cpu, 0, nr_possible_cpus) {
-			if (!tctx->layered_cpumask)
+			if (!tctx->layered_cpus.mask)
 				goto unlock;

-			if (!bpf_cpumask_test_cpu(cpu,   cast_mask(tctx->layered_cpumask)))
+			if (!bpf_cpumask_test_cpu(cpu, cast_mask(tctx->layered_cpus.mask)))
 				continue;

 			antistall_dsq = bpf_map_lookup_percpu_elem(&antistall_cpu_dsq, &zero, cpu);
@ -2608,7 +2661,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 	struct bpf_cpumask *cpumask, *tmp_big_cpumask;
 	struct cpu_ctx *cctx;
 	int i, j, k, nr_online_cpus, ret;
-	u64 *init_antistall_delay, *init_antistall_dsq;
+	u64 *init_antistall_dsq;
 	u32 zero;

 	zero = 0;
--- a/scheds/rust/scx_layered/src/stats.rs
+++ b/scheds/rust/scx_layered/src/stats.rs
@ -221,15 +221,22 @@ impl LayerStats {
    pub fn format<W: Write>(&self, w: &mut W, name: &str, header_width: usize) -> Result<()> {
        writeln!(
            w,
-            "  {:<width$}: util/dcycle/frac/{:5.1}/{:5.1}/{:7.1} load/load_frac_adj/frac={:9.2}/{:2.2}/{:5.1} tasks={:6}",
+            "  {:<width$}: util/dcycle/frac={:5.1}/{:5.1}/{:7.1} tasks={:6}",
            name,
            self.util,
            self.dcycle,
            self.util_frac,
+            self.tasks,
+            width = header_width,
+        )?;
+
+        writeln!(
+            w,
+            "  {:<width$}  load/load_frac_adj/frac={:9.2}/{:2.2}/{:5.1}",
+            "",
            self.load,
            self.load_frac_adj,
            self.load_frac,
-            self.tasks,
            width = header_width,
        )?;

@ -281,7 +288,7 @@ impl LayerStats {

        writeln!(
            w,
-            "  {:<width$}  preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{} min_exec={}/{:7.2}ms, slice={}ms",
+            "  {:<width$}  preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{}",
            "",
            fmt_pct(self.preempt),
            fmt_pct(self.preempt_first),
@ -289,10 +296,17 @@ impl LayerStats {
            fmt_pct(self.preempt_xnuma),
            fmt_pct(self.preempt_idle),
            fmt_pct(self.preempt_fail),
+            width = header_width,
+        )?;
+
+        writeln!(
+            w,
+            "  {:<width$}  slice={}ms min_exec={}/{:7.2}ms",
+            "",
+            self.slice_us as f64 / 1000.0,
            fmt_pct(self.min_exec),
            self.min_exec_us as f64 / 1000.0,
-            self.slice_us as f64 / 1000.0,
-            width = header_width,
+            width = header_width
        )?;

        let mut cpus = self