Merge pull request #934 from sched-ext/htejun/layered-updates

scx_layered: Cleanups around topology handling
2024-11-25 11:00:24 +00:00 · 2024-11-18 23:12:48 +00:00 · 2024-11-18 23:12:48 +00:00 · 88c7d47314
commit 88c7d47314
parent ff0e9c621c aec9e86797
4 changed files with 89 additions and 136 deletions
--- a/rust/scx_utils/src/topology.rs
+++ b/rust/scx_utils/src/topology.rs
@ -290,18 +290,7 @@ pub struct Topology {
 }

 impl Topology {
-    /// Build a complete host Topology
-    pub fn new() -> Result<Topology> {
-        let span = cpus_online()?;
-        // If the kernel is compiled with CONFIG_NUMA, then build a topology
-        // from the NUMA hierarchy in sysfs. Otherwise, just make a single
-        // default node of ID 0 which contains all cores.
-        let nodes = if Path::new("/sys/devices/system/node").exists() {
-            create_numa_nodes(&span)?
-        } else {
-            create_default_node(&span)?
-        };
-
+    fn instantiate(span: Cpumask, nodes: Vec<Node>) -> Result<Self> {
        // For convenient and efficient lookup from the root topology object,
        // create two BTreeMaps to the full set of Core and Cpu objects on the
        // system. We clone the objects that are located further down in the
@ -332,6 +321,27 @@ impl Topology {
        })
    }

+    /// Build a complete host Topology
+    pub fn new() -> Result<Topology> {
+        let span = cpus_online()?;
+        // If the kernel is compiled with CONFIG_NUMA, then build a topology
+        // from the NUMA hierarchy in sysfs. Otherwise, just make a single
+        // default node of ID 0 which contains all cores.
+        let nodes = if Path::new("/sys/devices/system/node").exists() {
+            create_numa_nodes(&span)?
+        } else {
+            create_default_node(&span, false)?
+        };
+
+        Self::instantiate(span, nodes)
+    }
+
+    pub fn with_flattened_llc_node() -> Result<Topology> {
+        let span = cpus_online()?;
+        let nodes = create_default_node(&span, true)?;
+        Self::instantiate(span, nodes)
+    }
+
    /// Get a slice of the NUMA nodes on the host.
    pub fn nodes(&self) -> &[Node] {
        &self.nodes
@ -518,6 +528,7 @@ fn create_insert_cpu(
    node: &mut Node,
    online_mask: &Cpumask,
    avg_cpu_freq: Option<(usize, usize)>,
+    flatten_llc: bool,
 ) -> Result<()> {
    // CPU is offline. The Topology hierarchy is read-only, and assumes
    // that hotplug will cause the scheduler to restart. Thus, we can
@ -542,7 +553,7 @@ fn create_insert_cpu(
    let l2_id = read_file_usize(&cache_path.join(format!("index{}", 2)).join("id")).unwrap_or(0);
    let l3_id = read_file_usize(&cache_path.join(format!("index{}", 3)).join("id")).unwrap_or(0);
    // Assume that LLC is always 3.
-    let llc_id = l3_id;
+    let llc_id = if flatten_llc { 0 } else { l3_id };

    // Min and max frequencies. If the kernel is not compiled with
    // CONFIG_CPU_FREQ, just assume 0 for both frequencies.
@ -647,7 +658,7 @@ fn avg_cpu_freq() -> Option<(usize, usize)> {
    Some((avg_base_freq / nr_cpus, top_max_freq))
 }

-fn create_default_node(online_mask: &Cpumask) -> Result<Vec<Node>> {
+fn create_default_node(online_mask: &Cpumask, flatten_llc: bool) -> Result<Vec<Node>> {
    let mut nodes: Vec<Node> = Vec::with_capacity(1);

    let mut node = Node {
@ -678,7 +689,7 @@ fn create_default_node(online_mask: &Cpumask) -> Result<Vec<Node>> {
    let avg_cpu_freq = avg_cpu_freq();
    let cpu_ids = read_cpu_ids()?;
    for cpu_id in cpu_ids.iter() {
-        create_insert_cpu(*cpu_id, &mut node, &online_mask, avg_cpu_freq)?;
+        create_insert_cpu(*cpu_id, &mut node, &online_mask, avg_cpu_freq, flatten_llc)?;
    }

    nodes.push(node);
@ -734,7 +745,7 @@ fn create_numa_nodes(online_mask: &Cpumask) -> Result<Vec<Node>> {
                }
            };

-            create_insert_cpu(cpu_id, &mut node, &online_mask, avg_cpu_freq)?;
+            create_insert_cpu(cpu_id, &mut node, &online_mask, avg_cpu_freq, false)?;
        }

        nodes.push(node);
--- a/scheds/rust/scx_layered/src/bpf/intf.h
+++ b/scheds/rust/scx_layered/src/bpf/intf.h
@ -112,6 +112,7 @@ struct cpu_ctx {
 	u64			gstats[NR_GSTATS];
 	u64			lstats[MAX_LAYERS][NR_LSTATS];
 	u64			ran_current_for;
+	u64			hi_fallback_dsq_id;
 	u32			layer_idx;
 	u32			cache_idx;
 	u32			node_idx;
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@ -115,15 +115,15 @@ u32 rotate_llc_id(u32 base_llc_id, u32 rotation)
 // return the dsq id for the layer based on the LLC id.
 static __noinline u64 layer_dsq_id(u32 layer_id, u32 llc_id)
 {
-	return (layer_id * nr_llcs) + llc_id;
+	if (nr_llcs == 1)
+		return layer_id;
+	else
+		return (layer_id * nr_llcs) + llc_id;
 }

-// XXX - cpu_to_llc_id() must not be inlined to not blow past ins limit when
-// topo is enabled but older kernels get confused by RCU state when subprogs are
-// called from sleepable functions. Use __always_inline variant from
-// layered_init() and __noinline from everywhere else. Remove this once we can
-// ignore the older kernels.
-static __always_inline u32 __cpu_to_llc_id(s32 cpu_id)
+// XXX - older kernels get confused by RCU state when subprogs are called from
+// sleepable functions. Use __always_inline.
+static __always_inline u32 cpu_to_llc_id(s32 cpu_id)
 {
        const volatile u32 *llc_ptr;

@ -135,11 +135,6 @@ static __always_inline u32 __cpu_to_llc_id(s32 cpu_id)
        return *llc_ptr;
 }

-static __noinline u32 cpu_to_llc_id(u32 cpu_id)
-{
-	return __cpu_to_llc_id(cpu_id);
-}
-
 u32 llc_node_id(u32 llc_id)
 {
        const volatile u32 *llc_ptr;
@ -162,33 +157,6 @@ static inline bool is_fallback_dsq(u64 dsq_id)
 	return dsq_id > HI_FALLBACK_DSQ_BASE && dsq_id <= LO_FALLBACK_DSQ;
 }

-static u64 llc_hi_fallback_dsq_iter_offset(int llc_offset, int idx)
-{
-	int offset = llc_offset + idx;
-
-	if (offset >= nr_llcs)
-		return llc_hi_fallback_dsq_id(offset - nr_llcs);
-
-	return llc_hi_fallback_dsq_id(idx + llc_offset);
-}
-
-static int llc_iter_cpu_offset(int idx, s32 cpu)
-{
-	int offset;
-
-	if (cpu <= 0)
-		return idx;
-
-	offset = (cpu % nr_llcs) + idx;
-
-	return offset >= nr_llcs ? offset - nr_llcs : offset;
-}
-
-static u64 cpu_hi_fallback_dsq_id(s32 cpu)
-{
-	return llc_hi_fallback_dsq_id(cpu_to_llc_id(cpu));
-}
-
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__type(key, u32);
@ -1108,16 +1076,15 @@ preempt_fail:

 void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 {
-	struct cpu_ctx *cctx;
+	struct cpu_ctx *cctx, *task_cctx;
 	struct task_ctx *tctx;
 	struct layer *layer;
 	s32 task_cpu = scx_bpf_task_cpu(p);
 	u64 vtime = p->scx.dsq_vtime;
 	bool try_preempt_first;
-	u32 idx;

-	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
-	    !(layer = lookup_layer(tctx->layer)))
+	if (!(cctx = lookup_cpu_ctx(-1)) || !(task_cctx = lookup_cpu_ctx(task_cpu)) ||
+	    !(tctx = lookup_task_ctx(p)) || !(layer = lookup_layer(tctx->layer)))
 		return;

 	try_preempt_first = cctx->try_preempt_first;
@ -1160,9 +1127,8 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 		    !bpf_cpumask_test_cpu(task_cpu, layer_cpumask))
 			lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);

-		idx = cpu_hi_fallback_dsq_id(task_cpu);
-		tctx->last_dsq = idx;
-		scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
+		tctx->last_dsq = task_cctx->hi_fallback_dsq_id;
+		scx_bpf_dispatch(p, tctx->last_dsq, slice_ns, enq_flags);
 		goto preempt;
 	}

@ -1186,22 +1152,13 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 		 * to the LLC local HI_FALLBACK_DSQ to avoid this starvation
 		 * issue.
 		 */
-		idx = cpu_hi_fallback_dsq_id(task_cpu);
-		scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
-		tctx->last_dsq = idx;
+		tctx->last_dsq = task_cctx->hi_fallback_dsq_id;
+		scx_bpf_dispatch(p, tctx->last_dsq, slice_ns, enq_flags);
 		goto preempt;
 	}

-	if (disable_topology) {
-		tctx->last_dsq = tctx->layer;
-		scx_bpf_dispatch_vtime(p, tctx->layer, slice_ns, vtime, enq_flags);
-	} else {
-		u32 llc_id = cpu_to_llc_id(tctx->last_cpu >= 0 ? tctx->last_cpu :
-					   bpf_get_smp_processor_id());
-		idx = layer_dsq_id(layer->idx, llc_id);
-		tctx->last_dsq = idx;
-		scx_bpf_dispatch_vtime(p, idx, slice_ns, vtime, enq_flags);
-	}
+	tctx->last_dsq = layer_dsq_id(layer->idx, task_cctx->cache_idx);
+	scx_bpf_dispatch_vtime(p, tctx->last_dsq, slice_ns, vtime, enq_flags);

 preempt:
 	try_preempt(task_cpu, p, tctx, try_preempt_first, enq_flags);
@ -1247,21 +1204,11 @@ static bool keep_running(struct cpu_ctx *cctx, struct task_struct *p)
 		 * have tasks waiting, keep running it. If there are multiple
 		 * competing preempting layers, this won't work well.
 		 */
-		if (disable_topology) {
-			if (!scx_bpf_dsq_nr_queued(layer->idx)) {
-				p->scx.slice = slice_ns;
-				lstat_inc(LSTAT_KEEP, layer, cctx);
-				return true;
-			}
-		} else {
-			u32 dsq_id = cpu_to_llc_id(tctx->last_cpu >= 0 ?
-						   tctx->last_cpu :
-						   bpf_get_smp_processor_id());
-			if (!scx_bpf_dsq_nr_queued(dsq_id)) {
-				p->scx.slice = slice_ns;
-				lstat_inc(LSTAT_KEEP, layer, cctx);
-				return true;
-			}
+		u32 dsq_id = layer_dsq_id(layer->idx, cctx->cache_idx);
+		if (!scx_bpf_dsq_nr_queued(dsq_id)) {
+			p->scx.slice = slice_ns;
+			lstat_inc(LSTAT_KEEP, layer, cctx);
+			return true;
 		}
 	} else {
 		const struct cpumask *idle_cpumask = scx_bpf_get_idle_cpumask();
@ -1465,8 +1412,7 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
 			return;
 	}

-	dsq_id = cpu_hi_fallback_dsq_id(cpu);
-	if (scx_bpf_consume(dsq_id))
+	if (scx_bpf_consume(cctx->hi_fallback_dsq_id))
 		return;

 	/* consume !open layers second */
@ -1725,8 +1671,6 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
 		return;
 	}

-	u32 my_llc_id = cpu_to_llc_id(cpu);
-
 	/*
 	 * If one of the fallback DSQs has the most budget then consume from it
 	 * to prevent starvation.
@ -1741,19 +1685,18 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
 	}

 	/* consume preempting layers first */
-	if (consume_preempting(costc, my_llc_id) == 0)
+	if (consume_preempting(costc, cctx->cache_idx) == 0)
 		return;

-	dsq_id = cpu_hi_fallback_dsq_id(cpu);
-	if (scx_bpf_consume(dsq_id))
+	if (scx_bpf_consume(cctx->hi_fallback_dsq_id))
 		return;

 	/* consume !open layers second */
-	if (consume_non_open(costc, cpu, my_llc_id) == 0)
+	if (consume_non_open(costc, cpu, cctx->cache_idx) == 0)
 		return;

 	/* consume !preempting open layers */
-	if (consume_open_no_preempt(costc, my_llc_id) == 0)
+	if (consume_open_no_preempt(costc, cctx->cache_idx) == 0)
 		return;

 	scx_bpf_consume(LO_FALLBACK_DSQ);
@ -2036,13 +1979,14 @@ static s32 create_cache(u32 cache_id)
 			return -ENOENT;
 		}

-		llc_id = __cpu_to_llc_id(cpu);
+		llc_id = cpu_to_llc_id(cpu);
 		if (llc_id != cache_id)
 			continue;

 		bpf_cpumask_set_cpu(cpu, cpumask);
 		cachec->nr_cpus++;
 		cctx->cache_idx = cache_id;
+		cctx->hi_fallback_dsq_id = llc_hi_fallback_dsq_id(cache_id);
 	}

 	dbg("CFG creating cache %d with %d cpus", cache_id, cachec->nr_cpus);
@ -2496,23 +2440,16 @@ void BPF_STRUCT_OPS(layered_dump, struct scx_dump_ctx *dctx)
 			continue;
 		}

-		if (disable_topology) {
-			scx_bpf_dump("LAYER[%d][%s] nr_cpus=%u nr_queued=%d -%llums cpus=",
-				     i, layer->name, layer->nr_cpus,
-				     scx_bpf_dsq_nr_queued(i),
-				     dsq_first_runnable_for_ms(i, now));
-		} else {
-			bpf_for(j, 0, nr_llcs) {
-				if (!(layer->cache_mask & (1 << j)))
-					continue;
+		bpf_for(j, 0, nr_llcs) {
+			if (!(layer->cache_mask & (1 << j)))
+				continue;

-				idx = layer_dsq_id(layer->idx, j);
-				scx_bpf_dump("LAYER[%d][%s]DSQ[%d] nr_cpus=%u nr_queued=%d -%llums cpus=",
-					     i, layer->name, idx, layer->nr_cpus,
-					     scx_bpf_dsq_nr_queued(idx),
-					     dsq_first_runnable_for_ms(idx, now));
-				scx_bpf_dump("\n");
-			}
+			idx = layer_dsq_id(layer->idx, j);
+			scx_bpf_dump("LAYER[%d][%s]DSQ[%d] nr_cpus=%u nr_queued=%d -%llums cpus=",
+				     i, layer->name, idx, layer->nr_cpus,
+				     scx_bpf_dsq_nr_queued(idx),
+				     dsq_first_runnable_for_ms(idx, now));
+			scx_bpf_dump("\n");
 		}
 		dump_layer_cpumask(i);
 		scx_bpf_dump("\n");
@ -2638,8 +2575,8 @@ unlock:
 */
 static bool antistall_scan(void)
 {
-	s32 cpu;
-	u64 dsq_id;
+	s32 llc;
+	u64 layer_id;
 	u64 jiffies_now;

 	if (!enable_antistall)
@ -2647,14 +2584,12 @@ static bool antistall_scan(void)

 	jiffies_now = bpf_jiffies64();

-	bpf_for(dsq_id, 0, nr_layers) {
-		antistall_set(dsq_id, jiffies_now);
-	}
+	bpf_for(layer_id, 0, nr_layers)
+		bpf_for(llc, 0, nr_llcs)
+			antistall_set(layer_dsq_id(layer_id, llc), jiffies_now);

-	bpf_for(cpu, 0, nr_possible_cpus) {
-		dsq_id = cpu_hi_fallback_dsq_id(cpu);
-		antistall_set(dsq_id, jiffies_now);
-	}
+	bpf_for(llc, 0, nr_llcs)
+		antistall_set(llc_hi_fallback_dsq_id(llc), jiffies_now);

 	antistall_set(LO_FALLBACK_DSQ, jiffies_now);

--- a/scheds/rust/scx_layered/src/main.rs
+++ b/scheds/rust/scx_layered/src/main.rs
@ -1336,7 +1336,7 @@ impl<'a> Scheduler<'a> {
                layer.perf = u32::try_from(*perf)?;
                layer.node_mask = nodemask_from_nodes(nodes) as u64;
                for topo_node in topo.nodes() {
-                    if !nodes.contains(&topo_node.id()) {
+                    if !nodes.is_empty() && !nodes.contains(&topo_node.id()) {
                        continue;
                    }
                    layer.cache_mask |= cachemask_from_llcs(&topo_node.llcs()) as u64;
@ -1396,24 +1396,30 @@ impl<'a> Scheduler<'a> {
        open_object: &'a mut MaybeUninit<OpenObject>,
    ) -> Result<Self> {
        let nr_layers = layer_specs.len();
-        let topo = Topology::new()?;
-        let cpu_pool = CpuPool::new(&topo)?;
+        let mut disable_topology = opts.disable_topology.unwrap_or(false);

-        let disable_topology = if let Some(val) = opts.disable_topology {
-            val
+        let topo = if disable_topology {
+            Topology::with_flattened_llc_node()?
        } else {
-            let val = if topo.nodes().len() > 1 {
-                false
-            } else {
-                topo.nodes().iter().all(|n| n.llcs().len() <= 1)
+            Topology::new()?
+        };
+
+        if !disable_topology {
+            if topo.nodes().len() == 1 && topo.nodes()[0].llcs().len() == 1 {
+                disable_topology = true;
            };
            info!(
                "Topology awareness not specified, selecting {} based on hardware",
-                if val { "disabled" } else { "enabled" }
+                if disable_topology {
+                    "disabled"
+                } else {
+                    "enabled"
+                }
            );
-            val
        };

+        let cpu_pool = CpuPool::new(&topo)?;
+
        // If disabling topology awareness clear out any set NUMA/LLC configs and
        // it will fallback to using all cores.
        let layer_specs: Vec<_> = if disable_topology {