scx_layered: Add weighted layer DSQ iteration

Add a flag to control DSQ iteration across layers by layer weight. This helps prevent starvation by iterating over layers with the lowest weight first. Signed-off-by: Daniel Hodges <hodges.daniel.scott@gmail.com>
2024-11-28 13:40:28 +00:00 · 2024-10-03 11:22:05 -07:00 · 2024-10-03 11:22:05 -07:00 · f3b3d4f19c
commit f3b3d4f19c
parent bd75ac8dbf
2 changed files with 50 additions and 11 deletions
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@ -32,8 +32,10 @@ const volatile u32 nr_llcs = 32;	/* !0 for veristat, set during init */
 const volatile bool smt_enabled = true;
 const volatile bool disable_topology = false;
 const volatile bool xnuma_preemption = false;
+const volatile bool layer_weight_dsq_iter = false;
 const volatile s32 __sibling_cpu[MAX_CPUS];
 const volatile unsigned char all_cpus[MAX_CPUS_U8];
+const volatile u32 layer_iteration_order[MAX_LAYERS];

 private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
 struct layer layers[MAX_LAYERS];
@ -79,6 +81,12 @@ static u32 cpu_ctx_layer_idx_inc(struct cpu_ctx *cctx)
 	return cctx->layer_idx;
 }

+// Returns the iterator index of a layer ordered by weight.
+static u32 iter_layer_weight_ctx(int idx)
+{
+	return *MEMBER_VPTR(layer_iteration_order, [idx]);
+}
+
 static __noinline u32 iter_layer_cpu_ctx(u32 layer_idx, int idx)
 {
 	u32 offset;
@ -1160,14 +1168,16 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)

 	/* consume preempting layers first */
 	bpf_for(idx, 0, nr_layers) {
-		layer_idx = iter_layer_cpu_ctx(cctx->layer_idx, idx);
 		if (disable_topology) {
-			if (MEMBER_VPTR(layers, [layer_idx].preempt) && scx_bpf_consume(layer_idx))
+			if (MEMBER_VPTR(layers, [idx].preempt) && scx_bpf_consume(idx))
 				return;
 		} else {
+			layer_idx = layer_weight_dsq_iter ? iter_layer_weight_ctx(idx) :
+				iter_layer_cpu_ctx(cctx->layer_idx, idx);
 			bpf_for(llc_id, 0, nr_llcs) {
 				dsq_id = layer_dsq_id(layer_idx, llc_id);
-				if (MEMBER_VPTR(layers, [layer_idx].preempt) && scx_bpf_consume(dsq_id))
+				if (MEMBER_VPTR(layers, [layer_idx].preempt) &&
+				    scx_bpf_consume(dsq_id))
 					return;
 			}
 		}
@ -1179,21 +1189,23 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)

 	/* consume !open layers second */
 	bpf_for(idx, 0, nr_layers) {
-		layer_idx = iter_layer_cpu_ctx(cctx->layer_idx, idx);
 		if (disable_topology) {
-			struct layer *layer = &layers[layer_idx];
+			layer_idx = idx;
+			struct layer *layer = &layers[idx];
 			struct cpumask *layer_cpumask;

 			/* consume matching layers */
-			if (!(layer_cpumask = lookup_layer_cpumask(layer_idx)))
+			if (!(layer_cpumask = lookup_layer_cpumask(idx)))
 				return;

 			if (bpf_cpumask_test_cpu(cpu, layer_cpumask) ||
 			    (cpu == fallback_cpu && layer->nr_cpus == 0)) {
-				if (scx_bpf_consume(layer_idx))
+				if (scx_bpf_consume(idx))
 					return;
 			}
 		} else {
+			layer_idx = layer_weight_dsq_iter ? iter_layer_weight_ctx(idx) :
+				iter_layer_cpu_ctx(cctx->layer_idx, idx);
 			bpf_for(llc_id, 0, nr_llcs) {
 				struct layer *layer = &layers[layer_idx];
 				struct cpumask *layer_cpumask;
@ -1205,7 +1217,7 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)

 				if (bpf_cpumask_test_cpu(cpu, layer_cpumask) ||
 				    (cpu <= nr_possible_cpus && cpu == fallback_cpu &&
-				     layer->nr_cpus == 0)) {
+				     MEMBER_VPTR(layer, ->nr_cpus) == 0)) {
 					if (scx_bpf_consume(dsq_id))
 						return;
 				}
@ -1215,12 +1227,13 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)

 	/* consume !preempting open layers */
 	bpf_for(idx, 0, nr_layers) {
-		layer_idx = iter_layer_cpu_ctx(cctx->layer_idx, idx);
 		if (disable_topology) {
-			if (!layers[layer_idx].preempt && layers[layer_idx].open &&
-			    scx_bpf_consume(layer_idx))
+			if (!layers[idx].preempt && layers[idx].open &&
+			    scx_bpf_consume(idx))
 				return;
 		} else {
+			layer_idx = layer_weight_dsq_iter ? iter_layer_weight_ctx(idx) :
+				iter_layer_cpu_ctx(cctx->layer_idx, idx);
 			bpf_for(llc_id, 0, nr_llcs) {
 				dsq_id = layer_dsq_id(layer_idx, llc_id);

@ -1885,6 +1898,14 @@ void BPF_STRUCT_OPS(layered_dump, struct scx_dump_ctx *dctx)
 		     dsq_first_runnable_for_ms(LO_FALLBACK_DSQ, now));
 }

+static void print_iter_order() {
+	int i;
+
+	bpf_for(i, 0, nr_layers) {
+		trace("ITER order i: %d %d\n", i, *MEMBER_VPTR(layer_iteration_order, [i]));
+	}
+}
+
 s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 {
 	struct bpf_cpumask *cpumask;
@ -2059,6 +2080,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 		}
 	}

+	print_iter_order();
+
 	return 0;
 }

--- a/scheds/rust/scx_layered/src/main.rs
+++ b/scheds/rust/scx_layered/src/main.rs
@ -414,6 +414,12 @@ struct Opts {
    #[clap(long, default_value = "0.0")]
    layer_growth_weight_disable: f64,

+    /// When iterating over layer DSQs use the weight of the layer for iteration 
+    /// order. The default iteration order is semi-random except when topology 
+    /// awareness is disabled.
+    #[clap(long)]
+    layer_weight_dsq_iter: bool,
+
    /// Enable stats monitoring with the specified interval.
    #[clap(long)]
    stats: Option<f64>,
@ -1555,6 +1561,9 @@ impl<'a, 'b> Scheduler<'a, 'b> {
        skel.maps.rodata_data.nr_layers = specs.len() as u32;
        let mut perf_set = false;

+        let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
+        let mut layer_weights: Vec<usize> = vec![];
+
        for (spec_i, spec) in specs.iter().enumerate() {
            let layer = &mut skel.maps.bss_data.layers[spec_i];

@ -1675,6 +1684,7 @@ impl<'a, 'b> Scheduler<'a, 'b> {
                    } else {
                        DEFAULT_LAYER_WEIGHT
                    };
+                    layer_weights.push(layer.weight.try_into().unwrap());
                    layer.perf = u32::try_from(*perf)?;
                    layer.node_mask = nodemask_from_nodes(nodes) as u64;
                    for topo_node in topo.nodes() {
@ -1696,6 +1706,11 @@ impl<'a, 'b> Scheduler<'a, 'b> {
            perf_set |= layer.perf > 0;
        }

+        layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
+        for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
+            skel.maps.rodata_data.layer_iteration_order[idx] = *layer_idx as u32;
+        }
+
        if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
            warn!("cpufreq support not available, ignoring perf configurations");
        }
@ -1775,6 +1790,7 @@ impl<'a, 'b> Scheduler<'a, 'b> {
        skel.maps.rodata_data.smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
        skel.maps.rodata_data.disable_topology = opts.disable_topology;
        skel.maps.rodata_data.xnuma_preemption = opts.xnuma_preemption;
+        skel.maps.rodata_data.layer_weight_dsq_iter = opts.layer_weight_dsq_iter;
        for (cpu, sib) in cpu_pool.sibling_cpu.iter().enumerate() {
            skel.maps.rodata_data.__sibling_cpu[cpu] = *sib;
        }