scx_layered: Implement min_exec_us option

which can be used to penalize tasks which wake up very frequently without doing much.
2024-11-28 13:40:28 +00:00 · 2024-03-11 22:13:11 -10:00 · 2024-03-11 22:13:11 -10:00 · be2102775b
commit be2102775b
parent 0c62b24993
3 changed files with 89 additions and 15 deletions
--- a/scheds/rust/scx_layered/src/bpf/intf.h
+++ b/scheds/rust/scx_layered/src/bpf/intf.h
@ -46,6 +46,8 @@ enum global_stat_idx {
 enum layer_stat_idx {
 	LSTAT_LOCAL,
 	LSTAT_GLOBAL,
+	LSTAT_MIN_EXEC,
+	LSTAT_MIN_EXEC_NS,
 	LSTAT_OPEN_IDLE,
 	LSTAT_AFFN_VIOL,
 	LSTAT_PREEMPT,
@ -90,6 +92,7 @@ struct layer {
 	struct layer_match_ands	matches[MAX_LAYER_MATCH_ORS];
 	unsigned int		nr_match_ors;
 	unsigned int		idx;
+	u64			min_exec_ns;
 	bool			open;
 	bool			preempt;
 	bool			exclusive;
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@ -45,7 +45,7 @@ static inline s32 prio_to_nice(s32 static_prio)

 static inline s32 sibling_cpu(s32 cpu)
 {
-	s32 *sib;
+	const volatile s32 *sib;

 	sib = MEMBER_VPTR(__sibling_cpu, [cpu]);
 	if (sib)
@ -89,16 +89,23 @@ static void gstat_inc(enum global_stat_idx idx, struct cpu_ctx *cctx)
 	cctx->gstats[idx]++;
 }

-static void lstat_inc(enum layer_stat_idx idx, struct layer *layer, struct cpu_ctx *cctx)
+static void lstat_add(enum layer_stat_idx idx, struct layer *layer,
+		      struct cpu_ctx *cctx, s64 delta)
 {
 	u64 *vptr;

 	if ((vptr = MEMBER_VPTR(*cctx, .lstats[layer->idx][idx])))
-		(*vptr)++;
+		(*vptr) += delta;
 	else
 		scx_bpf_error("invalid layer or stat idxs: %d, %d", idx, layer->idx);
 }

+static void lstat_inc(enum layer_stat_idx idx, struct layer *layer,
+		      struct cpu_ctx *cctx)
+{
+	lstat_add(idx, layer, cctx, 1);
+}
+
 struct lock_wrapper {
 	struct bpf_spin_lock	lock;
 };
@ -738,20 +745,25 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
 {
 	struct cpu_ctx *cctx;
 	struct task_ctx *tctx;
+	struct layer *layer;
+	s32 lidx;
 	u64 used;
-	u32 layer;

 	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
 		return;

-	layer = tctx->layer;
-	if (layer >= nr_layers) {
-		scx_bpf_error("invalid layer %u", layer);
+	lidx = tctx->layer;
+	if (!(layer = lookup_layer(lidx)))
 		return;
-	}

 	used = bpf_ktime_get_ns() - tctx->started_running_at;
-	cctx->layer_cycles[layer] += used;
+	if (used < layer->min_exec_ns) {
+		lstat_inc(LSTAT_MIN_EXEC, layer, cctx);
+		lstat_add(LSTAT_MIN_EXEC_NS, layer, cctx, layer->min_exec_ns - used);
+		used = layer->min_exec_ns;
+	}
+
+	cctx->layer_cycles[lidx] += used;
 	cctx->current_preempt = false;
 	cctx->prev_exclusive = cctx->current_exclusive;
 	cctx->current_exclusive = false;
@ -914,8 +926,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 	bpf_for(i, 0, nr_layers) {
 		struct layer *layer = &layers[i];

-		dbg("CFG LAYER[%d] open=%d preempt=%d exclusive=%d",
-		    i, layer->open, layer->preempt, layer->exclusive);
+		dbg("CFG LAYER[%d] min_exec_ns=%lu open=%d preempt=%d exclusive=%d",
+		    i, layer->min_exec_ns, layer->open, layer->preempt,
+		    layer->exclusive);

 		if (layer->nr_match_ors > MAX_LAYER_MATCH_ORS) {
 			scx_bpf_error("too many ORs");
--- a/scheds/rust/scx_layered/src/main.rs
+++ b/scheds/rust/scx_layered/src/main.rs
@ -154,6 +154,12 @@ lazy_static::lazy_static! {
 ///   layers. Tasks in this group will spill into occupied CPUs if there are
 ///   no unoccupied idle CPUs.
 ///
+/// All layers take the following options:
+///
+/// - min_exec_us: Minimum execution time in microseconds. Whenever a task
+///   is scheduled in, this is the minimum CPU time that it's charged no
+///   matter how short the actual execution time may be.
+///
 /// Both Grouped and Open layers also acception the following options:
 ///
 /// - preempt: If true, tasks in the layer will preempt tasks which belong
@ -297,14 +303,17 @@ enum LayerKind {
    Confined {
        cpus_range: Option<(usize, usize)>,
        util_range: (f64, f64),
+        min_exec_us: u64,
    },
    Grouped {
        cpus_range: Option<(usize, usize)>,
        util_range: (f64, f64),
+        min_exec_us: u64,
        preempt: bool,
        exclusive: bool,
    },
    Open {
+        min_exec_us: u64,
        preempt: bool,
        exclusive: bool,
    },
@ -852,6 +861,7 @@ impl Layer {
            LayerKind::Confined {
                cpus_range,
                util_range,
+                ..
            } => {
                let cpus_range = cpus_range.unwrap_or((0, std::usize::MAX));
                if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
@ -1080,6 +1090,8 @@ struct OpenMetricsStats {
    l_tasks: Family<Vec<(String, String)>, Gauge<i64, AtomicI64>>,
    l_total: Family<Vec<(String, String)>, Gauge<i64, AtomicI64>>,
    l_local: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
+    l_min_exec: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
+    l_min_exec_us: Family<Vec<(String, String)>, Gauge<i64, AtomicI64>>,
    l_open_idle: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
    l_preempt: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
    l_affn_viol: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
@ -1148,6 +1160,14 @@ impl OpenMetricsStats {
        register!(l_tasks, "Number of tasks in the layer");
        register!(l_total, "Number of scheduling events in the layer");
        register!(l_local, "% of scheduling events directly into an idle CPU");
+        register!(
+            l_min_exec,
+            "Number of times execution duration was shorter than min_exec_us"
+        );
+        register!(
+            l_min_exec_us,
+            "Total execution duration extended due to min_exec_us"
+        );
        register!(
            l_open_idle,
            "% of scheduling events into idle CPUs occupied by other layers"
@ -1238,15 +1258,24 @@ impl<'a> Scheduler<'a> {
            layer.nr_match_ors = spec.matches.len() as u32;

            match &spec.kind {
-                LayerKind::Open { preempt, exclusive }
+                LayerKind::Confined { min_exec_us, .. } => layer.min_exec_ns = min_exec_us * 1000,
+                LayerKind::Open {
+                    min_exec_us,
+                    preempt,
+                    exclusive,
+                    ..
+                }
                | LayerKind::Grouped {
-                    preempt, exclusive, ..
+                    min_exec_us,
+                    preempt,
+                    exclusive,
+                    ..
                } => {
                    layer.open = true;
+                    layer.min_exec_ns = min_exec_us * 1000;
                    layer.preempt = *preempt;
                    layer.exclusive = *exclusive;
                }
-                _ => {}
            }
        }

@ -1353,6 +1382,7 @@ impl<'a> Scheduler<'a> {
                LayerKind::Confined {
                    cpus_range,
                    util_range,
+                    ..
                }
                | LayerKind::Grouped {
                    cpus_range,
@ -1557,6 +1587,14 @@ impl<'a> Scheduler<'a> {
            let l_tasks = set!(l_tasks, stats.nr_layer_tasks[lidx] as i64);
            let l_total = set!(l_total, ltotal as i64);
            let l_local = set!(l_local, lstat_pct(bpf_intf::layer_stat_idx_LSTAT_LOCAL));
+            let l_min_exec = set!(
+                l_min_exec,
+                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_MIN_EXEC)
+            );
+            let l_min_exec_us = set!(
+                l_min_exec_us,
+                (lstat(bpf_intf::layer_stat_idx_LSTAT_MIN_EXEC_NS) / 1000) as i64
+            );
            let l_open_idle = set!(
                l_open_idle,
                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_OPEN_IDLE)
@ -1598,6 +1636,22 @@ impl<'a> Scheduler<'a> {
                    l_affn_viol.get(),
                    width = header_width,
                );
+                match &layer.kind {
+                    LayerKind::Confined { min_exec_us, .. }
+                    | LayerKind::Grouped { min_exec_us, .. }
+                    | LayerKind::Open { min_exec_us, .. }
+                        if *min_exec_us > 0 =>
+                    {
+                        info!(
+                            "  {:<width$}  min_exec={:5.2} min_exec_ms={:7.2}",
+                            "",
+                            l_min_exec.get(),
+                            l_min_exec_us.get() as f64 / 1000.0,
+                            width = header_width,
+                        );
+                    }
+                    _ => {}
+                }
                match &layer.kind {
                    LayerKind::Grouped { exclusive, .. } | LayerKind::Open { exclusive, .. }
                        if *exclusive =>
@ -1608,7 +1662,7 @@ impl<'a> Scheduler<'a> {
                            l_excl_collision.get(),
                            l_excl_preempt.get(),
                            width = header_width,
-                        )
+                        );
                    }
                    _ => (),
                }
@ -1689,6 +1743,7 @@ fn write_example_file(path: &str) -> Result<()> {
                kind: LayerKind::Confined {
                    cpus_range: Some((0, 16)),
                    util_range: (0.8, 0.9),
+                    min_exec_us: 1000,
                },
            },
            LayerSpec {
@ -1699,6 +1754,7 @@ fn write_example_file(path: &str) -> Result<()> {
                    LayerMatch::NiceBelow(0),
                ]],
                kind: LayerKind::Open {
+                    min_exec_us: 100,
                    preempt: true,
                    exclusive: true,
                },
@ -1710,6 +1766,7 @@ fn write_example_file(path: &str) -> Result<()> {
                kind: LayerKind::Grouped {
                    cpus_range: None,
                    util_range: (0.5, 0.6),
+                    min_exec_us: 200,
                    preempt: false,
                    exclusive: false,
                },
@ -1782,6 +1839,7 @@ fn verify_layer_specs(specs: &[LayerSpec]) -> Result<()> {
            LayerKind::Confined {
                cpus_range,
                util_range,
+                ..
            }
            | LayerKind::Grouped {
                cpus_range,