From 0096c0632b75638edf675eb1eda2b24b65cf2cbf Mon Sep 17 00:00:00 2001
From: Daniel Hodges <hodges.daniel.scott@gmail.com>
Date: Fri, 8 Nov 2024 14:06:45 -0800
Subject: [PATCH] scx_layered: Fix cost accounting for dsqs

Fix cost accounting for fallback DSQs on refresh that DSQ budgets
get refilled appropriately. Add helper functions for converting to and
from a DSQ id to a LLC budget id. During preemption a layer should check
if it is attempting to preempt from a layer that has more budget and
only preempt if the preempting layer has more budget.

Signed-off-by: Daniel Hodges <hodges.daniel.scott@gmail.com>
---
 scheds/rust/scx_layered/src/bpf/cost.bpf.c | 153 ++++++++++++++-------
 scheds/rust/scx_layered/src/bpf/cost.bpf.h |   4 +-
 scheds/rust/scx_layered/src/bpf/main.bpf.c |  26 ++--
 3 files changed, 124 insertions(+), 59 deletions(-)

diff --git a/scheds/rust/scx_layered/src/bpf/cost.bpf.c b/scheds/rust/scx_layered/src/bpf/cost.bpf.c
index b2a2e85..7ccd936 100644
--- a/scheds/rust/scx_layered/src/bpf/cost.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/cost.bpf.c
@@ -15,7 +15,19 @@ static __always_inline int fallback_dsq_cost_id(u64 fallback_dsq)
 		scx_bpf_error("invalid fallback dsq");
 		return 0;
 	}
-	return (int)fallback_dsq - HI_FALLBACK_DSQ_BASE;
+	return nr_layers + (int)fallback_dsq - HI_FALLBACK_DSQ_BASE;
+}
+
+/*
+ * Converts a llc DSQ to a cost id for accessing a cost struct.
+ */
+static __always_inline int fallback_llc_cost_id(int fallback_llc)
+{
+	if (fallback_llc > MAX_LLCS) {
+		scx_bpf_error("invalid fallback llc");
+		return 0;
+	}
+	return nr_layers + fallback_llc;
 }
 
 /*
@@ -24,16 +36,8 @@ static __always_inline int fallback_dsq_cost_id(u64 fallback_dsq)
 static u64 budget_id_to_fallback_dsq(u32 budget_id)
 {
 	if (budget_id == MAX_GLOBAL_BUDGETS)
-		return LO_FALLBACK_DSQ;
-	return HI_FALLBACK_DSQ_BASE + budget_id;
-}
-
-/*
- * Returns true if the cost has preferred fallback DSQ budget
- */
-static bool has_pref_fallback_budget(struct cost *costc)
-{
-	return costc->pref_budget > nr_layers && costc->pref_budget <= MAX_GLOBAL_BUDGETS;
+		return (u64)LO_FALLBACK_DSQ;
+	return (u64)HI_FALLBACK_DSQ_BASE + (u64)budget_id - nr_layers;
 }
 
 /*
@@ -125,7 +129,7 @@ static struct cost *initialize_cost(u32 cost_idx, u32 parent_idx,
 static __noinline void initialize_budget(struct cost *costc, u32 budget_id,
 					 s64 capacity)
 {
-	if (budget_id >= MAX_GLOBAL_BUDGETS) {
+	if (budget_id > MAX_GLOBAL_BUDGETS) {
 		scx_bpf_error("invalid budget id %d", budget_id);
 		return;
 	}
@@ -140,11 +144,10 @@ static void calc_preferred_cost(struct cost *costc)
 {
 	u32 layer_id, id, budget_id, pref_budget = 0, max_layer = 0;
 	s64 max_budget = 0;
-	u64 dsq_id;
 	u32 rotation = bpf_get_smp_processor_id() % nr_layers;
 
 	bpf_for(id, 0, nr_layers) {
-		/* 
+		/*
 		 * If there is two equally weighted layers that have the same
 		 * budget we rely on rotating the layers based on the cpu. This
 		 * may not work well on low core machines.
@@ -157,14 +160,16 @@ static void calc_preferred_cost(struct cost *costc)
 		if (costc->budget[layer_id] > max_budget) {
 			max_budget = costc->budget[layer_id];
 			max_layer = layer_id;
-			pref_budget = max_layer;
+			pref_budget = layer_id;
 		}
 	}
 	// Hi fallback DSQs
 	bpf_for(id, 0, nr_llcs) {
-		if (costc->budget[id] > max_budget) {
-			max_budget = costc->budget[id];
-			pref_budget = id;
+		budget_id = fallback_llc_cost_id(id);
+		if (costc->budget[budget_id] >= max_budget) {
+			max_budget = costc->budget[budget_id];
+			pref_budget = budget_id;
+			trace("COST pref fallback %d", budget_id);
 		}
 	}
 	budget_id = fallback_dsq_cost_id(LO_FALLBACK_DSQ);
@@ -178,34 +183,85 @@ static void calc_preferred_cost(struct cost *costc)
 
 	costc->pref_layer = max_layer;
 	costc->pref_budget = pref_budget;
+	if (costc->idx == 0 && pref_budget > nr_layers)
+	trace("COST pref_layer %d pref_budget %d budget %lld",
+	      max_layer, pref_budget, costc->budget[pref_budget]);
 
 	return;
 }
 
+/*
+ * Returns true if the cost has preferred fallback DSQ budget
+ */
+static bool has_pref_fallback_budget(struct cost *costc)
+{
+	return costc->pref_budget >= nr_layers &&
+	       costc->pref_budget < MAX_GLOBAL_BUDGETS;
+}
+
+/*
+ * Returns if a budget is allowed to preempt another budget. In general if the
+ * preempting budget is greater than the running budget then it is allowed to
+ * preempt.
+ */
+static __always_inline bool has_preempt_budget(struct cost *costc,
+					       u32 cur_budget, u32 budget_id)
+{
+	if (cur_budget >= MAX_GLOBAL_BUDGETS ||
+	     budget_id >= MAX_GLOBAL_BUDGETS)
+		return false;
+
+	/*
+	 * Fallback DSQs are always allowed to preempt
+	 */
+	if (budget_id > nr_layers)
+		return true;
+
+	return costc->budget[budget_id] > costc->budget[cur_budget];
+}
+
 /*
  * Refreshes the budget of a cost.
  */
 int refresh_budget(int cost_id)
 {
 	struct cost *costc;
+	s64 capacity;
 
 	if (!(costc = lookup_cost(cost_id))) {
 		scx_bpf_error("failed to lookup cost %d", cost_id);
 		return 0;
 	}
 
-	u32 layer_id, id;
+	u32 budget_id, id;
 	u32 rotation = bpf_get_smp_processor_id() % nr_layers;
 	bpf_for(id, 0, nr_layers) {
-		layer_id = rotate_layer_id(id, rotation);
-		if (layer_id > nr_layers) {
-			scx_bpf_error("invalid layer");
+		budget_id = rotate_layer_id(id, rotation);
+		if (budget_id > nr_layers) {
+			scx_bpf_error("invalid budget id");
 			return 0;
 		}
-		s64 capacity = costc->capacity[layer_id];
-		__sync_lock_test_and_set(MEMBER_VPTR(*costc, .budget[layer_id]),
+		capacity = costc->capacity[budget_id];
+		__sync_lock_test_and_set(MEMBER_VPTR(*costc, .budget[budget_id]),
 					 capacity);
 	}
+	// Hi fallback DSQs
+	bpf_for(id, 0, nr_llcs) {
+		budget_id = fallback_llc_cost_id(id);
+		capacity = costc->capacity[budget_id];
+		__sync_lock_test_and_set(MEMBER_VPTR(*costc, .budget[budget_id]),
+					 capacity);
+	}
+	budget_id = fallback_dsq_cost_id(LO_FALLBACK_DSQ);
+	if (budget_id > MAX_GLOBAL_BUDGETS) {
+		scx_bpf_error("invalid budget");
+		return 0;
+	}
+	capacity = costc->capacity[budget_id];
+	__sync_lock_test_and_set(MEMBER_VPTR(*costc, .budget[budget_id]),
+				 capacity);
+
+	trace("COST refreshed budget %d", cost_id);
 
 	return 0;
 }
@@ -223,11 +279,11 @@ int refresh_budgets(void)
 /*
  * Acquires a budget from a parent cost account.
  */
-s64 acquire_budget(struct cost *costc, u32 layer_id, s64 amount)
+s64 acquire_budget(struct cost *costc, u32 budget_id, s64 amount)
 {
 	s64 budget = 0;
 
-	if (layer_id >= MAX_LAYERS || layer_id < 0) {
+	if (budget_id >= MAX_GLOBAL_BUDGETS) {
 		scx_bpf_error("invalid parent cost");
 		return budget;
 	}
@@ -235,14 +291,15 @@ s64 acquire_budget(struct cost *costc, u32 layer_id, s64 amount)
 	if (!costc || !costc->has_parent)
 		return budget;
 
-
 	struct cost *parent_cost;
-	if (!(parent_cost = lookup_cost(costc->idx)))
+	if (!(parent_cost = lookup_cost(costc->idx))) {
+		scx_bpf_error("failed to find parent");
 		return budget;
+	}
 
-	__sync_fetch_and_sub(&parent_cost->budget[layer_id], amount);
+	__sync_fetch_and_sub(&parent_cost->budget[budget_id], amount);
 
-	if (parent_cost->budget[layer_id] < 0)
+	if (parent_cost->budget[budget_id] <= 0)
 		refresh_budgets();
 
 	return amount;
@@ -253,7 +310,7 @@ s64 acquire_budget(struct cost *costc, u32 layer_id, s64 amount)
  * acquire budget by either retrieving budget from the global context or
  * refreshing all budgets.
  */
-int record_cpu_cost(struct cost *costc, u32 budget_id, s64 amount)
+int record_cpu_cost(struct cost *costc, u32 budget_id, s64 amount, u64 slice_ns)
 {
 	if (budget_id > MAX_GLOBAL_BUDGETS || !costc) {
 		scx_bpf_error("invalid budget %d", budget_id);
@@ -262,10 +319,11 @@ int record_cpu_cost(struct cost *costc, u32 budget_id, s64 amount)
 
 	__sync_fetch_and_sub(&costc->budget[budget_id], amount);
 
-	if (costc->budget[budget_id] <= 0) {
+	if (costc->budget[budget_id] <= 0 ||
+	    costc->budget[budget_id] < slice_ns) {
 		if (costc->has_parent) {
-			s64 budget = acquire_budget(costc, budget_id,
-						    costc->capacity[budget_id] + amount);
+			s64 req_budget = costc->capacity[budget_id] - costc->budget[budget_id];
+			s64 budget = acquire_budget(costc, budget_id, req_budget);
 			if (budget > 0) {
 				__sync_fetch_and_add(&costc->budget[budget_id],
 						     costc->capacity[budget_id]);
@@ -296,7 +354,7 @@ __weak int has_budget(struct cost *costc, struct layer *layer)
 	s64 budget = *MEMBER_VPTR(*costc, .budget[layer_id]);
 	u64 layer_slice_ns = layer->slice_ns > 0 ? layer->slice_ns : slice_ns;
 
-	if (budget > layer_slice_ns)
+	if (budget >= layer_slice_ns)
 		return slice_ns;
 
 	return 0;
@@ -310,7 +368,7 @@ static void initialize_budgets(u64 refresh_intvl_ns)
 	struct layer *layer;
 	struct cost *costc, *global_costc;
 	int layer_id, llc_id;
-	u64 dsq_id, layer_weight_dur, layer_weight_sum = 0;
+	u64 layer_weight_dur, layer_weight_sum = 0;
 	s32 cpu;
 	u32 budget_id;
 
@@ -339,7 +397,7 @@ static void initialize_budgets(u64 refresh_intvl_ns)
 			return;
 		}
 
-		layer_weight_dur = (layer->weight * ((u64)refresh_intvl_ns * slice_ns * nr_possible_cpus)) /
+		layer_weight_dur = (layer->weight * ((u64)refresh_intvl_ns * nr_possible_cpus)) /
 				    layer_weight_sum;
 		initialize_budget(global_costc, layer_id, (s64)layer_weight_dur);
 		trace("COST GLOBAL[%d][%s] budget %lld",
@@ -354,7 +412,7 @@ static void initialize_budgets(u64 refresh_intvl_ns)
 				scx_bpf_error("failed to cpu budget: %d", cpu);
 				return;
 			}
-			layer_weight_dur = (layer->weight * slice_ns * refresh_intvl_ns) /
+			layer_weight_dur = (layer->weight * refresh_intvl_ns) /
 					    layer_weight_sum;
 			initialize_budget(costc, layer_id, (s64)layer_weight_dur);
 			if (cpu == 0)
@@ -367,16 +425,15 @@ static void initialize_budgets(u64 refresh_intvl_ns)
 	 * XXX: since any task from any layer can get kicked to the fallback
 	 * DSQ we use the default slice to calculate the default budget.
 	 */
-	layer_weight_dur = (LO_FALLBACK_DSQ_WEIGHT * slice_ns * refresh_intvl_ns * nr_possible_cpus) /
+	layer_weight_dur = (LO_FALLBACK_DSQ_WEIGHT * refresh_intvl_ns * nr_possible_cpus) /
 			    layer_weight_sum;
-	initialize_budget(global_costc, fallback_dsq_cost_id(LO_FALLBACK_DSQ),
-			  (s64)layer_weight_dur);
+	budget_id = fallback_dsq_cost_id(LO_FALLBACK_DSQ);
+	initialize_budget(global_costc, budget_id, (s64)layer_weight_dur);
 
 	bpf_for(llc_id, 0, nr_llcs) {
-		dsq_id = llc_hi_fallback_dsq_id(llc_id);
-		budget_id = fallback_dsq_cost_id(dsq_id);
+		budget_id = fallback_llc_cost_id(llc_id);
 
-		layer_weight_dur = (HI_FALLBACK_DSQ_WEIGHT * slice_ns * refresh_intvl_ns * nr_possible_cpus) /
+		layer_weight_dur = (HI_FALLBACK_DSQ_WEIGHT * refresh_intvl_ns * nr_possible_cpus) /
 				    layer_weight_sum;
 		initialize_budget(global_costc, budget_id, (s64)layer_weight_dur);
 
@@ -390,14 +447,14 @@ static void initialize_budgets(u64 refresh_intvl_ns)
 			// On first iteration always setup the lo fallback dsq budget.
 			if (llc_id == 0) {
 				budget_id = fallback_dsq_cost_id(LO_FALLBACK_DSQ);
-				layer_weight_dur = (LO_FALLBACK_DSQ_WEIGHT * slice_ns * refresh_intvl_ns) /
+				layer_weight_dur = (LO_FALLBACK_DSQ_WEIGHT * refresh_intvl_ns) /
 						    layer_weight_sum;
-				initialize_budget(costc, budget_id,
-						  (s64)layer_weight_dur);
+				initialize_budget(costc, budget_id, (s64)layer_weight_dur);
 			}
 
-			layer_weight_dur = (HI_FALLBACK_DSQ_WEIGHT * slice_ns * refresh_intvl_ns) /
+			layer_weight_dur = (HI_FALLBACK_DSQ_WEIGHT * refresh_intvl_ns) /
 					    layer_weight_sum;
+			budget_id = fallback_llc_cost_id(llc_id);
 			initialize_budget(costc, budget_id, (s64)layer_weight_dur);
 			if (cpu == 0 && llc_id == 0 && budget_id < MAX_GLOBAL_BUDGETS)
 				trace("COST CPU DSQ[%d][%d] budget %lld",
diff --git a/scheds/rust/scx_layered/src/bpf/cost.bpf.h b/scheds/rust/scx_layered/src/bpf/cost.bpf.h
index 2bfb06d..59e4818 100644
--- a/scheds/rust/scx_layered/src/bpf/cost.bpf.h
+++ b/scheds/rust/scx_layered/src/bpf/cost.bpf.h
@@ -15,8 +15,8 @@
 
 enum cost_consts {
 	COST_GLOBAL_KEY		= 0,
-	HI_FALLBACK_DSQ_WEIGHT	= 50,
-	LO_FALLBACK_DSQ_WEIGHT	= 10,
+	HI_FALLBACK_DSQ_WEIGHT	= 95,
+	LO_FALLBACK_DSQ_WEIGHT	= 85,
 
 	/*
 	 * Max global budgets map fallback DSQs (per LLC) as well as layers.
diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
index b2275d7..4efe2b5 100644
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -822,8 +822,12 @@ bool try_preempt_cpu(s32 cand, struct task_struct *p, struct cpu_ctx *cctx,
 	if (!(cand_cctx = lookup_cpu_ctx(cand)) || cand_cctx->current_preempt)
 		return false;
 
-	if (!(costc = lookup_cpu_cost(cand)) || has_budget(costc, layer) == 0)
+	if (!(costc = lookup_cpu_cost(cand)) ||
+	    has_budget(costc, layer) == 0 ||
+	    !has_preempt_budget(costc, cand_cctx->layer_idx, tctx->layer)) {
+		trace("COST layer %s not enough budget to preempt", layer->name);
 		return false;
+	}
 
 	/*
 	 * If exclusive, we want to make sure the sibling CPU, if there's
@@ -1604,11 +1608,14 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
 	u32 my_llc_id = cpu_to_llc_id(cpu);
 
 	/*
-	 * If one of the fallback DSQs has the most budget then consume from
-	 * it to prevent starvation.
+	 * If one of the fallback DSQs has the most budget then consume from it
+	 * to prevent starvation.
 	 */
 	if (has_pref_fallback_budget(costc)) {
 		dsq_id = budget_id_to_fallback_dsq(costc->pref_budget);
+		trace("COST consuming fallback %lld", dsq_id);
+		if (dsq_id > LO_FALLBACK_DSQ)
+			scx_bpf_error("invalid fallback dsq %lld", dsq_id);
 		if (scx_bpf_consume(dsq_id))
 			return;
 	}
@@ -2058,13 +2065,14 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
 	} else {
 		budget_id = layer->idx;
 	}
-	record_cpu_cost(costc, budget_id, (s64)used);
+
+	u64 slice_ns = layer_slice_ns(layer);
+	record_cpu_cost(costc, budget_id, (s64)used, slice_ns);
 
 	cctx->layer_cycles[lidx] += used;
 	cctx->current_preempt = false;
 	cctx->prev_exclusive = cctx->current_exclusive;
 	cctx->current_exclusive = false;
-	u64 slice_ns = layer_slice_ns(layer);
 
 	/* scale the execution time by the inverse of the weight and charge */
 	if (cctx->yielding && used < slice_ns)
@@ -2279,7 +2287,7 @@ int dump_cost(void)
 	bpf_for(i, 0, nr_llcs) {
 		u64 dsq_id = llc_hi_fallback_dsq_id(i);
 		u32 budget_id = fallback_dsq_cost_id(dsq_id);
-		scx_bpf_dump("COST FALLBACK[%d][%d] budget=%lld capacity=%lld\n",
+		scx_bpf_dump("COST FALLBACK[%llu][%d] budget=%lld capacity=%lld\n",
 			     dsq_id, budget_id,
 			     costc->budget[budget_id], costc->capacity[budget_id]);
 	}
@@ -2305,8 +2313,8 @@ int dump_cost(void)
 			u32 budget_id = fallback_dsq_cost_id(dsq_id);
 			if (budget_id >= MAX_GLOBAL_BUDGETS)
 				continue;
-			scx_bpf_dump("COST CPU[%d]FALLBACK[%d][%d] budget=%lld capacity=%lld\n",
-				     i, j, dsq_id, budget_id,
+			scx_bpf_dump("COST CPU[%d]FALLBACK[%llu][%d] budget=%lld capacity=%lld\n",
+				     i, dsq_id, budget_id,
 				     costc->budget[budget_id], costc->capacity[budget_id]);
 		}
 	}
@@ -2725,7 +2733,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 			}
 		}
 	}
-	initialize_budgets(1000LLU * NSEC_PER_MSEC);
+	initialize_budgets(15LLU * NSEC_PER_SEC);
 	ret = start_layered_timers();
 	if (ret < 0)
 		return ret;