Merge pull request #896 from hodgesds/layered-dsq-cost

scx_layered: Add fallback DSQ cost accounting
2024-11-25 11:00:24 +00:00 · 2024-11-07 13:35:04 +00:00 · 2024-11-07 13:35:04 +00:00 · d6ba3b79d7
commit d6ba3b79d7
parent debe991b26 487baa4a03
3 changed files with 227 additions and 73 deletions
--- a/scheds/rust/scx_layered/src/bpf/cost.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/cost.bpf.c
@ -1,23 +1,40 @@
 /* Copyright (c) Meta Platforms, Inc. and affiliates. */
+#include "cost.bpf.h"
 #include <bpf/bpf_core_read.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>


-/*
- * Cost accounting struct that is used in both the per CPU and global context.
- * Budgets are allowed to recurse to parent structs.
- */
-struct cost {
-	s64		budget[MAX_LAYERS];
-	s64		capacity[MAX_LAYERS];
-	u32		pref_layer;
-	u32		idx;
-	bool		overflow;
-	bool		has_parent;
-	bool		drain_fallback;
-};

+/*
+ * Converts a fallback DSQ to a cost id for accessing a cost struct.
+ */
+static __always_inline int fallback_dsq_cost_id(u64 fallback_dsq)
+{
+	if (fallback_dsq < HI_FALLBACK_DSQ_BASE) {
+		scx_bpf_error("invalid fallback dsq");
+		return 0;
+	}
+	return (int)fallback_dsq - HI_FALLBACK_DSQ_BASE;
+}
+
+/*
+ * Returns the fallback DSQ id for a budget id.
+ */
+static u64 budget_id_to_fallback_dsq(u32 budget_id)
+{
+	if (budget_id == MAX_GLOBAL_BUDGETS)
+		return LO_FALLBACK_DSQ;
+	return HI_FALLBACK_DSQ_BASE + budget_id;
+}
+
+/*
+ * Returns true if the cost has preferred fallback DSQ budget
+ */
+static bool has_pref_fallback_budget(struct cost *costc)
+{
+	return costc->pref_budget > nr_layers && costc->pref_budget <= MAX_GLOBAL_BUDGETS;
+}

 /*
 * Map used for global cost accounting. Can be extended to support NUMA nodes.
@ -26,7 +43,7 @@ struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__type(key, u32);
 	__type(value, struct cost);
-	__uint(max_entries, MAX_NUMA_NODES + 1);
+	__uint(max_entries, 1);
 	__uint(map_flags, 0);
 } cost_data SEC(".maps");

@ -72,6 +89,7 @@ static __always_inline struct cost *lookup_cpu_cost(s32 cpu)
 	return costc;
 }

+
 /*
 * Initializes a cost.
 */
@ -102,39 +120,65 @@ static struct cost *initialize_cost(u32 cost_idx, u32 parent_idx,
 }

 /*
- * Initializes the cost of a layer.
+ * Initializes a budget.
 */
-static void initialize_cost_layer(struct cost *costc, u32 layer_id, s64 capacity)
+static void initialize_budget(struct cost *costc, u32 budget_id, s64 capacity)
 {
-	costc->capacity[layer_id] = capacity;
-	costc->budget[layer_id] = capacity;
+	if (budget_id >= MAX_GLOBAL_BUDGETS) {
+		scx_bpf_error("invalid budget id %d", budget_id);
+		return;
+	}
+	costc->capacity[budget_id] = capacity;
+	costc->budget[budget_id] = capacity;
 }

 /*
- * Returns the preferred layer based on the layer with the maximum budget.
+ * Calculates the preferred budget and layer based based on maximum budget.
 */
-static u32 preferred_cost(struct cost *costc)
+static void calc_preferred_cost(struct cost *costc)
 {
-	u32 layer_id, id, max_layer = 0;
+	u32 layer_id, id, budget_id, pref_budget = 0, max_layer = 0;
 	s64 max_budget = 0;
+	u64 dsq_id;
 	u32 rotation = bpf_get_smp_processor_id() % nr_layers;

 	bpf_for(id, 0, nr_layers) {
-		// If there is two equally weighted layers that have the same
-		// budget we rely on rotating the layers based on the cpu. This
-		// may not work well on low core machines.
+		/* 
+		 * If there is two equally weighted layers that have the same
+		 * budget we rely on rotating the layers based on the cpu. This
+		 * may not work well on low core machines.
+		 */
 		layer_id = rotate_layer_id(id, rotation);
 		if (layer_id > nr_layers) {
 			scx_bpf_error("invalid layer");
-			return 0;
+			return;
 		}
 		if (costc->budget[layer_id] > max_budget) {
 			max_budget = costc->budget[layer_id];
 			max_layer = layer_id;
+			pref_budget = max_layer;
 		}
 	}
+	// Hi fallback DSQs
+	bpf_for(id, 0, nr_llcs) {
+		if (costc->budget[id] > max_budget) {
+			max_budget = costc->budget[id];
+			pref_budget = id;
+		}
+	}
+	budget_id = fallback_dsq_cost_id(LO_FALLBACK_DSQ);
+	if (budget_id > MAX_GLOBAL_BUDGETS) {
+		scx_bpf_error("invalid budget");
+		return;
+	}
+	if (costc->budget[budget_id] > max_budget) {
+		pref_budget = budget_id;
+	}

-	return max_layer;
+	costc->pref_layer = max_layer;
+	costc->pref_budget = pref_budget;
+
+	return;
 }

 /*
@ -208,34 +252,26 @@ s64 acquire_budget(struct cost *costc, u32 layer_id, s64 amount)
 * acquire budget by either retrieving budget from the global context or
 * refreshing all budgets.
 */
-static int record_cpu_cost(struct cost *costc, u32 layer_id, s64 amount)
+int record_cpu_cost(struct cost *costc, u32 budget_id, s64 amount)
 {
-	if (layer_id >= MAX_LAYERS || !costc) {
-		scx_bpf_error("invalid layer %d", layer_id);
+	if (budget_id > MAX_GLOBAL_BUDGETS || !costc) {
+		scx_bpf_error("invalid budget %d", budget_id);
 		return 0;
 	}

-	__sync_fetch_and_sub(&costc->budget[layer_id], amount);
+	__sync_fetch_and_sub(&costc->budget[budget_id], amount);

-	if (costc->budget[layer_id] <= 0) {
-		costc->drain_fallback = true;
+	if (costc->budget[budget_id] <= 0) {
 		if (costc->has_parent) {
-			s64 budget = acquire_budget(costc, layer_id,
-						    costc->capacity[layer_id] + amount);
+			s64 budget = acquire_budget(costc, budget_id,
+						    costc->capacity[budget_id] + amount);
 			if (budget > 0) {
-				__sync_fetch_and_add(MEMBER_VPTR(*costc, .budget[layer_id]),
-						     costc->capacity[layer_id]);
+				__sync_fetch_and_add(&costc->budget[budget_id],
+						     costc->capacity[budget_id]);
 			}
 		}
 	}
-
-	u32 pref_layer = preferred_cost(costc);
-	if (pref_layer > nr_layers) {
-		scx_bpf_error("invalid pref_layer");
-		return 0;
-	}
-
-	costc->pref_layer = pref_layer;
+	calc_preferred_cost(costc);

 	return 0;
 }
@ -271,11 +307,11 @@ int has_budget(struct cost *costc, struct layer *layer)
 static void initialize_budgets(u64 refresh_intvl_ns)
 {
 	struct layer *layer;
-	struct cost *costc;
-	int layer_id;
-	u64 layer_weight_dur, layer_weight_sum = 0;
+	struct cost *costc, *global_costc;
+	int layer_id, llc_id;
+	u64 dsq_id, layer_weight_dur, layer_weight_sum = 0;
 	s32 cpu;
-	u32 global = 0;
+	u32 budget_id;

 	bpf_for(layer_id, 0, nr_layers) {
 		layer = &layers[layer_id];
@ -285,41 +321,86 @@ static void initialize_budgets(u64 refresh_intvl_ns)
 		}
 		layer_weight_sum += layer->weight;
 	}
+	layer_weight_sum += HI_FALLBACK_DSQ_WEIGHT;
+	layer_weight_sum += LO_FALLBACK_DSQ_WEIGHT;

-	bpf_for(layer_id, 0, nr_layers) {
-		costc = initialize_cost(global, global, false, false, false);
-		if (!costc) {
+	global_costc = initialize_cost(COST_GLOBAL_KEY, COST_GLOBAL_KEY,
+				       false, false, false);
+	if (!global_costc) {
 		scx_bpf_error("failed to initialize global budget");
 		return;
 	}

+	bpf_for(layer_id, 0, nr_layers) {
 		layer = &layers[layer_id];
 		if (!layer) {
 			scx_bpf_error("failed to lookup layer %d", layer_id);
 			return;
 		}
-		u64 layer_slice_ns = layer->slice_ns > 0 ? layer->slice_ns : slice_ns;

-		layer_weight_dur = (layer->weight * ((u64)refresh_intvl_ns * nr_possible_cpus)) /
+		layer_weight_dur = (layer->weight * ((u64)refresh_intvl_ns * slice_ns * nr_possible_cpus)) /
 				    layer_weight_sum;
-		initialize_cost_layer(costc, layer_id, (s64)layer_weight_dur);
+		initialize_budget(global_costc, layer_id, (s64)layer_weight_dur);
 		trace("COST GLOBAL[%d][%s] budget %lld",
-		      layer_id, layer->name, costc->budget[layer_id]);
+		      layer_id, layer->name, global_costc->budget[layer_id]);

 		// TODO: add L3 budgets for topology awareness

 		bpf_for(cpu, 0, nr_possible_cpus) {
-			costc = initialize_cost(cpu, global, true,
+			costc = initialize_cost(cpu, COST_GLOBAL_KEY, true,
 						true, false);
 			if (!costc) {
 				scx_bpf_error("failed to cpu budget: %d", cpu);
 				return;
 			}
-			layer_weight_dur = (layer->weight * layer_slice_ns * refresh_intvl_ns) /
+			layer_weight_dur = (layer->weight * slice_ns * refresh_intvl_ns) /
 					    layer_weight_sum;
-			initialize_cost_layer(costc, layer_id, (s64)layer_weight_dur);
+			initialize_budget(costc, layer_id, (s64)layer_weight_dur);
+			if (cpu == 0)
 				trace("COST CPU[%d][%d][%s] budget %lld",
 				      cpu, layer_id, layer->name, costc->budget[layer_id]);
 		}
 	}
+
+	/*
+	 * XXX: since any task from any layer can get kicked to the fallback
+	 * DSQ we use the default slice to calculate the default budget.
+	 */
+	layer_weight_dur = (LO_FALLBACK_DSQ_WEIGHT * slice_ns * refresh_intvl_ns * nr_possible_cpus) /
+			    layer_weight_sum;
+	initialize_budget(global_costc, fallback_dsq_cost_id(LO_FALLBACK_DSQ),
+			  (s64)layer_weight_dur);
+
+	bpf_for(llc_id, 0, nr_llcs) {
+		dsq_id = llc_hi_fallback_dsq_id(llc_id);
+		budget_id = fallback_dsq_cost_id(dsq_id);
+
+		layer_weight_dur = (HI_FALLBACK_DSQ_WEIGHT * slice_ns * refresh_intvl_ns * nr_possible_cpus) /
+				    layer_weight_sum;
+		initialize_budget(global_costc, budget_id, (s64)layer_weight_dur);
+
+		bpf_for(cpu, 0, nr_possible_cpus) {
+			costc = lookup_cpu_cost(cpu);
+			if (!costc) {
+				scx_bpf_error("failed to cpu budget: %d", cpu);
+				return;
+			}
+
+			// On first iteration always setup the lo fallback dsq budget.
+			if (llc_id == 0) {
+				budget_id = fallback_dsq_cost_id(LO_FALLBACK_DSQ);
+				layer_weight_dur = (LO_FALLBACK_DSQ_WEIGHT * slice_ns * refresh_intvl_ns) /
+						    layer_weight_sum;
+				initialize_budget(costc, budget_id,
+						  (s64)layer_weight_dur);
+			}
+
+			layer_weight_dur = (HI_FALLBACK_DSQ_WEIGHT * slice_ns * refresh_intvl_ns) /
+					    layer_weight_sum;
+			initialize_budget(costc, budget_id, (s64)layer_weight_dur);
+			if (cpu == 0 && llc_id == 0)
+				trace("COST CPU DSQ[%d][%d] budget %lld",
+				      cpu, budget_id, costc->budget[budget_id]);
+		}
+	}
 }
--- a/scheds/rust/scx_layered/src/bpf/cost.bpf.h
+++ b/scheds/rust/scx_layered/src/bpf/cost.bpf.h
@ -0,0 +1,46 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+#ifndef __LAYERED_COST_H
+#define __LAYERED_COST_H
+
+#ifdef LSP
+#define __bpf__
+#ifndef LSP_INC
+#include "../../../../include/scx/common.bpf.h"
+#endif
+#endif
+#include "intf.h"
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+enum cost_consts {
+	COST_GLOBAL_KEY		= 0,
+	HI_FALLBACK_DSQ_WEIGHT	= 50,
+	LO_FALLBACK_DSQ_WEIGHT	= 10,
+
+	/*
+	 * Max global budgets map fallback DSQs (per LLC) as well as layers.
+	 * This is so that budgets can easily be calculated between fallback
+	 * dsqs and weights. The cost accounting could be done at the DSQ
+	 * level, which would simplify some things at the cost of the size of
+	 * the cost struct.
+	 */
+	MAX_GLOBAL_BUDGETS	= MAX_LLCS + MAX_LAYERS + 1,
+};
+
+/*
+ * Cost accounting struct that is used in both the per CPU and global context.
+ * Budgets are allowed to recurse to parent structs.
+ */
+struct cost {
+	s64		budget[MAX_GLOBAL_BUDGETS];
+	s64		capacity[MAX_GLOBAL_BUDGETS];
+	u32		pref_budget; // the cost with the most budget
+	u32		pref_layer; // the layer with the most budget.
+	u32		idx;
+	bool		overflow;
+	bool		has_parent;
+};
+
+
+#endif /* __LAYERED_COST_H */
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@ -133,6 +133,11 @@ static u64 llc_hi_fallback_dsq_id(u32 llc_id)
 	return HI_FALLBACK_DSQ_BASE + llc_id;
 }

+static inline bool is_fallback_dsq(u64 dsq_id)
+{
+	return dsq_id > HI_FALLBACK_DSQ_BASE && dsq_id <= LO_FALLBACK_DSQ;
+}
+
 static u64 llc_hi_fallback_dsq_iter_offset(int llc_offset, int idx)
 {
 	int offset = llc_offset + idx;
@ -383,6 +388,7 @@ struct task_ctx {
 	bool			all_cpus_allowed;
 	u64			runnable_at;
 	u64			running_at;
+	u64			last_dsq;
 };

 struct {
@ -1076,6 +1082,7 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 			lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);

 		idx = cpu_hi_fallback_dsq_id(task_cpu);
+		tctx->last_dsq = idx;
 		scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
 		goto preempt;
 	}
@ -1102,15 +1109,18 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 		 */
 		idx = cpu_hi_fallback_dsq_id(task_cpu);
 		scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
+		tctx->last_dsq = idx;
 		goto preempt;
 	}

 	if (disable_topology) {
+		tctx->last_dsq = tctx->layer;
 		scx_bpf_dispatch_vtime(p, tctx->layer, layer_slice_ns, vtime, enq_flags);
 	} else {
 		u32 llc_id = cpu_to_llc_id(tctx->last_cpu >= 0 ? tctx->last_cpu :
 					   bpf_get_smp_processor_id());
 		idx = layer_dsq_id(layer->idx, llc_id);
+		tctx->last_dsq = idx;
 		scx_bpf_dispatch_vtime(p, idx, layer_slice_ns, vtime, enq_flags);
 	}

@ -1247,6 +1257,16 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
 		return;
 	}

+	/*
+	 * If one of the fallback DSQs has the most budget then consume from
+	 * it to prevent starvation.
+	 */
+	if (has_pref_fallback_budget(costc)) {
+		dsq_id = budget_id_to_fallback_dsq(costc->pref_budget);
+		if (scx_bpf_consume(dsq_id))
+			return;
+	}
+
 	/* consume preempting layers first */
 	bpf_for(idx, 0, nr_layers) {
 		layer_idx = rotate_layer_id(costc->pref_layer, idx);
@ -1444,20 +1464,18 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
 		return;
 	}

+	u32 my_llc_id = cpu_to_llc_id(cpu);
+
 	/*
-	 * Fallback DSQs don't have cost accounting. When the budget runs out
-	 * for a layer we do an extra consume of the fallback DSQ to ensure
-	 * that it doesn't stall out when the system is being saturated.
+	 * If one of the fallback DSQs has the most budget then consume from
+	 * it to prevent starvation.
 	 */
-	if (costc->drain_fallback) {
-		costc->drain_fallback = false;
-		dsq_id = cpu_hi_fallback_dsq_id(cpu);
+	if (has_pref_fallback_budget(costc)) {
+		dsq_id = budget_id_to_fallback_dsq(costc->pref_budget);
 		if (scx_bpf_consume(dsq_id))
 			return;
 	}

-	u32 my_llc_id = cpu_to_llc_id(cpu);
-
 	/* consume preempting layers first */
 	if (consume_preempting(costc, my_llc_id) == 0)
 		return;
@ -1878,6 +1896,7 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
 	struct task_ctx *tctx;
 	struct layer *layer;
 	struct cost *costc;
+	u32 budget_id;
 	s32 lidx;
 	u64 used;

@ -1895,7 +1914,15 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
 		used = layer->min_exec_ns;
 	}

-	record_cpu_cost(costc, layer->idx, (s64)used);
+	// If the task ran on the hi fallback dsq then the cost should be
+	// charged to it.
+	if (is_fallback_dsq(tctx->last_dsq)) {
+		budget_id = fallback_dsq_cost_id(tctx->last_dsq);
+	} else {
+		budget_id = layer->idx;
+	}
+	record_cpu_cost(costc, budget_id, (s64)used);
+
 	cctx->layer_cycles[lidx] += used;
 	cctx->current_preempt = false;
 	cctx->prev_exclusive = cctx->current_exclusive;