Merge 72ecf3c8e3 into 7d14df8ca2

Merge pull request #1000 from multics69/lavd-load-balancing
scx_lavd: Load balancing across compute domains
2024-12-04 08:17:11 +00:00 · 2024-11-30 14:45:50 +01:00 · 2024-11-30 12:10:04 +09:00 · 2024-11-30 12:09:43 +09:00 · 2024-11-30 12:09:43 +09:00 · 2024-11-30 12:09:43 +09:00
8 changed files with 252 additions and 84 deletions
--- a/scheds/rust/scx_lavd/src/bpf/intf.h
+++ b/scheds/rust/scx_lavd/src/bpf/intf.h
@ -78,12 +78,14 @@ struct sys_stat {
 	volatile u32	max_perf_cri;	/* maximum performance criticality */
 	volatile u32	thr_perf_cri;	/* performance criticality threshold */

+	volatile u32	nr_stealee;	/* number of compute domains to be migrated */
 	volatile u32	nr_violation;	/* number of utilization violation */
 	volatile u32	nr_active;	/* number of active cores */

 	volatile u64	nr_sched;	/* total scheduling so far */
 	volatile u64	nr_perf_cri;	/* number of performance-critical tasks scheduled */
 	volatile u64	nr_lat_cri;	/* number of latency-critical tasks scheduled */
+	volatile u64	nr_x_migration; /* number of cross domain migration */
 	volatile u64	nr_big;		/* scheduled on big core */
 	volatile u64	nr_pc_on_big;	/* performance-critical tasks scheduled on big core */
 	volatile u64	nr_lc_on_big;	/* latency-critical tasks scheduled on big core */
--- a/scheds/rust/scx_lavd/src/bpf/lavd.bpf.h
+++ b/scheds/rust/scx_lavd/src/bpf/lavd.bpf.h
@ -51,6 +51,9 @@ enum consts_internal  {
 						  performance mode when cpu util > 40% */

 	LAVD_CPDOM_STARV_NS		= (2 * LAVD_SLICE_MAX_NS_DFL),
+	LAVD_CPDOM_MIGRATION_SHIFT	= 3, /* 1/2**3 = +/- 12.5% */
+	LAVD_CPDOM_X_PROB_FT		= (LAVD_SYS_STAT_INTERVAL_NS /
+					   (2 * LAVD_SLICE_MAX_NS_DFL)), /* roughly twice per interval */
 };

 /*
@ -58,12 +61,15 @@ enum consts_internal  {
 * - system > numa node > llc domain > compute domain per core type (P or E)
 */
 struct cpdom_ctx {
-	u64	last_consume_clk;		    /* when the associated DSQ was consumed */
 	u64	id;				    /* id of this compute domain (== dsq_id) */
 	u64	alt_id;				    /* id of the closest compute domain of alternative type (== dsq id) */
 	u8	node_id;			    /* numa domain id */
 	u8	is_big;				    /* is it a big core or little core? */
 	u8	is_active;			    /* if this compute domain is active */
+	u8	is_stealer;			    /* this domain should steal tasks from others */
+	u8	is_stealee;			    /* stealer doamin should steal tasks from this domain */
+	u16	nr_cpus;			    /* the number of CPUs in this compute domain */
+	u32	nr_q_tasks_per_cpu;		    /* the number of queued tasks per CPU in this domain (x1000) */
 	u8	nr_neighbors[LAVD_CPDOM_MAX_DIST];  /* number of neighbors per distance */
 	u64	neighbor_bits[LAVD_CPDOM_MAX_DIST]; /* bitmask of neighbor bitmask per distance */
 	u64	__cpumask[LAVD_CPU_ID_MAX/64];	    /* cpumasks belongs to this compute domain */
@ -129,6 +135,7 @@ struct cpu_ctx {
 	/*
 	 * Information for statistics.
 	 */
+	volatile u32	nr_x_migration;
 	volatile u32	nr_perf_cri;
 	volatile u32	nr_lat_cri;

--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@ -1108,7 +1108,7 @@ void BPF_STRUCT_OPS(lavd_enqueue, struct task_struct *p, u64 enq_flags)
 	}
 }

-static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
+static bool consume_dsq(u64 dsq_id)
 {
 	struct cpdom_ctx *cpdomc;

@ -1120,7 +1120,6 @@ static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
 		scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
 		return false;
 	}
-	WRITE_ONCE(cpdomc->last_consume_clk, now);

 	/*
 	 * Try to consume a task on the associated DSQ.
@ -1130,81 +1129,110 @@ static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
 	return false;
 }

-static bool consume_starving_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
+static bool try_to_steal_task(struct cpdom_ctx *cpdomc)
 {
-	struct cpdom_ctx *cpdomc;
-	u64 dsq_id = cpuc->cpdom_poll_pos;
-	u64 dl;
-	bool ret = false;
-	int i;
-
-	if (nr_cpdoms == 1)
-		return false;
-
-	bpf_for(i, 0, nr_cpdoms) {
-		if (i >= LAVD_CPDOM_MAX_NR)
-			break;
-
-		dsq_id = (dsq_id + i) % LAVD_CPDOM_MAX_NR;
-
-		if (dsq_id == cpuc->cpdom_id)
-			continue;
-
-		cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
-		if (!cpdomc) {
-			scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
-			goto out;
-		}
-
-		if (cpdomc->is_active) {
-			dl = READ_ONCE(cpdomc->last_consume_clk) + LAVD_CPDOM_STARV_NS;
-			if (dl < now) {
-				ret = consume_dsq(cpu, dsq_id, now);
-			}
-			goto out;
-		}
-	}
-out:
-	cpuc->cpdom_poll_pos = (dsq_id + 1) % LAVD_CPDOM_MAX_NR;
-	return ret;
-}
-
-static bool consume_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
-{
-	struct cpdom_ctx *cpdomc, *cpdomc_pick;
-	u64 dsq_id, nr_nbr;
+	struct cpdom_ctx *cpdomc_pick;
+	u64 nr_nbr, dsq_id;
 	s64 nuance;

 	/*
-	 * If there is a starving DSQ, try to consume it first.
+	 * If all CPUs are not used -- i.e., the system is under-utilized,
+	 * there is no point of load balancing. It is better to make an
+	 * effort to increase the system utilization.
 	 */
-	if (consume_starving_task(cpu, cpuc, now))
-		return true;
-
-	/*
-	 * Try to consume from CPU's associated DSQ.
-	 */
-	dsq_id = cpuc->cpdom_id;
-	if (consume_dsq(cpu, dsq_id, now))
-		return true;
-
-	/*
-	 * If there is no task in the assssociated DSQ, traverse neighbor
-	 * compute domains in distance order -- task stealing.
-	 */
-	cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
-	if (!cpdomc) {
-		scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
+	if (!use_full_cpus())
 		return false;
-	}

+	/*
+	 * Probabilistically make a go or no go decision to avoid the
+	 * thundering herd problem. In other words, one out of nr_cpus
+	 * will try to steal a task at a moment.
+	 */
+	if (!prob_x_out_of_y(1, cpdomc->nr_cpus * LAVD_CPDOM_X_PROB_FT))
+		return false;
+
+	/*
+	 * Traverse neighbor compute domains in distance order.
+	 */
+	nuance = bpf_get_prandom_u32();
 	for (int i = 0; i < LAVD_CPDOM_MAX_DIST; i++) {
 		nr_nbr = min(cpdomc->nr_neighbors[i], LAVD_CPDOM_MAX_NR);
 		if (nr_nbr == 0)
 			break;

-		nuance = bpf_get_prandom_u32();
-		for (int j = 0; j < LAVD_CPDOM_MAX_NR; j++, nuance = dsq_id + 1) {
+		/*
+		 * Traverse neighbor in the same distance in arbitrary order.
+		 */
+		for (int j = 0; j < LAVD_CPDOM_MAX_NR; j++, nuance++) {
+			if (j >= nr_nbr)
+				break;
+
+			dsq_id = pick_any_bit(cpdomc->neighbor_bits[i], nuance);
+			if (dsq_id == -ENOENT)
+				continue;
+
+			cpdomc_pick = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
+			if (!cpdomc_pick) {
+				scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
+				return false;
+			}
+
+			if (!cpdomc_pick->is_stealee || !cpdomc_pick->is_active)
+				continue;
+
+			/*
+			 * If task stealing is successful, mark the stealer
+			 * and the stealee's job done. By marking done,
+			 * those compute domains would not be involved in
+			 * load balancing until the end of this round,
+			 * so this helps gradual migration. Note that multiple
+			 * stealers can steal tasks from the same stealee.
+			 * However, we don't coordinate concurrent stealing
+			 * because the chance is low and there is no harm
+			 * in slight over-stealing.
+			 */
+			if (consume_dsq(dsq_id)) {
+				WRITE_ONCE(cpdomc_pick->is_stealee, false);
+				WRITE_ONCE(cpdomc->is_stealer, false);
+				return true;
+			}
+		}
+
+		/*
+		 * Now, we need to steal a task from a farther neighbor
+		 * for load balancing. Since task migration from a farther
+		 * neighbor is more expensive (e.g., crossing a NUMA boundary),
+		 * we will do this with a lot of hesitation. The chance of
+		 * further migration will decrease exponentially as distance
+		 * increases, so, on the other hand, it increases the chance
+		 * of closer migration.
+		 */
+		if (!prob_x_out_of_y(1, LAVD_CPDOM_X_PROB_FT))
+			break;
+	}
+
+	return false;
+}
+
+static bool force_to_steal_task(struct cpdom_ctx *cpdomc)
+{
+	struct cpdom_ctx *cpdomc_pick;
+	u64 nr_nbr, dsq_id;
+	s64 nuance;
+
+	/*
+	 * Traverse neighbor compute domains in distance order.
+	 */
+	nuance = bpf_get_prandom_u32();
+	for (int i = 0; i < LAVD_CPDOM_MAX_DIST; i++) {
+		nr_nbr = min(cpdomc->nr_neighbors[i], LAVD_CPDOM_MAX_NR);
+		if (nr_nbr == 0)
+			break;
+
+		/*
+		 * Traverse neighbor in the same distance in arbitrary order.
+		 */
+		for (int j = 0; j < LAVD_CPDOM_MAX_NR; j++, nuance++) {
 			if (j >= nr_nbr)
 				break;

@ -1221,7 +1249,7 @@ static bool consume_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
 			if (!cpdomc_pick->is_active)
 				continue;

-			if (consume_dsq(cpu, dsq_id, now))
+			if (consume_dsq(dsq_id))
 				return true;
 		}
 	}
@ -1229,9 +1257,51 @@ static bool consume_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
 	return false;
 }

+static bool consume_task(struct cpu_ctx *cpuc)
+{
+	struct cpdom_ctx *cpdomc;
+	u64 dsq_id;
+
+	dsq_id = cpuc->cpdom_id;
+	cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
+	if (!cpdomc) {
+		scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
+		return false;
+	}
+
+	/*
+	 * If the current compute domain is a stealer, try to steal
+	 * a task from any of stealee domains probabilistically.
+	 */
+	if (cpdomc->is_stealer && try_to_steal_task(cpdomc))
+		goto x_domain_migration_out;
+
+	/*
+	 * Try to consume a task from CPU's associated DSQ.
+	 */
+	if (consume_dsq(dsq_id))
+		return true;
+
+	/*
+	 * If there is no task in the assssociated DSQ, traverse neighbor
+	 * compute domains in distance order -- task stealing.
+	 */
+	if (force_to_steal_task(cpdomc))
+		goto x_domain_migration_out;
+
+	return false;
+
+	/*
+	 * Task migration across compute domains happens.
+	 * Update the statistics.
+	 */
+x_domain_migration_out:
+	cpuc->nr_x_migration++;
+	return true;
+}
+
 void BPF_STRUCT_OPS(lavd_dispatch, s32 cpu, struct task_struct *prev)
 {
-	u64 now = bpf_ktime_get_ns();
 	struct cpu_ctx *cpuc;
 	struct task_ctx *taskc;
 	struct bpf_cpumask *active, *ovrflw;
@ -1365,10 +1435,7 @@ consume_out:
 	/*
 	 * Consume a task if requested.
 	 */
-	if (!try_consume)
-		return;
-
-	if (consume_task(cpu, cpuc, now))
+	if (try_consume && consume_task(cpuc))
 		return;

 	/*
@ -1805,8 +1872,6 @@ static s32 init_cpdoms(u64 now)
 		if (!cpdomc->is_active)
 			continue;

-		WRITE_ONCE(cpdomc->last_consume_clk, now);
-
 		/*
 		 * Create an associated DSQ on its associated NUMA domain.
 		 */
@ -2024,6 +2089,7 @@ static s32 init_per_cpu_ctx(u64 now)
 					}
 					cpuc->cpdom_id = cpdomc->id;
 					cpuc->cpdom_alt_id = cpdomc->alt_id;
+					cpdomc->nr_cpus++;
 				}
 			}
 		}
--- a/scheds/rust/scx_lavd/src/bpf/sys_stat.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/sys_stat.bpf.c
@ -38,6 +38,8 @@ struct sys_stat_ctx {
 	u32		nr_sched;
 	u32		nr_perf_cri;
 	u32		nr_lat_cri;
+	u32		nr_x_migration;
+	u32		nr_stealee;
 	u32		nr_big;
 	u32		nr_pc_on_big;
 	u32		nr_lc_on_big;
@ -62,10 +64,66 @@ static void init_sys_stat_ctx(struct sys_stat_ctx *c)
 	c->stat_next->last_update_clk = c->now;
 }

+static void plan_x_cpdom_migration(struct sys_stat_ctx *c)
+{
+	struct cpdom_ctx *cpdomc;
+	u64 dsq_id;
+	u32 avg_nr_q_tasks_per_cpu = 0, nr_q_tasks, x_mig_delta;
+	u32 stealer_threshold, stealee_threshold;
+
+	/*
+	 * Calcualte average queued tasks per CPU per compute domain.
+	 */
+	bpf_for(dsq_id, 0, nr_cpdoms) {
+		if (dsq_id >= LAVD_CPDOM_MAX_NR)
+			break;
+
+		nr_q_tasks = scx_bpf_dsq_nr_queued(dsq_id);
+		c->nr_queued_task += nr_q_tasks;
+
+		cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
+		cpdomc->nr_q_tasks_per_cpu = (nr_q_tasks * 1000) / cpdomc->nr_cpus;
+		avg_nr_q_tasks_per_cpu += cpdomc->nr_q_tasks_per_cpu;
+	}
+	avg_nr_q_tasks_per_cpu /= nr_cpdoms;
+
+	/*
+	 * Determine stealer and stealee domains.
+	 *
+	 * A stealer domain, whose per-CPU queue length is shorter than
+	 * the average, will steal a task from any of stealee domain,
+	 * whose per-CPU queue length is longer than the average.
+	 * Compute domain around average will not do anything.
+	 */
+	x_mig_delta = avg_nr_q_tasks_per_cpu >> LAVD_CPDOM_MIGRATION_SHIFT;
+	stealer_threshold = avg_nr_q_tasks_per_cpu - x_mig_delta;
+	stealee_threshold = avg_nr_q_tasks_per_cpu + x_mig_delta;
+
+	bpf_for(dsq_id, 0, nr_cpdoms) {
+		if (dsq_id >= LAVD_CPDOM_MAX_NR)
+			break;
+
+		cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
+
+		if (cpdomc->nr_q_tasks_per_cpu < stealer_threshold) {
+			WRITE_ONCE(cpdomc->is_stealer, true);
+			WRITE_ONCE(cpdomc->is_stealee, false);
+		}
+		else if (cpdomc->nr_q_tasks_per_cpu > stealee_threshold) {
+			WRITE_ONCE(cpdomc->is_stealer, false);
+			WRITE_ONCE(cpdomc->is_stealee, true);
+			c->nr_stealee++;
+		}
+		else {
+			WRITE_ONCE(cpdomc->is_stealer, false);
+			WRITE_ONCE(cpdomc->is_stealee, false);
+		}
+	}
+}
+
 static void collect_sys_stat(struct sys_stat_ctx *c)
 {
-	u64 dsq_id;
-	int cpu, nr;
+	int cpu;

 	bpf_for(cpu, 0, nr_cpu_ids) {
 		struct cpu_ctx *cpuc = get_cpu_ctx_id(cpu);
@ -94,6 +152,9 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
 		c->nr_lat_cri += cpuc->nr_lat_cri;
 		cpuc->nr_lat_cri = 0;

+		c->nr_x_migration += cpuc->nr_x_migration;
+		cpuc->nr_x_migration = 0;
+
 		/*
 		 * Accumulate task's latency criticlity information.
 		 *
@ -169,12 +230,6 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
 		c->idle_total += cpuc->idle_total;
 		cpuc->idle_total = 0;
 	}
- 
-	bpf_for(dsq_id, 0, LAVD_CPDOM_MAX_NR) {
-		nr = scx_bpf_dsq_nr_queued(dsq_id);
-		if (nr > 0)
-			c->nr_queued_task += nr;
-	}
 }

 static void calc_sys_stat(struct sys_stat_ctx *c)
@ -239,6 +294,8 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
 			c->stat_cur->thr_perf_cri; /* will be updated later */
 	}

+	stat_next->nr_stealee = c->nr_stealee;
+
 	stat_next->nr_violation =
 		calc_avg32(stat_cur->nr_violation, c->nr_violation);

@ -260,6 +317,7 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
 		stat_next->nr_sched >>= 1;
 		stat_next->nr_perf_cri >>= 1;
 		stat_next->nr_lat_cri >>= 1;
+		stat_next->nr_x_migration >>= 1;
 		stat_next->nr_big >>= 1;
 		stat_next->nr_pc_on_big >>= 1;
 		stat_next->nr_lc_on_big >>= 1;
@ -272,6 +330,7 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
 	stat_next->nr_sched += c->nr_sched;
 	stat_next->nr_perf_cri += c->nr_perf_cri;
 	stat_next->nr_lat_cri += c->nr_lat_cri;
+	stat_next->nr_x_migration += c->nr_x_migration;
 	stat_next->nr_big += c->nr_big;
 	stat_next->nr_pc_on_big += c->nr_pc_on_big;
 	stat_next->nr_lc_on_big += c->nr_lc_on_big;
@ -287,6 +346,7 @@ static void do_update_sys_stat(void)
 	 * Collect and prepare the next version of stat.
 	 */
 	init_sys_stat_ctx(&c);
+	plan_x_cpdom_migration(&c);
 	collect_sys_stat(&c);
 	calc_sys_stat(&c);
 	update_sys_stat_next(&c);
--- a/scheds/rust/scx_lavd/src/bpf/util.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/util.bpf.c
@ -299,3 +299,14 @@ static void set_on_core_type(struct task_ctx *taskc,
 	WRITE_ONCE(taskc->on_big, on_big);
 	WRITE_ONCE(taskc->on_little, on_little);
 }
+
+static bool prob_x_out_of_y(u32 x, u32 y)
+{
+	/*
+	 * [0, r, y)
+	 *  ---- x?
+	 */
+	u32 r = bpf_get_prandom_u32() % y;
+	return r < x;
+}
+
--- a/scheds/rust/scx_lavd/src/main.rs
+++ b/scheds/rust/scx_lavd/src/main.rs
@ -711,6 +711,8 @@ impl<'a> Scheduler<'a> {
                let nr_sched = st.nr_sched;
                let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
                let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
+                let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
+                let nr_stealee = st.nr_stealee;
                let nr_big = st.nr_big;
                let pc_big = Self::get_pc(nr_big, nr_sched);
                let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
@ -730,6 +732,8 @@ impl<'a> Scheduler<'a> {
                    nr_sched,
                    pc_pc,
                    pc_lc,
+                    pc_x_migration,
+                    nr_stealee,
                    pc_big,
                    pc_pc_on_big,
                    pc_lc_on_big,
--- a/scheds/rust/scx_lavd/src/stats.rs
+++ b/scheds/rust/scx_lavd/src/stats.rs
@ -37,6 +37,12 @@ pub struct SysStats {
    #[stat(desc = "% of latency-critical tasks")]
    pub pc_lc: f64,

+    #[stat(desc = "% of cross domain task migration")]
+    pub pc_x_migration: f64,
+
+    #[stat(desc = "Number of stealee domains")]
+    pub nr_stealee: u32,
+
    #[stat(desc = "% of tasks scheduled on big cores")]
    pub pc_big: f64,

@ -63,13 +69,15 @@ impl SysStats {
    pub fn format_header<W: Write>(w: &mut W) -> Result<()> {
        writeln!(
            w,
-            "\x1b[93m| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |\x1b[0m",
+            "\x1b[93m| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |\x1b[0m",
            "MSEQ",
            "# Q TASK",
            "# ACT CPU",
            "# SCHED",
            "PERF-CR%",
            "LAT-CR%",
+            "X-MIG%",
+            "# STLEE",
            "BIG%",
            "PC/BIG%",
            "LC/BIG%",
@ -88,13 +96,15 @@ impl SysStats {

        writeln!(
            w,
-            "| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |",
+            "| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |",
            self.mseq,
            self.nr_queued_task,
            self.nr_active,
            self.nr_sched,
            GPoint(self.pc_pc),
            GPoint(self.pc_lc),
+            GPoint(self.pc_x_migration),
+            self.nr_stealee,
            GPoint(self.pc_big),
            GPoint(self.pc_pc_on_big),
            GPoint(self.pc_lc_on_big),
--- a/scheds/rust/scx_rusty/src/load_balance.rs
+++ b/scheds/rust/scx_rusty/src/load_balance.rs
@ -133,6 +133,7 @@
 use core::cmp::Ordering;
 use std::cell::Cell;
 use std::collections::BTreeMap;
+use std::collections::HashSet;
 use std::collections::VecDeque;
 use std::fmt;
 use std::sync::Arc;
@ -343,6 +344,7 @@ struct Domain {
    queried_tasks: bool,
    load: LoadEntity,
    tasks: SortedVec<TaskInfo>,
+    active_tptr_set: HashSet<u64>,
 }

 impl Domain {
@ -362,6 +364,7 @@ impl Domain {
                load_avg,
            ),
            tasks: SortedVec::new(),
+            active_tptr_set: HashSet::new(),
        }
    }

@ -680,6 +683,10 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
            let tptr = active_tptrs.tptrs[(idx % MAX_TPTRS) as usize];
            let key = unsafe { std::mem::transmute::<u64, [u8; 8]>(tptr) };

+            if dom.active_tptr_set.contains(&tptr) {
+                continue;
+            }
+
            if let Some(task_data_elem) = task_data.lookup(&key, libbpf_rs::MapFlags::ANY)? {
                let task_ctx =
                    unsafe { &*(task_data_elem.as_slice().as_ptr() as *const bpf_intf::task_ctx) };
@ -705,6 +712,7 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
                };
                load *= weight;

+                dom.active_tptr_set.insert(tptr);
                dom.tasks.insert(TaskInfo {
                    tptr,
                    load: OrderedFloat(load),
Author	SHA1	Message	Date
I-Hsin Cheng	f3c7d01e7e	Merge `72ecf3c8e3` into `7d14df8ca2`	2024-11-30 14:45:50 +01:00
Changwoo Min	7d14df8ca2	Merge pull request #1000 from multics69/lavd-load-balancing scx_lavd: Load balancing across compute domains	2024-11-30 12:10:04 +09:00
Changwoo Min	047e8c81e9	scx_lavd: Perform load balancing at consume_task() Upon ops.dispatch, perform load balancing based on the set-up plan, stealing a task from a stealee domain to a stealer domain. To avoid the thundering herd problem of concurrent stealers, a stealer steals a task probabilistically. Also, to minimize the task migration distance, decrease the stealing probability exponentially for each hop in the distance. Finally, for every stat cycle (50 ms), a stealer will migrate only one task from a stealee for gradual load balancing. Signed-off-by: Changwoo Min <changwoo@igalia.com>	2024-11-30 12:09:43 +09:00
Changwoo Min	4f1ffc1bc6	scx_lavd: Refactor consume_task() Remove unnecessary variables and arguments and factor out force_to_steal_task(). Signed-off-by: Changwoo Min <changwoo@igalia.com>	2024-11-30 12:09:43 +09:00
Changwoo Min	7991266773	scx_lavd: Decide load balancing plan across compute domains The goal of load balancing is to maintain almost equal queued tasks per CPU in a compute domain. To this end, we first decide which compute domain is under-utilized (i.e., its queue length per CPU is below average) and which compute domain is over-utilized (i.e., its queue length per CPU is above average). We call the under-utilized domain as a stealer domain and the over-utilized domain as a stealee domain. Signed-off-by: Changwoo Min <changwoo@igalia.com>	2024-11-30 12:09:43 +09:00
Changwoo Min	ed14a4ca91	scx_lavd: Log out the number of cross-domain task migration Collect and log out the number of task migration across compute domains. Signed-off-by: Changwoo Min <changwoo@igalia.com>	2024-11-30 12:09:43 +09:00
I Hsin Cheng	72ecf3c8e3	scx_rusty: Temporary fix of duplicate active tptr Under severe load unbalance scenario such as mixtures of CPU-insensive workload and I/O-intensive worload, same tptr may be written into the same dom_active_tptrs's array. It will lead to load balancer's failure because when the tptr task contains large enough load, it tends be to selected so warnings about same tptr being set in "lb_data" will continue to pop up. Use a workaround for now , which is to keep a HashSet in userspace recording the current active tptr under a domain, and do not generate the same task repeatedly. Signed-off-by: I Hsin Cheng <richard120310@gmail.com>	2024-11-19 22:03:18 +08:00