scx_lavd: Decide load balancing plan across compute domains

The goal of load balancing is to maintain almost equal queued
tasks per CPU in a compute domain. To this end, we first decide
which compute domain is under-utilized (i.e., its queue length
per CPU is below average) and which compute domain is over-utilized
(i.e., its queue length per CPU is above average). We call the
under-utilized domain as a stealer domain and the over-utilized
domain as a stealee domain.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
This commit is contained in:
Changwoo Min 2024-11-27 16:10:25 +09:00
parent ed14a4ca91
commit 7991266773
6 changed files with 78 additions and 59 deletions

View File

@ -78,6 +78,7 @@ struct sys_stat {
volatile u32 max_perf_cri; /* maximum performance criticality */
volatile u32 thr_perf_cri; /* performance criticality threshold */
volatile u32 nr_stealee; /* number of compute domains to be migrated */
volatile u32 nr_violation; /* number of utilization violation */
volatile u32 nr_active; /* number of active cores */

View File

@ -51,6 +51,7 @@ enum consts_internal {
performance mode when cpu util > 40% */
LAVD_CPDOM_STARV_NS = (2 * LAVD_SLICE_MAX_NS_DFL),
LAVD_CPDOM_MIGRATION_SHIFT = 2, /* 1/2**2 = +/- 25% */
};
/*
@ -58,12 +59,15 @@ enum consts_internal {
* - system > numa node > llc domain > compute domain per core type (P or E)
*/
struct cpdom_ctx {
u64 last_consume_clk; /* when the associated DSQ was consumed */
u64 id; /* id of this compute domain (== dsq_id) */
u64 alt_id; /* id of the closest compute domain of alternative type (== dsq id) */
u8 node_id; /* numa domain id */
u8 is_big; /* is it a big core or little core? */
u8 is_active; /* if this compute domain is active */
u8 is_stealer; /* this domain should steal tasks from others */
u8 is_stealee; /* stealer doamin should steal tasks from this domain */
u16 nr_cpus; /* the number of CPUs in this compute domain */
u32 nr_q_tasks_per_cpu; /* the number of queued tasks per CPU in this domain (x1000) */
u8 nr_neighbors[LAVD_CPDOM_MAX_DIST]; /* number of neighbors per distance */
u64 neighbor_bits[LAVD_CPDOM_MAX_DIST]; /* bitmask of neighbor bitmask per distance */
u64 __cpumask[LAVD_CPU_ID_MAX/64]; /* cpumasks belongs to this compute domain */

View File

@ -1120,7 +1120,6 @@ static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
return false;
}
WRITE_ONCE(cpdomc->last_consume_clk, now);
/*
* Try to consume a task on the associated DSQ.
@ -1130,57 +1129,12 @@ static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
return false;
}
static bool consume_starving_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
{
struct cpdom_ctx *cpdomc;
u64 dsq_id = cpuc->cpdom_poll_pos;
u64 dl;
bool ret = false;
int i;
if (nr_cpdoms == 1)
return false;
bpf_for(i, 0, nr_cpdoms) {
if (i >= LAVD_CPDOM_MAX_NR)
break;
dsq_id = (dsq_id + i) % LAVD_CPDOM_MAX_NR;
if (dsq_id == cpuc->cpdom_id)
continue;
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
if (!cpdomc) {
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
goto out;
}
if (cpdomc->is_active) {
dl = READ_ONCE(cpdomc->last_consume_clk) + LAVD_CPDOM_STARV_NS;
if (dl < now) {
ret = consume_dsq(cpu, dsq_id, now);
}
goto out;
}
}
out:
cpuc->cpdom_poll_pos = (dsq_id + 1) % LAVD_CPDOM_MAX_NR;
return ret;
}
static bool consume_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
{
struct cpdom_ctx *cpdomc, *cpdomc_pick;
u64 dsq_id, nr_nbr;
s64 nuance;
/*
* If there is a starving DSQ, try to consume it first.
*/
if (consume_starving_task(cpu, cpuc, now))
goto x_domain_migration_out;
/*
* Try to consume from CPU's associated DSQ.
*/
@ -1813,8 +1767,6 @@ static s32 init_cpdoms(u64 now)
if (!cpdomc->is_active)
continue;
WRITE_ONCE(cpdomc->last_consume_clk, now);
/*
* Create an associated DSQ on its associated NUMA domain.
*/
@ -2032,6 +1984,7 @@ static s32 init_per_cpu_ctx(u64 now)
}
cpuc->cpdom_id = cpdomc->id;
cpuc->cpdom_alt_id = cpdomc->alt_id;
cpdomc->nr_cpus++;
}
}
}

View File

@ -39,6 +39,7 @@ struct sys_stat_ctx {
u32 nr_perf_cri;
u32 nr_lat_cri;
u32 nr_x_migration;
u32 nr_stealee;
u32 nr_big;
u32 nr_pc_on_big;
u32 nr_lc_on_big;
@ -63,10 +64,66 @@ static void init_sys_stat_ctx(struct sys_stat_ctx *c)
c->stat_next->last_update_clk = c->now;
}
static void plan_x_cpdom_migration(struct sys_stat_ctx *c)
{
struct cpdom_ctx *cpdomc;
u64 dsq_id;
u32 avg_nr_q_tasks_per_cpu = 0, nr_q_tasks, x_mig_delta;
u32 stealer_threshold, stealee_threshold;
/*
* Calcualte average queued tasks per CPU per compute domain.
*/
bpf_for(dsq_id, 0, nr_cpdoms) {
if (dsq_id >= LAVD_CPDOM_MAX_NR)
break;
nr_q_tasks = scx_bpf_dsq_nr_queued(dsq_id);
c->nr_queued_task += nr_q_tasks;
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
cpdomc->nr_q_tasks_per_cpu = (nr_q_tasks * 1000) / cpdomc->nr_cpus;
avg_nr_q_tasks_per_cpu += cpdomc->nr_q_tasks_per_cpu;
}
avg_nr_q_tasks_per_cpu /= nr_cpdoms;
/*
* Determine stealer and stealee domains.
*
* A stealer domain, whose per-CPU queue length is shorter than
* the average, will steal a task from any of stealee domain,
* whose per-CPU queue length is longer than the average.
* Compute domain around average will not do anything.
*/
x_mig_delta = avg_nr_q_tasks_per_cpu >> LAVD_CPDOM_MIGRATION_SHIFT;
stealer_threshold = avg_nr_q_tasks_per_cpu - x_mig_delta;
stealee_threshold = avg_nr_q_tasks_per_cpu + x_mig_delta;
bpf_for(dsq_id, 0, nr_cpdoms) {
if (dsq_id >= LAVD_CPDOM_MAX_NR)
break;
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
if (cpdomc->nr_q_tasks_per_cpu < stealer_threshold) {
WRITE_ONCE(cpdomc->is_stealer, true);
WRITE_ONCE(cpdomc->is_stealee, false);
}
else if (cpdomc->nr_q_tasks_per_cpu > stealee_threshold) {
WRITE_ONCE(cpdomc->is_stealer, false);
WRITE_ONCE(cpdomc->is_stealee, true);
c->nr_stealee++;
}
else {
WRITE_ONCE(cpdomc->is_stealer, false);
WRITE_ONCE(cpdomc->is_stealee, false);
}
}
}
static void collect_sys_stat(struct sys_stat_ctx *c)
{
u64 dsq_id;
int cpu, nr;
int cpu;
bpf_for(cpu, 0, nr_cpu_ids) {
struct cpu_ctx *cpuc = get_cpu_ctx_id(cpu);
@ -173,12 +230,6 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
c->idle_total += cpuc->idle_total;
cpuc->idle_total = 0;
}
bpf_for(dsq_id, 0, LAVD_CPDOM_MAX_NR) {
nr = scx_bpf_dsq_nr_queued(dsq_id);
if (nr > 0)
c->nr_queued_task += nr;
}
}
static void calc_sys_stat(struct sys_stat_ctx *c)
@ -243,6 +294,8 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
c->stat_cur->thr_perf_cri; /* will be updated later */
}
stat_next->nr_stealee = c->nr_stealee;
stat_next->nr_violation =
calc_avg32(stat_cur->nr_violation, c->nr_violation);
@ -293,6 +346,7 @@ static void do_update_sys_stat(void)
* Collect and prepare the next version of stat.
*/
init_sys_stat_ctx(&c);
plan_x_cpdom_migration(&c);
collect_sys_stat(&c);
calc_sys_stat(&c);
update_sys_stat_next(&c);

View File

@ -712,6 +712,7 @@ impl<'a> Scheduler<'a> {
let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
let nr_stealee = st.nr_stealee;
let nr_big = st.nr_big;
let pc_big = Self::get_pc(nr_big, nr_sched);
let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
@ -732,6 +733,7 @@ impl<'a> Scheduler<'a> {
pc_pc,
pc_lc,
pc_x_migration,
nr_stealee,
pc_big,
pc_pc_on_big,
pc_lc_on_big,

View File

@ -40,6 +40,9 @@ pub struct SysStats {
#[stat(desc = "% of cross domain task migration")]
pub pc_x_migration: f64,
#[stat(desc = "Number of stealee domains")]
pub nr_stealee: u32,
#[stat(desc = "% of tasks scheduled on big cores")]
pub pc_big: f64,
@ -66,7 +69,7 @@ impl SysStats {
pub fn format_header<W: Write>(w: &mut W) -> Result<()> {
writeln!(
w,
"\x1b[93m| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |\x1b[0m",
"\x1b[93m| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |\x1b[0m",
"MSEQ",
"# Q TASK",
"# ACT CPU",
@ -74,6 +77,7 @@ impl SysStats {
"PERF-CR%",
"LAT-CR%",
"X-MIG%",
"# STLEE",
"BIG%",
"PC/BIG%",
"LC/BIG%",
@ -92,7 +96,7 @@ impl SysStats {
writeln!(
w,
"| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |",
"| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |",
self.mseq,
self.nr_queued_task,
self.nr_active,
@ -100,6 +104,7 @@ impl SysStats {
GPoint(self.pc_pc),
GPoint(self.pc_lc),
GPoint(self.pc_x_migration),
self.nr_stealee,
GPoint(self.pc_big),
GPoint(self.pc_pc_on_big),
GPoint(self.pc_lc_on_big),