mirror of
https://github.com/sched-ext/scx.git
synced 2024-12-04 16:27:12 +00:00
scx_lavd: Decide load balancing plan across compute domains
The goal of load balancing is to maintain almost equal queued tasks per CPU in a compute domain. To this end, we first decide which compute domain is under-utilized (i.e., its queue length per CPU is below average) and which compute domain is over-utilized (i.e., its queue length per CPU is above average). We call the under-utilized domain as a stealer domain and the over-utilized domain as a stealee domain. Signed-off-by: Changwoo Min <changwoo@igalia.com>
This commit is contained in:
parent
ed14a4ca91
commit
7991266773
@ -78,6 +78,7 @@ struct sys_stat {
|
||||
volatile u32 max_perf_cri; /* maximum performance criticality */
|
||||
volatile u32 thr_perf_cri; /* performance criticality threshold */
|
||||
|
||||
volatile u32 nr_stealee; /* number of compute domains to be migrated */
|
||||
volatile u32 nr_violation; /* number of utilization violation */
|
||||
volatile u32 nr_active; /* number of active cores */
|
||||
|
||||
|
@ -51,6 +51,7 @@ enum consts_internal {
|
||||
performance mode when cpu util > 40% */
|
||||
|
||||
LAVD_CPDOM_STARV_NS = (2 * LAVD_SLICE_MAX_NS_DFL),
|
||||
LAVD_CPDOM_MIGRATION_SHIFT = 2, /* 1/2**2 = +/- 25% */
|
||||
};
|
||||
|
||||
/*
|
||||
@ -58,12 +59,15 @@ enum consts_internal {
|
||||
* - system > numa node > llc domain > compute domain per core type (P or E)
|
||||
*/
|
||||
struct cpdom_ctx {
|
||||
u64 last_consume_clk; /* when the associated DSQ was consumed */
|
||||
u64 id; /* id of this compute domain (== dsq_id) */
|
||||
u64 alt_id; /* id of the closest compute domain of alternative type (== dsq id) */
|
||||
u8 node_id; /* numa domain id */
|
||||
u8 is_big; /* is it a big core or little core? */
|
||||
u8 is_active; /* if this compute domain is active */
|
||||
u8 is_stealer; /* this domain should steal tasks from others */
|
||||
u8 is_stealee; /* stealer doamin should steal tasks from this domain */
|
||||
u16 nr_cpus; /* the number of CPUs in this compute domain */
|
||||
u32 nr_q_tasks_per_cpu; /* the number of queued tasks per CPU in this domain (x1000) */
|
||||
u8 nr_neighbors[LAVD_CPDOM_MAX_DIST]; /* number of neighbors per distance */
|
||||
u64 neighbor_bits[LAVD_CPDOM_MAX_DIST]; /* bitmask of neighbor bitmask per distance */
|
||||
u64 __cpumask[LAVD_CPU_ID_MAX/64]; /* cpumasks belongs to this compute domain */
|
||||
|
@ -1120,7 +1120,6 @@ static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
|
||||
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
|
||||
return false;
|
||||
}
|
||||
WRITE_ONCE(cpdomc->last_consume_clk, now);
|
||||
|
||||
/*
|
||||
* Try to consume a task on the associated DSQ.
|
||||
@ -1130,57 +1129,12 @@ static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool consume_starving_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
|
||||
{
|
||||
struct cpdom_ctx *cpdomc;
|
||||
u64 dsq_id = cpuc->cpdom_poll_pos;
|
||||
u64 dl;
|
||||
bool ret = false;
|
||||
int i;
|
||||
|
||||
if (nr_cpdoms == 1)
|
||||
return false;
|
||||
|
||||
bpf_for(i, 0, nr_cpdoms) {
|
||||
if (i >= LAVD_CPDOM_MAX_NR)
|
||||
break;
|
||||
|
||||
dsq_id = (dsq_id + i) % LAVD_CPDOM_MAX_NR;
|
||||
|
||||
if (dsq_id == cpuc->cpdom_id)
|
||||
continue;
|
||||
|
||||
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
|
||||
if (!cpdomc) {
|
||||
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (cpdomc->is_active) {
|
||||
dl = READ_ONCE(cpdomc->last_consume_clk) + LAVD_CPDOM_STARV_NS;
|
||||
if (dl < now) {
|
||||
ret = consume_dsq(cpu, dsq_id, now);
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
out:
|
||||
cpuc->cpdom_poll_pos = (dsq_id + 1) % LAVD_CPDOM_MAX_NR;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool consume_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
|
||||
{
|
||||
struct cpdom_ctx *cpdomc, *cpdomc_pick;
|
||||
u64 dsq_id, nr_nbr;
|
||||
s64 nuance;
|
||||
|
||||
/*
|
||||
* If there is a starving DSQ, try to consume it first.
|
||||
*/
|
||||
if (consume_starving_task(cpu, cpuc, now))
|
||||
goto x_domain_migration_out;
|
||||
|
||||
/*
|
||||
* Try to consume from CPU's associated DSQ.
|
||||
*/
|
||||
@ -1813,8 +1767,6 @@ static s32 init_cpdoms(u64 now)
|
||||
if (!cpdomc->is_active)
|
||||
continue;
|
||||
|
||||
WRITE_ONCE(cpdomc->last_consume_clk, now);
|
||||
|
||||
/*
|
||||
* Create an associated DSQ on its associated NUMA domain.
|
||||
*/
|
||||
@ -2032,6 +1984,7 @@ static s32 init_per_cpu_ctx(u64 now)
|
||||
}
|
||||
cpuc->cpdom_id = cpdomc->id;
|
||||
cpuc->cpdom_alt_id = cpdomc->alt_id;
|
||||
cpdomc->nr_cpus++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -39,6 +39,7 @@ struct sys_stat_ctx {
|
||||
u32 nr_perf_cri;
|
||||
u32 nr_lat_cri;
|
||||
u32 nr_x_migration;
|
||||
u32 nr_stealee;
|
||||
u32 nr_big;
|
||||
u32 nr_pc_on_big;
|
||||
u32 nr_lc_on_big;
|
||||
@ -63,10 +64,66 @@ static void init_sys_stat_ctx(struct sys_stat_ctx *c)
|
||||
c->stat_next->last_update_clk = c->now;
|
||||
}
|
||||
|
||||
static void plan_x_cpdom_migration(struct sys_stat_ctx *c)
|
||||
{
|
||||
struct cpdom_ctx *cpdomc;
|
||||
u64 dsq_id;
|
||||
u32 avg_nr_q_tasks_per_cpu = 0, nr_q_tasks, x_mig_delta;
|
||||
u32 stealer_threshold, stealee_threshold;
|
||||
|
||||
/*
|
||||
* Calcualte average queued tasks per CPU per compute domain.
|
||||
*/
|
||||
bpf_for(dsq_id, 0, nr_cpdoms) {
|
||||
if (dsq_id >= LAVD_CPDOM_MAX_NR)
|
||||
break;
|
||||
|
||||
nr_q_tasks = scx_bpf_dsq_nr_queued(dsq_id);
|
||||
c->nr_queued_task += nr_q_tasks;
|
||||
|
||||
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
|
||||
cpdomc->nr_q_tasks_per_cpu = (nr_q_tasks * 1000) / cpdomc->nr_cpus;
|
||||
avg_nr_q_tasks_per_cpu += cpdomc->nr_q_tasks_per_cpu;
|
||||
}
|
||||
avg_nr_q_tasks_per_cpu /= nr_cpdoms;
|
||||
|
||||
/*
|
||||
* Determine stealer and stealee domains.
|
||||
*
|
||||
* A stealer domain, whose per-CPU queue length is shorter than
|
||||
* the average, will steal a task from any of stealee domain,
|
||||
* whose per-CPU queue length is longer than the average.
|
||||
* Compute domain around average will not do anything.
|
||||
*/
|
||||
x_mig_delta = avg_nr_q_tasks_per_cpu >> LAVD_CPDOM_MIGRATION_SHIFT;
|
||||
stealer_threshold = avg_nr_q_tasks_per_cpu - x_mig_delta;
|
||||
stealee_threshold = avg_nr_q_tasks_per_cpu + x_mig_delta;
|
||||
|
||||
bpf_for(dsq_id, 0, nr_cpdoms) {
|
||||
if (dsq_id >= LAVD_CPDOM_MAX_NR)
|
||||
break;
|
||||
|
||||
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
|
||||
|
||||
if (cpdomc->nr_q_tasks_per_cpu < stealer_threshold) {
|
||||
WRITE_ONCE(cpdomc->is_stealer, true);
|
||||
WRITE_ONCE(cpdomc->is_stealee, false);
|
||||
}
|
||||
else if (cpdomc->nr_q_tasks_per_cpu > stealee_threshold) {
|
||||
WRITE_ONCE(cpdomc->is_stealer, false);
|
||||
WRITE_ONCE(cpdomc->is_stealee, true);
|
||||
c->nr_stealee++;
|
||||
}
|
||||
else {
|
||||
WRITE_ONCE(cpdomc->is_stealer, false);
|
||||
WRITE_ONCE(cpdomc->is_stealee, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void collect_sys_stat(struct sys_stat_ctx *c)
|
||||
{
|
||||
u64 dsq_id;
|
||||
int cpu, nr;
|
||||
int cpu;
|
||||
|
||||
bpf_for(cpu, 0, nr_cpu_ids) {
|
||||
struct cpu_ctx *cpuc = get_cpu_ctx_id(cpu);
|
||||
@ -173,12 +230,6 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
|
||||
c->idle_total += cpuc->idle_total;
|
||||
cpuc->idle_total = 0;
|
||||
}
|
||||
|
||||
bpf_for(dsq_id, 0, LAVD_CPDOM_MAX_NR) {
|
||||
nr = scx_bpf_dsq_nr_queued(dsq_id);
|
||||
if (nr > 0)
|
||||
c->nr_queued_task += nr;
|
||||
}
|
||||
}
|
||||
|
||||
static void calc_sys_stat(struct sys_stat_ctx *c)
|
||||
@ -243,6 +294,8 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
|
||||
c->stat_cur->thr_perf_cri; /* will be updated later */
|
||||
}
|
||||
|
||||
stat_next->nr_stealee = c->nr_stealee;
|
||||
|
||||
stat_next->nr_violation =
|
||||
calc_avg32(stat_cur->nr_violation, c->nr_violation);
|
||||
|
||||
@ -293,6 +346,7 @@ static void do_update_sys_stat(void)
|
||||
* Collect and prepare the next version of stat.
|
||||
*/
|
||||
init_sys_stat_ctx(&c);
|
||||
plan_x_cpdom_migration(&c);
|
||||
collect_sys_stat(&c);
|
||||
calc_sys_stat(&c);
|
||||
update_sys_stat_next(&c);
|
||||
|
@ -712,6 +712,7 @@ impl<'a> Scheduler<'a> {
|
||||
let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
|
||||
let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
|
||||
let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
|
||||
let nr_stealee = st.nr_stealee;
|
||||
let nr_big = st.nr_big;
|
||||
let pc_big = Self::get_pc(nr_big, nr_sched);
|
||||
let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
|
||||
@ -732,6 +733,7 @@ impl<'a> Scheduler<'a> {
|
||||
pc_pc,
|
||||
pc_lc,
|
||||
pc_x_migration,
|
||||
nr_stealee,
|
||||
pc_big,
|
||||
pc_pc_on_big,
|
||||
pc_lc_on_big,
|
||||
|
@ -40,6 +40,9 @@ pub struct SysStats {
|
||||
#[stat(desc = "% of cross domain task migration")]
|
||||
pub pc_x_migration: f64,
|
||||
|
||||
#[stat(desc = "Number of stealee domains")]
|
||||
pub nr_stealee: u32,
|
||||
|
||||
#[stat(desc = "% of tasks scheduled on big cores")]
|
||||
pub pc_big: f64,
|
||||
|
||||
@ -66,7 +69,7 @@ impl SysStats {
|
||||
pub fn format_header<W: Write>(w: &mut W) -> Result<()> {
|
||||
writeln!(
|
||||
w,
|
||||
"\x1b[93m| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |\x1b[0m",
|
||||
"\x1b[93m| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |\x1b[0m",
|
||||
"MSEQ",
|
||||
"# Q TASK",
|
||||
"# ACT CPU",
|
||||
@ -74,6 +77,7 @@ impl SysStats {
|
||||
"PERF-CR%",
|
||||
"LAT-CR%",
|
||||
"X-MIG%",
|
||||
"# STLEE",
|
||||
"BIG%",
|
||||
"PC/BIG%",
|
||||
"LC/BIG%",
|
||||
@ -92,7 +96,7 @@ impl SysStats {
|
||||
|
||||
writeln!(
|
||||
w,
|
||||
"| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |",
|
||||
"| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |",
|
||||
self.mseq,
|
||||
self.nr_queued_task,
|
||||
self.nr_active,
|
||||
@ -100,6 +104,7 @@ impl SysStats {
|
||||
GPoint(self.pc_pc),
|
||||
GPoint(self.pc_lc),
|
||||
GPoint(self.pc_x_migration),
|
||||
self.nr_stealee,
|
||||
GPoint(self.pc_big),
|
||||
GPoint(self.pc_pc_on_big),
|
||||
GPoint(self.pc_lc_on_big),
|
||||
|
Loading…
Reference in New Issue
Block a user