mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-28 13:40:28 +00:00
scx: Implement solution to infeasible weights problem
As described in [0], there is an open problem in load balancing called the "infeasible weights" problem. Essentially, the problem boils down to the fact that a task with disproportionately high load can be granted more CPU time than they can actually consume per their duty cycle. This patch implements a solution to that problem, wherein we apply the algorithm described in this paper to adjust all infeasible weights in the system down to a feasible wight that gives them their full duty cycle, while allowing the remaining feasible tasks on the system to share the remaining compute capacity on the machine. [0]: https://drive.google.com/file/d/1fAoWUlmW-HTp6akuATVpMxpUpvWcGSAv/view?usp=drive_link Signed-off-by: David Vernet <void@manifault.com>
This commit is contained in:
parent
c574598dc7
commit
e627176d90
@ -19,6 +19,7 @@ log = "0.4.17"
|
||||
ordered-float = "3.4.0"
|
||||
scx_utils = { path = "../../../rust/scx_utils", version = "0.6" }
|
||||
simplelog = "0.12.0"
|
||||
static_assertions = "1.1.0"
|
||||
|
||||
[build-dependencies]
|
||||
scx_utils = { path = "../../../rust/scx_utils", version = "0.6" }
|
||||
|
@ -26,6 +26,12 @@ enum consts {
|
||||
MAX_DOMS = 64, /* limited to avoid complex bitmask ops */
|
||||
CACHELINE_SIZE = 64,
|
||||
|
||||
LB_DEFAULT_WEIGHT = 100,
|
||||
LB_MIN_WEIGHT = 1,
|
||||
LB_MAX_WEIGHT = 10000,
|
||||
LB_LOAD_BUCKETS = 100, /* Must be a factor of LB_MAX_WEIGHT */
|
||||
LB_WEIGHT_PER_BUCKET = LB_MAX_WEIGHT / LB_LOAD_BUCKETS,
|
||||
|
||||
/*
|
||||
* When userspace load balancer is trying to determine the tasks to push
|
||||
* out from an overloaded domain, it looks at the the following number
|
||||
@ -84,14 +90,18 @@ struct task_ctx {
|
||||
struct ravg_data dcyc_rd;
|
||||
};
|
||||
|
||||
struct bucket_ctx {
|
||||
u64 dcycle;
|
||||
struct ravg_data rd;
|
||||
};
|
||||
|
||||
struct dom_ctx {
|
||||
u64 vtime_now;
|
||||
struct bpf_cpumask __kptr *cpumask;
|
||||
struct bpf_cpumask __kptr *direct_greedy_cpumask;
|
||||
|
||||
u64 load;
|
||||
struct ravg_data load_rd;
|
||||
u64 dbg_load_printed_at;
|
||||
u64 dbg_dcycle_printed_at;
|
||||
struct bucket_ctx buckets[LB_LOAD_BUCKETS];
|
||||
};
|
||||
|
||||
#endif /* __INTF_H */
|
||||
|
@ -104,9 +104,9 @@ struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__type(key, u32);
|
||||
__type(value, struct lock_wrapper);
|
||||
__uint(max_entries, MAX_DOMS);
|
||||
__uint(max_entries, MAX_DOMS * LB_LOAD_BUCKETS);
|
||||
__uint(map_flags, 0);
|
||||
} dom_load_locks SEC(".maps");
|
||||
} dom_dcycle_locks SEC(".maps");
|
||||
|
||||
struct dom_active_pids {
|
||||
u64 gen;
|
||||
@ -119,128 +119,6 @@ struct dom_active_pids dom_active_pids[MAX_DOMS];
|
||||
|
||||
const u64 ravg_1 = 1 << RAVG_FRAC_BITS;
|
||||
|
||||
static void dom_load_adj(u32 dom_id, s64 adj, u64 now)
|
||||
{
|
||||
struct dom_ctx *domc;
|
||||
struct lock_wrapper *lockw;
|
||||
|
||||
domc = bpf_map_lookup_elem(&dom_data, &dom_id);
|
||||
lockw = bpf_map_lookup_elem(&dom_load_locks, &dom_id);
|
||||
|
||||
if (!domc || !lockw) {
|
||||
scx_bpf_error("dom_ctx / lock lookup failed");
|
||||
return;
|
||||
}
|
||||
|
||||
bpf_spin_lock(&lockw->lock);
|
||||
domc->load += adj;
|
||||
ravg_accumulate(&domc->load_rd, domc->load, now, load_half_life);
|
||||
bpf_spin_unlock(&lockw->lock);
|
||||
|
||||
if (adj < 0 && (s64)domc->load < 0)
|
||||
scx_bpf_error("cpu%d dom%u load underflow (load=%lld adj=%lld)",
|
||||
bpf_get_smp_processor_id(), dom_id, domc->load, adj);
|
||||
|
||||
if (debug >=2 &&
|
||||
(!domc->dbg_load_printed_at || now - domc->dbg_load_printed_at >= 1000000000)) {
|
||||
bpf_printk("LOAD ADJ dom=%u adj=%lld load=%llu",
|
||||
dom_id,
|
||||
adj,
|
||||
ravg_read(&domc->load_rd, now, load_half_life) >> RAVG_FRAC_BITS);
|
||||
domc->dbg_load_printed_at = now;
|
||||
}
|
||||
}
|
||||
|
||||
static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
|
||||
u32 from_dom_id, u32 to_dom_id, u64 now)
|
||||
{
|
||||
struct dom_ctx *from_domc, *to_domc;
|
||||
struct lock_wrapper *from_lockw, *to_lockw;
|
||||
struct ravg_data task_load_rd;
|
||||
u64 from_load[2], to_load[2], task_load;
|
||||
|
||||
from_domc = bpf_map_lookup_elem(&dom_data, &from_dom_id);
|
||||
from_lockw = bpf_map_lookup_elem(&dom_load_locks, &from_dom_id);
|
||||
to_domc = bpf_map_lookup_elem(&dom_data, &to_dom_id);
|
||||
to_lockw = bpf_map_lookup_elem(&dom_load_locks, &to_dom_id);
|
||||
if (!from_domc || !from_lockw || !to_domc || !to_lockw) {
|
||||
scx_bpf_error("dom_ctx / lock lookup failed");
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* @p is moving from @from_dom_id to @to_dom_id. Its load contribution
|
||||
* should be moved together. We only track duty cycle for tasks. Scale
|
||||
* it by weight to get load_rd.
|
||||
*/
|
||||
ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
|
||||
task_load_rd = taskc->dcyc_rd;
|
||||
ravg_scale(&task_load_rd, p->scx.weight, 0);
|
||||
|
||||
if (debug >= 2)
|
||||
task_load = ravg_read(&task_load_rd, now, load_half_life);
|
||||
|
||||
/* transfer out of @from_dom_id */
|
||||
bpf_spin_lock(&from_lockw->lock);
|
||||
if (taskc->runnable)
|
||||
from_domc->load -= p->scx.weight;
|
||||
|
||||
if (debug >= 2)
|
||||
from_load[0] = ravg_read(&from_domc->load_rd, now, load_half_life);
|
||||
|
||||
ravg_transfer(&from_domc->load_rd, from_domc->load,
|
||||
&task_load_rd, taskc->runnable, load_half_life, false);
|
||||
|
||||
if (debug >= 2)
|
||||
from_load[1] = ravg_read(&from_domc->load_rd, now, load_half_life);
|
||||
|
||||
bpf_spin_unlock(&from_lockw->lock);
|
||||
|
||||
/* transfer into @to_dom_id */
|
||||
bpf_spin_lock(&to_lockw->lock);
|
||||
if (taskc->runnable)
|
||||
to_domc->load += p->scx.weight;
|
||||
|
||||
if (debug >= 2)
|
||||
to_load[0] = ravg_read(&to_domc->load_rd, now, load_half_life);
|
||||
|
||||
ravg_transfer(&to_domc->load_rd, to_domc->load,
|
||||
&task_load_rd, taskc->runnable, load_half_life, true);
|
||||
|
||||
if (debug >= 2)
|
||||
to_load[1] = ravg_read(&to_domc->load_rd, now, load_half_life);
|
||||
|
||||
bpf_spin_unlock(&to_lockw->lock);
|
||||
|
||||
if (debug >= 2)
|
||||
bpf_printk("XFER dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu",
|
||||
from_dom_id, to_dom_id,
|
||||
task_load >> RAVG_FRAC_BITS,
|
||||
from_load[0] >> RAVG_FRAC_BITS,
|
||||
from_load[1] >> RAVG_FRAC_BITS,
|
||||
to_load[0] >> RAVG_FRAC_BITS,
|
||||
to_load[1] >> RAVG_FRAC_BITS);
|
||||
}
|
||||
|
||||
/*
|
||||
* Statistics
|
||||
*/
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||
__uint(key_size, sizeof(u32));
|
||||
__uint(value_size, sizeof(u64));
|
||||
__uint(max_entries, RUSTY_NR_STATS);
|
||||
} stats SEC(".maps");
|
||||
|
||||
static inline void stat_add(enum stat_idx idx, u64 addend)
|
||||
{
|
||||
u32 idx_v = idx;
|
||||
|
||||
u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
|
||||
if (cnt_p)
|
||||
(*cnt_p) += addend;
|
||||
}
|
||||
|
||||
/* Map pid -> task_ctx */
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
@ -263,6 +141,183 @@ struct task_ctx *lookup_task_ctx(struct task_struct *p)
|
||||
}
|
||||
}
|
||||
|
||||
static inline u32 weight_to_bucket_idx(u32 weight)
|
||||
{
|
||||
/* Weight is calculated linearly, and is within range of [1, 10000] */
|
||||
return weight * LB_LOAD_BUCKETS / LB_MAX_WEIGHT;
|
||||
}
|
||||
|
||||
static void task_load_adj(struct task_struct *p, struct task_ctx *taskc,
|
||||
u64 now, bool runnable)
|
||||
{
|
||||
taskc->runnable = runnable;
|
||||
ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
|
||||
}
|
||||
|
||||
static struct bucket_ctx *lookup_dom_bucket(struct dom_ctx *dom_ctx,
|
||||
u32 weight, u32 *bucket_id)
|
||||
{
|
||||
u32 idx = weight_to_bucket_idx(weight);
|
||||
struct bucket_ctx *bucket;
|
||||
|
||||
*bucket_id = idx;
|
||||
bucket = MEMBER_VPTR(dom_ctx->buckets, [idx]);
|
||||
if (bucket)
|
||||
return bucket;
|
||||
|
||||
scx_bpf_error("Failed to lookup dom bucket");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct lock_wrapper *lookup_dom_lock(u32 dom_id, u32 weight)
|
||||
{
|
||||
u32 idx = dom_id * LB_LOAD_BUCKETS + weight_to_bucket_idx(weight);
|
||||
struct lock_wrapper *lockw;
|
||||
|
||||
lockw = bpf_map_lookup_elem(&dom_dcycle_locks, &idx);
|
||||
if (lockw)
|
||||
return lockw;
|
||||
|
||||
scx_bpf_error("Failed to lookup dom lock");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void dom_dcycle_adj(u32 dom_id, u32 weight, u64 now, bool runnable)
|
||||
{
|
||||
struct dom_ctx *domc;
|
||||
struct bucket_ctx *bucket;
|
||||
struct lock_wrapper *lockw;
|
||||
s64 adj = runnable ? 1 : -1;
|
||||
u32 bucket_idx = 0;
|
||||
|
||||
domc = bpf_map_lookup_elem(&dom_data, &dom_id);
|
||||
if (!domc) {
|
||||
scx_bpf_error("Failed to lookup dom_ctx");
|
||||
return;
|
||||
}
|
||||
|
||||
bucket = lookup_dom_bucket(domc, weight, &bucket_idx);
|
||||
lockw = lookup_dom_lock(dom_id, weight);
|
||||
|
||||
if (!bucket || !lockw)
|
||||
return;
|
||||
|
||||
bpf_spin_lock(&lockw->lock);
|
||||
bucket->dcycle += adj;
|
||||
ravg_accumulate(&bucket->rd, bucket->dcycle, now, load_half_life);
|
||||
bpf_spin_unlock(&lockw->lock);
|
||||
|
||||
if (adj < 0 && (s64)bucket->dcycle < 0)
|
||||
scx_bpf_error("cpu%d dom%u bucket%u load underflow (dcycle=%lld adj=%lld)",
|
||||
bpf_get_smp_processor_id(), dom_id, bucket_idx,
|
||||
bucket->dcycle, adj);
|
||||
|
||||
if (debug >=2 &&
|
||||
(!domc->dbg_dcycle_printed_at || now - domc->dbg_dcycle_printed_at >= 1000000000)) {
|
||||
bpf_printk("DCYCLE ADJ dom=%u bucket=%u adj=%lld dcycle=%u avg_dcycle=%llu",
|
||||
dom_id, bucket_idx, adj, bucket->dcycle,
|
||||
ravg_read(&bucket->rd, now, load_half_life) >> RAVG_FRAC_BITS);
|
||||
domc->dbg_dcycle_printed_at = now;
|
||||
}
|
||||
}
|
||||
|
||||
static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc,
|
||||
u32 from_dom_id, u32 to_dom_id, u64 now)
|
||||
{
|
||||
struct bucket_ctx *from_bucket, *to_bucket;
|
||||
u32 idx = 0, weight = taskc->weight;
|
||||
struct dom_ctx *from_domc, *to_domc;
|
||||
struct lock_wrapper *from_lockw, *to_lockw;
|
||||
struct ravg_data task_dcyc_rd;
|
||||
u64 from_dcycle[2], to_dcycle[2], task_dcycle;
|
||||
|
||||
from_domc = bpf_map_lookup_elem(&dom_data, &from_dom_id);
|
||||
from_lockw = lookup_dom_lock(from_dom_id, weight);
|
||||
to_domc = bpf_map_lookup_elem(&dom_data, &to_dom_id);
|
||||
to_lockw = lookup_dom_lock(to_dom_id, weight);
|
||||
if (!from_domc || !from_lockw || !to_domc || !to_lockw) {
|
||||
scx_bpf_error("dom_ctx / lock lookup failed");
|
||||
return;
|
||||
}
|
||||
|
||||
from_bucket = lookup_dom_bucket(from_domc, weight, &idx);
|
||||
to_bucket = lookup_dom_bucket(to_domc, weight, &idx);
|
||||
if (!from_bucket || !to_bucket)
|
||||
return;
|
||||
|
||||
/*
|
||||
* @p is moving from @from_dom_id to @to_dom_id. Its duty cycle
|
||||
* contribution in the relevant bucket of @from_dom_id should be moved
|
||||
* together to the corresponding bucket in @to_dom_id. We only track
|
||||
* duty cycle from BPF. Load is computed in user space when performing
|
||||
* load balancing.
|
||||
*/
|
||||
ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
|
||||
task_dcyc_rd = taskc->dcyc_rd;
|
||||
if (debug >= 2)
|
||||
task_dcycle = ravg_read(&task_dcyc_rd, now, load_half_life);
|
||||
|
||||
/* transfer out of @from_dom_id */
|
||||
bpf_spin_lock(&from_lockw->lock);
|
||||
if (taskc->runnable)
|
||||
from_bucket->dcycle--;
|
||||
|
||||
if (debug >= 2)
|
||||
from_dcycle[0] = ravg_read(&from_bucket->rd, now, load_half_life);
|
||||
|
||||
ravg_transfer(&from_bucket->rd, from_bucket->dcycle,
|
||||
&task_dcyc_rd, taskc->runnable, load_half_life, false);
|
||||
|
||||
if (debug >= 2)
|
||||
from_dcycle[1] = ravg_read(&from_bucket->rd, now, load_half_life);
|
||||
|
||||
bpf_spin_unlock(&from_lockw->lock);
|
||||
|
||||
/* transfer into @to_dom_id */
|
||||
bpf_spin_lock(&to_lockw->lock);
|
||||
if (taskc->runnable)
|
||||
to_bucket->dcycle++;
|
||||
|
||||
if (debug >= 2)
|
||||
to_dcycle[0] = ravg_read(&to_bucket->rd, now, load_half_life);
|
||||
|
||||
ravg_transfer(&to_bucket->rd, to_bucket->dcycle,
|
||||
&task_dcyc_rd, taskc->runnable, load_half_life, true);
|
||||
|
||||
if (debug >= 2)
|
||||
to_dcycle[1] = ravg_read(&to_bucket->rd, now, load_half_life);
|
||||
|
||||
bpf_spin_unlock(&to_lockw->lock);
|
||||
|
||||
if (debug >= 2)
|
||||
bpf_printk("XFER dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu",
|
||||
from_dom_id, to_dom_id,
|
||||
task_dcycle >> RAVG_FRAC_BITS,
|
||||
from_dcycle[0] >> RAVG_FRAC_BITS,
|
||||
from_dcycle[1] >> RAVG_FRAC_BITS,
|
||||
to_dcycle[0] >> RAVG_FRAC_BITS,
|
||||
to_dcycle[1] >> RAVG_FRAC_BITS);
|
||||
}
|
||||
|
||||
/*
|
||||
* Statistics
|
||||
*/
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||
__uint(key_size, sizeof(u32));
|
||||
__uint(value_size, sizeof(u64));
|
||||
__uint(max_entries, RUSTY_NR_STATS);
|
||||
} stats SEC(".maps");
|
||||
|
||||
static inline void stat_add(enum stat_idx idx, u64 addend)
|
||||
{
|
||||
u32 idx_v = idx;
|
||||
|
||||
u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
|
||||
if (cnt_p)
|
||||
(*cnt_p) += addend;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is populated from userspace to indicate which pids should be reassigned
|
||||
* to new doms.
|
||||
@ -788,11 +843,10 @@ void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags)
|
||||
if (!(taskc = lookup_task_ctx(p)))
|
||||
return;
|
||||
|
||||
taskc->runnable = true;
|
||||
taskc->is_kworker = p->flags & PF_WQ_WORKER;
|
||||
|
||||
ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
|
||||
dom_load_adj(taskc->dom_id, p->scx.weight, now);
|
||||
task_load_adj(p, taskc, now, true);
|
||||
dom_dcycle_adj(taskc->dom_id, taskc->weight, now, true);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
|
||||
@ -875,19 +929,22 @@ void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
|
||||
if (!(taskc = lookup_task_ctx(p)))
|
||||
return;
|
||||
|
||||
taskc->runnable = false;
|
||||
|
||||
ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
|
||||
dom_load_adj(taskc->dom_id, -(s64)p->scx.weight, now);
|
||||
task_load_adj(p, taskc, now, false);
|
||||
dom_dcycle_adj(taskc->dom_id, taskc->weight, now, false);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(rusty_set_weight, struct task_struct *p, u32 weight)
|
||||
{
|
||||
struct task_ctx *taskc;
|
||||
u64 now = bpf_ktime_get_ns();
|
||||
|
||||
if (!(taskc = lookup_task_ctx(p)))
|
||||
return;
|
||||
|
||||
if (debug >= 2)
|
||||
bpf_printk("%s[%d]: SET_WEIGHT %u -> %u", p->comm, p->pid,
|
||||
taskc->weight, weight);
|
||||
|
||||
taskc->weight = weight;
|
||||
}
|
||||
|
||||
@ -970,6 +1027,9 @@ s32 BPF_STRUCT_OPS(rusty_init_task, struct task_struct *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (debug >= 2)
|
||||
bpf_printk("%s[%d]: INIT (weight %u))", p->comm, p->pid, p->scx.weight);
|
||||
|
||||
/*
|
||||
* Read the entry from the map immediately so we can add the cpumask
|
||||
* with bpf_kptr_xchg().
|
||||
@ -1019,26 +1079,25 @@ void BPF_STRUCT_OPS(rusty_exit_task, struct task_struct *p,
|
||||
|
||||
static s32 create_dom(u32 dom_id)
|
||||
{
|
||||
struct dom_ctx domc_init = {}, *domc;
|
||||
struct dom_ctx *domc;
|
||||
struct bpf_cpumask *cpumask;
|
||||
u32 cpu;
|
||||
s32 ret;
|
||||
|
||||
if (dom_id >= MAX_DOMS) {
|
||||
scx_bpf_error("Max dom ID %u exceeded (%u)", MAX_DOMS, dom_id);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = scx_bpf_create_dsq(dom_id, -1);
|
||||
if (ret < 0) {
|
||||
scx_bpf_error("Failed to create dsq %u (%d)", dom_id, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = bpf_map_update_elem(&dom_data, &dom_id, &domc_init, 0);
|
||||
if (ret) {
|
||||
scx_bpf_error("Failed to add dom_ctx entry %u (%d)", dom_id, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
domc = bpf_map_lookup_elem(&dom_data, &dom_id);
|
||||
if (!domc) {
|
||||
/* Should never happen, we just inserted it above. */
|
||||
/* Should never happen, it's created statically at load time. */
|
||||
scx_bpf_error("No dom%u", dom_id);
|
||||
return -ENOENT;
|
||||
}
|
||||
|
@ -6,6 +6,9 @@ mod bpf_skel;
|
||||
pub use bpf_skel::*;
|
||||
pub mod bpf_intf;
|
||||
|
||||
#[macro_use]
|
||||
extern crate static_assertions;
|
||||
|
||||
use std::cell::Cell;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
@ -409,6 +412,7 @@ struct Tuner {
|
||||
kick_greedy_under: f64,
|
||||
proc_reader: procfs::ProcReader,
|
||||
prev_cpu_stats: BTreeMap<u32, procfs::CpuStat>,
|
||||
lb_apply_weight: bool,
|
||||
dom_utils: Vec<f64>,
|
||||
}
|
||||
|
||||
@ -425,6 +429,7 @@ impl Tuner {
|
||||
proc_reader,
|
||||
prev_cpu_stats,
|
||||
dom_utils: vec![0.0; top.nr_doms],
|
||||
lb_apply_weight: false,
|
||||
top,
|
||||
})
|
||||
}
|
||||
@ -435,10 +440,10 @@ impl Tuner {
|
||||
.read_stat()?
|
||||
.cpus_map
|
||||
.ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
|
||||
let ti = &mut skel.bss_mut().tune_input;
|
||||
let mut dom_nr_cpus = vec![0; self.top.nr_doms];
|
||||
let mut dom_util_sum = vec![0.0; self.top.nr_doms];
|
||||
|
||||
let mut avg_util = 0.0f64;
|
||||
for cpu in 0..self.top.nr_cpus {
|
||||
let cpu32 = cpu as u32;
|
||||
// None domain indicates the CPU was offline during
|
||||
@ -449,11 +454,16 @@ impl Tuner {
|
||||
curr_cpu_stats.get(&cpu32),
|
||||
self.prev_cpu_stats.get(&cpu32),
|
||||
) {
|
||||
let util = calc_util(curr, prev)?;
|
||||
dom_nr_cpus[dom] += 1;
|
||||
dom_util_sum[dom] += calc_util(curr, prev)?;
|
||||
dom_util_sum[dom] += util;
|
||||
avg_util += util;
|
||||
}
|
||||
}
|
||||
avg_util /= self.top.nr_cpus as f64;
|
||||
self.lb_apply_weight = approx_ge(avg_util, 0.99999);
|
||||
|
||||
let ti = &mut skel.bss_mut().tune_input;
|
||||
for dom in 0..self.top.nr_doms {
|
||||
// Calculate the domain avg util. If there are no active CPUs,
|
||||
// it doesn't really matter. Go with 0.0 as that's less likely
|
||||
@ -496,6 +506,14 @@ impl Tuner {
|
||||
}
|
||||
}
|
||||
|
||||
fn approx_eq(a: f64, b: f64) -> bool {
|
||||
(a - b).abs() <= 0.0001f64
|
||||
}
|
||||
|
||||
fn approx_ge(a: f64, b: f64) -> bool {
|
||||
a > b || approx_eq(a, b)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TaskInfo {
|
||||
pid: i32,
|
||||
@ -509,6 +527,9 @@ struct LoadBalancer<'a, 'b, 'c> {
|
||||
top: Arc<Topology>,
|
||||
skip_kworkers: bool,
|
||||
|
||||
lb_apply_weight: bool,
|
||||
infeas_threshold: f64,
|
||||
|
||||
tasks_by_load: Vec<Option<BTreeMap<OrderedFloat<f64>, TaskInfo>>>,
|
||||
load_avg: f64,
|
||||
dom_loads: Vec<f64>,
|
||||
@ -520,9 +541,14 @@ struct LoadBalancer<'a, 'b, 'c> {
|
||||
nr_lb_data_errors: &'c mut u64,
|
||||
}
|
||||
|
||||
// Verify that the number of buckets is a factor of the maximum weight to
|
||||
// ensure that the range of weight can be split evenly amongst every bucket.
|
||||
const_assert_eq!(bpf_intf::consts_LB_MAX_WEIGHT % bpf_intf::consts_LB_LOAD_BUCKETS, 0);
|
||||
|
||||
|
||||
impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
|
||||
// If imbalance gets higher than this ratio, try to balance the loads.
|
||||
const LOAD_IMBAL_HIGH_RATIO: f64 = 0.10;
|
||||
const LOAD_IMBAL_HIGH_RATIO: f64 = 0.05;
|
||||
|
||||
// Aim to transfer this fraction of the imbalance on each round. We want
|
||||
// to be gradual to avoid unnecessary oscillations. While this can delay
|
||||
@ -543,12 +569,16 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
|
||||
skel: &'a mut BpfSkel<'b>,
|
||||
top: Arc<Topology>,
|
||||
skip_kworkers: bool,
|
||||
lb_apply_weight: &bool,
|
||||
nr_lb_data_errors: &'c mut u64,
|
||||
) -> Self {
|
||||
Self {
|
||||
skel,
|
||||
skip_kworkers,
|
||||
|
||||
lb_apply_weight: lb_apply_weight.clone(),
|
||||
infeas_threshold: bpf_intf::consts_LB_MAX_WEIGHT as f64,
|
||||
|
||||
tasks_by_load: (0..top.nr_doms).map(|_| None).collect(),
|
||||
load_avg: 0f64,
|
||||
dom_loads: vec![0.0; top.nr_doms],
|
||||
@ -563,46 +593,263 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
|
||||
}
|
||||
}
|
||||
|
||||
fn bucket_range(&self, bucket: u64) -> (f64, f64) {
|
||||
const MAX_WEIGHT: u64 = bpf_intf::consts_LB_MAX_WEIGHT as u64;
|
||||
const NUM_BUCKETS: u64 = bpf_intf::consts_LB_LOAD_BUCKETS as u64;
|
||||
const WEIGHT_PER_BUCKET: u64 = MAX_WEIGHT / NUM_BUCKETS;
|
||||
|
||||
if bucket >= NUM_BUCKETS {
|
||||
panic!("Invalid bucket {}, max {}", bucket, NUM_BUCKETS);
|
||||
}
|
||||
|
||||
// w_x = [1 + (10000 * x) / N, 10000 * (x + 1) / N]
|
||||
let min_w = 1 + (MAX_WEIGHT * bucket) / NUM_BUCKETS;
|
||||
let max_w = min_w + WEIGHT_PER_BUCKET - 1;
|
||||
|
||||
(min_w as f64, max_w as f64)
|
||||
}
|
||||
|
||||
fn bucket_weight(&self, bucket: u64) -> f64 {
|
||||
const WEIGHT_PER_BUCKET: f64 = bpf_intf::consts_LB_WEIGHT_PER_BUCKET as f64;
|
||||
let (min_weight, _) = self.bucket_range(bucket);
|
||||
|
||||
// Use the mid-point of the bucket when determining weight
|
||||
min_weight + (WEIGHT_PER_BUCKET / 2.0f64)
|
||||
}
|
||||
|
||||
fn apply_infeas_threshold(&mut self,
|
||||
doms_dcycles_buckets: &[f64],
|
||||
infeas_thrsh: f64) {
|
||||
const NUM_BUCKETS: u64 = bpf_intf::consts_LB_LOAD_BUCKETS as u64;
|
||||
|
||||
self.infeas_threshold = infeas_thrsh;
|
||||
let mut global_load_sum = 0.0f64;
|
||||
for dom in 0..self.top.nr_doms {
|
||||
let dom_offset = (dom as u64) * NUM_BUCKETS;
|
||||
let mut dom_load_sum = 0.0f64;
|
||||
for i in 0..NUM_BUCKETS {
|
||||
let weight = self.bucket_weight(i).min(self.infeas_threshold);
|
||||
let dcycle = doms_dcycles_buckets[(dom_offset + i) as usize];
|
||||
|
||||
dom_load_sum += dcycle * weight;
|
||||
}
|
||||
self.dom_loads[dom] = dom_load_sum;
|
||||
global_load_sum += dom_load_sum;
|
||||
}
|
||||
|
||||
self.load_avg = global_load_sum / self.top.nr_doms as f64;
|
||||
}
|
||||
|
||||
fn adjust_infeas_weights(&mut self,
|
||||
bucket_dcycles: &[f64],
|
||||
doms_dcycle_buckets: &[f64],
|
||||
global_load_sum: f64) -> Result<()> {
|
||||
// At this point we have the following data points:
|
||||
//
|
||||
// P : The number of cores on the system
|
||||
// L : The total load sum of the system before any adjustments for
|
||||
// infeasibility
|
||||
// Lf: The load sum of all feasible tasks
|
||||
// D : The total sum of duty cycles across all domains in the system
|
||||
// Di: The duty cycle sum of all infeasible tasks
|
||||
//
|
||||
// We need to find a weight lambda_x such that every infeasible task in
|
||||
// the system will be granted a CPU allocation equal to their duty
|
||||
// cycle, and all the remaining compute capacity in the system will be
|
||||
// divided fairly amongst the feasible tasks according to their load.
|
||||
// Our goal is to find a value lambda_x such that every infeasible task
|
||||
// is allocated its duty cycle, and the remaining compute capacity is
|
||||
// shared fairly amongst the feasible tasks on the system.
|
||||
//
|
||||
// If L' is the load sum on the system after clamping all weights
|
||||
// w_x > lambda_x to lambda_x, then lambda_x can be defined as follows:
|
||||
//
|
||||
// lambda_x = L' / P
|
||||
//
|
||||
// => L' = lambda_x * Di + Lf
|
||||
// => lambda_x * P' = lambda_x * Di + Lf
|
||||
// => lambda_x (P' - D_I) = Lf
|
||||
// => lambda_x = Lf / (P' - Di)
|
||||
//
|
||||
// Thus, need to iterate over different values of x (i.e. over buckets)
|
||||
// until we find a lambda_x such that:
|
||||
//
|
||||
// w_x >= lambda_x >= w_x+1
|
||||
//
|
||||
// Once we find a lambda_x, we need to:
|
||||
//
|
||||
// 1. Adjust the maximum weights of any w_x > lambda_x -> lambda_x
|
||||
// 2. Subtract (w_i - lambda_x) from the load sums that the buckets were
|
||||
// contributing to
|
||||
// 3. Re-calculate the per-domain load, and the global load average.
|
||||
//
|
||||
// Note that we should always find a lambda_x at this point, as we
|
||||
// verified in the caller that there is at least one infeasible bucket
|
||||
// in the system.
|
||||
//
|
||||
// All of this is described and proven in detail in the following pdf:
|
||||
//
|
||||
// https://drive.google.com/file/d/1fAoWUlmW-HTp6akuATVpMxpUpvWcGSAv
|
||||
const NUM_BUCKETS: u64 = bpf_intf::consts_LB_LOAD_BUCKETS as u64;
|
||||
let p = self.top.nr_cpus as f64;
|
||||
let mut curr_dcycle_sum = 0.0f64;
|
||||
let mut curr_load_sum = global_load_sum;
|
||||
let mut lambda_x = curr_load_sum / p;
|
||||
|
||||
for bucket in (0..NUM_BUCKETS).filter(|bucket| !approx_eq(bucket_dcycles[*bucket as usize], 0f64)).rev() {
|
||||
let weight = self.bucket_weight(bucket);
|
||||
let dcycles = bucket_dcycles[bucket as usize];
|
||||
|
||||
if approx_ge(lambda_x, weight) {
|
||||
self.apply_infeas_threshold(doms_dcycle_buckets, lambda_x);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
curr_dcycle_sum += dcycles;
|
||||
curr_load_sum -= weight * dcycles;
|
||||
lambda_x = curr_load_sum / (p - curr_dcycle_sum);
|
||||
}
|
||||
|
||||
// We can fail to find an infeasible weight if the host is
|
||||
// under-utilized. In this case, just fall back to using weights. If
|
||||
// this is happening due to a stale system-wide util value due to the
|
||||
// tuner not having run recently enough, it is a condition that should
|
||||
// self-correct soon. If it is the result of the user configuring us to
|
||||
// use weights even when the system is under-utilized, they were warned
|
||||
// when the scheduler was launched.
|
||||
self.load_avg = global_load_sum / self.top.nr_doms as f64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_dom_loads(&mut self) -> Result<()> {
|
||||
let now_mono = now_monotonic();
|
||||
let load_half_life = self.skel.rodata().load_half_life;
|
||||
let maps = self.skel.maps();
|
||||
let dom_data = maps.dom_data();
|
||||
let mut load_sum = 0.0f64;
|
||||
const NUM_BUCKETS: u64 = bpf_intf::consts_LB_LOAD_BUCKETS as u64;
|
||||
|
||||
for i in 0..self.top.nr_doms {
|
||||
let key = unsafe { std::mem::transmute::<u32, [u8; 4]>(i as u32) };
|
||||
// Sum of dcycle and load for each bucket, aggregated across domains.
|
||||
let mut global_bucket_dcycle = vec![0.0f64; NUM_BUCKETS as usize];
|
||||
|
||||
// Global dcycle and load sums.
|
||||
let mut global_dcycle_sum = 0.0f64;
|
||||
let mut global_load_sum = 0.0f64;
|
||||
|
||||
// dcycle values stored in every bucket. Recorded here so we don't have
|
||||
// to do another ravg read later when testing and adjusting for
|
||||
// infeasibility.
|
||||
let mut doms_dcycle_buckets = vec![0.064; NUM_BUCKETS as usize * self.top.nr_doms];
|
||||
|
||||
// Sum of dcycle for each domain. Used if we're going to do load
|
||||
// balancing based on just dcycle to avoid having to do two iterations.
|
||||
let mut doms_dcycle_sums = vec![0.064; self.top.nr_doms];
|
||||
|
||||
// Track maximum weight so we can test for infeasibility below.
|
||||
let mut max_weight = 0.0f64;
|
||||
|
||||
// Accumulate dcycle and load across all domains and buckets. If we're
|
||||
// under-utilized, or there are no infeasible weights, this is
|
||||
// sufficient to collect all of the data we need for load balancing.
|
||||
for dom in 0..self.top.nr_doms {
|
||||
let dom_key = unsafe { std::mem::transmute::<u32, [u8; 4]>(dom as u32) };
|
||||
|
||||
let dom_offset = dom as u64 * NUM_BUCKETS;
|
||||
let mut dom_dcycle_sum = 0.0f64;
|
||||
let mut dom_load_sum = 0.0f64;
|
||||
|
||||
if let Some(dom_ctx_map_elem) = dom_data
|
||||
.lookup(&key, libbpf_rs::MapFlags::ANY)
|
||||
.lookup(&dom_key, libbpf_rs::MapFlags::ANY)
|
||||
.context("Failed to lookup dom_ctx")?
|
||||
{
|
||||
let dom_ctx =
|
||||
unsafe { &*(dom_ctx_map_elem.as_slice().as_ptr() as *const bpf_intf::dom_ctx) };
|
||||
|
||||
let rd = &dom_ctx.load_rd;
|
||||
self.dom_loads[i] = ravg_read(
|
||||
rd.val,
|
||||
rd.val_at,
|
||||
rd.old,
|
||||
rd.cur,
|
||||
now_mono,
|
||||
load_half_life,
|
||||
RAVG_FRAC_BITS,
|
||||
);
|
||||
for bucket in 0..NUM_BUCKETS {
|
||||
let bucket_ctx = dom_ctx.buckets[bucket as usize];
|
||||
let rd = &bucket_ctx.rd;
|
||||
let duty_cycle = ravg_read(
|
||||
rd.val,
|
||||
rd.val_at,
|
||||
rd.old,
|
||||
rd.cur,
|
||||
now_mono,
|
||||
load_half_life,
|
||||
RAVG_FRAC_BITS,
|
||||
);
|
||||
|
||||
load_sum += self.dom_loads[i];
|
||||
if approx_eq(0.0, duty_cycle) {
|
||||
continue;
|
||||
}
|
||||
|
||||
dom_dcycle_sum += duty_cycle;
|
||||
global_bucket_dcycle[bucket as usize] += duty_cycle;
|
||||
doms_dcycle_buckets[(dom_offset + bucket) as usize] = duty_cycle;
|
||||
|
||||
let weight = self.bucket_weight(bucket);
|
||||
let load = weight * duty_cycle;
|
||||
dom_load_sum += load;
|
||||
if weight > max_weight {
|
||||
max_weight = weight;
|
||||
}
|
||||
}
|
||||
|
||||
global_dcycle_sum += dom_dcycle_sum;
|
||||
doms_dcycle_sums[dom] = dom_dcycle_sum;
|
||||
|
||||
global_load_sum += dom_load_sum;
|
||||
self.dom_loads[dom] = dom_load_sum;
|
||||
}
|
||||
}
|
||||
|
||||
self.load_avg = load_sum / self.top.nr_doms as f64;
|
||||
if !self.lb_apply_weight {
|
||||
// System is under-utilized, so just use dcycle instead of load.
|
||||
self.load_avg = global_dcycle_sum / self.top.nr_doms as f64;
|
||||
self.dom_loads = doms_dcycle_sums;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// If the sum of duty cycle on the system is >= P, any weight w_x of a
|
||||
// task that exceeds L / P is guaranteed to be infeasible. Furthermore,
|
||||
// if any weight w_x == L / P then we know that task t_x can get its
|
||||
// full duty cycle, as:
|
||||
//
|
||||
// c_x = P * (w_x * d_x) / L
|
||||
// = P * (L/P * d_x) / L
|
||||
// = d_x / L / L
|
||||
// = d_x
|
||||
//
|
||||
// If there is no bucket whose weight exceeds L / P that has a nonzero
|
||||
// duty cycle, then all weights are feasible and we can use the data we
|
||||
// collected above without having to adjust for infeasibility.
|
||||
// Otherwise, we have at least one infeasible weight.
|
||||
//
|
||||
// See the function header for adjust_infeas_weights() for a more
|
||||
// comprehensive description of the algorithm for adjusting for
|
||||
// infeasible weights.
|
||||
let infeasible_thresh = global_load_sum / self.top.nr_cpus as f64;
|
||||
if approx_ge(max_weight, infeasible_thresh) {
|
||||
debug!("max_weight={} infeasible_threshold= {}",
|
||||
max_weight, infeasible_thresh);
|
||||
return self.adjust_infeas_weights(&global_bucket_dcycle,
|
||||
&doms_dcycle_buckets,
|
||||
global_load_sum);
|
||||
}
|
||||
|
||||
self.load_avg = global_load_sum / self.top.nr_doms as f64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// To balance dom loads, identify doms with lower and higher load than
|
||||
/// average.
|
||||
fn calculate_dom_load_balance(&mut self) -> Result<()> {
|
||||
let mode = if self.lb_apply_weight {
|
||||
"weighted"
|
||||
} else {
|
||||
"dcycle"
|
||||
};
|
||||
|
||||
debug!("mode= {} load_avg= {:.2} infeasible_thresh= {:.2}",
|
||||
mode, self.load_avg, self.infeas_threshold);
|
||||
for (dom, dom_load) in self.dom_loads.iter().enumerate() {
|
||||
let imbal = dom_load - self.load_avg;
|
||||
if imbal.abs() >= self.load_avg * Self::LOAD_IMBAL_HIGH_RATIO {
|
||||
@ -658,13 +905,14 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
|
||||
if let Some(task_data_elem) = task_data.lookup(&key, libbpf_rs::MapFlags::ANY)? {
|
||||
let task_ctx =
|
||||
unsafe { &*(task_data_elem.as_slice().as_ptr() as *const bpf_intf::task_ctx) };
|
||||
|
||||
if task_ctx.dom_id != dom {
|
||||
continue;
|
||||
}
|
||||
|
||||
let weight = (task_ctx.weight as f64).min(self.infeas_threshold);
|
||||
|
||||
let rd = &task_ctx.dcyc_rd;
|
||||
let load = task_ctx.weight as f64
|
||||
let load = weight
|
||||
* ravg_read(
|
||||
rd.val,
|
||||
rd.val_at,
|
||||
@ -1136,7 +1384,7 @@ impl<'a> Scheduler<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
fn lb_step(&mut self) -> Result<()> {
|
||||
fn lb_step(&mut self, lb_apply_weight: bool) -> Result<()> {
|
||||
let started_at = Instant::now();
|
||||
let bpf_stats = self.read_bpf_stats()?;
|
||||
let cpu_busy = self.get_cpu_busy()?;
|
||||
@ -1145,6 +1393,7 @@ impl<'a> Scheduler<'a> {
|
||||
&mut self.skel,
|
||||
self.top.clone(),
|
||||
self.balanced_kworkers,
|
||||
&lb_apply_weight,
|
||||
&mut self.nr_lb_data_errors,
|
||||
);
|
||||
|
||||
@ -1187,9 +1436,10 @@ impl<'a> Scheduler<'a> {
|
||||
next_tune_at = now + self.tune_interval;
|
||||
}
|
||||
}
|
||||
let lb_apply_weight = self.tuner.lb_apply_weight;
|
||||
|
||||
if now >= next_sched_at {
|
||||
self.lb_step()?;
|
||||
self.lb_step(lb_apply_weight)?;
|
||||
next_sched_at += self.sched_interval;
|
||||
if next_sched_at < now {
|
||||
next_sched_at = now + self.sched_interval;
|
||||
|
Loading…
Reference in New Issue
Block a user