mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-24 20:00:22 +00:00
Merge pull request #178 from sched-ext/multi_numa_rusty
rusty: Implement NUMA-aware load balancing
This commit is contained in:
commit
91cb5ce8ab
@ -207,7 +207,14 @@ impl Cpumask {
|
||||
|
||||
impl fmt::Display for Cpumask {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}:<{}>", self.nr_cpus, self.mask)
|
||||
let slice = self.as_raw_slice();
|
||||
let mut remaining_width = self.nr_cpus + 2;
|
||||
write!(f, "{:#0width$b}", slice[0], width = remaining_width.min(66))?;
|
||||
for submask in &slice[1..] {
|
||||
remaining_width -= 64;
|
||||
write!(f, "{:0width$b}", submask, width = remaining_width.min(64))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -181,7 +181,7 @@ impl Node {
|
||||
#[derive(Debug)]
|
||||
pub struct Topology {
|
||||
nodes: Vec<Node>,
|
||||
cores: BTreeMap<usize, Core>,
|
||||
cores: Vec<Core>,
|
||||
cpus: BTreeMap<usize, Cpu>,
|
||||
nr_cpus: usize,
|
||||
span: Cpumask,
|
||||
@ -199,14 +199,12 @@ impl Topology {
|
||||
// system. We clone the objects that are located further down in the
|
||||
// hierarchy rather than dealing with references, as the entire
|
||||
// Topology is read-only anyways.
|
||||
let mut cores = BTreeMap::new();
|
||||
let mut cores = Vec::new();
|
||||
let mut cpus = BTreeMap::new();
|
||||
for node in nodes.iter() {
|
||||
for (_, llc) in node.llcs.iter() {
|
||||
for (core_id, core) in llc.cores.iter() {
|
||||
if let Some(_) = cores.insert(*core_id, core.clone()) {
|
||||
bail!("Found duplicate core ID {}", core_id);
|
||||
}
|
||||
for llc in node.llcs.values() {
|
||||
for core in llc.cores.values() {
|
||||
cores.push(core.clone());
|
||||
for (cpu_id, cpu) in core.cpus.iter() {
|
||||
if let Some(_) = cpus.insert(*cpu_id, cpu.clone()) {
|
||||
bail!("Found duplicate CPU ID {}", cpu_id);
|
||||
@ -219,13 +217,13 @@ impl Topology {
|
||||
Ok(Topology { nodes, nr_cpus, cores, cpus, span })
|
||||
}
|
||||
|
||||
/// Get a slice of the NUMA nodes on the host
|
||||
/// Get a slice of the NUMA nodes on the host.
|
||||
pub fn nodes(&self) -> &[Node] {
|
||||
&self.nodes
|
||||
}
|
||||
|
||||
/// Get a hashmap of <core ID, Core> for all Cores on the host.
|
||||
pub fn cores(&self) -> &BTreeMap<usize, Core> {
|
||||
/// Get a slice of all Cores on the host.
|
||||
pub fn cores(&self) -> &[Core] {
|
||||
&self.cores
|
||||
}
|
||||
|
||||
|
@ -296,7 +296,7 @@ impl<'a> Scheduler<'a> {
|
||||
let mut idle_cpu_count = 0;
|
||||
|
||||
// Count the number of cores where all the CPUs are idle.
|
||||
for (_, core) in self.topo.cores().iter() {
|
||||
for core in self.topo.cores().iter() {
|
||||
let mut all_idle = true;
|
||||
for (cpu_id, _) in core.cpus().iter() {
|
||||
if self.bpf.get_cpu_pid(*cpu_id as i32) != 0 {
|
||||
@ -669,14 +669,14 @@ impl<'a> Scheduler<'a> {
|
||||
Err(_) => -1,
|
||||
};
|
||||
info!("Running tasks:");
|
||||
for (core_id, core) in self.topo.cores().iter() {
|
||||
for core in self.topo.cores().iter() {
|
||||
for (cpu_id, _) in core.cpus().iter() {
|
||||
let pid = if *cpu_id as i32 == sched_cpu {
|
||||
"[self]".to_string()
|
||||
} else {
|
||||
self.bpf.get_cpu_pid(*cpu_id as i32).to_string()
|
||||
};
|
||||
info!(" core {:2} cpu {:2} pid={}", core_id, cpu_id, pid);
|
||||
info!(" core {:2} cpu {:2} pid={}", core.id(), cpu_id, pid);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,7 @@ log = "0.4.17"
|
||||
ordered-float = "3.4.0"
|
||||
scx_utils = { path = "../../../rust/scx_utils", version = "0.6" }
|
||||
simplelog = "0.12.0"
|
||||
sorted-vec = "0.8.3"
|
||||
static_assertions = "1.1.0"
|
||||
|
||||
[build-dependencies]
|
||||
|
@ -24,6 +24,7 @@ typedef unsigned long long u64;
|
||||
enum consts {
|
||||
MAX_CPUS = 512,
|
||||
MAX_DOMS = 64, /* limited to avoid complex bitmask ops */
|
||||
MAX_NUMA_NODES = MAX_DOMS, /* Assume at least 1 domain per NUMA node */
|
||||
CACHELINE_SIZE = 64,
|
||||
|
||||
LB_DEFAULT_WEIGHT = 100,
|
||||
@ -54,7 +55,8 @@ enum stat_idx {
|
||||
RUSTY_STAT_DIRECT_GREEDY,
|
||||
RUSTY_STAT_DIRECT_GREEDY_FAR,
|
||||
RUSTY_STAT_DSQ_DISPATCH,
|
||||
RUSTY_STAT_GREEDY,
|
||||
RUSTY_STAT_GREEDY_LOCAL,
|
||||
RUSTY_STAT_GREEDY_XNUMA,
|
||||
|
||||
/* Extra stats that don't contribute to total */
|
||||
RUSTY_STAT_REPATRIATE,
|
||||
@ -72,6 +74,7 @@ struct task_ctx {
|
||||
u64 dom_mask;
|
||||
|
||||
struct bpf_cpumask __kptr *cpumask;
|
||||
struct bpf_cpumask __kptr *tmp_cpumask;
|
||||
u32 dom_id;
|
||||
u32 weight;
|
||||
bool runnable;
|
||||
@ -99,9 +102,15 @@ struct dom_ctx {
|
||||
u64 vtime_now;
|
||||
struct bpf_cpumask __kptr *cpumask;
|
||||
struct bpf_cpumask __kptr *direct_greedy_cpumask;
|
||||
struct bpf_cpumask __kptr *node_cpumask;
|
||||
u32 node_id;
|
||||
|
||||
u64 dbg_dcycle_printed_at;
|
||||
struct bucket_ctx buckets[LB_LOAD_BUCKETS];
|
||||
};
|
||||
|
||||
struct node_ctx {
|
||||
struct bpf_cpumask __kptr *cpumask;
|
||||
};
|
||||
|
||||
#endif /* __INTF_H */
|
||||
|
@ -59,15 +59,20 @@ struct user_exit_info uei;
|
||||
* Domains and cpus
|
||||
*/
|
||||
const volatile u32 nr_doms = 32; /* !0 for veristat, set during init */
|
||||
const volatile u32 nr_nodes = 32; /* !0 for veristat, set during init */
|
||||
const volatile u32 nr_cpus = 64; /* !0 for veristat, set during init */
|
||||
const volatile u32 cpu_dom_id_map[MAX_CPUS];
|
||||
const volatile u32 dom_numa_id_map[MAX_DOMS];
|
||||
const volatile u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64];
|
||||
const volatile u64 numa_cpumasks[MAX_NUMA_NODES][MAX_CPUS / 64];
|
||||
const volatile u32 load_half_life = 1000000000 /* 1s */;
|
||||
|
||||
const volatile bool kthreads_local;
|
||||
const volatile bool fifo_sched;
|
||||
const volatile bool switch_partial;
|
||||
const volatile bool direct_greedy_numa;
|
||||
const volatile u32 greedy_threshold;
|
||||
const volatile u32 greedy_threshold_x_numa;
|
||||
const volatile u32 debug;
|
||||
|
||||
/* base slice duration */
|
||||
@ -78,13 +83,27 @@ const volatile u64 slice_ns = SCX_SLICE_DFL;
|
||||
*/
|
||||
struct pcpu_ctx {
|
||||
u32 dom_rr_cur; /* used when scanning other doms */
|
||||
u32 dom_id;
|
||||
u32 nr_node_doms;
|
||||
u32 node_doms[MAX_DOMS];
|
||||
|
||||
/* libbpf-rs does not respect the alignment, so pad out the struct explicitly */
|
||||
u8 _padding[CACHELINE_SIZE - sizeof(u32)];
|
||||
u8 _padding[CACHELINE_SIZE - ((3 + MAX_DOMS) * sizeof(u32) % CACHELINE_SIZE)];
|
||||
} __attribute__((aligned(CACHELINE_SIZE)));
|
||||
|
||||
struct pcpu_ctx pcpu_ctx[MAX_CPUS];
|
||||
|
||||
/*
|
||||
* Numa node context
|
||||
*/
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__type(key, u32);
|
||||
__type(value, struct node_ctx);
|
||||
__uint(max_entries, MAX_NUMA_NODES);
|
||||
__uint(map_flags, 0);
|
||||
} node_data SEC(".maps");
|
||||
|
||||
/*
|
||||
* Domain context
|
||||
*/
|
||||
@ -467,7 +486,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
|
||||
{
|
||||
const struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask();
|
||||
struct task_ctx *taskc;
|
||||
struct bpf_cpumask *p_cpumask;
|
||||
struct bpf_cpumask *p_cpumask, *tmp_cpumask = NULL;
|
||||
bool prev_domestic, has_idle_cores;
|
||||
s32 cpu;
|
||||
|
||||
@ -599,19 +618,51 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
|
||||
|
||||
/*
|
||||
* Domestic domain is fully booked. If there are CPUs which are idle and
|
||||
* under-utilized, ignore domain boundaries and push the task there. Try
|
||||
* to find an idle core first.
|
||||
* under-utilized, ignore domain boundaries (while still respecting NUMA
|
||||
* boundaries) and push the task there. Try to find an idle core first.
|
||||
*/
|
||||
if (taskc->all_cpus && direct_greedy_cpumask &&
|
||||
!bpf_cpumask_empty((const struct cpumask *)direct_greedy_cpumask)) {
|
||||
u32 dom_id = cpu_to_dom_id(prev_cpu);
|
||||
struct dom_ctx *domc;
|
||||
struct bpf_cpumask *tmp_direct_greedy, *node_mask;
|
||||
|
||||
if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) {
|
||||
scx_bpf_error("Failed to lookup dom[%u]", dom_id);
|
||||
goto enoent;
|
||||
}
|
||||
|
||||
tmp_direct_greedy = direct_greedy_cpumask;
|
||||
if (!tmp_direct_greedy) {
|
||||
scx_bpf_error("Failed to lookup direct_greedy mask");
|
||||
goto enoent;
|
||||
}
|
||||
/*
|
||||
* By default, only look for an idle core in the current NUMA
|
||||
* node when looking for direct greedy CPUs outside of the
|
||||
* current domain. Stealing work temporarily is fine when
|
||||
* you're going across domain boundaries, but it may be less
|
||||
* desirable when crossing NUMA boundaries as the task's
|
||||
* working set may end up spanning multiple NUMA nodes.
|
||||
*/
|
||||
if (!direct_greedy_numa) {
|
||||
node_mask = domc->node_cpumask;
|
||||
if (!node_mask) {
|
||||
scx_bpf_error("Failed to lookup node mask");
|
||||
goto enoent;
|
||||
}
|
||||
|
||||
tmp_cpumask = bpf_kptr_xchg(&taskc->tmp_cpumask, NULL);
|
||||
if (!tmp_cpumask) {
|
||||
scx_bpf_error("Failed to lookup tmp cpumask");
|
||||
goto enoent;
|
||||
}
|
||||
bpf_cpumask_and(tmp_cpumask,
|
||||
(const struct cpumask *)node_mask,
|
||||
(const struct cpumask *)tmp_direct_greedy);
|
||||
tmp_direct_greedy = tmp_cpumask;
|
||||
}
|
||||
|
||||
/* Try to find an idle core in the previous and then any domain */
|
||||
if (has_idle_cores) {
|
||||
if (domc->direct_greedy_cpumask) {
|
||||
@ -626,7 +677,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
|
||||
|
||||
if (direct_greedy_cpumask) {
|
||||
cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
|
||||
direct_greedy_cpumask,
|
||||
tmp_direct_greedy,
|
||||
SCX_PICK_IDLE_CORE);
|
||||
if (cpu >= 0) {
|
||||
stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
|
||||
@ -649,7 +700,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
|
||||
|
||||
if (direct_greedy_cpumask) {
|
||||
cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
|
||||
direct_greedy_cpumask, 0);
|
||||
tmp_direct_greedy, 0);
|
||||
if (cpu >= 0) {
|
||||
stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
|
||||
goto direct;
|
||||
@ -668,10 +719,20 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
|
||||
else
|
||||
cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);
|
||||
|
||||
if (tmp_cpumask) {
|
||||
tmp_cpumask = bpf_kptr_xchg(&taskc->tmp_cpumask, tmp_cpumask);
|
||||
if (tmp_cpumask)
|
||||
bpf_cpumask_release(tmp_cpumask);
|
||||
}
|
||||
scx_bpf_put_idle_cpumask(idle_smtmask);
|
||||
return cpu;
|
||||
|
||||
direct:
|
||||
if (tmp_cpumask) {
|
||||
tmp_cpumask = bpf_kptr_xchg(&taskc->tmp_cpumask, tmp_cpumask);
|
||||
if (tmp_cpumask)
|
||||
bpf_cpumask_release(tmp_cpumask);
|
||||
}
|
||||
taskc->dispatch_local = true;
|
||||
scx_bpf_put_idle_cpumask(idle_smtmask);
|
||||
return cpu;
|
||||
@ -797,24 +858,43 @@ static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
|
||||
static u32 dom_rr_next(s32 cpu)
|
||||
{
|
||||
struct pcpu_ctx *pcpuc;
|
||||
u32 dom_id;
|
||||
u32 idx, *dom_id;
|
||||
|
||||
pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
|
||||
if (!pcpuc)
|
||||
if (!pcpuc || !pcpuc->nr_node_doms)
|
||||
return 0;
|
||||
|
||||
dom_id = (pcpuc->dom_rr_cur + 1) % nr_doms;
|
||||
idx = (pcpuc->dom_rr_cur + 1) % pcpuc->nr_node_doms;
|
||||
dom_id = MEMBER_VPTR(pcpuc->node_doms, [idx]);
|
||||
if (!dom_id) {
|
||||
scx_bpf_error("Failed to lookup dom for %d", cpu);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (dom_id == cpu_to_dom_id(cpu))
|
||||
dom_id = (dom_id + 1) % nr_doms;
|
||||
if (*dom_id == cpu_to_dom_id(cpu))
|
||||
scx_bpf_error("%d found current dom in node_doms array", cpu);
|
||||
|
||||
pcpuc->dom_rr_cur = dom_id;
|
||||
return dom_id;
|
||||
pcpuc->dom_rr_cur++;
|
||||
return *dom_id;
|
||||
}
|
||||
|
||||
u32 dom_node_id(u32 dom_id)
|
||||
{
|
||||
u32 *nid_ptr;
|
||||
|
||||
nid_ptr = MEMBER_VPTR(dom_numa_id_map, [dom_id]);
|
||||
if (!nid_ptr) {
|
||||
scx_bpf_error("Couldn't look up node ID for %s", dom_id);
|
||||
return 0;
|
||||
}
|
||||
return *nid_ptr;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
|
||||
{
|
||||
u32 dom = cpu_to_dom_id(cpu);
|
||||
struct pcpu_ctx *pcpuc;
|
||||
u32 node_doms, my_node, i;
|
||||
|
||||
if (scx_bpf_consume(dom)) {
|
||||
stat_add(RUSTY_STAT_DSQ_DISPATCH, 1);
|
||||
@ -824,13 +904,35 @@ void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
|
||||
if (!greedy_threshold)
|
||||
return;
|
||||
|
||||
bpf_repeat(nr_doms - 1) {
|
||||
u32 dom_id = dom_rr_next(cpu);
|
||||
pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
|
||||
if (!pcpuc) {
|
||||
scx_bpf_error("Failed to get PCPU context");
|
||||
return;
|
||||
}
|
||||
node_doms = pcpuc->nr_node_doms;
|
||||
|
||||
if (scx_bpf_dsq_nr_queued(dom_id) >= greedy_threshold &&
|
||||
scx_bpf_consume(dom_id)) {
|
||||
stat_add(RUSTY_STAT_GREEDY, 1);
|
||||
break;
|
||||
/* try to steal a task from domains on the current NUMA node */
|
||||
bpf_for(i, 0, node_doms) {
|
||||
dom = (pcpuc->dom_rr_cur + 1 + i) % node_doms;
|
||||
if (scx_bpf_consume(dom)) {
|
||||
stat_add(RUSTY_STAT_GREEDY_LOCAL, 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (!greedy_threshold_x_numa || nr_nodes == 1)
|
||||
return;
|
||||
|
||||
/* try to steal a task from domains on other NUMA nodes */
|
||||
my_node = dom_node_id(pcpuc->dom_id);
|
||||
bpf_repeat(nr_doms - 1) {
|
||||
dom = (pcpuc->dom_rr_cur + 1) % nr_doms;
|
||||
pcpuc->dom_rr_cur++;
|
||||
if (dom_node_id(dom) != my_node &&
|
||||
scx_bpf_dsq_nr_queued(dom) >= greedy_threshold_x_numa &&
|
||||
scx_bpf_consume(dom)) {
|
||||
stat_add(RUSTY_STAT_GREEDY_XNUMA, 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1053,6 +1155,18 @@ s32 BPF_STRUCT_OPS(rusty_init_task, struct task_struct *p,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
cpumask = bpf_cpumask_create();
|
||||
if (!cpumask) {
|
||||
scx_bpf_error("Failed to create BPF cpumask for task");
|
||||
return -ENOMEM;
|
||||
}
|
||||
cpumask = bpf_kptr_xchg(&map_value->tmp_cpumask, cpumask);
|
||||
if (cpumask) {
|
||||
scx_bpf_error("%s[%d] tmp_cpumask already present", p->comm, p->pid);
|
||||
bpf_cpumask_release(cpumask);
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
task_pick_and_set_domain(map_value, p, p->cpus_ptr, true);
|
||||
|
||||
return 0;
|
||||
@ -1077,11 +1191,53 @@ void BPF_STRUCT_OPS(rusty_exit_task, struct task_struct *p,
|
||||
}
|
||||
}
|
||||
|
||||
static s32 create_node(u32 node_id)
|
||||
{
|
||||
u32 cpu;
|
||||
struct bpf_cpumask *cpumask;
|
||||
struct node_ctx *nodec;
|
||||
|
||||
nodec = bpf_map_lookup_elem(&node_data, &node_id);
|
||||
if (!nodec) {
|
||||
/* Should never happen, it's created statically at load time. */
|
||||
scx_bpf_error("No node%u", node_id);
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
cpumask = bpf_cpumask_create();
|
||||
if (!cpumask)
|
||||
return -ENOMEM;
|
||||
|
||||
for (cpu = 0; cpu < MAX_CPUS; cpu++) {
|
||||
const volatile u64 *nmask;
|
||||
|
||||
nmask = MEMBER_VPTR(numa_cpumasks, [node_id][cpu / 64]);
|
||||
if (!nmask) {
|
||||
scx_bpf_error("array index error");
|
||||
bpf_cpumask_release(cpumask);
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
if (*nmask & (1LLU << (cpu % 64)))
|
||||
bpf_cpumask_set_cpu(cpu, cpumask);
|
||||
}
|
||||
|
||||
cpumask = bpf_kptr_xchg(&nodec->cpumask, cpumask);
|
||||
if (cpumask) {
|
||||
scx_bpf_error("Node %u cpumask already present", node_id);
|
||||
bpf_cpumask_release(cpumask);
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static s32 create_dom(u32 dom_id)
|
||||
{
|
||||
struct dom_ctx *domc;
|
||||
struct bpf_cpumask *cpumask;
|
||||
u32 cpu;
|
||||
struct node_ctx *nodec;
|
||||
struct bpf_cpumask *cpumask, *node_mask;
|
||||
u32 cpu, node_id;
|
||||
s32 ret;
|
||||
|
||||
if (dom_id >= MAX_DOMS) {
|
||||
@ -1141,7 +1297,6 @@ static s32 create_dom(u32 dom_id)
|
||||
dom_id);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
cpumask = bpf_kptr_xchg(&domc->direct_greedy_cpumask, cpumask);
|
||||
if (cpumask) {
|
||||
scx_bpf_error("Domain %u direct_greedy_cpumask already present",
|
||||
@ -1150,6 +1305,99 @@ static s32 create_dom(u32 dom_id)
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
node_id = dom_node_id(dom_id);
|
||||
nodec = bpf_map_lookup_elem(&node_data, &node_id);
|
||||
if (!nodec) {
|
||||
/* Should never happen, it's created statically at load time. */
|
||||
scx_bpf_error("No node%u", node_id);
|
||||
return -ENOENT;
|
||||
}
|
||||
bpf_rcu_read_lock();
|
||||
node_mask = nodec->cpumask;
|
||||
if (!node_mask) {
|
||||
bpf_rcu_read_unlock();
|
||||
scx_bpf_error("NUMA %d mask not found for domain %u",
|
||||
node_id, dom_id);
|
||||
return -ENOENT;
|
||||
}
|
||||
cpumask = bpf_cpumask_create();
|
||||
if (!cpumask) {
|
||||
bpf_rcu_read_unlock();
|
||||
scx_bpf_error("Failed to create BPF cpumask for domain %u",
|
||||
dom_id);
|
||||
return -ENOMEM;
|
||||
}
|
||||
bpf_cpumask_copy(cpumask, (const struct cpumask *)node_mask);
|
||||
bpf_rcu_read_unlock();
|
||||
cpumask = bpf_kptr_xchg(&domc->node_cpumask, cpumask);
|
||||
if (cpumask) {
|
||||
scx_bpf_error("Domain %u node_cpumask already present",
|
||||
dom_id);
|
||||
bpf_cpumask_release(cpumask);
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static s32 initialize_cpu(s32 cpu)
|
||||
{
|
||||
struct bpf_cpumask *cpumask;
|
||||
struct dom_ctx *domc;
|
||||
int i, j = 0;
|
||||
struct pcpu_ctx *pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
|
||||
u32 *dom_nodes;
|
||||
|
||||
if (!pcpuc) {
|
||||
scx_bpf_error("Failed to lookup pcpu ctx %d", cpu);
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
pcpuc->dom_rr_cur = cpu;
|
||||
bpf_for(i, 0, nr_doms) {
|
||||
domc = bpf_map_lookup_elem(&dom_data, &i);
|
||||
if (!domc) {
|
||||
scx_bpf_error("Failed to lookup dom_ctx");
|
||||
return -ENOENT;
|
||||
}
|
||||
bpf_rcu_read_lock();
|
||||
cpumask = domc->node_cpumask;
|
||||
if (!cpumask) {
|
||||
bpf_rcu_read_unlock();
|
||||
scx_bpf_error("Failed to lookup dom node cpumask");
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
if (bpf_cpumask_test_cpu(cpu, (const struct cpumask *)cpumask)) {
|
||||
cpumask = domc->cpumask;
|
||||
if (!cpumask) {
|
||||
bpf_rcu_read_unlock();
|
||||
scx_bpf_error("Failed to lookup dom cpumask");
|
||||
return -ENOENT;
|
||||
}
|
||||
/*
|
||||
* Only record the remote domains in this array, as
|
||||
* we'll only ever consume from them on the greedy
|
||||
* threshold path.
|
||||
*/
|
||||
if (!bpf_cpumask_test_cpu(cpu,
|
||||
(const struct cpumask *)cpumask)) {
|
||||
dom_nodes = MEMBER_VPTR(pcpuc->node_doms, [j]);
|
||||
if (!dom_nodes) {
|
||||
bpf_rcu_read_unlock();
|
||||
scx_bpf_error("Failed to lookup doms ptr");
|
||||
return -EINVAL;
|
||||
}
|
||||
*dom_nodes = i;
|
||||
j++;
|
||||
} else {
|
||||
pcpuc->dom_id = i;
|
||||
}
|
||||
}
|
||||
bpf_rcu_read_unlock();
|
||||
}
|
||||
pcpuc->nr_node_doms = j;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1182,14 +1430,22 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
|
||||
if (!switch_partial)
|
||||
scx_bpf_switch_all();
|
||||
|
||||
bpf_for(i, 0, nr_nodes) {
|
||||
ret = create_node(i);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
bpf_for(i, 0, nr_doms) {
|
||||
ret = create_dom(i);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
bpf_for(i, 0, nr_cpus)
|
||||
pcpu_ctx[i].dom_rr_cur = i;
|
||||
bpf_for(i, 0, nr_cpus) {
|
||||
ret = initialize_cpu(i);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
112
scheds/rust/scx_rusty/src/domain.rs
Normal file
112
scheds/rust/scx_rusty/src/domain.rs
Normal file
@ -0,0 +1,112 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use scx_utils::Cpumask;
|
||||
use scx_utils::Topology;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Domain {
|
||||
id: usize,
|
||||
mask: Cpumask,
|
||||
}
|
||||
|
||||
impl Domain {
|
||||
/// Get the Domain's ID.
|
||||
pub fn id(&self) -> usize {
|
||||
self.id
|
||||
}
|
||||
|
||||
/// Get a copy of the domain's cpumask.
|
||||
pub fn mask(&self) -> Cpumask {
|
||||
self.mask.clone()
|
||||
}
|
||||
|
||||
/// Get a raw slice of the domain's cpumask as a set of one or more u64
|
||||
/// variables whose bits represent CPUs in the mask.
|
||||
pub fn mask_slice(&self) -> &[u64] {
|
||||
self.mask.as_raw_slice()
|
||||
}
|
||||
|
||||
/// The number of CPUs in the domain.
|
||||
pub fn weight(&self) -> usize {
|
||||
self.mask.len()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DomainGroup {
|
||||
doms: BTreeMap<usize, Domain>,
|
||||
cpu_dom_map: BTreeMap<usize, usize>,
|
||||
dom_numa_map: BTreeMap<usize, usize>,
|
||||
num_numa_nodes: usize,
|
||||
}
|
||||
|
||||
impl DomainGroup {
|
||||
pub fn new(top: Arc<Topology>, cpumasks: &[String]) -> Result<Self> {
|
||||
let mut dom_numa_map = BTreeMap::new();
|
||||
let (doms, num_numa_nodes) = if !cpumasks.is_empty() {
|
||||
let mut doms: BTreeMap<usize, Domain> = BTreeMap::new();
|
||||
let mut id = 0;
|
||||
for mask_str in cpumasks.iter() {
|
||||
let mask = Cpumask::from_str(&mask_str)?;
|
||||
doms.insert(id, Domain { id, mask, });
|
||||
dom_numa_map.insert(id, 0);
|
||||
id += 1;
|
||||
}
|
||||
(doms, 1)
|
||||
} else {
|
||||
let mut doms: BTreeMap<usize, Domain> = BTreeMap::new();
|
||||
for (node_id, node) in top.nodes().iter().enumerate() {
|
||||
for (id, llc) in node.llcs().iter() {
|
||||
let mask = llc.span();
|
||||
doms.insert(*id, Domain { id: id.clone(), mask, });
|
||||
dom_numa_map.insert(*id, node_id.clone());
|
||||
}
|
||||
}
|
||||
(doms, top.nodes().len())
|
||||
};
|
||||
|
||||
let mut cpu_dom_map = BTreeMap::new();
|
||||
for (id, dom) in doms.iter() {
|
||||
for cpu in dom.mask.clone().into_iter() {
|
||||
cpu_dom_map.insert(cpu, *id);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self { doms, cpu_dom_map, dom_numa_map, num_numa_nodes, })
|
||||
}
|
||||
|
||||
pub fn numa_doms(&self, numa_id: &usize) -> Vec<Domain> {
|
||||
let mut numa_doms = Vec::new();
|
||||
for (d_id, n_id) in self.dom_numa_map.iter() {
|
||||
if n_id == numa_id {
|
||||
let dom = self.doms.get(&d_id).unwrap();
|
||||
numa_doms.push(dom.clone());
|
||||
}
|
||||
}
|
||||
|
||||
numa_doms
|
||||
}
|
||||
|
||||
pub fn nr_doms(&self) -> usize {
|
||||
self.doms.len()
|
||||
}
|
||||
|
||||
pub fn nr_nodes(&self) -> usize {
|
||||
self.num_numa_nodes
|
||||
}
|
||||
|
||||
pub fn cpu_dom_id(&self, cpu: &usize) -> Option<usize> {
|
||||
self.cpu_dom_map.get(cpu).copied()
|
||||
}
|
||||
|
||||
pub fn dom_numa_id(&self, dom_id: &usize) -> Option<usize> {
|
||||
self.dom_numa_map.get(dom_id).copied()
|
||||
}
|
||||
}
|
1097
scheds/rust/scx_rusty/src/load_balance.rs
Normal file
1097
scheds/rust/scx_rusty/src/load_balance.rs
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
183
scheds/rust/scx_rusty/src/tuner.rs
Normal file
183
scheds/rust/scx_rusty/src/tuner.rs
Normal file
@ -0,0 +1,183 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::sub_or_zero;
|
||||
use crate::DomainGroup;
|
||||
use crate::BpfSkel;
|
||||
|
||||
use ::fb_procfs as procfs;
|
||||
use anyhow::anyhow;
|
||||
use anyhow::bail;
|
||||
use anyhow::Result;
|
||||
|
||||
use scx_utils::Topology;
|
||||
|
||||
fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
|
||||
match (curr, prev) {
|
||||
(
|
||||
procfs::CpuStat {
|
||||
user_usec: Some(curr_user),
|
||||
nice_usec: Some(curr_nice),
|
||||
system_usec: Some(curr_system),
|
||||
idle_usec: Some(curr_idle),
|
||||
iowait_usec: Some(curr_iowait),
|
||||
irq_usec: Some(curr_irq),
|
||||
softirq_usec: Some(curr_softirq),
|
||||
stolen_usec: Some(curr_stolen),
|
||||
..
|
||||
},
|
||||
procfs::CpuStat {
|
||||
user_usec: Some(prev_user),
|
||||
nice_usec: Some(prev_nice),
|
||||
system_usec: Some(prev_system),
|
||||
idle_usec: Some(prev_idle),
|
||||
iowait_usec: Some(prev_iowait),
|
||||
irq_usec: Some(prev_irq),
|
||||
softirq_usec: Some(prev_softirq),
|
||||
stolen_usec: Some(prev_stolen),
|
||||
..
|
||||
},
|
||||
) => {
|
||||
let idle_usec = sub_or_zero(curr_idle, prev_idle);
|
||||
let iowait_usec = sub_or_zero(curr_iowait, prev_iowait);
|
||||
let user_usec = sub_or_zero(curr_user, prev_user);
|
||||
let system_usec = sub_or_zero(curr_system, prev_system);
|
||||
let nice_usec = sub_or_zero(curr_nice, prev_nice);
|
||||
let irq_usec = sub_or_zero(curr_irq, prev_irq);
|
||||
let softirq_usec = sub_or_zero(curr_softirq, prev_softirq);
|
||||
let stolen_usec = sub_or_zero(curr_stolen, prev_stolen);
|
||||
|
||||
let busy_usec =
|
||||
user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
|
||||
let total_usec = idle_usec + busy_usec + iowait_usec;
|
||||
if total_usec > 0 {
|
||||
Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
|
||||
} else {
|
||||
Ok(1.0)
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
bail!("Missing stats in cpustat");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Tuner {
|
||||
top: Arc<Topology>,
|
||||
dom_group: Arc<DomainGroup>,
|
||||
direct_greedy_under: f64,
|
||||
kick_greedy_under: f64,
|
||||
proc_reader: procfs::ProcReader,
|
||||
prev_cpu_stats: BTreeMap<u32, procfs::CpuStat>,
|
||||
pub fully_utilized: bool,
|
||||
dom_utils: Vec<f64>,
|
||||
}
|
||||
|
||||
impl Tuner {
|
||||
pub fn new(top: Arc<Topology>,
|
||||
dom_group: Arc<DomainGroup>,
|
||||
direct_greedy_under: f64,
|
||||
kick_greedy_under: f64) -> Result<Self> {
|
||||
let proc_reader = procfs::ProcReader::new();
|
||||
let prev_cpu_stats = proc_reader
|
||||
.read_stat()?
|
||||
.cpus_map
|
||||
.ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
|
||||
|
||||
Ok(Self {
|
||||
direct_greedy_under: direct_greedy_under / 100.0,
|
||||
kick_greedy_under: kick_greedy_under / 100.0,
|
||||
proc_reader,
|
||||
prev_cpu_stats,
|
||||
dom_utils: vec![0.0; dom_group.nr_doms()],
|
||||
fully_utilized: false,
|
||||
top,
|
||||
dom_group,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn dom_util(&self, dom: usize) -> f64 {
|
||||
self.dom_utils[dom]
|
||||
}
|
||||
|
||||
/// Apply a step in the Tuner by:
|
||||
///
|
||||
/// 1. Recording CPU stats from procfs
|
||||
/// 2. Calculating current per-domain and host-wide utilization
|
||||
/// 3. Updating direct_greedy_under and kick_greedy_under cpumasks according
|
||||
/// to the observed utilization
|
||||
pub fn step(&mut self, skel: &mut BpfSkel) -> Result<()> {
|
||||
let curr_cpu_stats = self
|
||||
.proc_reader
|
||||
.read_stat()?
|
||||
.cpus_map
|
||||
.ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
|
||||
let mut dom_nr_cpus = vec![0; self.dom_group.nr_doms()];
|
||||
let mut dom_util_sum = vec![0.0; self.dom_group.nr_doms()];
|
||||
|
||||
let mut avg_util = 0.0f64;
|
||||
for cpu in 0..self.top.nr_cpus() {
|
||||
let cpu32 = cpu as u32;
|
||||
// None domain indicates the CPU was offline during
|
||||
// initialization and None CpuStat indicates the CPU has gone
|
||||
// down since then. Ignore both.
|
||||
if let (Some(dom), Some(curr), Some(prev)) = (
|
||||
self.dom_group.cpu_dom_id(&cpu),
|
||||
curr_cpu_stats.get(&cpu32),
|
||||
self.prev_cpu_stats.get(&cpu32),
|
||||
) {
|
||||
let util = calc_util(curr, prev)?;
|
||||
dom_nr_cpus[dom] += 1;
|
||||
dom_util_sum[dom] += util;
|
||||
avg_util += util;
|
||||
}
|
||||
}
|
||||
avg_util /= self.top.nr_cpus() as f64;
|
||||
self.fully_utilized = avg_util >= 0.99999;
|
||||
|
||||
let ti = &mut skel.bss_mut().tune_input;
|
||||
for dom in 0..self.dom_group.nr_doms() {
|
||||
// Calculate the domain avg util. If there are no active CPUs,
|
||||
// it doesn't really matter. Go with 0.0 as that's less likely
|
||||
// to confuse users.
|
||||
let util = match dom_nr_cpus[dom] {
|
||||
0 => 0.0,
|
||||
nr => dom_util_sum[dom] / nr as f64,
|
||||
};
|
||||
|
||||
self.dom_utils[dom] = util;
|
||||
|
||||
// This could be implemented better.
|
||||
let update_dom_bits = |target: &mut [u64; 8], val: bool| {
|
||||
for cpu in 0..self.top.nr_cpus() {
|
||||
if let Some(cdom) = self.dom_group.cpu_dom_id(&cpu) {
|
||||
if cdom == dom {
|
||||
if val {
|
||||
target[cpu / 64] |= 1u64 << (cpu % 64);
|
||||
} else {
|
||||
target[cpu / 64] &= !(1u64 << (cpu % 64));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
update_dom_bits(
|
||||
&mut ti.direct_greedy_cpumask,
|
||||
self.direct_greedy_under > 0.99999 || util < self.direct_greedy_under,
|
||||
);
|
||||
update_dom_bits(
|
||||
&mut ti.kick_greedy_cpumask,
|
||||
self.kick_greedy_under > 0.99999 || util < self.kick_greedy_under,
|
||||
);
|
||||
}
|
||||
|
||||
ti.gen += 1;
|
||||
self.prev_cpu_stats = curr_cpu_stats;
|
||||
Ok(())
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user