Merge pull request #178 from sched-ext/multi_numa_rusty

rusty: Implement NUMA-aware load balancing
This commit is contained in:
David Vernet 2024-03-12 15:50:27 -05:00 committed by GitHub
commit 91cb5ce8ab
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 1806 additions and 806 deletions

View File

@ -207,7 +207,14 @@ impl Cpumask {
impl fmt::Display for Cpumask {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}:<{}>", self.nr_cpus, self.mask)
let slice = self.as_raw_slice();
let mut remaining_width = self.nr_cpus + 2;
write!(f, "{:#0width$b}", slice[0], width = remaining_width.min(66))?;
for submask in &slice[1..] {
remaining_width -= 64;
write!(f, "{:0width$b}", submask, width = remaining_width.min(64))?;
}
Ok(())
}
}

View File

@ -181,7 +181,7 @@ impl Node {
#[derive(Debug)]
pub struct Topology {
nodes: Vec<Node>,
cores: BTreeMap<usize, Core>,
cores: Vec<Core>,
cpus: BTreeMap<usize, Cpu>,
nr_cpus: usize,
span: Cpumask,
@ -199,14 +199,12 @@ impl Topology {
// system. We clone the objects that are located further down in the
// hierarchy rather than dealing with references, as the entire
// Topology is read-only anyways.
let mut cores = BTreeMap::new();
let mut cores = Vec::new();
let mut cpus = BTreeMap::new();
for node in nodes.iter() {
for (_, llc) in node.llcs.iter() {
for (core_id, core) in llc.cores.iter() {
if let Some(_) = cores.insert(*core_id, core.clone()) {
bail!("Found duplicate core ID {}", core_id);
}
for llc in node.llcs.values() {
for core in llc.cores.values() {
cores.push(core.clone());
for (cpu_id, cpu) in core.cpus.iter() {
if let Some(_) = cpus.insert(*cpu_id, cpu.clone()) {
bail!("Found duplicate CPU ID {}", cpu_id);
@ -219,13 +217,13 @@ impl Topology {
Ok(Topology { nodes, nr_cpus, cores, cpus, span })
}
/// Get a slice of the NUMA nodes on the host
/// Get a slice of the NUMA nodes on the host.
pub fn nodes(&self) -> &[Node] {
&self.nodes
}
/// Get a hashmap of <core ID, Core> for all Cores on the host.
pub fn cores(&self) -> &BTreeMap<usize, Core> {
/// Get a slice of all Cores on the host.
pub fn cores(&self) -> &[Core] {
&self.cores
}

View File

@ -296,7 +296,7 @@ impl<'a> Scheduler<'a> {
let mut idle_cpu_count = 0;
// Count the number of cores where all the CPUs are idle.
for (_, core) in self.topo.cores().iter() {
for core in self.topo.cores().iter() {
let mut all_idle = true;
for (cpu_id, _) in core.cpus().iter() {
if self.bpf.get_cpu_pid(*cpu_id as i32) != 0 {
@ -669,14 +669,14 @@ impl<'a> Scheduler<'a> {
Err(_) => -1,
};
info!("Running tasks:");
for (core_id, core) in self.topo.cores().iter() {
for core in self.topo.cores().iter() {
for (cpu_id, _) in core.cpus().iter() {
let pid = if *cpu_id as i32 == sched_cpu {
"[self]".to_string()
} else {
self.bpf.get_cpu_pid(*cpu_id as i32).to_string()
};
info!(" core {:2} cpu {:2} pid={}", core_id, cpu_id, pid);
info!(" core {:2} cpu {:2} pid={}", core.id(), cpu_id, pid);
}
}

View File

@ -17,6 +17,7 @@ log = "0.4.17"
ordered-float = "3.4.0"
scx_utils = { path = "../../../rust/scx_utils", version = "0.6" }
simplelog = "0.12.0"
sorted-vec = "0.8.3"
static_assertions = "1.1.0"
[build-dependencies]

View File

@ -24,6 +24,7 @@ typedef unsigned long long u64;
enum consts {
MAX_CPUS = 512,
MAX_DOMS = 64, /* limited to avoid complex bitmask ops */
MAX_NUMA_NODES = MAX_DOMS, /* Assume at least 1 domain per NUMA node */
CACHELINE_SIZE = 64,
LB_DEFAULT_WEIGHT = 100,
@ -54,7 +55,8 @@ enum stat_idx {
RUSTY_STAT_DIRECT_GREEDY,
RUSTY_STAT_DIRECT_GREEDY_FAR,
RUSTY_STAT_DSQ_DISPATCH,
RUSTY_STAT_GREEDY,
RUSTY_STAT_GREEDY_LOCAL,
RUSTY_STAT_GREEDY_XNUMA,
/* Extra stats that don't contribute to total */
RUSTY_STAT_REPATRIATE,
@ -72,6 +74,7 @@ struct task_ctx {
u64 dom_mask;
struct bpf_cpumask __kptr *cpumask;
struct bpf_cpumask __kptr *tmp_cpumask;
u32 dom_id;
u32 weight;
bool runnable;
@ -99,9 +102,15 @@ struct dom_ctx {
u64 vtime_now;
struct bpf_cpumask __kptr *cpumask;
struct bpf_cpumask __kptr *direct_greedy_cpumask;
struct bpf_cpumask __kptr *node_cpumask;
u32 node_id;
u64 dbg_dcycle_printed_at;
struct bucket_ctx buckets[LB_LOAD_BUCKETS];
};
struct node_ctx {
struct bpf_cpumask __kptr *cpumask;
};
#endif /* __INTF_H */

View File

@ -59,15 +59,20 @@ struct user_exit_info uei;
* Domains and cpus
*/
const volatile u32 nr_doms = 32; /* !0 for veristat, set during init */
const volatile u32 nr_nodes = 32; /* !0 for veristat, set during init */
const volatile u32 nr_cpus = 64; /* !0 for veristat, set during init */
const volatile u32 cpu_dom_id_map[MAX_CPUS];
const volatile u32 dom_numa_id_map[MAX_DOMS];
const volatile u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64];
const volatile u64 numa_cpumasks[MAX_NUMA_NODES][MAX_CPUS / 64];
const volatile u32 load_half_life = 1000000000 /* 1s */;
const volatile bool kthreads_local;
const volatile bool fifo_sched;
const volatile bool switch_partial;
const volatile bool direct_greedy_numa;
const volatile u32 greedy_threshold;
const volatile u32 greedy_threshold_x_numa;
const volatile u32 debug;
/* base slice duration */
@ -78,13 +83,27 @@ const volatile u64 slice_ns = SCX_SLICE_DFL;
*/
struct pcpu_ctx {
u32 dom_rr_cur; /* used when scanning other doms */
u32 dom_id;
u32 nr_node_doms;
u32 node_doms[MAX_DOMS];
/* libbpf-rs does not respect the alignment, so pad out the struct explicitly */
u8 _padding[CACHELINE_SIZE - sizeof(u32)];
u8 _padding[CACHELINE_SIZE - ((3 + MAX_DOMS) * sizeof(u32) % CACHELINE_SIZE)];
} __attribute__((aligned(CACHELINE_SIZE)));
struct pcpu_ctx pcpu_ctx[MAX_CPUS];
/*
* Numa node context
*/
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, struct node_ctx);
__uint(max_entries, MAX_NUMA_NODES);
__uint(map_flags, 0);
} node_data SEC(".maps");
/*
* Domain context
*/
@ -467,7 +486,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
{
const struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask();
struct task_ctx *taskc;
struct bpf_cpumask *p_cpumask;
struct bpf_cpumask *p_cpumask, *tmp_cpumask = NULL;
bool prev_domestic, has_idle_cores;
s32 cpu;
@ -599,19 +618,51 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
/*
* Domestic domain is fully booked. If there are CPUs which are idle and
* under-utilized, ignore domain boundaries and push the task there. Try
* to find an idle core first.
* under-utilized, ignore domain boundaries (while still respecting NUMA
* boundaries) and push the task there. Try to find an idle core first.
*/
if (taskc->all_cpus && direct_greedy_cpumask &&
!bpf_cpumask_empty((const struct cpumask *)direct_greedy_cpumask)) {
u32 dom_id = cpu_to_dom_id(prev_cpu);
struct dom_ctx *domc;
struct bpf_cpumask *tmp_direct_greedy, *node_mask;
if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) {
scx_bpf_error("Failed to lookup dom[%u]", dom_id);
goto enoent;
}
tmp_direct_greedy = direct_greedy_cpumask;
if (!tmp_direct_greedy) {
scx_bpf_error("Failed to lookup direct_greedy mask");
goto enoent;
}
/*
* By default, only look for an idle core in the current NUMA
* node when looking for direct greedy CPUs outside of the
* current domain. Stealing work temporarily is fine when
* you're going across domain boundaries, but it may be less
* desirable when crossing NUMA boundaries as the task's
* working set may end up spanning multiple NUMA nodes.
*/
if (!direct_greedy_numa) {
node_mask = domc->node_cpumask;
if (!node_mask) {
scx_bpf_error("Failed to lookup node mask");
goto enoent;
}
tmp_cpumask = bpf_kptr_xchg(&taskc->tmp_cpumask, NULL);
if (!tmp_cpumask) {
scx_bpf_error("Failed to lookup tmp cpumask");
goto enoent;
}
bpf_cpumask_and(tmp_cpumask,
(const struct cpumask *)node_mask,
(const struct cpumask *)tmp_direct_greedy);
tmp_direct_greedy = tmp_cpumask;
}
/* Try to find an idle core in the previous and then any domain */
if (has_idle_cores) {
if (domc->direct_greedy_cpumask) {
@ -626,7 +677,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
if (direct_greedy_cpumask) {
cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
direct_greedy_cpumask,
tmp_direct_greedy,
SCX_PICK_IDLE_CORE);
if (cpu >= 0) {
stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
@ -649,7 +700,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
if (direct_greedy_cpumask) {
cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
direct_greedy_cpumask, 0);
tmp_direct_greedy, 0);
if (cpu >= 0) {
stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
goto direct;
@ -668,10 +719,20 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
else
cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);
if (tmp_cpumask) {
tmp_cpumask = bpf_kptr_xchg(&taskc->tmp_cpumask, tmp_cpumask);
if (tmp_cpumask)
bpf_cpumask_release(tmp_cpumask);
}
scx_bpf_put_idle_cpumask(idle_smtmask);
return cpu;
direct:
if (tmp_cpumask) {
tmp_cpumask = bpf_kptr_xchg(&taskc->tmp_cpumask, tmp_cpumask);
if (tmp_cpumask)
bpf_cpumask_release(tmp_cpumask);
}
taskc->dispatch_local = true;
scx_bpf_put_idle_cpumask(idle_smtmask);
return cpu;
@ -797,24 +858,43 @@ static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
static u32 dom_rr_next(s32 cpu)
{
struct pcpu_ctx *pcpuc;
u32 dom_id;
u32 idx, *dom_id;
pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
if (!pcpuc)
if (!pcpuc || !pcpuc->nr_node_doms)
return 0;
dom_id = (pcpuc->dom_rr_cur + 1) % nr_doms;
idx = (pcpuc->dom_rr_cur + 1) % pcpuc->nr_node_doms;
dom_id = MEMBER_VPTR(pcpuc->node_doms, [idx]);
if (!dom_id) {
scx_bpf_error("Failed to lookup dom for %d", cpu);
return 0;
}
if (dom_id == cpu_to_dom_id(cpu))
dom_id = (dom_id + 1) % nr_doms;
if (*dom_id == cpu_to_dom_id(cpu))
scx_bpf_error("%d found current dom in node_doms array", cpu);
pcpuc->dom_rr_cur = dom_id;
return dom_id;
pcpuc->dom_rr_cur++;
return *dom_id;
}
u32 dom_node_id(u32 dom_id)
{
u32 *nid_ptr;
nid_ptr = MEMBER_VPTR(dom_numa_id_map, [dom_id]);
if (!nid_ptr) {
scx_bpf_error("Couldn't look up node ID for %s", dom_id);
return 0;
}
return *nid_ptr;
}
void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
{
u32 dom = cpu_to_dom_id(cpu);
struct pcpu_ctx *pcpuc;
u32 node_doms, my_node, i;
if (scx_bpf_consume(dom)) {
stat_add(RUSTY_STAT_DSQ_DISPATCH, 1);
@ -824,13 +904,35 @@ void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
if (!greedy_threshold)
return;
bpf_repeat(nr_doms - 1) {
u32 dom_id = dom_rr_next(cpu);
pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
if (!pcpuc) {
scx_bpf_error("Failed to get PCPU context");
return;
}
node_doms = pcpuc->nr_node_doms;
if (scx_bpf_dsq_nr_queued(dom_id) >= greedy_threshold &&
scx_bpf_consume(dom_id)) {
stat_add(RUSTY_STAT_GREEDY, 1);
break;
/* try to steal a task from domains on the current NUMA node */
bpf_for(i, 0, node_doms) {
dom = (pcpuc->dom_rr_cur + 1 + i) % node_doms;
if (scx_bpf_consume(dom)) {
stat_add(RUSTY_STAT_GREEDY_LOCAL, 1);
return;
}
}
if (!greedy_threshold_x_numa || nr_nodes == 1)
return;
/* try to steal a task from domains on other NUMA nodes */
my_node = dom_node_id(pcpuc->dom_id);
bpf_repeat(nr_doms - 1) {
dom = (pcpuc->dom_rr_cur + 1) % nr_doms;
pcpuc->dom_rr_cur++;
if (dom_node_id(dom) != my_node &&
scx_bpf_dsq_nr_queued(dom) >= greedy_threshold_x_numa &&
scx_bpf_consume(dom)) {
stat_add(RUSTY_STAT_GREEDY_XNUMA, 1);
return;
}
}
}
@ -1053,6 +1155,18 @@ s32 BPF_STRUCT_OPS(rusty_init_task, struct task_struct *p,
return -EINVAL;
}
cpumask = bpf_cpumask_create();
if (!cpumask) {
scx_bpf_error("Failed to create BPF cpumask for task");
return -ENOMEM;
}
cpumask = bpf_kptr_xchg(&map_value->tmp_cpumask, cpumask);
if (cpumask) {
scx_bpf_error("%s[%d] tmp_cpumask already present", p->comm, p->pid);
bpf_cpumask_release(cpumask);
return -EEXIST;
}
task_pick_and_set_domain(map_value, p, p->cpus_ptr, true);
return 0;
@ -1077,11 +1191,53 @@ void BPF_STRUCT_OPS(rusty_exit_task, struct task_struct *p,
}
}
static s32 create_node(u32 node_id)
{
u32 cpu;
struct bpf_cpumask *cpumask;
struct node_ctx *nodec;
nodec = bpf_map_lookup_elem(&node_data, &node_id);
if (!nodec) {
/* Should never happen, it's created statically at load time. */
scx_bpf_error("No node%u", node_id);
return -ENOENT;
}
cpumask = bpf_cpumask_create();
if (!cpumask)
return -ENOMEM;
for (cpu = 0; cpu < MAX_CPUS; cpu++) {
const volatile u64 *nmask;
nmask = MEMBER_VPTR(numa_cpumasks, [node_id][cpu / 64]);
if (!nmask) {
scx_bpf_error("array index error");
bpf_cpumask_release(cpumask);
return -ENOENT;
}
if (*nmask & (1LLU << (cpu % 64)))
bpf_cpumask_set_cpu(cpu, cpumask);
}
cpumask = bpf_kptr_xchg(&nodec->cpumask, cpumask);
if (cpumask) {
scx_bpf_error("Node %u cpumask already present", node_id);
bpf_cpumask_release(cpumask);
return -EEXIST;
}
return 0;
}
static s32 create_dom(u32 dom_id)
{
struct dom_ctx *domc;
struct bpf_cpumask *cpumask;
u32 cpu;
struct node_ctx *nodec;
struct bpf_cpumask *cpumask, *node_mask;
u32 cpu, node_id;
s32 ret;
if (dom_id >= MAX_DOMS) {
@ -1141,7 +1297,6 @@ static s32 create_dom(u32 dom_id)
dom_id);
return -ENOMEM;
}
cpumask = bpf_kptr_xchg(&domc->direct_greedy_cpumask, cpumask);
if (cpumask) {
scx_bpf_error("Domain %u direct_greedy_cpumask already present",
@ -1150,6 +1305,99 @@ static s32 create_dom(u32 dom_id)
return -EEXIST;
}
node_id = dom_node_id(dom_id);
nodec = bpf_map_lookup_elem(&node_data, &node_id);
if (!nodec) {
/* Should never happen, it's created statically at load time. */
scx_bpf_error("No node%u", node_id);
return -ENOENT;
}
bpf_rcu_read_lock();
node_mask = nodec->cpumask;
if (!node_mask) {
bpf_rcu_read_unlock();
scx_bpf_error("NUMA %d mask not found for domain %u",
node_id, dom_id);
return -ENOENT;
}
cpumask = bpf_cpumask_create();
if (!cpumask) {
bpf_rcu_read_unlock();
scx_bpf_error("Failed to create BPF cpumask for domain %u",
dom_id);
return -ENOMEM;
}
bpf_cpumask_copy(cpumask, (const struct cpumask *)node_mask);
bpf_rcu_read_unlock();
cpumask = bpf_kptr_xchg(&domc->node_cpumask, cpumask);
if (cpumask) {
scx_bpf_error("Domain %u node_cpumask already present",
dom_id);
bpf_cpumask_release(cpumask);
return -EEXIST;
}
return 0;
}
static s32 initialize_cpu(s32 cpu)
{
struct bpf_cpumask *cpumask;
struct dom_ctx *domc;
int i, j = 0;
struct pcpu_ctx *pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
u32 *dom_nodes;
if (!pcpuc) {
scx_bpf_error("Failed to lookup pcpu ctx %d", cpu);
return -ENOENT;
}
pcpuc->dom_rr_cur = cpu;
bpf_for(i, 0, nr_doms) {
domc = bpf_map_lookup_elem(&dom_data, &i);
if (!domc) {
scx_bpf_error("Failed to lookup dom_ctx");
return -ENOENT;
}
bpf_rcu_read_lock();
cpumask = domc->node_cpumask;
if (!cpumask) {
bpf_rcu_read_unlock();
scx_bpf_error("Failed to lookup dom node cpumask");
return -ENOENT;
}
if (bpf_cpumask_test_cpu(cpu, (const struct cpumask *)cpumask)) {
cpumask = domc->cpumask;
if (!cpumask) {
bpf_rcu_read_unlock();
scx_bpf_error("Failed to lookup dom cpumask");
return -ENOENT;
}
/*
* Only record the remote domains in this array, as
* we'll only ever consume from them on the greedy
* threshold path.
*/
if (!bpf_cpumask_test_cpu(cpu,
(const struct cpumask *)cpumask)) {
dom_nodes = MEMBER_VPTR(pcpuc->node_doms, [j]);
if (!dom_nodes) {
bpf_rcu_read_unlock();
scx_bpf_error("Failed to lookup doms ptr");
return -EINVAL;
}
*dom_nodes = i;
j++;
} else {
pcpuc->dom_id = i;
}
}
bpf_rcu_read_unlock();
}
pcpuc->nr_node_doms = j;
return 0;
}
@ -1182,14 +1430,22 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
if (!switch_partial)
scx_bpf_switch_all();
bpf_for(i, 0, nr_nodes) {
ret = create_node(i);
if (ret)
return ret;
}
bpf_for(i, 0, nr_doms) {
ret = create_dom(i);
if (ret)
return ret;
}
bpf_for(i, 0, nr_cpus)
pcpu_ctx[i].dom_rr_cur = i;
bpf_for(i, 0, nr_cpus) {
ret = initialize_cpu(i);
if (ret)
return ret;
}
return 0;
}

View File

@ -0,0 +1,112 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
use std::collections::BTreeMap;
use std::sync::Arc;
use anyhow::Result;
use scx_utils::Cpumask;
use scx_utils::Topology;
#[derive(Clone, Debug)]
pub struct Domain {
id: usize,
mask: Cpumask,
}
impl Domain {
/// Get the Domain's ID.
pub fn id(&self) -> usize {
self.id
}
/// Get a copy of the domain's cpumask.
pub fn mask(&self) -> Cpumask {
self.mask.clone()
}
/// Get a raw slice of the domain's cpumask as a set of one or more u64
/// variables whose bits represent CPUs in the mask.
pub fn mask_slice(&self) -> &[u64] {
self.mask.as_raw_slice()
}
/// The number of CPUs in the domain.
pub fn weight(&self) -> usize {
self.mask.len()
}
}
#[derive(Debug)]
pub struct DomainGroup {
doms: BTreeMap<usize, Domain>,
cpu_dom_map: BTreeMap<usize, usize>,
dom_numa_map: BTreeMap<usize, usize>,
num_numa_nodes: usize,
}
impl DomainGroup {
pub fn new(top: Arc<Topology>, cpumasks: &[String]) -> Result<Self> {
let mut dom_numa_map = BTreeMap::new();
let (doms, num_numa_nodes) = if !cpumasks.is_empty() {
let mut doms: BTreeMap<usize, Domain> = BTreeMap::new();
let mut id = 0;
for mask_str in cpumasks.iter() {
let mask = Cpumask::from_str(&mask_str)?;
doms.insert(id, Domain { id, mask, });
dom_numa_map.insert(id, 0);
id += 1;
}
(doms, 1)
} else {
let mut doms: BTreeMap<usize, Domain> = BTreeMap::new();
for (node_id, node) in top.nodes().iter().enumerate() {
for (id, llc) in node.llcs().iter() {
let mask = llc.span();
doms.insert(*id, Domain { id: id.clone(), mask, });
dom_numa_map.insert(*id, node_id.clone());
}
}
(doms, top.nodes().len())
};
let mut cpu_dom_map = BTreeMap::new();
for (id, dom) in doms.iter() {
for cpu in dom.mask.clone().into_iter() {
cpu_dom_map.insert(cpu, *id);
}
}
Ok(Self { doms, cpu_dom_map, dom_numa_map, num_numa_nodes, })
}
pub fn numa_doms(&self, numa_id: &usize) -> Vec<Domain> {
let mut numa_doms = Vec::new();
for (d_id, n_id) in self.dom_numa_map.iter() {
if n_id == numa_id {
let dom = self.doms.get(&d_id).unwrap();
numa_doms.push(dom.clone());
}
}
numa_doms
}
pub fn nr_doms(&self) -> usize {
self.doms.len()
}
pub fn nr_nodes(&self) -> usize {
self.num_numa_nodes
}
pub fn cpu_dom_id(&self, cpu: &usize) -> Option<usize> {
self.cpu_dom_map.get(cpu).copied()
}
pub fn dom_numa_id(&self, dom_id: &usize) -> Option<usize> {
self.dom_numa_map.get(dom_id).copied()
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,183 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
use std::collections::BTreeMap;
use std::sync::Arc;
use crate::sub_or_zero;
use crate::DomainGroup;
use crate::BpfSkel;
use ::fb_procfs as procfs;
use anyhow::anyhow;
use anyhow::bail;
use anyhow::Result;
use scx_utils::Topology;
fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
match (curr, prev) {
(
procfs::CpuStat {
user_usec: Some(curr_user),
nice_usec: Some(curr_nice),
system_usec: Some(curr_system),
idle_usec: Some(curr_idle),
iowait_usec: Some(curr_iowait),
irq_usec: Some(curr_irq),
softirq_usec: Some(curr_softirq),
stolen_usec: Some(curr_stolen),
..
},
procfs::CpuStat {
user_usec: Some(prev_user),
nice_usec: Some(prev_nice),
system_usec: Some(prev_system),
idle_usec: Some(prev_idle),
iowait_usec: Some(prev_iowait),
irq_usec: Some(prev_irq),
softirq_usec: Some(prev_softirq),
stolen_usec: Some(prev_stolen),
..
},
) => {
let idle_usec = sub_or_zero(curr_idle, prev_idle);
let iowait_usec = sub_or_zero(curr_iowait, prev_iowait);
let user_usec = sub_or_zero(curr_user, prev_user);
let system_usec = sub_or_zero(curr_system, prev_system);
let nice_usec = sub_or_zero(curr_nice, prev_nice);
let irq_usec = sub_or_zero(curr_irq, prev_irq);
let softirq_usec = sub_or_zero(curr_softirq, prev_softirq);
let stolen_usec = sub_or_zero(curr_stolen, prev_stolen);
let busy_usec =
user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
let total_usec = idle_usec + busy_usec + iowait_usec;
if total_usec > 0 {
Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
} else {
Ok(1.0)
}
}
_ => {
bail!("Missing stats in cpustat");
}
}
}
pub struct Tuner {
top: Arc<Topology>,
dom_group: Arc<DomainGroup>,
direct_greedy_under: f64,
kick_greedy_under: f64,
proc_reader: procfs::ProcReader,
prev_cpu_stats: BTreeMap<u32, procfs::CpuStat>,
pub fully_utilized: bool,
dom_utils: Vec<f64>,
}
impl Tuner {
pub fn new(top: Arc<Topology>,
dom_group: Arc<DomainGroup>,
direct_greedy_under: f64,
kick_greedy_under: f64) -> Result<Self> {
let proc_reader = procfs::ProcReader::new();
let prev_cpu_stats = proc_reader
.read_stat()?
.cpus_map
.ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
Ok(Self {
direct_greedy_under: direct_greedy_under / 100.0,
kick_greedy_under: kick_greedy_under / 100.0,
proc_reader,
prev_cpu_stats,
dom_utils: vec![0.0; dom_group.nr_doms()],
fully_utilized: false,
top,
dom_group,
})
}
pub fn dom_util(&self, dom: usize) -> f64 {
self.dom_utils[dom]
}
/// Apply a step in the Tuner by:
///
/// 1. Recording CPU stats from procfs
/// 2. Calculating current per-domain and host-wide utilization
/// 3. Updating direct_greedy_under and kick_greedy_under cpumasks according
/// to the observed utilization
pub fn step(&mut self, skel: &mut BpfSkel) -> Result<()> {
let curr_cpu_stats = self
.proc_reader
.read_stat()?
.cpus_map
.ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
let mut dom_nr_cpus = vec![0; self.dom_group.nr_doms()];
let mut dom_util_sum = vec![0.0; self.dom_group.nr_doms()];
let mut avg_util = 0.0f64;
for cpu in 0..self.top.nr_cpus() {
let cpu32 = cpu as u32;
// None domain indicates the CPU was offline during
// initialization and None CpuStat indicates the CPU has gone
// down since then. Ignore both.
if let (Some(dom), Some(curr), Some(prev)) = (
self.dom_group.cpu_dom_id(&cpu),
curr_cpu_stats.get(&cpu32),
self.prev_cpu_stats.get(&cpu32),
) {
let util = calc_util(curr, prev)?;
dom_nr_cpus[dom] += 1;
dom_util_sum[dom] += util;
avg_util += util;
}
}
avg_util /= self.top.nr_cpus() as f64;
self.fully_utilized = avg_util >= 0.99999;
let ti = &mut skel.bss_mut().tune_input;
for dom in 0..self.dom_group.nr_doms() {
// Calculate the domain avg util. If there are no active CPUs,
// it doesn't really matter. Go with 0.0 as that's less likely
// to confuse users.
let util = match dom_nr_cpus[dom] {
0 => 0.0,
nr => dom_util_sum[dom] / nr as f64,
};
self.dom_utils[dom] = util;
// This could be implemented better.
let update_dom_bits = |target: &mut [u64; 8], val: bool| {
for cpu in 0..self.top.nr_cpus() {
if let Some(cdom) = self.dom_group.cpu_dom_id(&cpu) {
if cdom == dom {
if val {
target[cpu / 64] |= 1u64 << (cpu % 64);
} else {
target[cpu / 64] &= !(1u64 << (cpu % 64));
}
}
}
}
};
update_dom_bits(
&mut ti.direct_greedy_cpumask,
self.direct_greedy_under > 0.99999 || util < self.direct_greedy_under,
);
update_dom_bits(
&mut ti.kick_greedy_cpumask,
self.kick_greedy_under > 0.99999 || util < self.kick_greedy_under,
);
}
ti.gen += 1;
self.prev_cpu_stats = curr_cpu_stats;
Ok(())
}
}