mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-24 11:50:23 +00:00
scx_utils: Add GPU topology
Add GPU awareness to the topology crate. Signed-off-by: Daniel Hodges <hodges.daniel.scott@gmail.com>
This commit is contained in:
parent
d708939e5a
commit
12f8cb74b5
@ -19,6 +19,7 @@ lazy_static = "1.4"
|
||||
libbpf-cargo = "0.24.1"
|
||||
libbpf-rs = "0.24.1"
|
||||
log = "0.4.17"
|
||||
nvml-wrapper = "0.10.0"
|
||||
paste = "1.0"
|
||||
regex = "1.10"
|
||||
scx_stats = { path = "../scx_stats", version = "1.0.3" }
|
||||
|
@ -72,6 +72,9 @@ use crate::Cpumask;
|
||||
use anyhow::bail;
|
||||
use anyhow::Result;
|
||||
use glob::glob;
|
||||
use nvml_wrapper::bitmasks::InitFlags;
|
||||
use nvml_wrapper::enum_wrappers::device::Clock;
|
||||
use nvml_wrapper::Nvml;
|
||||
use sscanf::sscanf;
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::Path;
|
||||
@ -217,10 +220,26 @@ impl Cache {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialOrd, PartialEq)]
|
||||
pub enum GpuIndex {
|
||||
Nvidia { nvml_id: u32 },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Gpu {
|
||||
pub index: GpuIndex,
|
||||
pub node_id: usize,
|
||||
pub max_graphics_clock: usize,
|
||||
// AMD uses CU for this value
|
||||
pub max_sm_clock: usize,
|
||||
pub memory: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Node {
|
||||
id: usize,
|
||||
llcs: BTreeMap<usize, Cache>,
|
||||
gpus: BTreeMap<GpuIndex, Gpu>,
|
||||
span: Cpumask,
|
||||
}
|
||||
|
||||
@ -246,6 +265,11 @@ impl Node {
|
||||
cpus
|
||||
}
|
||||
|
||||
// Get the map of all GPUs for this NUMA node.
|
||||
pub fn gpus(&self) -> &BTreeMap<GpuIndex, Gpu> {
|
||||
&self.gpus
|
||||
}
|
||||
|
||||
/// Get a Cpumask of all CPUs in this NUMA node
|
||||
pub fn span(&self) -> &Cpumask {
|
||||
&self.span
|
||||
@ -314,6 +338,17 @@ impl Topology {
|
||||
&self.cores
|
||||
}
|
||||
|
||||
/// Get a vec of all GPUs on the hosts.
|
||||
pub fn gpus(&self) -> BTreeMap<GpuIndex, &Gpu> {
|
||||
let mut gpus = BTreeMap::new();
|
||||
for node in &self.nodes {
|
||||
for (idx, gpu) in &node.gpus {
|
||||
gpus.insert(idx.clone(), gpu);
|
||||
}
|
||||
}
|
||||
gpus
|
||||
}
|
||||
|
||||
/// Get a hashmap of <CPU ID, Cpu> for all Cpus on the host.
|
||||
pub fn cpus(&self) -> &BTreeMap<usize, Cpu> {
|
||||
&self.cpus
|
||||
@ -549,12 +584,78 @@ fn avg_cpu_freq() -> Option<(usize, usize)> {
|
||||
Some((avg_base_freq / nr_cpus, top_max_freq))
|
||||
}
|
||||
|
||||
fn create_gpus() -> BTreeMap<usize, Vec<Gpu>> {
|
||||
let mut gpus: BTreeMap<usize, Vec<Gpu>> = BTreeMap::new();
|
||||
|
||||
// Don't fail if the system has no NVIDIA GPUs.
|
||||
let Ok(nvml) = Nvml::init_with_flags(InitFlags::NO_GPUS) else {
|
||||
return BTreeMap::new();
|
||||
};
|
||||
match nvml.device_count() {
|
||||
Ok(nvidia_gpu_count) => {
|
||||
for i in 0..nvidia_gpu_count {
|
||||
let Ok(nvidia_gpu) = nvml.device_by_index(i) else {
|
||||
continue;
|
||||
};
|
||||
let graphics_boost_clock = nvidia_gpu.max_customer_boost_clock(Clock::Graphics).unwrap_or(0);
|
||||
let sm_boost_clock = nvidia_gpu.max_customer_boost_clock(Clock::SM).unwrap_or(0);
|
||||
let Ok(memory_info) = nvidia_gpu.memory_info() else {
|
||||
continue;
|
||||
};
|
||||
let Ok(pci_info) = nvidia_gpu.pci_info() else {
|
||||
continue;
|
||||
};
|
||||
let Ok(index) = nvidia_gpu.index() else {
|
||||
continue;
|
||||
};
|
||||
|
||||
// The NVML library doesn't return a PCIe bus ID compatible with sysfs. It includes
|
||||
// uppercase bus ID values and an extra four leading 0s.
|
||||
let bus_id = pci_info.bus_id.to_lowercase();
|
||||
let fixed_bus_id = bus_id.strip_prefix("0000").unwrap_or("");
|
||||
let numa_path = format!("/sys/bus/pci/devices/{}/numa_node", fixed_bus_id);
|
||||
let numa_node = read_file_usize(&Path::new(&numa_path)).unwrap_or(0);
|
||||
|
||||
let gpu = Gpu{
|
||||
index: GpuIndex::Nvidia{nvml_id: index},
|
||||
node_id: numa_node as usize,
|
||||
max_graphics_clock: graphics_boost_clock as usize,
|
||||
max_sm_clock: sm_boost_clock as usize,
|
||||
memory: memory_info.total,
|
||||
};
|
||||
if !gpus.contains_key(&numa_node) {
|
||||
gpus.insert(numa_node, vec![gpu]);
|
||||
continue;
|
||||
}
|
||||
if let Some(gpus) = gpus.get_mut(&numa_node) {
|
||||
gpus.push(gpu);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
};
|
||||
|
||||
gpus
|
||||
}
|
||||
|
||||
fn create_default_node(online_mask: &Cpumask) -> Result<Vec<Node>> {
|
||||
let mut nodes: Vec<Node> = Vec::with_capacity(1);
|
||||
let system_gpus = create_gpus();
|
||||
let mut node_gpus = BTreeMap::new();
|
||||
match system_gpus.get(&0) {
|
||||
Some(gpus) => {
|
||||
for gpu in gpus {
|
||||
node_gpus.insert(gpu.index, gpu.clone());
|
||||
}
|
||||
}
|
||||
_ => {},
|
||||
};
|
||||
|
||||
let mut node = Node {
|
||||
id: 0,
|
||||
llcs: BTreeMap::new(),
|
||||
span: Cpumask::new()?,
|
||||
gpus: node_gpus,
|
||||
};
|
||||
|
||||
if !Path::new("/sys/devices/system/cpu").exists() {
|
||||
@ -575,6 +676,8 @@ fn create_default_node(online_mask: &Cpumask) -> Result<Vec<Node>> {
|
||||
fn create_numa_nodes(online_mask: &Cpumask) -> Result<Vec<Node>> {
|
||||
let mut nodes: Vec<Node> = Vec::new();
|
||||
|
||||
let system_gpus = create_gpus();
|
||||
|
||||
let numa_paths = glob("/sys/devices/system/node/node*")?;
|
||||
for numa_path in numa_paths.filter_map(Result::ok) {
|
||||
let numa_str = numa_path.to_str().unwrap().trim();
|
||||
@ -585,10 +688,21 @@ fn create_numa_nodes(online_mask: &Cpumask) -> Result<Vec<Node>> {
|
||||
}
|
||||
};
|
||||
|
||||
let mut node_gpus = BTreeMap::new();
|
||||
match system_gpus.get(&node_id) {
|
||||
Some(gpus) => {
|
||||
for gpu in gpus {
|
||||
node_gpus.insert(gpu.index, gpu.clone());
|
||||
}
|
||||
}
|
||||
_ => {},
|
||||
};
|
||||
|
||||
let mut node = Node {
|
||||
id: node_id,
|
||||
llcs: BTreeMap::new(),
|
||||
span: Cpumask::new()?,
|
||||
gpus: node_gpus,
|
||||
};
|
||||
|
||||
let cpu_pattern = numa_path.join("cpu[0-9]*");
|
||||
|
83
scheds/rust/Cargo.lock
generated
83
scheds/rust/Cargo.lock
generated
@ -404,6 +404,41 @@ dependencies = [
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.20.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"darling_macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_core"
|
||||
version = "0.20.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
|
||||
dependencies = [
|
||||
"fnv",
|
||||
"ident_case",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim 0.11.1",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_macro"
|
||||
version = "0.20.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.3.11"
|
||||
@ -474,6 +509,12 @@ dependencies = [
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "funty"
|
||||
version = "2.0.0"
|
||||
@ -545,6 +586,12 @@ dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ident_case"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.4.0"
|
||||
@ -824,6 +871,29 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nvml-wrapper"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c9bff0aa1d48904a1385ea2a8b97576fbdcbc9a3cfccd0d31fe978e1c4038c5"
|
||||
dependencies = [
|
||||
"bitflags 2.6.0",
|
||||
"libloading",
|
||||
"nvml-wrapper-sys",
|
||||
"static_assertions",
|
||||
"thiserror",
|
||||
"wrapcenum-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nvml-wrapper-sys"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "698d45156f28781a4e79652b6ebe2eaa0589057d588d3aec1333f6466f13fcb5"
|
||||
dependencies = [
|
||||
"libloading",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.19.0"
|
||||
@ -1228,6 +1298,7 @@ dependencies = [
|
||||
"log",
|
||||
"metrics",
|
||||
"metrics-util",
|
||||
"nvml-wrapper",
|
||||
"paste",
|
||||
"regex",
|
||||
"scx_stats",
|
||||
@ -1838,6 +1909,18 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
|
||||
[[package]]
|
||||
name = "wrapcenum-derive"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a76ff259533532054cfbaefb115c613203c73707017459206380f03b3b3f266e"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wyz"
|
||||
version = "0.5.1"
|
||||
|
Loading…
Reference in New Issue
Block a user