Merge pull request #575 from hodgesds/gpu-topo

scx_utils: Add GPU topology
This commit is contained in:
Daniel Hodges 2024-08-28 09:59:44 -04:00 committed by GitHub
commit 5391816853
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 198 additions and 0 deletions

View File

@ -19,6 +19,7 @@ lazy_static = "1.4"
libbpf-cargo = "0.24.1"
libbpf-rs = "0.24.1"
log = "0.4.17"
nvml-wrapper = "0.10.0"
paste = "1.0"
regex = "1.10"
scx_stats = { path = "../scx_stats", version = "1.0.3" }

View File

@ -72,6 +72,9 @@ use crate::Cpumask;
use anyhow::bail;
use anyhow::Result;
use glob::glob;
use nvml_wrapper::bitmasks::InitFlags;
use nvml_wrapper::enum_wrappers::device::Clock;
use nvml_wrapper::Nvml;
use sscanf::sscanf;
use std::collections::BTreeMap;
use std::path::Path;
@ -217,10 +220,26 @@ impl Cache {
}
}
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialOrd, PartialEq)]
pub enum GpuIndex {
Nvidia { nvml_id: u32 },
}
#[derive(Debug, Clone)]
pub struct Gpu {
pub index: GpuIndex,
pub node_id: usize,
pub max_graphics_clock: usize,
// AMD uses CU for this value
pub max_sm_clock: usize,
pub memory: u64,
}
#[derive(Debug, Clone)]
pub struct Node {
id: usize,
llcs: BTreeMap<usize, Cache>,
gpus: BTreeMap<GpuIndex, Gpu>,
span: Cpumask,
}
@ -246,6 +265,11 @@ impl Node {
cpus
}
// Get the map of all GPUs for this NUMA node.
pub fn gpus(&self) -> &BTreeMap<GpuIndex, Gpu> {
&self.gpus
}
/// Get a Cpumask of all CPUs in this NUMA node
pub fn span(&self) -> &Cpumask {
&self.span
@ -314,6 +338,17 @@ impl Topology {
&self.cores
}
/// Get a vec of all GPUs on the hosts.
pub fn gpus(&self) -> BTreeMap<GpuIndex, &Gpu> {
let mut gpus = BTreeMap::new();
for node in &self.nodes {
for (idx, gpu) in &node.gpus {
gpus.insert(idx.clone(), gpu);
}
}
gpus
}
/// Get a hashmap of <CPU ID, Cpu> for all Cpus on the host.
pub fn cpus(&self) -> &BTreeMap<usize, Cpu> {
&self.cpus
@ -549,12 +584,78 @@ fn avg_cpu_freq() -> Option<(usize, usize)> {
Some((avg_base_freq / nr_cpus, top_max_freq))
}
fn create_gpus() -> BTreeMap<usize, Vec<Gpu>> {
let mut gpus: BTreeMap<usize, Vec<Gpu>> = BTreeMap::new();
// Don't fail if the system has no NVIDIA GPUs.
let Ok(nvml) = Nvml::init_with_flags(InitFlags::NO_GPUS) else {
return BTreeMap::new();
};
match nvml.device_count() {
Ok(nvidia_gpu_count) => {
for i in 0..nvidia_gpu_count {
let Ok(nvidia_gpu) = nvml.device_by_index(i) else {
continue;
};
let graphics_boost_clock = nvidia_gpu.max_customer_boost_clock(Clock::Graphics).unwrap_or(0);
let sm_boost_clock = nvidia_gpu.max_customer_boost_clock(Clock::SM).unwrap_or(0);
let Ok(memory_info) = nvidia_gpu.memory_info() else {
continue;
};
let Ok(pci_info) = nvidia_gpu.pci_info() else {
continue;
};
let Ok(index) = nvidia_gpu.index() else {
continue;
};
// The NVML library doesn't return a PCIe bus ID compatible with sysfs. It includes
// uppercase bus ID values and an extra four leading 0s.
let bus_id = pci_info.bus_id.to_lowercase();
let fixed_bus_id = bus_id.strip_prefix("0000").unwrap_or("");
let numa_path = format!("/sys/bus/pci/devices/{}/numa_node", fixed_bus_id);
let numa_node = read_file_usize(&Path::new(&numa_path)).unwrap_or(0);
let gpu = Gpu{
index: GpuIndex::Nvidia{nvml_id: index},
node_id: numa_node as usize,
max_graphics_clock: graphics_boost_clock as usize,
max_sm_clock: sm_boost_clock as usize,
memory: memory_info.total,
};
if !gpus.contains_key(&numa_node) {
gpus.insert(numa_node, vec![gpu]);
continue;
}
if let Some(gpus) = gpus.get_mut(&numa_node) {
gpus.push(gpu);
}
}
}
_ => {}
};
gpus
}
fn create_default_node(online_mask: &Cpumask) -> Result<Vec<Node>> {
let mut nodes: Vec<Node> = Vec::with_capacity(1);
let system_gpus = create_gpus();
let mut node_gpus = BTreeMap::new();
match system_gpus.get(&0) {
Some(gpus) => {
for gpu in gpus {
node_gpus.insert(gpu.index, gpu.clone());
}
}
_ => {},
};
let mut node = Node {
id: 0,
llcs: BTreeMap::new(),
span: Cpumask::new()?,
gpus: node_gpus,
};
if !Path::new("/sys/devices/system/cpu").exists() {
@ -575,6 +676,8 @@ fn create_default_node(online_mask: &Cpumask) -> Result<Vec<Node>> {
fn create_numa_nodes(online_mask: &Cpumask) -> Result<Vec<Node>> {
let mut nodes: Vec<Node> = Vec::new();
let system_gpus = create_gpus();
let numa_paths = glob("/sys/devices/system/node/node*")?;
for numa_path in numa_paths.filter_map(Result::ok) {
let numa_str = numa_path.to_str().unwrap().trim();
@ -585,10 +688,21 @@ fn create_numa_nodes(online_mask: &Cpumask) -> Result<Vec<Node>> {
}
};
let mut node_gpus = BTreeMap::new();
match system_gpus.get(&node_id) {
Some(gpus) => {
for gpu in gpus {
node_gpus.insert(gpu.index, gpu.clone());
}
}
_ => {},
};
let mut node = Node {
id: node_id,
llcs: BTreeMap::new(),
span: Cpumask::new()?,
gpus: node_gpus,
};
let cpu_pattern = numa_path.join("cpu[0-9]*");

83
scheds/rust/Cargo.lock generated
View File

@ -404,6 +404,41 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "darling"
version = "0.20.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
dependencies = [
"darling_core",
"darling_macro",
]
[[package]]
name = "darling_core"
version = "0.20.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
dependencies = [
"fnv",
"ident_case",
"proc-macro2",
"quote",
"strsim 0.11.1",
"syn",
]
[[package]]
name = "darling_macro"
version = "0.20.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
dependencies = [
"darling_core",
"quote",
"syn",
]
[[package]]
name = "deranged"
version = "0.3.11"
@ -474,6 +509,12 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "fnv"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "funty"
version = "2.0.0"
@ -545,6 +586,12 @@ dependencies = [
"cc",
]
[[package]]
name = "ident_case"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "indexmap"
version = "2.4.0"
@ -824,6 +871,29 @@ dependencies = [
"libc",
]
[[package]]
name = "nvml-wrapper"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c9bff0aa1d48904a1385ea2a8b97576fbdcbc9a3cfccd0d31fe978e1c4038c5"
dependencies = [
"bitflags 2.6.0",
"libloading",
"nvml-wrapper-sys",
"static_assertions",
"thiserror",
"wrapcenum-derive",
]
[[package]]
name = "nvml-wrapper-sys"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "698d45156f28781a4e79652b6ebe2eaa0589057d588d3aec1333f6466f13fcb5"
dependencies = [
"libloading",
]
[[package]]
name = "once_cell"
version = "1.19.0"
@ -1228,6 +1298,7 @@ dependencies = [
"log",
"metrics",
"metrics-util",
"nvml-wrapper",
"paste",
"regex",
"scx_stats",
@ -1838,6 +1909,18 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "wrapcenum-derive"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a76ff259533532054cfbaefb115c613203c73707017459206380f03b3b3f266e"
dependencies = [
"darling",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "wyz"
version = "0.5.1"