diff --git a/rust/scx_utils/src/cpumask.rs b/rust/scx_utils/src/cpumask.rs index 876e1de..0bac94c 100644 --- a/rust/scx_utils/src/cpumask.rs +++ b/rust/scx_utils/src/cpumask.rs @@ -68,7 +68,7 @@ use std::ops::BitOrAssign; use std::ops::BitXor; use std::ops::BitXorAssign; -#[derive(Debug, Eq, Clone, Ord, PartialEq, PartialOrd)] +#[derive(Debug, Eq, Clone, Hash, Ord, PartialEq, PartialOrd)] pub struct Cpumask { mask: BitVec, } @@ -146,6 +146,10 @@ impl Cpumask { } } + pub fn from_bitvec(bitvec: BitVec) -> Self { + Self { mask: bitvec } + } + /// Return a slice of u64's whose bits reflect the Cpumask. pub fn as_raw_slice(&self) -> &[u64] { self.mask.as_raw_slice() diff --git a/rust/scx_utils/src/lib.rs b/rust/scx_utils/src/lib.rs index 735d7d9..720477b 100644 --- a/rust/scx_utils/src/lib.rs +++ b/rust/scx_utils/src/lib.rs @@ -88,5 +88,9 @@ pub use misc::monitor_stats; pub use misc::normalize_load_metric; pub use misc::set_rlimit_infinity; +mod netdev; +pub use netdev::read_netdevs; +pub use netdev::NetDev; + pub mod enums; pub use enums::scx_enums; diff --git a/rust/scx_utils/src/netdev.rs b/rust/scx_utils/src/netdev.rs new file mode 100644 index 0000000..b6e6e2c --- /dev/null +++ b/rust/scx_utils/src/netdev.rs @@ -0,0 +1,83 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +use std::collections::BTreeMap; +use std::fs; +use std::path::Path; + +use crate::misc::read_file_usize; +use crate::Cpumask; +use anyhow::Result; + +#[derive(Debug, Clone, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct NetDev { + pub iface: String, + pub node: usize, + pub irqs: BTreeMap, + pub irq_hints: BTreeMap, +} + +impl NetDev { + pub fn apply_cpumasks(&self) -> Result<()> { + for (irq, cpumask) in self.irqs.iter() { + let irq_path = format!("/proc/irq/{}/smp_affinity", irq); + fs::write(irq_path, format!("{:#x}", cpumask))? + } + Ok(()) + } +} + +pub fn read_netdevs() -> Result> { + let mut netdevs: BTreeMap = BTreeMap::new(); + + for entry in fs::read_dir("/sys/class/net")? { + let entry = entry?; + let iface = entry.file_name().to_string_lossy().into_owned(); + let raw_path = format!("/sys/class/net/{}/device/msi_irqs", iface); + let msi_irqs_path = Path::new(&raw_path); + if !msi_irqs_path.exists() { + continue; + } + + let node_path_raw = format!("/sys/class/net/{}/device/node", iface); + let node_path = Path::new(&node_path_raw); + let node = read_file_usize(node_path).unwrap_or(0); + let mut irqs = BTreeMap::new(); + let mut irq_hints = BTreeMap::new(); + + for entry in fs::read_dir(msi_irqs_path)? { + let entry = entry.unwrap(); + let irq = entry.file_name().to_string_lossy().into_owned(); + if let Ok(irq) = irq.parse::() { + let affinity_raw_path = format!("/proc/irq/{}/smp_affinity", irq); + let smp_affinity_path = Path::new(&affinity_raw_path); + let smp_affinity = fs::read_to_string(smp_affinity_path)? + .replace(",", "") + .replace("\n", ""); + let cpumask = Cpumask::from_str(&smp_affinity)?; + irqs.insert(irq, cpumask); + + // affinity hints + let affinity_hint_raw_path = format!("/proc/irq/{}/affinity_hint", irq); + let affinity_hint_path = Path::new(&affinity_hint_raw_path); + let affinity_hint = fs::read_to_string(affinity_hint_path)? + .replace(",", "") + .replace("\n", ""); + let hint_cpumask = Cpumask::from_str(&affinity_hint)?; + irq_hints.insert(irq, hint_cpumask); + } + } + netdevs.insert( + iface.clone(), + NetDev { + iface, + node, + irqs, + irq_hints, + }, + ); + } + Ok(netdevs) +} diff --git a/scheds/rust/scx_layered/src/lib.rs b/scheds/rust/scx_layered/src/lib.rs index eb6a8fe..08aa1e8 100644 --- a/scheds/rust/scx_layered/src/lib.rs +++ b/scheds/rust/scx_layered/src/lib.rs @@ -222,6 +222,15 @@ impl CpuPool { Ok(Some(&self.core_cpus[core])) } + pub fn available_cpus(&self) -> BitVec { + let mut cpus = bitvec![u64, Lsb0; 0; self.nr_cpus]; + for core in self.available_cores.iter_ones() { + let core_cpus = self.core_cpus[core].clone(); + cpus |= core_cpus.as_bitslice(); + } + cpus + } + pub fn available_cpus_in_mask(&self, allowed_cpus: &BitVec) -> BitVec { let mut cpus = bitvec![0; self.nr_cpus]; for core in self.available_cores.iter_ones() { diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs index 99a3143..faa9562 100644 --- a/scheds/rust/scx_layered/src/main.rs +++ b/scheds/rust/scx_layered/src/main.rs @@ -44,6 +44,7 @@ use scx_utils::compat; use scx_utils::import_enums; use scx_utils::init_libbpf_logging; use scx_utils::ravg::ravg_read; +use scx_utils::read_netdevs; use scx_utils::scx_enums; use scx_utils::scx_ops_attach; use scx_utils::scx_ops_load; @@ -53,6 +54,7 @@ use scx_utils::uei_report; use scx_utils::Cache; use scx_utils::CoreType; use scx_utils::LoadAggregator; +use scx_utils::NetDev; use scx_utils::Topology; use scx_utils::UserExitInfo; use stats::LayerStats; @@ -471,6 +473,10 @@ struct Opts { #[clap(long, default_value = "false")] disable_antistall: bool, + /// Enable netdev IRQ balancing + #[clap(long, default_value = "false")] + netdev_irq_balance: bool, + /// Maximum task runnable_at delay (in seconds) before antistall turns on #[clap(long, default_value = "3")] antistall_sec: u64, @@ -1215,6 +1221,7 @@ struct Scheduler<'a> { nr_layer_cpus_ranges: Vec<(usize, usize)>, processing_dur: Duration, + netdevs: BTreeMap, stats_server: StatsServer, } @@ -1399,6 +1406,12 @@ impl<'a> Scheduler<'a> { let topo = Topology::new()?; let cpu_pool = CpuPool::new(&topo)?; + let netdevs = if opts.netdev_irq_balance { + read_netdevs()? + } else { + BTreeMap::new() + }; + let disable_topology = if let Some(val) = opts.disable_topology { val } else { @@ -1523,6 +1536,7 @@ impl<'a> Scheduler<'a> { proc_reader, skel, + netdevs, stats_server, }; @@ -1542,6 +1556,26 @@ impl<'a> Scheduler<'a> { bpf_layer.refresh_cpus = 1; } + fn update_netdev_cpumasks(&mut self) -> Result<()> { + let available_cpus = self.cpu_pool.available_cpus(); + if available_cpus.is_empty() { + return Ok(()); + } + + for (iface, netdev) in self.netdevs.iter_mut() { + for (irq, irqmask) in netdev.irqs.iter_mut() { + irqmask.clear(); + for cpu in available_cpus.iter_ones() { + let _ = irqmask.set_cpu(cpu); + } + trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask); + } + netdev.apply_cpumasks()?; + } + + Ok(()) + } + fn set_bpf_layer_preemption(layer: &mut Layer, bpf_layer: &mut types::layer, preempt: bool) { layer.preempt = preempt; bpf_layer.preempt.write(preempt); @@ -1656,6 +1690,7 @@ impl<'a> Scheduler<'a> { } } + let _ = self.update_netdev_cpumasks(); Ok(()) }