mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-24 20:00:22 +00:00
scx_rustland: introduce SMT support
Introduce a basic support of CPU topology awareness. With this change, the scheduler will prioritize dispatching tasks to idle CPUs with fewer busy SMT siblings, then, it will proceed to CPUs with more busy SMT siblings, in ascending order. To implement this, introduce a new CoreMapping abstraction, that provides a mapping of the available core IDs in the system along with their corresponding lists of CPU IDs. This, coupled with the get_cpu_pid() method from the BpfScheduler abstraction, allows the user-space scheduler to enforce the policy outlined above and improve performance on SMT systems. Keep in mind that this improvement is relevent only when the amount of tasks running in the system is less than the amount of CPUs. As soon as the amount of running tasks increases, they will be distributed across all available CPUs and cores, thereby negating the advantages of SMT isolation. Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
This commit is contained in:
parent
09e7905ee0
commit
791bdbec97
@ -7,8 +7,6 @@ use crate::bpf_intf;
|
||||
use crate::bpf_skel::*;
|
||||
|
||||
use std::ffi::CStr;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufRead};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
@ -214,7 +212,7 @@ pub struct BpfScheduler<'a> {
|
||||
}
|
||||
|
||||
impl<'a> BpfScheduler<'a> {
|
||||
pub fn init(slice_us: u64, partial: bool, debug: bool) -> Result<Self> {
|
||||
pub fn init(slice_us: u64, nr_cpus_online: i32, partial: bool, debug: bool) -> Result<Self> {
|
||||
// Open the BPF prog first for verification.
|
||||
let skel_builder = BpfSkelBuilder::default();
|
||||
let mut skel = skel_builder.open().context("Failed to open BPF program")?;
|
||||
@ -225,9 +223,8 @@ impl<'a> BpfScheduler<'a> {
|
||||
|
||||
// Initialize online CPUs counter.
|
||||
//
|
||||
// We should probably refresh this counter during the normal execution to support cpu
|
||||
// NOTE: we should probably refresh this counter during the normal execution to support cpu
|
||||
// hotplugging, but for now let's keep it simple and set this only at initialization).
|
||||
let nr_cpus_online = Self::count_cpus()?;
|
||||
skel.rodata_mut().num_possible_cpus = nr_cpus_online;
|
||||
|
||||
// Set scheduler options (defined in the BPF part).
|
||||
@ -256,30 +253,6 @@ impl<'a> BpfScheduler<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
// Return the amount of available CPUs in the system (according to /proc/stat).
|
||||
fn count_cpus() -> io::Result<i32> {
|
||||
let file = File::open("/proc/stat")?;
|
||||
let reader = io::BufReader::new(file);
|
||||
let mut cpu_count = -1;
|
||||
|
||||
for line in reader.lines() {
|
||||
let line = line?;
|
||||
if line.starts_with("cpu") {
|
||||
cpu_count += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(cpu_count)
|
||||
}
|
||||
|
||||
// Override the default scheduler time slice (in us).
|
||||
#[allow(dead_code)]
|
||||
pub fn get_nr_cpus(&self) -> i32 {
|
||||
self.skel.rodata().num_possible_cpus
|
||||
}
|
||||
|
||||
// Override the default scheduler time slice (in us).
|
||||
#[allow(dead_code)]
|
||||
pub fn set_effective_slice_us(&mut self, slice_us: u64) {
|
||||
|
@ -9,6 +9,9 @@ pub mod bpf_intf;
|
||||
mod bpf;
|
||||
use bpf::*;
|
||||
|
||||
mod topology;
|
||||
use topology::*;
|
||||
|
||||
use std::thread;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
@ -63,9 +66,9 @@ const SCHEDULER_NAME: &'static str = "RustLand";
|
||||
///
|
||||
/// === Troubleshooting ===
|
||||
///
|
||||
/// - Disable HyperThreading / SMT if you notice poor performance: this scheduler lacks support for
|
||||
/// any type of core scheduling and lacks NUMA awareness, assuming uniform performance and
|
||||
/// migration costs across all CPUs.
|
||||
/// - Disable HyperThreading / SMT if you notice poor performance (add "nosmt" to the kernel boot
|
||||
/// parameters): this scheduler is not NUMA-aware and it implements a simple policy of handling
|
||||
/// SMT cores.
|
||||
///
|
||||
/// - Adjust the time slice boost parameter (option `-b`) to enhance the responsiveness of
|
||||
/// low-latency applications (i.e., online gaming, live streaming, video conferencing etc.).
|
||||
@ -200,6 +203,7 @@ impl TaskTree {
|
||||
// Main scheduler object
|
||||
struct Scheduler<'a> {
|
||||
bpf: BpfScheduler<'a>, // BPF connector
|
||||
cores: CoreMapping, // Map of core ids -> CPU ids
|
||||
task_pool: TaskTree, // tasks ordered by vruntime
|
||||
task_map: TaskInfoMap, // map pids to the corresponding task information
|
||||
min_vruntime: u64, // Keep track of the minimum vruntime across all tasks
|
||||
@ -210,9 +214,8 @@ struct Scheduler<'a> {
|
||||
|
||||
impl<'a> Scheduler<'a> {
|
||||
fn init(opts: &Opts) -> Result<Self> {
|
||||
// Low-level BPF connector.
|
||||
let bpf = BpfScheduler::init(opts.slice_us, opts.partial, opts.debug)?;
|
||||
info!("{} scheduler attached", SCHEDULER_NAME);
|
||||
// Initialize core mapping topology.
|
||||
let cores = CoreMapping::new();
|
||||
|
||||
// Save the default time slice (in ns) in the scheduler class.
|
||||
let slice_ns = opts.slice_us * MSEC_PER_SEC;
|
||||
@ -232,9 +235,19 @@ impl<'a> Scheduler<'a> {
|
||||
// Initialize initial page fault counter.
|
||||
let init_page_faults: u64 = 0;
|
||||
|
||||
// Low-level BPF connector.
|
||||
let bpf = BpfScheduler::init(
|
||||
opts.slice_us,
|
||||
cores.nr_cpus_online,
|
||||
opts.partial,
|
||||
opts.debug,
|
||||
)?;
|
||||
info!("{} scheduler attached", SCHEDULER_NAME);
|
||||
|
||||
// Return scheduler object.
|
||||
Ok(Self {
|
||||
bpf,
|
||||
cores,
|
||||
task_pool,
|
||||
task_map,
|
||||
min_vruntime,
|
||||
@ -244,17 +257,46 @@ impl<'a> Scheduler<'a> {
|
||||
})
|
||||
}
|
||||
|
||||
// Return the array of idle CPU ids.
|
||||
// Returns the list of idle CPUs, sorted by the amount of free sibling CPUs in the same core.
|
||||
//
|
||||
// In this way we can schedule first on the CPUs that are "more free" than others, maximizing
|
||||
// performance in presence of SMT cores.
|
||||
fn get_idle_cpus(&self) -> Vec<i32> {
|
||||
let mut idle_cpus = Vec::new();
|
||||
let cores = &self.cores.map;
|
||||
let num_cpus = self.cores.nr_cpus_online;
|
||||
|
||||
for cpu in 0..self.bpf.get_nr_cpus() {
|
||||
let pid = self.bpf.get_cpu_pid(cpu);
|
||||
if pid == 0 {
|
||||
idle_cpus.push(cpu);
|
||||
}
|
||||
}
|
||||
// Cache the results of self.bpf.get_cpu_pid() for all CPUs.
|
||||
let cpu_pid_map: Vec<u32> = (0..num_cpus)
|
||||
.map(|cpu_id| self.bpf.get_cpu_pid(cpu_id))
|
||||
.collect();
|
||||
|
||||
// Create a Vec of tuples containing core id and the number of busy CPU ids for each core.
|
||||
let mut core_status: Vec<(i32, usize)> = cores
|
||||
.iter()
|
||||
.map(|(&core_id, core_cpus)| {
|
||||
let busy_cpus = core_cpus
|
||||
.iter()
|
||||
.filter(|&&cpu_id| cpu_pid_map[cpu_id as usize] > 0)
|
||||
.count();
|
||||
(core_id, busy_cpus)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort the Vec based on the number of busy CPUs in ascending order (free cores first).
|
||||
core_status.sort_by(|a, b| a.1.cmp(&b.1));
|
||||
|
||||
// Generate the list of idle CPU ids, ordered by the number of busy siblings (ascending).
|
||||
let idle_cpus: Vec<i32> = core_status
|
||||
.iter()
|
||||
.flat_map(|&(core_id, _)| {
|
||||
cores[&core_id]
|
||||
.iter()
|
||||
.filter(|&&cpu_id| cpu_pid_map[cpu_id as usize] == 0)
|
||||
.cloned()
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Return the list of idle CPUs.
|
||||
idle_cpus
|
||||
}
|
||||
|
||||
@ -600,19 +642,25 @@ impl<'a> Scheduler<'a> {
|
||||
// Show current used time slice.
|
||||
info!("time slice = {} us", self.bpf.get_effective_slice_us());
|
||||
|
||||
// Show tasks that are currently running.
|
||||
// Show tasks that are currently running on each core and CPU.
|
||||
let sched_cpu = match Self::get_current_cpu() {
|
||||
Ok(cpu_info) => cpu_info,
|
||||
Err(_) => -1,
|
||||
};
|
||||
info!("Running tasks:");
|
||||
for cpu in 0..self.bpf.get_nr_cpus() {
|
||||
let pid = if cpu == sched_cpu {
|
||||
"[self]".to_string()
|
||||
} else {
|
||||
self.bpf.get_cpu_pid(cpu).to_string()
|
||||
};
|
||||
info!(" cpu={} pid={}", cpu, pid);
|
||||
let mut core_ids: Vec<i32> = self.cores.map.keys().copied().collect();
|
||||
core_ids.sort();
|
||||
for core_id in core_ids {
|
||||
let mut cpu_ids: Vec<i32> = self.cores.map.get(&core_id).unwrap().clone();
|
||||
cpu_ids.sort();
|
||||
for cpu_id in cpu_ids {
|
||||
let pid = if cpu_id == sched_cpu {
|
||||
"[self]".to_string()
|
||||
} else {
|
||||
self.bpf.get_cpu_pid(cpu_id).to_string()
|
||||
};
|
||||
info!(" core {:2} cpu {:2} pid={}", core_id, cpu_id, pid);
|
||||
}
|
||||
}
|
||||
|
||||
log::logger().flush();
|
||||
|
84
scheds/rust/scx_rustland/src/topology.rs
Normal file
84
scheds/rust/scx_rustland/src/topology.rs
Normal file
@ -0,0 +1,84 @@
|
||||
// Copyright (c) Andrea Righi <andrea.righi@canonical.com>
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
|
||||
/// scx_rustland: CPU topology.
|
||||
///
|
||||
/// CoreMapping provides a map of the CPU topology, with the list of CPU ids in the system grouped
|
||||
/// by their corresponding core id.
|
||||
///
|
||||
/// The CPU / core mapping is stored in a HashMap using the core ID as the key. An example content
|
||||
/// of the HashMap can be the following:
|
||||
///
|
||||
/// core 0: [4, 0]
|
||||
/// core 1: [5, 1]
|
||||
/// core 2: [6, 2]
|
||||
/// core 3: [7, 3]
|
||||
///
|
||||
/// This information can be used by the scheduler to apply a more efficient scheduling policy, for
|
||||
/// example dispatching tasks on the CPUs that have all the siblings idle first, and later move to
|
||||
/// the CPUs with busy siblings.
|
||||
|
||||
const CPU_PATH: &str = "/sys/devices/system/cpu";
|
||||
|
||||
pub struct CoreMapping {
|
||||
// Map of core IDs -> list of CPU ids.
|
||||
//
|
||||
// NOTE: we must periodically refresh this map if we want to support CPU hotplugging, for now
|
||||
// let's assume it's static.
|
||||
pub map: HashMap<i32, Vec<i32>>,
|
||||
|
||||
// Number of available CPUs in the system.
|
||||
//
|
||||
// NOTE: we must periodically refresh this value if we want to support CPU hotplugging, for now
|
||||
// let's assume it's static.
|
||||
pub nr_cpus_online: i32,
|
||||
}
|
||||
|
||||
impl CoreMapping {
|
||||
pub fn new() -> Self {
|
||||
let mut core_mapping = CoreMapping {
|
||||
map: HashMap::new(),
|
||||
nr_cpus_online: 0,
|
||||
};
|
||||
core_mapping.init_mapping();
|
||||
|
||||
core_mapping
|
||||
}
|
||||
|
||||
// Evaluate the amount of available CPUs in the system.
|
||||
// Initialize the core ids -> CPU ids HashMap, parsing all the information from
|
||||
// /sys/devices/system/cpu/cpu<id>/topology/core_id.
|
||||
fn init_mapping(&mut self) {
|
||||
let cpu_entries: Vec<_> = fs::read_dir(CPU_PATH)
|
||||
.expect(format!("Failed to read: {}", CPU_PATH).as_str())
|
||||
.filter_map(|entry| entry.ok())
|
||||
.collect();
|
||||
|
||||
// Generate core map.
|
||||
for entry in cpu_entries {
|
||||
let entry_path = entry.path();
|
||||
let cpu_name = entry.file_name();
|
||||
let cpu_id_str = cpu_name.to_string_lossy().to_string();
|
||||
if cpu_id_str.starts_with("cpu") {
|
||||
if let Some(cpu_id) = cpu_id_str.chars().skip(3).collect::<String>().parse().ok() {
|
||||
let core_id_path = entry_path.join("topology/core_id");
|
||||
if let Some(core_id) = fs::read_to_string(&core_id_path)
|
||||
.ok()
|
||||
.and_then(|content| content.trim().parse().ok())
|
||||
{
|
||||
// Add CPU id to the core map.
|
||||
self.map.entry(core_id).or_insert(Vec::new()).push(cpu_id);
|
||||
|
||||
// Update total CPU ids counter.
|
||||
self.nr_cpus_online += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user