Merge pull request #90 from sched-ext/scx-rustland-smt

scx_rustland: introduce SMT support
This commit is contained in:
David Vernet 2024-01-16 10:30:41 -06:00 committed by GitHub
commit b8687a051e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 156 additions and 51 deletions

View File

@ -7,8 +7,6 @@ use crate::bpf_intf;
use crate::bpf_skel::*;
use std::ffi::CStr;
use std::fs::File;
use std::io::{self, BufRead};
use anyhow::Context;
use anyhow::Result;
@ -214,7 +212,7 @@ pub struct BpfScheduler<'a> {
}
impl<'a> BpfScheduler<'a> {
pub fn init(slice_us: u64, partial: bool, debug: bool) -> Result<Self> {
pub fn init(slice_us: u64, nr_cpus_online: i32, partial: bool, debug: bool) -> Result<Self> {
// Open the BPF prog first for verification.
let skel_builder = BpfSkelBuilder::default();
let mut skel = skel_builder.open().context("Failed to open BPF program")?;
@ -225,9 +223,8 @@ impl<'a> BpfScheduler<'a> {
// Initialize online CPUs counter.
//
// We should probably refresh this counter during the normal execution to support cpu
// NOTE: we should probably refresh this counter during the normal execution to support cpu
// hotplugging, but for now let's keep it simple and set this only at initialization).
let nr_cpus_online = Self::count_cpus()?;
skel.rodata_mut().num_possible_cpus = nr_cpus_online;
// Set scheduler options (defined in the BPF part).
@ -256,30 +253,6 @@ impl<'a> BpfScheduler<'a> {
}
}
// Return the amount of available CPUs in the system (according to /proc/stat).
fn count_cpus() -> io::Result<i32> {
let file = File::open("/proc/stat")?;
let reader = io::BufReader::new(file);
let mut cpu_count = -1;
for line in reader.lines() {
let line = line?;
if line.starts_with("cpu") {
cpu_count += 1;
} else {
break;
}
}
Ok(cpu_count)
}
// Override the default scheduler time slice (in us).
#[allow(dead_code)]
pub fn get_nr_cpus(&self) -> i32 {
self.skel.rodata().num_possible_cpus
}
// Override the default scheduler time slice (in us).
#[allow(dead_code)]
pub fn set_effective_slice_us(&mut self, slice_us: u64) {

View File

@ -9,6 +9,9 @@ pub mod bpf_intf;
mod bpf;
use bpf::*;
mod topology;
use topology::*;
use std::thread;
use std::collections::BTreeSet;
@ -63,9 +66,9 @@ const SCHEDULER_NAME: &'static str = "RustLand";
///
/// === Troubleshooting ===
///
/// - Disable HyperThreading / SMT if you notice poor performance: this scheduler lacks support for
/// any type of core scheduling and lacks NUMA awareness, assuming uniform performance and
/// migration costs across all CPUs.
/// - Disable HyperThreading / SMT if you notice poor performance (add "nosmt" to the kernel boot
/// parameters): this scheduler is not NUMA-aware and it implements a simple policy of handling
/// SMT cores.
///
/// - Adjust the time slice boost parameter (option `-b`) to enhance the responsiveness of
/// low-latency applications (i.e., online gaming, live streaming, video conferencing etc.).
@ -200,6 +203,7 @@ impl TaskTree {
// Main scheduler object
struct Scheduler<'a> {
bpf: BpfScheduler<'a>, // BPF connector
cores: CoreMapping, // Map of core ids -> CPU ids
task_pool: TaskTree, // tasks ordered by vruntime
task_map: TaskInfoMap, // map pids to the corresponding task information
min_vruntime: u64, // Keep track of the minimum vruntime across all tasks
@ -210,9 +214,8 @@ struct Scheduler<'a> {
impl<'a> Scheduler<'a> {
fn init(opts: &Opts) -> Result<Self> {
// Low-level BPF connector.
let bpf = BpfScheduler::init(opts.slice_us, opts.partial, opts.debug)?;
info!("{} scheduler attached", SCHEDULER_NAME);
// Initialize core mapping topology.
let cores = CoreMapping::new();
// Save the default time slice (in ns) in the scheduler class.
let slice_ns = opts.slice_us * MSEC_PER_SEC;
@ -232,9 +235,19 @@ impl<'a> Scheduler<'a> {
// Initialize initial page fault counter.
let init_page_faults: u64 = 0;
// Low-level BPF connector.
let bpf = BpfScheduler::init(
opts.slice_us,
cores.nr_cpus_online,
opts.partial,
opts.debug,
)?;
info!("{} scheduler attached", SCHEDULER_NAME);
// Return scheduler object.
Ok(Self {
bpf,
cores,
task_pool,
task_map,
min_vruntime,
@ -244,17 +257,46 @@ impl<'a> Scheduler<'a> {
})
}
// Return the array of idle CPU ids.
// Returns the list of idle CPUs, sorted by the amount of free sibling CPUs in the same core.
//
// In this way we can schedule first on the CPUs that are "more free" than others, maximizing
// performance in presence of SMT cores.
fn get_idle_cpus(&self) -> Vec<i32> {
let mut idle_cpus = Vec::new();
let cores = &self.cores.map;
let num_cpus = self.cores.nr_cpus_online;
for cpu in 0..self.bpf.get_nr_cpus() {
let pid = self.bpf.get_cpu_pid(cpu);
if pid == 0 {
idle_cpus.push(cpu);
}
}
// Cache the results of self.bpf.get_cpu_pid() for all CPUs.
let cpu_pid_map: Vec<u32> = (0..num_cpus)
.map(|cpu_id| self.bpf.get_cpu_pid(cpu_id))
.collect();
// Create a Vec of tuples containing core id and the number of busy CPU ids for each core.
let mut core_status: Vec<(i32, usize)> = cores
.iter()
.map(|(&core_id, core_cpus)| {
let busy_cpus = core_cpus
.iter()
.filter(|&&cpu_id| cpu_pid_map[cpu_id as usize] > 0)
.count();
(core_id, busy_cpus)
})
.collect();
// Sort the Vec based on the number of busy CPUs in ascending order (free cores first).
core_status.sort_by(|a, b| a.1.cmp(&b.1));
// Generate the list of idle CPU ids, ordered by the number of busy siblings (ascending).
let idle_cpus: Vec<i32> = core_status
.iter()
.flat_map(|&(core_id, _)| {
cores[&core_id]
.iter()
.filter(|&&cpu_id| cpu_pid_map[cpu_id as usize] == 0)
.cloned()
})
.collect();
// Return the list of idle CPUs.
idle_cpus
}
@ -600,19 +642,25 @@ impl<'a> Scheduler<'a> {
// Show current used time slice.
info!("time slice = {} us", self.bpf.get_effective_slice_us());
// Show tasks that are currently running.
// Show tasks that are currently running on each core and CPU.
let sched_cpu = match Self::get_current_cpu() {
Ok(cpu_info) => cpu_info,
Err(_) => -1,
};
info!("Running tasks:");
for cpu in 0..self.bpf.get_nr_cpus() {
let pid = if cpu == sched_cpu {
"[self]".to_string()
} else {
self.bpf.get_cpu_pid(cpu).to_string()
};
info!(" cpu={} pid={}", cpu, pid);
let mut core_ids: Vec<i32> = self.cores.map.keys().copied().collect();
core_ids.sort();
for core_id in core_ids {
let mut cpu_ids: Vec<i32> = self.cores.map.get(&core_id).unwrap().clone();
cpu_ids.sort();
for cpu_id in cpu_ids {
let pid = if cpu_id == sched_cpu {
"[self]".to_string()
} else {
self.bpf.get_cpu_pid(cpu_id).to_string()
};
info!(" core {:2} cpu {:2} pid={}", core_id, cpu_id, pid);
}
}
log::logger().flush();

View File

@ -0,0 +1,84 @@
// Copyright (c) Andrea Righi <andrea.righi@canonical.com>
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
use std::collections::HashMap;
use std::fs;
/// scx_rustland: CPU topology.
///
/// CoreMapping provides a map of the CPU topology, with the list of CPU ids in the system grouped
/// by their corresponding core id.
///
/// The CPU / core mapping is stored in a HashMap using the core ID as the key. An example content
/// of the HashMap can be the following:
///
/// core 0: [4, 0]
/// core 1: [5, 1]
/// core 2: [6, 2]
/// core 3: [7, 3]
///
/// This information can be used by the scheduler to apply a more efficient scheduling policy, for
/// example dispatching tasks on the CPUs that have all the siblings idle first, and later move to
/// the CPUs with busy siblings.
const CPU_PATH: &str = "/sys/devices/system/cpu";
pub struct CoreMapping {
// Map of core IDs -> list of CPU ids.
//
// NOTE: we must periodically refresh this map if we want to support CPU hotplugging, for now
// let's assume it's static.
pub map: HashMap<i32, Vec<i32>>,
// Number of available CPUs in the system.
//
// NOTE: we must periodically refresh this value if we want to support CPU hotplugging, for now
// let's assume it's static.
pub nr_cpus_online: i32,
}
impl CoreMapping {
pub fn new() -> Self {
let mut core_mapping = CoreMapping {
map: HashMap::new(),
nr_cpus_online: 0,
};
core_mapping.init_mapping();
core_mapping
}
// Evaluate the amount of available CPUs in the system.
// Initialize the core ids -> CPU ids HashMap, parsing all the information from
// /sys/devices/system/cpu/cpu<id>/topology/core_id.
fn init_mapping(&mut self) {
let cpu_entries: Vec<_> = fs::read_dir(CPU_PATH)
.expect(format!("Failed to read: {}", CPU_PATH).as_str())
.filter_map(|entry| entry.ok())
.collect();
// Generate core map.
for entry in cpu_entries {
let entry_path = entry.path();
let cpu_name = entry.file_name();
let cpu_id_str = cpu_name.to_string_lossy().to_string();
if cpu_id_str.starts_with("cpu") {
if let Some(cpu_id) = cpu_id_str.chars().skip(3).collect::<String>().parse().ok() {
let core_id_path = entry_path.join("topology/core_id");
if let Some(core_id) = fs::read_to_string(&core_id_path)
.ok()
.and_then(|content| content.trim().parse().ok())
{
// Add CPU id to the core map.
self.map.entry(core_id).or_insert(Vec::new()).push(cpu_id);
// Update total CPU ids counter.
self.nr_cpus_online += 1;
}
}
}
}
}
}