mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-25 04:00:24 +00:00
Merge pull request #90 from sched-ext/scx-rustland-smt
scx_rustland: introduce SMT support
This commit is contained in:
commit
b8687a051e
@ -7,8 +7,6 @@ use crate::bpf_intf;
|
||||
use crate::bpf_skel::*;
|
||||
|
||||
use std::ffi::CStr;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufRead};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
@ -214,7 +212,7 @@ pub struct BpfScheduler<'a> {
|
||||
}
|
||||
|
||||
impl<'a> BpfScheduler<'a> {
|
||||
pub fn init(slice_us: u64, partial: bool, debug: bool) -> Result<Self> {
|
||||
pub fn init(slice_us: u64, nr_cpus_online: i32, partial: bool, debug: bool) -> Result<Self> {
|
||||
// Open the BPF prog first for verification.
|
||||
let skel_builder = BpfSkelBuilder::default();
|
||||
let mut skel = skel_builder.open().context("Failed to open BPF program")?;
|
||||
@ -225,9 +223,8 @@ impl<'a> BpfScheduler<'a> {
|
||||
|
||||
// Initialize online CPUs counter.
|
||||
//
|
||||
// We should probably refresh this counter during the normal execution to support cpu
|
||||
// NOTE: we should probably refresh this counter during the normal execution to support cpu
|
||||
// hotplugging, but for now let's keep it simple and set this only at initialization).
|
||||
let nr_cpus_online = Self::count_cpus()?;
|
||||
skel.rodata_mut().num_possible_cpus = nr_cpus_online;
|
||||
|
||||
// Set scheduler options (defined in the BPF part).
|
||||
@ -256,30 +253,6 @@ impl<'a> BpfScheduler<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
// Return the amount of available CPUs in the system (according to /proc/stat).
|
||||
fn count_cpus() -> io::Result<i32> {
|
||||
let file = File::open("/proc/stat")?;
|
||||
let reader = io::BufReader::new(file);
|
||||
let mut cpu_count = -1;
|
||||
|
||||
for line in reader.lines() {
|
||||
let line = line?;
|
||||
if line.starts_with("cpu") {
|
||||
cpu_count += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(cpu_count)
|
||||
}
|
||||
|
||||
// Override the default scheduler time slice (in us).
|
||||
#[allow(dead_code)]
|
||||
pub fn get_nr_cpus(&self) -> i32 {
|
||||
self.skel.rodata().num_possible_cpus
|
||||
}
|
||||
|
||||
// Override the default scheduler time slice (in us).
|
||||
#[allow(dead_code)]
|
||||
pub fn set_effective_slice_us(&mut self, slice_us: u64) {
|
||||
|
@ -9,6 +9,9 @@ pub mod bpf_intf;
|
||||
mod bpf;
|
||||
use bpf::*;
|
||||
|
||||
mod topology;
|
||||
use topology::*;
|
||||
|
||||
use std::thread;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
@ -63,9 +66,9 @@ const SCHEDULER_NAME: &'static str = "RustLand";
|
||||
///
|
||||
/// === Troubleshooting ===
|
||||
///
|
||||
/// - Disable HyperThreading / SMT if you notice poor performance: this scheduler lacks support for
|
||||
/// any type of core scheduling and lacks NUMA awareness, assuming uniform performance and
|
||||
/// migration costs across all CPUs.
|
||||
/// - Disable HyperThreading / SMT if you notice poor performance (add "nosmt" to the kernel boot
|
||||
/// parameters): this scheduler is not NUMA-aware and it implements a simple policy of handling
|
||||
/// SMT cores.
|
||||
///
|
||||
/// - Adjust the time slice boost parameter (option `-b`) to enhance the responsiveness of
|
||||
/// low-latency applications (i.e., online gaming, live streaming, video conferencing etc.).
|
||||
@ -200,6 +203,7 @@ impl TaskTree {
|
||||
// Main scheduler object
|
||||
struct Scheduler<'a> {
|
||||
bpf: BpfScheduler<'a>, // BPF connector
|
||||
cores: CoreMapping, // Map of core ids -> CPU ids
|
||||
task_pool: TaskTree, // tasks ordered by vruntime
|
||||
task_map: TaskInfoMap, // map pids to the corresponding task information
|
||||
min_vruntime: u64, // Keep track of the minimum vruntime across all tasks
|
||||
@ -210,9 +214,8 @@ struct Scheduler<'a> {
|
||||
|
||||
impl<'a> Scheduler<'a> {
|
||||
fn init(opts: &Opts) -> Result<Self> {
|
||||
// Low-level BPF connector.
|
||||
let bpf = BpfScheduler::init(opts.slice_us, opts.partial, opts.debug)?;
|
||||
info!("{} scheduler attached", SCHEDULER_NAME);
|
||||
// Initialize core mapping topology.
|
||||
let cores = CoreMapping::new();
|
||||
|
||||
// Save the default time slice (in ns) in the scheduler class.
|
||||
let slice_ns = opts.slice_us * MSEC_PER_SEC;
|
||||
@ -232,9 +235,19 @@ impl<'a> Scheduler<'a> {
|
||||
// Initialize initial page fault counter.
|
||||
let init_page_faults: u64 = 0;
|
||||
|
||||
// Low-level BPF connector.
|
||||
let bpf = BpfScheduler::init(
|
||||
opts.slice_us,
|
||||
cores.nr_cpus_online,
|
||||
opts.partial,
|
||||
opts.debug,
|
||||
)?;
|
||||
info!("{} scheduler attached", SCHEDULER_NAME);
|
||||
|
||||
// Return scheduler object.
|
||||
Ok(Self {
|
||||
bpf,
|
||||
cores,
|
||||
task_pool,
|
||||
task_map,
|
||||
min_vruntime,
|
||||
@ -244,17 +257,46 @@ impl<'a> Scheduler<'a> {
|
||||
})
|
||||
}
|
||||
|
||||
// Return the array of idle CPU ids.
|
||||
// Returns the list of idle CPUs, sorted by the amount of free sibling CPUs in the same core.
|
||||
//
|
||||
// In this way we can schedule first on the CPUs that are "more free" than others, maximizing
|
||||
// performance in presence of SMT cores.
|
||||
fn get_idle_cpus(&self) -> Vec<i32> {
|
||||
let mut idle_cpus = Vec::new();
|
||||
let cores = &self.cores.map;
|
||||
let num_cpus = self.cores.nr_cpus_online;
|
||||
|
||||
for cpu in 0..self.bpf.get_nr_cpus() {
|
||||
let pid = self.bpf.get_cpu_pid(cpu);
|
||||
if pid == 0 {
|
||||
idle_cpus.push(cpu);
|
||||
}
|
||||
}
|
||||
// Cache the results of self.bpf.get_cpu_pid() for all CPUs.
|
||||
let cpu_pid_map: Vec<u32> = (0..num_cpus)
|
||||
.map(|cpu_id| self.bpf.get_cpu_pid(cpu_id))
|
||||
.collect();
|
||||
|
||||
// Create a Vec of tuples containing core id and the number of busy CPU ids for each core.
|
||||
let mut core_status: Vec<(i32, usize)> = cores
|
||||
.iter()
|
||||
.map(|(&core_id, core_cpus)| {
|
||||
let busy_cpus = core_cpus
|
||||
.iter()
|
||||
.filter(|&&cpu_id| cpu_pid_map[cpu_id as usize] > 0)
|
||||
.count();
|
||||
(core_id, busy_cpus)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort the Vec based on the number of busy CPUs in ascending order (free cores first).
|
||||
core_status.sort_by(|a, b| a.1.cmp(&b.1));
|
||||
|
||||
// Generate the list of idle CPU ids, ordered by the number of busy siblings (ascending).
|
||||
let idle_cpus: Vec<i32> = core_status
|
||||
.iter()
|
||||
.flat_map(|&(core_id, _)| {
|
||||
cores[&core_id]
|
||||
.iter()
|
||||
.filter(|&&cpu_id| cpu_pid_map[cpu_id as usize] == 0)
|
||||
.cloned()
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Return the list of idle CPUs.
|
||||
idle_cpus
|
||||
}
|
||||
|
||||
@ -600,19 +642,25 @@ impl<'a> Scheduler<'a> {
|
||||
// Show current used time slice.
|
||||
info!("time slice = {} us", self.bpf.get_effective_slice_us());
|
||||
|
||||
// Show tasks that are currently running.
|
||||
// Show tasks that are currently running on each core and CPU.
|
||||
let sched_cpu = match Self::get_current_cpu() {
|
||||
Ok(cpu_info) => cpu_info,
|
||||
Err(_) => -1,
|
||||
};
|
||||
info!("Running tasks:");
|
||||
for cpu in 0..self.bpf.get_nr_cpus() {
|
||||
let pid = if cpu == sched_cpu {
|
||||
"[self]".to_string()
|
||||
} else {
|
||||
self.bpf.get_cpu_pid(cpu).to_string()
|
||||
};
|
||||
info!(" cpu={} pid={}", cpu, pid);
|
||||
let mut core_ids: Vec<i32> = self.cores.map.keys().copied().collect();
|
||||
core_ids.sort();
|
||||
for core_id in core_ids {
|
||||
let mut cpu_ids: Vec<i32> = self.cores.map.get(&core_id).unwrap().clone();
|
||||
cpu_ids.sort();
|
||||
for cpu_id in cpu_ids {
|
||||
let pid = if cpu_id == sched_cpu {
|
||||
"[self]".to_string()
|
||||
} else {
|
||||
self.bpf.get_cpu_pid(cpu_id).to_string()
|
||||
};
|
||||
info!(" core {:2} cpu {:2} pid={}", core_id, cpu_id, pid);
|
||||
}
|
||||
}
|
||||
|
||||
log::logger().flush();
|
||||
|
84
scheds/rust/scx_rustland/src/topology.rs
Normal file
84
scheds/rust/scx_rustland/src/topology.rs
Normal file
@ -0,0 +1,84 @@
|
||||
// Copyright (c) Andrea Righi <andrea.righi@canonical.com>
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
|
||||
/// scx_rustland: CPU topology.
|
||||
///
|
||||
/// CoreMapping provides a map of the CPU topology, with the list of CPU ids in the system grouped
|
||||
/// by their corresponding core id.
|
||||
///
|
||||
/// The CPU / core mapping is stored in a HashMap using the core ID as the key. An example content
|
||||
/// of the HashMap can be the following:
|
||||
///
|
||||
/// core 0: [4, 0]
|
||||
/// core 1: [5, 1]
|
||||
/// core 2: [6, 2]
|
||||
/// core 3: [7, 3]
|
||||
///
|
||||
/// This information can be used by the scheduler to apply a more efficient scheduling policy, for
|
||||
/// example dispatching tasks on the CPUs that have all the siblings idle first, and later move to
|
||||
/// the CPUs with busy siblings.
|
||||
|
||||
const CPU_PATH: &str = "/sys/devices/system/cpu";
|
||||
|
||||
pub struct CoreMapping {
|
||||
// Map of core IDs -> list of CPU ids.
|
||||
//
|
||||
// NOTE: we must periodically refresh this map if we want to support CPU hotplugging, for now
|
||||
// let's assume it's static.
|
||||
pub map: HashMap<i32, Vec<i32>>,
|
||||
|
||||
// Number of available CPUs in the system.
|
||||
//
|
||||
// NOTE: we must periodically refresh this value if we want to support CPU hotplugging, for now
|
||||
// let's assume it's static.
|
||||
pub nr_cpus_online: i32,
|
||||
}
|
||||
|
||||
impl CoreMapping {
|
||||
pub fn new() -> Self {
|
||||
let mut core_mapping = CoreMapping {
|
||||
map: HashMap::new(),
|
||||
nr_cpus_online: 0,
|
||||
};
|
||||
core_mapping.init_mapping();
|
||||
|
||||
core_mapping
|
||||
}
|
||||
|
||||
// Evaluate the amount of available CPUs in the system.
|
||||
// Initialize the core ids -> CPU ids HashMap, parsing all the information from
|
||||
// /sys/devices/system/cpu/cpu<id>/topology/core_id.
|
||||
fn init_mapping(&mut self) {
|
||||
let cpu_entries: Vec<_> = fs::read_dir(CPU_PATH)
|
||||
.expect(format!("Failed to read: {}", CPU_PATH).as_str())
|
||||
.filter_map(|entry| entry.ok())
|
||||
.collect();
|
||||
|
||||
// Generate core map.
|
||||
for entry in cpu_entries {
|
||||
let entry_path = entry.path();
|
||||
let cpu_name = entry.file_name();
|
||||
let cpu_id_str = cpu_name.to_string_lossy().to_string();
|
||||
if cpu_id_str.starts_with("cpu") {
|
||||
if let Some(cpu_id) = cpu_id_str.chars().skip(3).collect::<String>().parse().ok() {
|
||||
let core_id_path = entry_path.join("topology/core_id");
|
||||
if let Some(core_id) = fs::read_to_string(&core_id_path)
|
||||
.ok()
|
||||
.and_then(|content| content.trim().parse().ok())
|
||||
{
|
||||
// Add CPU id to the core map.
|
||||
self.map.entry(core_id).or_insert(Vec::new()).push(cpu_id);
|
||||
|
||||
// Update total CPU ids counter.
|
||||
self.nr_cpus_online += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user