Merge pull request #90 from sched-ext/scx-rustland-smt

scx_rustland: introduce SMT support
2024-11-25 04:00:24 +00:00 · 2024-01-16 10:30:41 -06:00 · 2024-01-16 10:30:41 -06:00 · b8687a051e
commit b8687a051e
parent 09e7905ee0 791bdbec97
3 changed files with 156 additions and 51 deletions
--- a/scheds/rust/scx_rustland/src/bpf.rs
+++ b/scheds/rust/scx_rustland/src/bpf.rs
@ -7,8 +7,6 @@ use crate::bpf_intf;
 use crate::bpf_skel::*;

 use std::ffi::CStr;
-use std::fs::File;
-use std::io::{self, BufRead};

 use anyhow::Context;
 use anyhow::Result;
@ -214,7 +212,7 @@ pub struct BpfScheduler<'a> {
 }

 impl<'a> BpfScheduler<'a> {
-    pub fn init(slice_us: u64, partial: bool, debug: bool) -> Result<Self> {
+    pub fn init(slice_us: u64, nr_cpus_online: i32, partial: bool, debug: bool) -> Result<Self> {
        // Open the BPF prog first for verification.
        let skel_builder = BpfSkelBuilder::default();
        let mut skel = skel_builder.open().context("Failed to open BPF program")?;
@ -225,9 +223,8 @@ impl<'a> BpfScheduler<'a> {

        // Initialize online CPUs counter.
        //
-        // We should probably refresh this counter during the normal execution to support cpu
+        // NOTE: we should probably refresh this counter during the normal execution to support cpu
        // hotplugging, but for now let's keep it simple and set this only at initialization).
-        let nr_cpus_online = Self::count_cpus()?;
        skel.rodata_mut().num_possible_cpus = nr_cpus_online;

        // Set scheduler options (defined in the BPF part).
@ -256,30 +253,6 @@ impl<'a> BpfScheduler<'a> {
        }
    }

-    // Return the amount of available CPUs in the system (according to /proc/stat).
-    fn count_cpus() -> io::Result<i32> {
-        let file = File::open("/proc/stat")?;
-        let reader = io::BufReader::new(file);
-        let mut cpu_count = -1;
-
-        for line in reader.lines() {
-            let line = line?;
-            if line.starts_with("cpu") {
-                cpu_count += 1;
-            } else {
-                break;
-            }
-        }
-
-        Ok(cpu_count)
-    }
-
-    // Override the default scheduler time slice (in us).
-    #[allow(dead_code)]
-    pub fn get_nr_cpus(&self) -> i32 {
-        self.skel.rodata().num_possible_cpus
-    }
-
    // Override the default scheduler time slice (in us).
    #[allow(dead_code)]
    pub fn set_effective_slice_us(&mut self, slice_us: u64) {
--- a/scheds/rust/scx_rustland/src/main.rs
+++ b/scheds/rust/scx_rustland/src/main.rs
@ -9,6 +9,9 @@ pub mod bpf_intf;
 mod bpf;
 use bpf::*;

+mod topology;
+use topology::*;
+
 use std::thread;

 use std::collections::BTreeSet;
@ -63,9 +66,9 @@ const SCHEDULER_NAME: &'static str = "RustLand";
 ///
 /// === Troubleshooting ===
 ///
-/// - Disable HyperThreading / SMT if you notice poor performance: this scheduler lacks support for
-///   any type of core scheduling and lacks NUMA awareness, assuming uniform performance and
-///   migration costs across all CPUs.
+/// - Disable HyperThreading / SMT if you notice poor performance (add "nosmt" to the kernel boot
+///   parameters): this scheduler is not NUMA-aware and it implements a simple policy of handling
+///   SMT cores.
 ///
 /// - Adjust the time slice boost parameter (option `-b`) to enhance the responsiveness of
 ///   low-latency applications (i.e., online gaming, live streaming, video conferencing etc.).
@ -200,6 +203,7 @@ impl TaskTree {
 // Main scheduler object
 struct Scheduler<'a> {
    bpf: BpfScheduler<'a>, // BPF connector
+    cores: CoreMapping,    // Map of core ids -> CPU ids
    task_pool: TaskTree,   // tasks ordered by vruntime
    task_map: TaskInfoMap, // map pids to the corresponding task information
    min_vruntime: u64,     // Keep track of the minimum vruntime across all tasks
@ -210,9 +214,8 @@ struct Scheduler<'a> {

 impl<'a> Scheduler<'a> {
    fn init(opts: &Opts) -> Result<Self> {
-        // Low-level BPF connector.
-        let bpf = BpfScheduler::init(opts.slice_us, opts.partial, opts.debug)?;
-        info!("{} scheduler attached", SCHEDULER_NAME);
+        // Initialize core mapping topology.
+        let cores = CoreMapping::new();

        // Save the default time slice (in ns) in the scheduler class.
        let slice_ns = opts.slice_us * MSEC_PER_SEC;
@ -232,9 +235,19 @@ impl<'a> Scheduler<'a> {
        // Initialize initial page fault counter.
        let init_page_faults: u64 = 0;

+        // Low-level BPF connector.
+        let bpf = BpfScheduler::init(
+            opts.slice_us,
+            cores.nr_cpus_online,
+            opts.partial,
+            opts.debug,
+        )?;
+        info!("{} scheduler attached", SCHEDULER_NAME);
+
        // Return scheduler object.
        Ok(Self {
            bpf,
+            cores,
            task_pool,
            task_map,
            min_vruntime,
@ -244,17 +257,46 @@ impl<'a> Scheduler<'a> {
        })
    }

-    // Return the array of idle CPU ids.
+    // Returns the list of idle CPUs, sorted by the amount of free sibling CPUs in the same core.
+    //
+    // In this way we can schedule first on the CPUs that are "more free" than others, maximizing
+    // performance in presence of SMT cores.
    fn get_idle_cpus(&self) -> Vec<i32> {
-        let mut idle_cpus = Vec::new();
+        let cores = &self.cores.map;
+        let num_cpus = self.cores.nr_cpus_online;

-        for cpu in 0..self.bpf.get_nr_cpus() {
-            let pid = self.bpf.get_cpu_pid(cpu);
-            if pid == 0 {
-                idle_cpus.push(cpu);
-            }
-        }
+        // Cache the results of self.bpf.get_cpu_pid() for all CPUs.
+        let cpu_pid_map: Vec<u32> = (0..num_cpus)
+            .map(|cpu_id| self.bpf.get_cpu_pid(cpu_id))
+            .collect();

+        // Create a Vec of tuples containing core id and the number of busy CPU ids for each core.
+        let mut core_status: Vec<(i32, usize)> = cores
+            .iter()
+            .map(|(&core_id, core_cpus)| {
+                let busy_cpus = core_cpus
+                    .iter()
+                    .filter(|&&cpu_id| cpu_pid_map[cpu_id as usize] > 0)
+                    .count();
+                (core_id, busy_cpus)
+            })
+            .collect();
+
+        // Sort the Vec based on the number of busy CPUs in ascending order (free cores first).
+        core_status.sort_by(|a, b| a.1.cmp(&b.1));
+
+        // Generate the list of idle CPU ids, ordered by the number of busy siblings (ascending).
+        let idle_cpus: Vec<i32> = core_status
+            .iter()
+            .flat_map(|&(core_id, _)| {
+                cores[&core_id]
+                    .iter()
+                    .filter(|&&cpu_id| cpu_pid_map[cpu_id as usize] == 0)
+                    .cloned()
+            })
+            .collect();
+
+        // Return the list of idle CPUs.
        idle_cpus
    }

@ -600,19 +642,25 @@ impl<'a> Scheduler<'a> {
        // Show current used time slice.
        info!("time slice = {} us", self.bpf.get_effective_slice_us());

-        // Show tasks that are currently running.
+        // Show tasks that are currently running on each core and CPU.
        let sched_cpu = match Self::get_current_cpu() {
            Ok(cpu_info) => cpu_info,
            Err(_) => -1,
        };
        info!("Running tasks:");
-        for cpu in 0..self.bpf.get_nr_cpus() {
-            let pid = if cpu == sched_cpu {
-                "[self]".to_string()
-            } else {
-                self.bpf.get_cpu_pid(cpu).to_string()
-            };
-            info!("  cpu={} pid={}", cpu, pid);
+        let mut core_ids: Vec<i32> = self.cores.map.keys().copied().collect();
+        core_ids.sort();
+        for core_id in core_ids {
+            let mut cpu_ids: Vec<i32> = self.cores.map.get(&core_id).unwrap().clone();
+            cpu_ids.sort();
+            for cpu_id in cpu_ids {
+                let pid = if cpu_id == sched_cpu {
+                    "[self]".to_string()
+                } else {
+                    self.bpf.get_cpu_pid(cpu_id).to_string()
+                };
+                info!("  core {:2} cpu {:2} pid={}", core_id, cpu_id, pid);
+            }
        }

        log::logger().flush();
--- a/scheds/rust/scx_rustland/src/topology.rs
+++ b/scheds/rust/scx_rustland/src/topology.rs
@ -0,0 +1,84 @@
+// Copyright (c) Andrea Righi <andrea.righi@canonical.com>
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+
+use std::collections::HashMap;
+use std::fs;
+
+/// scx_rustland: CPU topology.
+///
+/// CoreMapping provides a map of the CPU topology, with the list of CPU ids in the system grouped
+/// by their corresponding core id.
+///
+/// The CPU / core mapping is stored in a HashMap using the core ID as the key. An example content
+/// of the HashMap can be the following:
+///
+///  core 0: [4, 0]
+///  core 1: [5, 1]
+///  core 2: [6, 2]
+///  core 3: [7, 3]
+///
+/// This information can be used by the scheduler to apply a more efficient scheduling policy, for
+/// example dispatching tasks on the CPUs that have all the siblings idle first, and later move to
+/// the CPUs with busy siblings.
+
+const CPU_PATH: &str = "/sys/devices/system/cpu";
+
+pub struct CoreMapping {
+    // Map of core IDs -> list of CPU ids.
+    //
+    // NOTE: we must periodically refresh this map if we want to support CPU hotplugging, for now
+    // let's assume it's static.
+    pub map: HashMap<i32, Vec<i32>>,
+
+    // Number of available CPUs in the system.
+    //
+    // NOTE: we must periodically refresh this value if we want to support CPU hotplugging, for now
+    // let's assume it's static.
+    pub nr_cpus_online: i32,
+}
+
+impl CoreMapping {
+    pub fn new() -> Self {
+        let mut core_mapping = CoreMapping {
+            map: HashMap::new(),
+            nr_cpus_online: 0,
+        };
+        core_mapping.init_mapping();
+
+        core_mapping
+    }
+
+    // Evaluate the amount of available CPUs in the system.
+    // Initialize the core ids -> CPU ids HashMap, parsing all the information from
+    // /sys/devices/system/cpu/cpu<id>/topology/core_id.
+    fn init_mapping(&mut self) {
+        let cpu_entries: Vec<_> = fs::read_dir(CPU_PATH)
+            .expect(format!("Failed to read: {}", CPU_PATH).as_str())
+            .filter_map(|entry| entry.ok())
+            .collect();
+
+        // Generate core map.
+        for entry in cpu_entries {
+            let entry_path = entry.path();
+            let cpu_name = entry.file_name();
+            let cpu_id_str = cpu_name.to_string_lossy().to_string();
+            if cpu_id_str.starts_with("cpu") {
+                if let Some(cpu_id) = cpu_id_str.chars().skip(3).collect::<String>().parse().ok() {
+                    let core_id_path = entry_path.join("topology/core_id");
+                    if let Some(core_id) = fs::read_to_string(&core_id_path)
+                        .ok()
+                        .and_then(|content| content.trim().parse().ok())
+                    {
+                        // Add CPU id to the core map.
+                        self.map.entry(core_id).or_insert(Vec::new()).push(cpu_id);
+
+                        // Update total CPU ids counter.
+                        self.nr_cpus_online += 1;
+                    }
+                }
+            }
+        }
+    }
+}