Merge pull request #161 from sched-ext/scx-user

Introduce scx_rustland_core: a generic layer to implement user-space schedulers in Rust
This commit is contained in:
David Vernet 2024-02-28 10:57:19 -06:00 committed by GitHub
commit 7278d88632
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
39 changed files with 570 additions and 196 deletions

View File

@ -1 +1,2 @@
subdir('scx_utils')
subdir('scx_rustland_core')

1
rust/scx_rustland_core/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
Cargo.lock

View File

@ -0,0 +1,26 @@
[package]
name = "scx_rustland_core"
version = "0.1.0"
edition = "2021"
authors = ["Andrea Righi <andrea.righi@canonical.com>"]
license = "GPL-2.0-only"
repository = "https://github.com/sched-ext/scx"
description = "Framework to implement sched_ext schedulers running in user space"
include = [
"src/bpf/intf.h",
"src/bpf/main.bpf.c",
"src/bpf.rs",
]
[dependencies]
anyhow = "1.0"
libbpf-rs = "0.22.0"
libc = "0.2.137"
buddy-alloc = "0.5.1"
scx_utils = { path = "../scx_utils", version = "0.6" }
[build-dependencies]
tar = "0.4"
walkdir = "2.4"
scx_utils = { path = "../scx_utils", version = "0.6" }

View File

@ -0,0 +1 @@
../../LICENSE

View File

@ -0,0 +1,78 @@
# Framework to implement sched_ext schedulers running in user-space
[sched_ext](https://github.com/sched-ext/scx) is a Linux kernel feature
which enables implementing kernel thread schedulers in BPF and dynamically
loading them.
This crate provides a generic layer that can be used to implement sched-ext
schedulers that run in user-space.
It provides a generic BPF abstraction that is completely agnostic of the
particular scheduling policy implemented in user-space.
Developers can use such abstraction to implement schedulers using pure Rust
code, without having to deal with any internal kernel / BPF internal details.
## API
The main BPF interface is provided by the `BpfScheduler` struct. When this
object is initialized it will take care of registering and initializing the BPF
component.
The scheduler then can use `BpfScheduler` instance to receive tasks (in the
form of `QueuedTask` objects) and dispatch tasks (in the form of DispatchedTask
objects), using respectively the methods `dequeue_task()` and `dispatch_task()`.
Example usage (FIFO scheduler):
```
struct Scheduler<'a> {
bpf: BpfScheduler<'a>,
}
impl<'a> Scheduler<'a> {
fn init() -> Result<Self> {
let topo = Topology::new().expect("Failed to build host topology");
let bpf = BpfScheduler::init(5000, topo.nr_cpus() as i32, false, false, false)?;
Ok(Self { bpf })
}
fn schedule(&mut self) {
match self.bpf.dequeue_task() {
Ok(Some(task)) => {
// task.cpu < 0 is used to to notify an exiting task, in this
// case we can simply ignore it.
if task.cpu >= 0 {
let _ = self.bpf.dispatch_task(&DispatchedTask {
pid: task.pid,
cpu: task.cpu,
cpumask_cnt: task.cpumask_cnt,
payload: 0,
});
}
}
Ok(None) => {
// Notify the BPF component that all tasks have been dispatched.
self.bpf.update_tasks(Some(0), Some(0))?
break;
}
Err(_) => {
break;
}
}
}
```
Moreover, a CPU ownership map (that keeps track of which PID runs on which CPU)
can be accessed using the method `get_cpu_pid()`. This also allows to keep
track of the idle and busy CPUs, with the corresponding PIDs associated to
them.
BPF counters and statistics can be accessed using the methods `nr_*_mut()`, in
particular `nr_queued_mut()` and `nr_scheduled_mut()` can be updated to notify
the BPF component if the user-space scheduler has still some pending work to do
or not.
Lastly, the methods `exited()` and `shutdown_and_report()` can be used
respectively to test whether the BPF component exited, and to shutdown and
report the exit message.

View File

@ -0,0 +1 @@
#include "bpf_h/vmlinux/vmlinux.h"

View File

@ -0,0 +1 @@
../../scheds/include

View File

@ -0,0 +1,10 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
use scx_utils::Builder;
fn main() {
Builder::new().build()
}

View File

@ -0,0 +1,7 @@
custom_target('scx_rustland_core',
output: '@PLAINNAME@.__PHONY__',
input: 'Cargo.toml',
command: [cargo, 'build', '--manifest-path=@INPUT@', '--target-dir=@OUTDIR@',
cargo_build_args],
env: cargo_env,
build_by_default: true)

View File

@ -15,7 +15,7 @@ const HEAP_SIZE: usize = 64 * 1024 * 1024; // 64M
const LEAF_SIZE: usize = 64;
#[repr(align(4096))]
pub struct AlignedHeap<const N: usize>([u8; N]);
struct AlignedHeap<const N: usize>([u8; N]);
// Statically pre-allocated memory arena.
static mut FAST_HEAP: AlignedHeap<FAST_HEAP_SIZE> = AlignedHeap([0u8; FAST_HEAP_SIZE]);
@ -26,25 +26,25 @@ static mut HEAP: AlignedHeap<HEAP_SIZE> = AlignedHeap([0u8; HEAP_SIZE]);
// To prevent potential deadlock conditions under heavy loads, any scheduler that delegates
// scheduling decisions to user-space should avoid triggering page faults.
//
// To address this issue, replace the global allocator with a custom one (RustLandAllocator),
// To address this issue, replace the global allocator with a custom one (UserAllocator),
// designed to operate on a pre-allocated buffer. This, coupled with the memory locking achieved
// through mlockall(), prevents page faults from occurring during the execution of the user-space
// scheduler.
#[cfg_attr(not(test), global_allocator)]
pub static ALLOCATOR: RustLandAllocator = unsafe {
pub static ALLOCATOR: UserAllocator = unsafe {
let fast_param = FastAllocParam::new(FAST_HEAP.0.as_ptr(), FAST_HEAP_SIZE);
let buddy_param = BuddyAllocParam::new(HEAP.0.as_ptr(), HEAP_SIZE, LEAF_SIZE);
RustLandAllocator {
UserAllocator {
arena: NonThreadsafeAlloc::new(fast_param, buddy_param),
}
};
// Main allocator class.
pub struct RustLandAllocator {
pub arena: NonThreadsafeAlloc,
pub struct UserAllocator {
arena: NonThreadsafeAlloc,
}
impl RustLandAllocator {
impl UserAllocator {
pub fn lock_memory(&self) {
unsafe {
VM.save();
@ -75,7 +75,7 @@ impl RustLandAllocator {
}
// Override global allocator methods.
unsafe impl GlobalAlloc for RustLandAllocator {
unsafe impl GlobalAlloc for UserAllocator {
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
self.arena.alloc(layout)
}

View File

@ -0,0 +1,6 @@
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(dead_code)]
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));

View File

@ -15,13 +15,12 @@ use libbpf_rs::skel::SkelBuilder as _;
use libc::{sched_param, sched_setscheduler};
mod alloc;
use alloc::*;
use scx_utils::init_libbpf_logging;
use scx_utils::uei_exited;
use scx_utils::uei_report;
use scx_rustland_core::ALLOCATOR;
// Defined in UAPI
const SCHED_EXT: i32 = 7;
@ -31,7 +30,7 @@ const SCHED_EXT: i32 = 7;
#[allow(dead_code)]
pub const NO_CPU: i32 = -1;
/// scx_rustland: provide high-level abstractions to interact with the BPF component.
/// High-level Rust abstraction to interact with a generic sched-ext BPF component.
///
/// Overview
/// ========
@ -40,7 +39,7 @@ pub const NO_CPU: i32 = -1;
/// initialized it will take care of registering and initializing the BPF component.
///
/// The scheduler then can use BpfScheduler() instance to receive tasks (in the form of QueuedTask
/// object) and dispatch tasks (in the form of DispatchedTask objects), using respectively the
/// objects) and dispatch tasks (in the form of DispatchedTask objects), using respectively the
/// methods dequeue_task() and dispatch_task().
///
/// The CPU ownership map can be accessed using the method get_cpu_pid(), this also allows to keep
@ -51,85 +50,8 @@ pub const NO_CPU: i32 = -1;
/// user-space scheduler has some pending work to do or not.
///
/// Finally the methods exited() and shutdown_and_report() can be used respectively to test
/// whether the BPF component exited, and to shutdown and report the exit message.
/// whether the BPF component exited, and to shutdown and report exit message.
///
/// Example
/// =======
///
/// Following you can find bare minimum template that can be used to implement a simple FIFO
/// scheduler using the BPF abstraction:
///
/// mod bpf_skel;
/// pub use bpf_skel::*;
/// mod bpf;
/// pub mod bpf_intf;
/// use bpf::*;
///
/// use std::thread;
///
/// use std::sync::atomic::AtomicBool;
/// use std::sync::atomic::Ordering;
/// use std::sync::Arc;
///
/// use anyhow::Result;
///
/// struct Scheduler<'a> {
/// bpf: BpfScheduler<'a>,
/// }
///
/// impl<'a> Scheduler<'a> {
/// fn init() -> Result<Self> {
/// let bpf = BpfScheduler::init(20000, false, false)?;
/// Ok(Self { bpf })
/// }
///
/// fn dispatch_tasks(&mut self) {
/// loop {
/// match self.bpf.dequeue_task() {
/// Ok(Some(task)) => {
/// if task.cpu >= 0 {
/// let _ = self.bpf.dispatch_task(
/// &DispatchedTask {
/// pid: task.pid,
/// cpu: task.cpu,
/// payload: 0,
/// }
/// );
/// }
/// }
/// Ok(None) => {
/// *self.bpf.nr_queued_mut() = 0;
/// *self.bpf.nr_scheduled_mut() = 0;
/// break;
/// }
/// Err(_) => {
/// break;
/// }
/// }
/// }
/// }
///
/// fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<()> {
/// while !shutdown.load(Ordering::Relaxed) && !self.bpf.exited() {
/// self.dispatch_tasks();
/// thread::yield_now();
/// }
///
/// Ok(())
/// }
/// }
///
/// fn main() -> Result<()> {
/// let mut sched = Scheduler::init()?;
/// let shutdown = Arc::new(AtomicBool::new(false));
/// let shutdown_clone = shutdown.clone();
/// ctrlc::set_handler(move || {
/// shutdown_clone.store(true, Ordering::Relaxed);
/// })?;
///
/// sched.run(shutdown)
/// }
///
// Task queued for scheduling from the BPF component (see bpf_intf::queued_task_ctx).
#[derive(Debug)]
@ -241,16 +163,31 @@ impl<'cb> BpfScheduler<'cb> {
ALLOCATOR.lock_memory();
// Copy one item from the ring buffer.
//
// # Safety
//
// Each invocation of the callback will trigger the copy of exactly one QueuedTask item to
// BUF. The caller must be synchronize to ensure that multiple invocations of the callback
// are not happening at the same time, but this is implicitly guaranteed by the fact that
// the caller is a single-thread process (for now).
//
// Use of a `str` whose contents are not valid UTF-8 is undefined behavior.
fn callback(data: &[u8]) -> i32 {
unsafe {
// SAFETY: copying from the BPF ring buffer to BUF is safe, since the size of BUF
// is exactly the size of QueuedTask and the callback operates in chunks of
// QueuedTask items. It also copies exactly one QueuedTask at a time, this is
// guaranteed by the error code returned by this callback (see below). From a
// thread-safety perspective this is also correct, assuming the caller is a
// single-thread process (as it is for now).
BUF.0.copy_from_slice(data);
}
// Return an unsupported error to stop early and consume only one item.
//
// NOTE: this is quite a hack. I wish libbpf would honor stopping after the first item
// is consumed, upon returnin a non-zero positive value here, but it doesn't seem to be
// the case:
// is consumed, upon returning a non-zero positive value here, but it doesn't seem to
// be the case:
//
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/ringbuf.c?h=v6.8-rc5#n260
//
@ -305,6 +242,23 @@ impl<'cb> BpfScheduler<'cb> {
}
}
// Update the amount of tasks that have been queued to the user-space scheduler and dispatched.
//
// This method is used to notify the BPF component if the user-space scheduler has still some
// pending actions to complete (based on the counter of queued and scheduled tasks).
//
// NOTE: do not set allow(dead_code) for this method, any scheduler must use this method at
// some point, otherwise the BPF component will keep waking-up the user-space scheduler in a
// busy loop, causing unnecessary high CPU consumption.
pub fn update_tasks(&mut self, nr_queued: Option<u64>, nr_scheduled: Option<u64>) {
if let Some(queued) = nr_queued {
self.skel.bss_mut().nr_queued = queued;
}
if let Some(scheduled) = nr_scheduled {
self.skel.bss_mut().nr_scheduled = scheduled;
}
}
// Override the default scheduler time slice (in us).
#[allow(dead_code)]
pub fn set_effective_slice_us(&mut self, slice_us: u64) {
@ -324,11 +278,13 @@ impl<'cb> BpfScheduler<'cb> {
}
// Counter of queued tasks.
#[allow(dead_code)]
pub fn nr_queued_mut(&mut self) -> &mut u64 {
&mut self.skel.bss_mut().nr_queued
}
// Counter of scheduled tasks.
#[allow(dead_code)]
pub fn nr_scheduled_mut(&mut self) -> &mut u64 {
&mut self.skel.bss_mut().nr_scheduled
}

View File

@ -1,19 +1,13 @@
/* Copyright (c) Andrea Righi <andrea.righi@canonical.com> */
/*
* scx_rustland: simple user-space scheduler written in Rust
* scx_rustland_core: BPF backend for schedulers running in user-space.
*
* The main goal of this scheduler is be an "easy to read" template that can be
* used to quickly test more complex scheduling policies. For this reason this
* scheduler is mostly focused on simplicity and code readability.
* This BPF backend implements the low level sched-ext functionalities for a
* user-space counterpart, that implements the actual scheduling policy.
*
* The scheduler is made of a BPF component (dispatcher) that implements the
* low level sched-ext functionalities and a user-space counterpart
* (scheduler), written in Rust, that implements the actual scheduling policy.
*
* The BPF dispatcher collects total cputime and weight from the tasks that
* need to run, then it sends all details to the user-space scheduler that
* decides the best order of execution of the tasks (based on the collected
* metrics).
* The BPF part collects total cputime and weight from the tasks that need to
* run, then it sends all details to the user-space scheduler that decides the
* best order of execution of the tasks (based on the collected metrics).
*
* The user-space scheduler then returns to the BPF component the list of tasks
* to be dispatched in the proper order.

View File

@ -0,0 +1,9 @@
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(dead_code)]
include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs"));

View File

@ -0,0 +1,4 @@
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs"));

View File

@ -0,0 +1,7 @@
mod bindings;
mod alloc;
pub use alloc::ALLOCATOR;
mod rustland_builder;
pub use rustland_builder::RustLandBuilder;

View File

@ -0,0 +1,50 @@
// Copyright (c) Andrea Righi <andrea.righi@canonical.com>
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
use anyhow::Result;
use std::fs::File;
use std::io::Write;
use std::path::Path;
use scx_utils::BpfBuilder;
pub struct RustLandBuilder {
inner_builder: BpfBuilder,
}
impl RustLandBuilder {
pub fn new() -> Result<Self> {
Ok(Self {
inner_builder: BpfBuilder::new()?,
})
}
fn create_file(&self, file_name: &str, content: &[u8]) {
let path = Path::new(file_name);
let mut file = File::create(&path).expect("Unable to create file");
file.write_all(content).expect("Unable to write to file");
}
pub fn build(&mut self) -> Result<()> {
// Embed the BPF source files.
let intf = include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/src/bpf/intf.h"));
let skel = include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/src/bpf/main.bpf.c"));
let bpf = include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/src/bpf.rs"));
// Generate BPF backend code (C).
self.create_file("intf.h", intf);
self.create_file("main.bpf.c", skel);
self.inner_builder.enable_intf("intf.h", "bpf_intf.rs");
self.inner_builder.enable_skel("main.bpf.c", "bpf");
// Generate user-space BPF connector code (Rust).
self.create_file("src/bpf.rs", bpf);
// Build the scheduler.
self.inner_builder.build()
}
}

View File

@ -18,10 +18,12 @@ hex = "0.4.3"
lazy_static = "1.4"
libbpf-cargo = "0.22"
libbpf-rs = "0.22.0"
buddy-alloc = "0.5.1"
log = "0.4.17"
regex = "1.10"
sscanf = "0.4"
tar = "0.4"
walkdir = "2.4"
version-compare = "0.1"
[build-dependencies]

View File

@ -3,51 +3,8 @@
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
use std::env;
use std::fs::File;
use std::path::PathBuf;
const BPF_H: &str = "bpf_h";
fn gen_bpf_h() {
let file =
File::create(PathBuf::from(env::var("OUT_DIR").unwrap()).join(format!("{}.tar", BPF_H)))
.unwrap();
let mut ar = tar::Builder::new(file);
ar.follow_symlinks(false);
ar.append_dir_all(".", BPF_H).unwrap();
ar.finish().unwrap();
for ent in walkdir::WalkDir::new(BPF_H) {
let ent = ent.unwrap();
if !ent.file_type().is_dir() {
println!("cargo:rerun-if-changed={}", ent.path().to_string_lossy());
}
}
}
fn gen_bindings() {
// FIXME - bindgen's API changed between 0.68 and 0.69 so that
// `bindgen::CargoCallbacks::new()` should be used instead of
// `bindgen::CargoCallbacks`. Unfortunately, as of Dec 2023, fedora is
// shipping 0.68. To accommodate fedora, allow both 0.68 and 0.69 of
// bindgen and suppress deprecation warning. Remove the following once
// fedora can be updated to bindgen >= 0.69.
#[allow(deprecated)]
let bindings = bindgen::Builder::default()
.header("bindings.h")
.allowlist_type("scx_exit_kind")
.parse_callbacks(Box::new(bindgen::CargoCallbacks))
.generate()
.expect("Unable to generate bindings");
bindings
.write_to_file(PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs"))
.expect("Couldn't write bindings");
}
include!("src/builder.rs");
fn main() {
gen_bpf_h();
gen_bindings();
Builder::new().build()
}

View File

@ -0,0 +1,61 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
use std::env;
use std::fs::File;
use std::path::PathBuf;
const BPF_H: &str = "bpf_h";
pub struct Builder;
impl Builder {
pub fn new() -> Self {
Builder
}
fn gen_bpf_h(&self) {
let out_dir = env::var("OUT_DIR").unwrap();
let file = File::create(PathBuf::from(&out_dir).join(format!("{}.tar", BPF_H))).unwrap();
let mut ar = tar::Builder::new(file);
ar.follow_symlinks(false);
ar.append_dir_all(".", BPF_H).unwrap();
ar.finish().unwrap();
for ent in walkdir::WalkDir::new(BPF_H) {
let ent = ent.unwrap();
if !ent.file_type().is_dir() {
println!("cargo:rerun-if-changed={}", ent.path().to_string_lossy());
}
}
}
fn gen_bindings(&self) {
let out_dir = env::var("OUT_DIR").unwrap();
// FIXME - bindgen's API changed between 0.68 and 0.69 so that
// `bindgen::CargoCallbacks::new()` should be used instead of
// `bindgen::CargoCallbacks`. Unfortunately, as of Dec 2023, fedora is
// shipping 0.68. To accommodate fedora, allow both 0.68 and 0.69 of
// bindgen and suppress deprecation warning. Remove the following once
// fedora can be updated to bindgen >= 0.69.
#[allow(deprecated)]
let bindings = bindgen::Builder::default()
.header("bindings.h")
.allowlist_type("scx_exit_kind")
.parse_callbacks(Box::new(bindgen::CargoCallbacks))
.generate()
.expect("Unable to generate bindings");
bindings
.write_to_file(PathBuf::from(&out_dir).join("bindings.rs"))
.expect("Couldn't write bindings");
}
pub fn build(self) {
self.gen_bpf_h();
self.gen_bindings();
}
}

View File

@ -35,6 +35,9 @@ mod bindings;
mod bpf_builder;
pub use bpf_builder::BpfBuilder;
mod builder;
pub use builder::Builder;
pub mod ravg;
mod libbpf_logger;

View File

@ -15,3 +15,4 @@ main.rs or \*.bpf.c files.
- [scx_layered](scx_layered/README.md)
- [scx_rusty](scx_rusty/README.md)
- [scx_rustland](scx_rustland/README.md)
- [scx_rlfifo](scx_rlfifo/README.md)

View File

@ -1,3 +1,4 @@
subdir('scx_layered')
subdir('scx_rusty')
subdir('scx_rustland')
subdir('scx_rlfifo')

6
scheds/rust/scx_rlfifo/.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
src/bpf/.output
intf.h
main.bpf.c
bpf.rs
Cargo.lock
target

View File

@ -0,0 +1,22 @@
[package]
name = "scx_rlfifo"
version = "0.0.1"
authors = ["Andrea Righi <andrea.righi@canonical.com>", "Canonical"]
edition = "2021"
description = "A simple FIFO scheduler in Rust that runs in user-space"
license = "GPL-2.0-only"
[dependencies]
anyhow = "1.0.65"
ctrlc = { version = "3.1", features = ["termination"] }
libbpf-rs = "0.22.0"
libc = "0.2.137"
scx_utils = { path = "../../../rust/scx_utils", version = "0.6" }
scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "0.1" }
[build-dependencies]
scx_utils = { path = "../../../rust/scx_utils", version = "0.6" }
scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "0.1" }
[features]
enable_backtrace = []

View File

@ -0,0 +1 @@
../../../LICENSE

View File

@ -0,0 +1,20 @@
# scx_rlfifo
This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main).
## Overview
scx_rlfifo is a simple FIFO scheduler runs in user-space, based on the
scx_rustland_core framework.
## Typical Use Case
This scheduler is provided as a simple template that can be used as a baseline
to test more complex scheduling policies.
## Production Ready?
Definitely not. Using this scheduler in a production environment is not
recommended, unless there are specific requirements that necessitate a basic
FIFO scheduling approach. Even then, it's still recommended to use the kernel's
SCHED_FIFO real-time class.

View File

@ -0,0 +1,9 @@
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
fn main() {
scx_rustland_core::RustLandBuilder::new()
.unwrap()
.build()
.unwrap();
}

View File

@ -0,0 +1,7 @@
custom_target('scx_rlfifo',
output: '@PLAINNAME@.__PHONY__',
input: 'Cargo.toml',
command: [cargo, 'build', '--manifest-path=@INPUT@', '--target-dir=@OUTDIR@',
cargo_build_args],
env: cargo_env,
build_by_default: true)

View File

@ -0,0 +1,8 @@
# Get help on options with `rustfmt --help=config`
# Please keep these in alphabetical order.
edition = "2021"
group_imports = "StdExternalCrate"
imports_granularity = "Item"
merge_derives = false
use_field_init_shorthand = true
version = "Two"

View File

@ -0,0 +1,9 @@
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(dead_code)]
include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs"));

View File

@ -0,0 +1,4 @@
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs"));

View File

@ -0,0 +1,117 @@
// Copyright (c) Andrea Righi <andrea.righi@canonical.com>
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
mod bpf_skel;
pub use bpf_skel::*;
pub mod bpf_intf;
mod bpf;
use bpf::*;
use scx_utils::Topology;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::time::{Duration, SystemTime};
use anyhow::Result;
struct Scheduler<'a> {
bpf: BpfScheduler<'a>,
}
impl<'a> Scheduler<'a> {
fn init() -> Result<Self> {
let topo = Topology::new().expect("Failed to build host topology");
let bpf = BpfScheduler::init(5000, topo.nr_cpus() as i32, false, false, false)?;
Ok(Self { bpf })
}
fn now() -> u64 {
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs()
}
fn dispatch_tasks(&mut self) {
loop {
// Get queued taks and dispatch them in order (FIFO).
match self.bpf.dequeue_task() {
Ok(Some(task)) => {
// task.cpu < 0 is used to to notify an exiting task, in this
// case we can simply ignore the task.
if task.cpu >= 0 {
let _ = self.bpf.dispatch_task(&DispatchedTask {
pid: task.pid,
cpu: task.cpu,
cpumask_cnt: task.cpumask_cnt,
payload: 0,
});
}
// Give the task a chance to run and prevent overflowing the dispatch queue.
std::thread::yield_now();
}
Ok(None) => {
// Notify the BPF component that all tasks have been scheduled and dispatched.
self.bpf.update_tasks(Some(0), Some(0));
// All queued tasks have been dipatched, add a short sleep to reduce
// scheduler's CPU consuption.
std::thread::sleep(Duration::from_millis(1));
break;
}
Err(_) => {
break;
}
}
}
}
fn print_stats(&mut self) {
let nr_user_dispatches = *self.bpf.nr_user_dispatches_mut();
let nr_kernel_dispatches = *self.bpf.nr_kernel_dispatches_mut();
let nr_cancel_dispatches = *self.bpf.nr_cancel_dispatches_mut();
let nr_bounce_dispatches = *self.bpf.nr_bounce_dispatches_mut();
let nr_failed_dispatches = *self.bpf.nr_failed_dispatches_mut();
let nr_sched_congested = *self.bpf.nr_sched_congested_mut();
println!(
"user={} kernel={} cancel={} bounce={} fail={} cong={}",
nr_user_dispatches, nr_kernel_dispatches,
nr_cancel_dispatches, nr_bounce_dispatches,
nr_failed_dispatches, nr_sched_congested,
);
}
fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<()> {
let mut prev_ts = Self::now();
while !shutdown.load(Ordering::Relaxed) && !self.bpf.exited() {
self.dispatch_tasks();
let curr_ts = Self::now();
if curr_ts > prev_ts {
self.print_stats();
prev_ts = curr_ts;
}
}
self.bpf.shutdown_and_report()
}
}
fn main() -> Result<()> {
let mut sched = Scheduler::init()?;
let shutdown = Arc::new(AtomicBool::new(false));
let shutdown_clone = shutdown.clone();
ctrlc::set_handler(move || {
shutdown_clone.store(true, Ordering::Relaxed);
})?;
sched.run(shutdown)
}

View File

@ -1,3 +1,6 @@
src/bpf/.output
intf.h
main.bpf.c
bpf.rs
Cargo.lock
target

View File

@ -8,21 +8,20 @@ license = "GPL-2.0-only"
[dependencies]
anyhow = "1.0.65"
bitvec = { version = "1.0", features = ["serde"] }
clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
ctrlc = { version = "3.1", features = ["termination"] }
fb_procfs = "0.7.0"
hex = "0.4.3"
libbpf-rs = "0.22.0"
libc = "0.2.137"
buddy-alloc = "0.5.1"
log = "0.4.17"
ordered-float = "3.4.0"
scx_utils = { path = "../../../rust/scx_utils", version = "0.6" }
scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "0.1" }
simplelog = "0.12.0"
[build-dependencies]
scx_utils = { path = "../../../rust/scx_utils", version = "0.6" }
scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "0.1" }
[features]
enable_backtrace = []

View File

@ -4,25 +4,24 @@ This is a single user-defined scheduler used within [sched_ext](https://github.c
## Overview
scx_rustland is made of a BPF component (dispatcher) that implements the low
level sched-ext functionalities and a user-space counterpart (scheduler),
scx_rustland is made of a BPF component (scx_rustland_core) that implements the
low level sched-ext functionalities and a user-space counterpart (scheduler),
written in Rust, that implements the actual scheduling policy.
The BPF dispatcher is completely agnostic of the particular scheduling policy
implemented in user-space. For this reason developers that are willing to use
this scheduler to experiment scheduling policies should be able to simply
modify the Rust component, without having to deal with any internal kernel /
BPF details.
## How To Install
Available as a [Rust crate](https://crates.io/crates/scx_rustland): `cargo add scx_rustland`
## Typical Use Case
scx_rustland is designed to be "easy to read" template that can be used by any
developer to quickly experiment more complex scheduling policies, that can be
fully implemented in Rust.
scx_rustland is designed to prioritize interactive workloads over background
CPU-intensive workloads. For this reason the typical use case of this scheduler
involves low-latency interactive applications, such as gaming, video
conferencing and live streaming.
scx_rustland is also designed to be an "easy to read" template that can be used
by any developer to quickly experiment more complex scheduling policies fully
implemented in Rust.
## Production Ready?
@ -32,20 +31,17 @@ with a certain cost.
However, a scheduler entirely implemented in user-space holds the potential for
seamless integration with sophisticated libraries, tracing tools, external
services (e.g., AI), etc. Hence, there might be situations where the benefits
outweigh the overhead, justifying the use of this scheduler in a production
environment.
services (e.g., AI), etc.
Hence, there might be situations where the benefits outweigh the overhead,
justifying the use of this scheduler in a production environment.
## Demo
[scx_rustland-terraria](https://github.com/sched-ext/scx/assets/1051723/42ec3bf2-9f1f-4403-80ab-bf5d66b7c2d5)
For this demo the scheduler includes an extra patch to impose a "time slice
penalty" on new short-lived tasks. While this approach might not be suitable
for general usage, it can yield significant advantages in this specific
scenario.
The key takeaway is to demonstrate the ease and safety of conducting
experiments like this, as we operate in user-space, and we can accomplish
everything simply by modifying the Rust code, that is completely abstracted
from the underlying BPF/kernel internal details.
The key takeaway of this demo is to demonstrate that , despite the overhead of
running a scheduler in user-space, we can still obtain interesting results and,
in this particular case, even outperform the default Linux scheduler (EEVDF) in
terms of application responsiveness (fps), while a CPU intensive workload
(parallel kernel build) is running in the background.

View File

@ -2,10 +2,8 @@
// GNU General Public License version 2.
fn main() {
scx_utils::BpfBuilder::new()
scx_rustland_core::RustLandBuilder::new()
.unwrap()
.enable_intf("src/bpf/intf.h", "bpf_intf.rs")
.enable_skel("src/bpf/main.bpf.c", "bpf")
.build()
.unwrap();
}

View File

@ -33,15 +33,18 @@ use log::warn;
const SCHEDULER_NAME: &'static str = "RustLand";
/// scx_rustland: simple user-space scheduler written in Rust
/// scx_rustland: user-space scheduler written in Rust
///
/// The main goal of this scheduler is be an "easy to read" template that can be used to quickly
/// test more complex scheduling policies. For this reason this scheduler is mostly focused on
/// simplicity and code readability.
/// scx_rustland is designed to prioritize interactive workloads over background CPU-intensive
/// workloads. For this reason the typical use case of this scheduler involves low-latency
/// interactive applications, such as gaming, video conferencing and live streaming.
///
/// The scheduler is made of a BPF component (dispatcher) that implements the low level sched-ext
/// functionalities and a user-space counterpart (scheduler), written in Rust, that implements the
/// actual scheduling policy.
/// scx_rustland is also designed to be an "easy to read" template that can be used by any
/// developer to quickly experiment more complex scheduling policies fully implemented in Rust.
///
/// The scheduler is made of a BPF component (scx_rustland_core) that implements the low level
/// sched-ext functionalities and a user-space counterpart (scheduler), written in Rust, that
/// implements the actual scheduling policy.
///
/// The default scheduling policy implemented in the user-space scheduler is a based on virtual
/// runtime (vruntime):
@ -65,10 +68,6 @@ const SCHEDULER_NAME: &'static str = "RustLand";
///
/// === Troubleshooting ===
///
/// - Disable HyperThreading / SMT if you notice poor performance (add "nosmt" to the kernel boot
/// parameters): this scheduler is not NUMA-aware and it implements a simple policy of handling
/// SMT cores.
///
/// - Adjust the time slice boost parameter (option `-b`) to enhance the responsiveness of
/// low-latency applications (i.e., online gaming, live streaming, video conferencing etc.).
///
@ -473,8 +472,7 @@ impl<'a> Scheduler<'a> {
// Reset nr_queued and update nr_scheduled, to notify the dispatcher that
// queued tasks are drained, but there is still some work left to do in the
// scheduler.
*self.bpf.nr_queued_mut() = 0;
*self.bpf.nr_scheduled_mut() = self.task_pool.tasks.len() as u64;
self.bpf.update_tasks(Some(0), Some(self.task_pool.tasks.len() as u64));
break;
}
Err(err) => {
@ -533,10 +531,10 @@ impl<'a> Scheduler<'a> {
None => break,
}
}
// Reset nr_scheduled to notify the dispatcher that all the tasks received by the scheduler
// has been dispatched, so there is no reason to re-activate the scheduler, unless more
// tasks are queued.
self.bpf.skel.bss_mut().nr_scheduled = self.task_pool.tasks.len() as u64;
// Update nr_scheduled to notify the dispatcher that all the tasks received by the
// scheduler has been dispatched, so there is no reason to re-activate the scheduler,
// unless more tasks are queued.
self.bpf.update_tasks(None, Some(self.task_pool.tasks.len() as u64));
}
// Main scheduling function (called in a loop to periodically drain tasks from the queued list