Merge pull request #192 from multics69/scx_lavd

scx_lavd: add LAVD (Latency-criticality Aware Virtual Deadline) scheduler
This commit is contained in:
David Vernet 2024-03-17 22:09:56 -07:00 committed by GitHub
commit 80986e4a23
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 2359 additions and 0 deletions

View File

@ -16,3 +16,4 @@ main.rs or \*.bpf.c files.
- [scx_rusty](scx_rusty/README.md)
- [scx_rustland](scx_rustland/README.md)
- [scx_rlfifo](scx_rlfifo/README.md)
- [scx_lavd](scx_lavd/README.md)

View File

@ -2,3 +2,4 @@ subdir('scx_layered')
subdir('scx_rusty')
subdir('scx_rustland')
subdir('scx_rlfifo')
subdir('scx_lavd')

View File

@ -0,0 +1,32 @@
[package]
name = "scx_lavd"
version = "0.1.0"
authors = ["Changwoo Min <changwoo@igalia.com>", "Igalia"]
edition = "2021"
description = "A Latency-criticality Aware Virtual Deadline (LAVD) scheduler based on sched_ext, which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. https://github.com/sched-ext/scx/tree/main"
license = "GPL-2.0-only"
[dependencies]
anyhow = "1.0.65"
bitvec = { version = "1.0", features = ["serde"] }
clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
ctrlc = { version = "3.1", features = ["termination"] }
fb_procfs = "0.7.0"
hex = "0.4.3"
libbpf-rs = "0.22.0"
libc = "0.2.137"
log = "0.4.17"
ordered-float = "3.4.0"
scx_utils = { path = "../../../rust/scx_utils", version = "0.6" }
simplelog = "0.12.0"
static_assertions = "1.1.0"
num_cpus = "1.16.0"
rlimit = "0.10.1"
plain = "0.2.3"
nix = "0.28.0"
[build-dependencies]
scx_utils = { path = "../../../rust/scx_utils", version = "0.6" }
[features]
enable_backtrace = []

View File

@ -0,0 +1 @@
../../../LICENSE

View File

@ -0,0 +1,31 @@
# scx_lavd
This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main).
## Overview
scx_lavd is a BPF scheduler that implements an LAVD (Latency-criticality Aware
Virtual Deadline) scheduling algorithm. While LAVD is new and still evolving,
its core ideas are 1) measuring how much a task is latency critical and 2)
leveraging the task's latency-criticality information in making various
scheduling decisions (e.g., task's deadline, time slice, etc.). As the name
implies, LAVD is based on the foundation of deadline scheduling. This scheduler
consists of the BPF part and the rust part. The BPF part makes all the
scheduling decisions; the rust part loads the BPF code and conducts other
chores (e.g., printing sampled scheduling decisions).
## Typical Use Case
scx_lavd is initially motivated by gaming workloads. It aims to improve
interactivity and reduce stuttering while playing games on Linux. Hence, this
scheduler's typical use case involves highly interactive applications, such as
gaming, which requires high throughput and low tail latencies.
## Production Ready?
This scheduler could be used in a production environment where the current code
is optimized. The current code does not particularly consider multiple NUMA/CCX
domains, so its scheduling decisions in such hardware would be suboptimal. This
scheduler currently will mainly perform well on single CCX / single-socket
hosts.

View File

@ -0,0 +1,13 @@
// Copyright (c) Changwoo Min <changwoo@igalia.com>
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
fn main() {
scx_utils::BpfBuilder::new()
.unwrap()
.enable_intf("src/bpf/intf.h", "bpf_intf.rs")
.enable_skel("src/bpf/main.bpf.c", "bpf")
.build()
.unwrap();
}

View File

@ -0,0 +1,7 @@
custom_target('scx_lavd',
output: '@PLAINNAME@.__PHONY__',
input: 'Cargo.toml',
command: [cargo, 'build', '--manifest-path=@INPUT@', '--target-dir=@OUTDIR@',
cargo_build_args],
env: cargo_env,
build_by_default: true)

View File

@ -0,0 +1,8 @@
# Get help on options with `rustfmt --help=config`
# Please keep these in alphabetical order.
edition = "2021"
group_imports = "StdExternalCrate"
imports_granularity = "Item"
merge_derives = false
use_field_init_shorthand = true
version = "Two"

View File

@ -0,0 +1,182 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2023, 2024 Changwoo Min <changwoo@igalia.com>
*/
#ifndef __INTF_H
#define __INTF_H
#include <limits.h>
#ifndef __VMLINUX_H__
typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;
typedef unsigned long u64;
typedef signed char s8;
typedef signed short s16;
typedef signed int s32;
typedef signed long s64;
typedef int pid_t;
enum {
TASK_COMM_LEN = 16,
};
#define __kptr
#endif
#ifdef __VMLINUX_H__
#define MAX_NICE 19
#define MIN_NICE -20
#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1)
#define MAX_RT_PRIO 100
struct bpf_iter_task;
extern int bpf_iter_task_new(struct bpf_iter_task *it,
struct task_struct *task, unsigned int flags) __weak __ksym;
extern struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) __weak __ksym;
extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
#endif /* __KERNEL__ */
/*
* common constants
*/
enum consts {
CLOCK_BOOTTIME = 7,
NSEC_PER_USEC = 1000L,
NSEC_PER_MSEC = (1000L * NSEC_PER_USEC),
LAVD_TIME_ONE_SEC = (1000L * NSEC_PER_MSEC),
LAVD_MAX_CAS_RETRY = 8,
LAVD_SLICE_MIN_NS = (300 * NSEC_PER_USEC),
LAVD_SLICE_MAX_NS = (3 * NSEC_PER_MSEC),
LAVD_TARGETED_LATENCY_NS = (15 * NSEC_PER_MSEC),
LAVD_SLICE_GREEDY_FT = 3,
LAVD_LC_FREQ_MAX = 1000000,
LAVD_LC_RUNTIME_MAX = (2 * NSEC_PER_MSEC),
LAVD_LC_RUNTIME_SHIFT = 10,
LAVD_BOOST_RANGE = 14, /* 35% of nice range */
LAVD_BOOST_WAKEUP_LAT = 1,
LAVD_SLICE_BOOST_MAX = 3,
LAVD_GREEDY_RATIO_MAX = USHRT_MAX,
LAVD_ELIGIBLE_TIME_LAT_FT = 2,
LAVD_ELIGIBLE_TIME_MAX = (LAVD_TARGETED_LATENCY_NS >> 1),
LAVD_CPU_UTIL_MAX = 1000, /* 100.0% */
LAVD_CPU_UTIL_INTERVAL_NS = (100 * NSEC_PER_MSEC), /* 100 msec */
LAVD_CPU_ID_HERE = 0xFE,
LAVD_CPU_ID_NONE = 0xFF,
LAVD_GLOBAL_DSQ = 0,
};
/*
* System-wide CPU utilization
*/
struct sys_cpu_util {
volatile u64 last_update_clk;
volatile u64 util; /* average of the CPU utilization */
volatile u64 load_ideal; /* average ideal load of runnable tasks */
volatile u64 load_actual; /* average actual load of runnable tasks */
};
/*
* Per-CPU context
*/
struct cpu_ctx {
/*
* Information used to keep track of CPU utilization
*/
volatile u64 idle_total; /* total idle time so far */
volatile u64 idle_start_clk; /* when the CPU becomes idle */
/*
* Information used to keep track of load
*/
volatile u64 load_actual; /* actual load of runnable tasks */
volatile u64 load_ideal; /* ideal loaf of runnable tasks */
};
/*
* Per-task scheduling context
*/
enum task_stat {
_LAVD_TASK_STAT_MIN = 0,
LAVD_TASK_STAT_STOPPING = _LAVD_TASK_STAT_MIN,
LAVD_TASK_STAT_ENQ,
LAVD_TASK_STAT_RUNNING,
_LAVD_TASK_STAT_MAX = LAVD_TASK_STAT_RUNNING,
};
struct task_ctx {
/*
* Essential task running statistics for latency criticality calculation
*/
u64 last_start_clk; /* last time when scheduled in */
u64 last_stop_clk; /* last time when scheduled out */
u64 run_time_ns; /* average runtime per schedule */
u64 run_freq; /* scheduling frequency in a second */
u64 last_wait_clk; /* last time when a task waits for an event */
u64 wait_freq; /* waiting frequency in a second */
u64 wake_freq; /* waking-up frequency in a second */
u64 last_wake_clk; /* last time when a task wakes up others */
u64 load_actual; /* task load derived from run_time and run_freq */
u64 vdeadline_delta_ns;
u64 eligible_delta_ns;
u64 slice_ns;
u64 greedy_ratio;
u16 stat; /* NIL -> ENQ -> RUN -> STOP -> NIL ... */
u16 slice_boost_prio;/* how many times a task fully consumed the slice */
u16 lat_prio; /* latency priority */
s16 lat_boost_prio; /* DEBUG */
};
struct task_ctx_x {
pid_t pid;
char comm[TASK_COMM_LEN + 1];
u16 static_prio; /* nice priority */
u16 cpu_id; /* where a task ran */
u64 cpu_util; /* cpu utilization in [0..100] */
};
/*
* introspection
*/
enum {
LAVD_CMD_NOP = 0x0,
LAVD_CMD_SCHED_N = 0x1,
LAVD_CMD_PID = 0x2,
LAVD_CMD_DUMP = 0x3,
};
enum {
LAVD_MSG_TASKC = 0x1,
};
struct introspec {
volatile u64 arg;
volatile u32 cmd;
u8 requested;
};
struct msg_hdr {
u32 kind;
};
struct msg_task_ctx {
struct msg_hdr hdr;
struct task_ctx taskc;
struct task_ctx_x taskc_x;
};
#endif /* __INTF_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
//
// Copyright (c) 2024 Changwoo Min <changwoo@igalia.com>
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(dead_code)]
include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs"));

View File

@ -0,0 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
//
// Copyright (c) 2024 Changwoo Min <changwoo@igalia.com>
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
// We can't directly include the generated skeleton in main.rs as it may
// contain compiler attributes that can't be `include!()`ed via macro and we
// can't use the `#[path = "..."]` because `concat!(env!("OUT_DIR"),
// "/bpf.skel.rs")` does not work inside the path attribute yet (see
// https://github.com/rust-lang/rust/pull/83366).
include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs"));

View File

@ -0,0 +1,308 @@
// SPDX-License-Identifier: GPL-2.0
//
// Copyright (c) 2024 Changwoo Min <changwoo@igalia.com>
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
mod bpf_skel;
pub use bpf_skel::*;
pub mod bpf_intf;
pub use bpf_intf::*;
extern crate static_assertions;
extern crate plain;
extern crate libc;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::Ordering;
use std::time::Duration;
use std::mem;
use libc::c_char;
use std::ffi::CStr;
use std::str;
use anyhow::Context;
use anyhow::Result;
use clap::Parser;
use libbpf_rs::skel::OpenSkel as _;
use libbpf_rs::skel::Skel as _;
use libbpf_rs::skel::SkelBuilder as _;
use log::info;
use scx_utils::uei_exited;
use scx_utils::uei_report;
use rlimit::{setrlimit, getrlimit, Resource};
use plain::Plain;
use nix::sys::signal;
static RUNNING: AtomicBool = AtomicBool::new(true);
/// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler
///
/// The rust part is minimal. It processes command line options and logs out
/// scheduling statistics. The BPF part makes all the scheduling decisions.
/// See the more detailed overview of the LAVD design at main.bpf.c.
#[derive(Debug, Parser)]
struct Opts {
/// The number of scheduling samples to be reported every second (default: 0)
#[clap(short = 's', long, default_value = "0")]
nr_sched_samples: u64,
/// PID to be tracked all its scheduling activities if specified
#[clap(short = 'p', long, default_value = "0")]
pid_traced: u64,
/// Enable verbose output including libbpf details. Specify multiple
/// times to increase verbosity.
#[clap(short = 'v', long, action = clap::ArgAction::Count)]
verbose: u8,
}
unsafe impl Plain for msg_task_ctx {}
impl msg_task_ctx {
fn from_bytes(buf: &[u8]) -> & msg_task_ctx {
plain::from_bytes(buf)
.expect("The buffer is either too short or not aligned!")
}
}
impl introspec {
fn new() -> Self {
let intrspc = unsafe {
mem::MaybeUninit::<introspec>::zeroed().assume_init()
};
intrspc
}
fn init(opts: &Opts) -> Self {
let mut intrspc = introspec::new();
if opts.nr_sched_samples > 0 {
intrspc.cmd = LAVD_CMD_SCHED_N;
intrspc.arg = opts.nr_sched_samples;
}
else if opts.pid_traced > 0 {
intrspc.cmd = LAVD_CMD_PID;
intrspc.arg = opts.pid_traced;
}
else {
intrspc.cmd = LAVD_CMD_NOP;
}
intrspc.requested = false as u8;
intrspc
}
}
struct Scheduler<'a> {
skel: BpfSkel<'a>,
struct_ops: Option<libbpf_rs::Link>,
nr_cpus_onln: u64,
rb_mgr: libbpf_rs::RingBuffer<'static>,
intrspc: introspec,
}
impl<'a> Scheduler<'a> {
fn init(opts: &'a Opts) -> Result<Self> {
// Increase MEMLOCK size since the BPF scheduler might use
// more than the current limit
let (soft_limit, _) = getrlimit(Resource::MEMLOCK).unwrap();
setrlimit(Resource::MEMLOCK, soft_limit, rlimit::INFINITY).unwrap();
// Open the BPF prog first for verification.
let mut skel_builder = BpfSkelBuilder::default();
skel_builder.obj_builder.debug(opts.verbose > 0);
let mut skel = skel_builder.open()
.context("Failed to open BPF program")?;
// Initialize skel according to @opts.
let nr_cpus_onln = num_cpus::get() as u64;
skel.bss_mut().nr_cpus_onln = nr_cpus_onln;
skel.rodata_mut(). verbose = opts.verbose;
let intrspc = introspec::init(opts);
// Attach.
let mut skel = skel.load().context("Failed to load BPF program")?;
skel.attach().context("Failed to attach BPF program")?;
let struct_ops = Some(
skel.maps_mut()
.lavd_ops()
.attach_struct_ops()
.context("Failed to attach scx_lavd struct ops")?,
);
// Build a ring buffer for instrumentation
let mut maps = skel.maps_mut();
let rb_map = maps.introspec_msg();
let mut builder = libbpf_rs::RingBufferBuilder::new();
builder.add(rb_map, Scheduler::print_bpf_msg).unwrap();
let rb_mgr = builder.build().unwrap();
Ok(Self {
skel,
struct_ops,
nr_cpus_onln,
rb_mgr,
intrspc,
})
}
fn get_msg_seq_id() -> u64 {
static mut MSEQ: u64 = 0;
unsafe {
MSEQ += 1;
MSEQ
}
}
fn print_bpf_msg(data: &[u8]) -> i32 {
let mt = msg_task_ctx::from_bytes(data);
let tx = mt.taskc_x;
let tc = mt.taskc;
// No idea how to print other types than LAVD_MSG_TASKC
if mt.hdr.kind != LAVD_MSG_TASKC {
return 0;
}
// Print a message from the BPF scheduler
let mseq = Scheduler::get_msg_seq_id();
if mseq % 32 == 1 {
info!("| {:9} | {:8} | {:17} \
| {:4} | {:9} | {:9} \
| {:10} | {:9} | {:8} \
| {:12} | {:7} | {:9} \
| {:9} | {:9} | {:9} \
| {:9} | {:8} |",
"mseq", "pid", "comm",
"cpu", "vddln_ns", "elglty_ns",
"slice_ns", "grdy_rt", "lat_prio",
"static_prio", "lat_bst", "slice_bst",
"run_freq", "run_tm_ns", "wait_freq",
"wake_freq", "cpu_util");
}
let c_tx_cm: *const c_char = (&tx.comm as *const [i8;17]) as *const i8;
let c_tx_cm_str: &CStr = unsafe { CStr::from_ptr(c_tx_cm) };
let tx_comm : &str = c_tx_cm_str.to_str().unwrap();
info!("| {:9} | {:8} | {:17} \
| {:4} | {:9} | {:9} \
| {:10} | {:9} | {:8} \
| {:12} | {:7} | {:9} \
| {:9} | {:9} | {:9} \
| {:9} | {:8} | ",
mseq, tx.pid, tx_comm,
tx.cpu_id, tc.vdeadline_delta_ns, tc.eligible_delta_ns,
tc.slice_ns, tc.greedy_ratio, tc.lat_prio,
tx.static_prio, tc.lat_boost_prio, tc.slice_boost_prio,
tc.run_freq, tc.run_time_ns, tc.wait_freq,
tc.wake_freq, tx.cpu_util);
0
}
fn prep_introspec(&mut self) -> u64 {
let mut interval_ms = 1000;
if self.intrspc.cmd == LAVD_CMD_SCHED_N &&
self.intrspc.arg > self.nr_cpus_onln {
// More samples, shorter sampling interval.
let f = self.intrspc.arg / self.nr_cpus_onln * 2;
interval_ms /= f;
}
self.intrspc.requested = true as u8;
self.skel.bss_mut().intrspc.cmd = self.intrspc.cmd;
self.skel.bss_mut().intrspc.arg = self.intrspc.arg;
self.skel.bss_mut().intrspc.requested = self.intrspc.requested;
interval_ms
}
fn cleanup_introspec(&mut self) {
// If not yet requested, do nothing.
if self.intrspc.requested == false as u8 {
return;
}
// Once dumped, it is done.
if self.intrspc.cmd == LAVD_CMD_DUMP {
self.intrspc.cmd = LAVD_CMD_NOP;
}
}
fn running(&mut self) -> bool {
RUNNING.load(Ordering::Relaxed) && !uei_exited!(&self.skel.bss().uei)
}
fn run(&mut self) -> Result<()> {
while self.running() {
let interval_ms = self.prep_introspec();
std::thread::sleep(Duration::from_millis(interval_ms));
self.rb_mgr.poll(Duration::from_millis(100)).unwrap();
self.cleanup_introspec();
}
self.rb_mgr.consume().unwrap();
self.struct_ops.take();
uei_report!(&self.skel.bss().uei)
}
}
impl<'a> Drop for Scheduler<'a> {
fn drop(&mut self) {
if let Some(struct_ops) = self.struct_ops.take() {
drop(struct_ops);
}
}
}
fn init_log(opts: & Opts) {
let llv = match opts.verbose {
0 => simplelog::LevelFilter::Info,
1 => simplelog::LevelFilter::Debug,
_ => simplelog::LevelFilter::Trace,
};
let mut lcfg = simplelog::ConfigBuilder::new();
lcfg.set_time_level(simplelog::LevelFilter::Error)
.set_location_level(simplelog::LevelFilter::Off)
.set_target_level(simplelog::LevelFilter::Off)
.set_thread_level(simplelog::LevelFilter::Off);
simplelog::TermLogger::init(
llv,
lcfg.build(),
simplelog::TerminalMode::Stderr,
simplelog::ColorChoice::Auto,
).unwrap();
}
extern "C" fn handle_sigint(_: libc::c_int, _: *mut libc::siginfo_t, _: *mut libc::c_void) {
RUNNING.store(false, Ordering::SeqCst);
}
fn init_signal_handlers() {
// Ctrl-c for termination
unsafe {
let sigint_action = signal::SigAction::new(
signal::SigHandler::SigAction(handle_sigint),
signal::SaFlags::empty(), signal::SigSet::empty());
signal::sigaction(signal::SIGINT, &sigint_action).unwrap();
}
}
fn main() -> Result<()> {
let opts = Opts::parse();
init_log(&opts);
let mut sched = Scheduler::init(&opts)?;
info!("scx_lavd scheduler is initialized");
init_signal_handlers();
info!("scx_lavd scheduler starts running.");
sched.run()
}