scx_bpfland: introduce primary scheduling domain

Allow to specify a primary scheduling domain via the new command line
option `--primary-domain CPUMASK`, where CPUMASK can be a hex number of
arbitrary length, representing the CPUs assigned to the domain.

If this option is not specified the scheduler will use all the available
CPUs in the system as primary domain (no behavior change).

Otherwise, if a primary scheduling domain is defined, the scheduler will
try to dispatch tasks only to the CPUs assigned to the primary domain,
until these CPUs are saturated, at which point tasks may overflow to
other available CPUs.

This feature can be used to prioritize certain cores over others and it
can be really effective in systems with heterogeneous cores (e.g.,
hybrid systems with P-cores and E-cores).

== Example (hybrid architecture) ==

Hardware:
 - Dell Precision 5480 with 13th Gen Intel(R) Core(TM) i7-13800H
   - 6 P-cores 0..5  with 2 CPUs each (CPU from  0..11)
   - 8 E-cores 6..13 with 1 CPU  each (CPU from 12..19)

== Test ==

WebGL application (https://webglsamples.org/aquarium/aquarium.html):
this allows to generate a steady workload in the system without
over-saturating the CPUs.

Use different scheduler configurations:

 - EEVDF (default)
 - scx_bpfland using P-cores only (--primary-domain 0x00fff)
 - scx_bpfland using E-cores only (--primary-domain 0xff000)

Measure performance (fps) and power consumption (W).

== Result ==

                  +-----+-----+------+-----+----------+
                  | min | max | avg  |       |        |
                  | fps | fps | fps  | stdev | power  |
+-----------------+-----+-----+------+-------+--------+
| EEVDF           | 28  | 34  | 31.0 |  1.73 |  3.5W  |
| bpfland-p-cores | 33  | 34  | 33.5 |  0.29 |  3.5W  |
| bpfland-e-cores | 25  | 26  | 25.5 |  0.29 |  2.2W  |
+-----------------+-----+-----+------+-------+--------+

Using a primary scheduling domain of only P-cores with scx_bpfland
allows to achieve a more stable and predictable level of performance,
with an average of 33.5 fps and an error of ±0.5 fps.

In contrast, using EEVDF results in an average frame rate of 31.0 fps
with an error of ±3.0 fps, indicating slightly less consistency, due to
the fact that tasks are evenly distributed across all the cores in the
system (both slow and fast cores).

On the other hand, using a scheduling domain solely of E-cores with
scx_bpfland results in a lower average frame rate (25.5 fps), though it
maintains a stable performance (error of ±0.5 fps), but the power
consumption is also reduced, averaging 2.2W, compared to 3.5W with
either of the other configurations.

== Conclusion ==

In summary, with this change users have the flexibility to prioritize
scheduling on performance cores for better performance and consistency,
or prioritize energy efficient cores for reduced power consumption, on
hybrid architectures.

Moreover, this feature can also be used to minimize the number of cores
used by the scheduler, until they reach full capacity. This capability
can be useful for reducing power consumption even in homogeneous systems
or for conducting scheduling experiments with smaller sets of cores,
provided the system is not overcommitted.

Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
This commit is contained in:
Andrea Righi 2024-08-12 10:02:12 +02:00
parent a6e977c70b
commit f9a994412d
3 changed files with 269 additions and 55 deletions

View File

@ -34,4 +34,8 @@ typedef signed long s64;
typedef int pid_t; typedef int pid_t;
#endif /* __VMLINUX_H__ */ #endif /* __VMLINUX_H__ */
struct cpu_arg {
s32 cpu_id;
};
#endif /* __INTF_H */ #endif /* __INTF_H */

View File

@ -106,6 +106,12 @@ volatile u64 nr_running, nr_waiting, nr_interactive, nr_online_cpus;
*/ */
UEI_DEFINE(uei); UEI_DEFINE(uei);
/*
* Mask of CPUs that the scheduler can use, until the system becomes saturated,
* at which point tasks may overflow to other available CPUs.
*/
private(BPFLAND) struct bpf_cpumask __kptr *allowed_cpumask;
/* /*
* Mask of offline CPUs, used to properly support CPU hotplugging. * Mask of offline CPUs, used to properly support CPU hotplugging.
*/ */
@ -150,9 +156,9 @@ static u64 vtime_now;
*/ */
struct task_ctx { struct task_ctx {
/* /*
* Set to true if the task is classified as interactive. * A temporary cpumask for calculating the allowed CPU mask.
*/ */
bool is_interactive; struct bpf_cpumask __kptr *cpumask;
/* /*
* Voluntary context switches metrics. * Voluntary context switches metrics.
@ -160,6 +166,11 @@ struct task_ctx {
u64 nvcsw; u64 nvcsw;
u64 nvcsw_ts; u64 nvcsw_ts;
u64 avg_nvcsw; u64 avg_nvcsw;
/*
* Set to true if the task is classified as interactive.
*/
bool is_interactive;
}; };
/* Map that contains task-local storage. */ /* Map that contains task-local storage. */
@ -196,6 +207,19 @@ struct task_ctx *lookup_task_ctx(const struct task_struct *p)
return tctx; return tctx;
} }
/*
* Return true if the task is interactive, false otherwise.
*/
static bool is_task_interactive(struct task_struct *p)
{
struct task_ctx *tctx;
tctx = try_lookup_task_ctx(p);
if (!tctx)
return false;
return tctx->is_interactive;
}
/* /*
* Return true if the target task @p is a kernel thread. * Return true if the target task @p is a kernel thread.
*/ */
@ -288,8 +312,6 @@ static inline bool vtime_before(u64 a, u64 b)
*/ */
static inline u64 task_vtime(struct task_struct *p) static inline u64 task_vtime(struct task_struct *p)
{ {
u64 vtime = p->scx.dsq_vtime;
/* /*
* Limit the vruntime to (vtime_now - slice_ns_lag) to avoid * Limit the vruntime to (vtime_now - slice_ns_lag) to avoid
* excessively penalizing tasks. * excessively penalizing tasks.
@ -300,7 +322,7 @@ static inline u64 task_vtime(struct task_struct *p)
* *
* Instead, a negative slice_ns_lag can result in more consistent * Instead, a negative slice_ns_lag can result in more consistent
* performance (less spikey), smoothing the reordering of the vruntime * performance (less spikey), smoothing the reordering of the vruntime
* scheduling and making the scheduler closer to a FIFO. * scheduling and making the scheduler closer to a FIFO.
*/ */
if (vtime_before(p->scx.dsq_vtime, vtime_now - slice_ns_lag)) if (vtime_before(p->scx.dsq_vtime, vtime_now - slice_ns_lag))
p->scx.dsq_vtime = vtime_now - slice_ns_lag; p->scx.dsq_vtime = vtime_now - slice_ns_lag;
@ -409,8 +431,14 @@ static int dispatch_direct_cpu(struct task_struct *p, s32 cpu, u64 enq_flags)
static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{ {
const struct cpumask *online_cpumask, *idle_smtmask, *idle_cpumask; const struct cpumask *online_cpumask, *idle_smtmask, *idle_cpumask;
struct bpf_cpumask *p_mask, *allowed;
struct task_ctx *tctx;
s32 cpu; s32 cpu;
tctx = try_lookup_task_ctx(p);
if (!tctx)
return prev_cpu;
/* /*
* For tasks that can run only on a single CPU, we can simply verify if * For tasks that can run only on a single CPU, we can simply verify if
* their only allowed CPU is idle. * their only allowed CPU is idle.
@ -424,6 +452,10 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
return -ENOENT; return -ENOENT;
} }
allowed = allowed_cpumask;
if (!allowed)
return -ENOENT;
/* /*
* Acquire the CPU masks to determine the online and idle CPUs in the * Acquire the CPU masks to determine the online and idle CPUs in the
* system. * system.
@ -432,6 +464,18 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
idle_smtmask = scx_bpf_get_idle_smtmask(); idle_smtmask = scx_bpf_get_idle_smtmask();
idle_cpumask = scx_bpf_get_idle_cpumask(); idle_cpumask = scx_bpf_get_idle_cpumask();
p_mask = tctx->cpumask;
if (!p_mask) {
cpu = prev_cpu;
goto out_put_cpumask;
}
/*
* Enable the task to run in the intersection of its permitted CPUs and
* the primary scheduling domain.
*/
bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(allowed));
/* /*
* Find the best idle CPU, prioritizing full idle cores in SMT systems. * Find the best idle CPU, prioritizing full idle cores in SMT systems.
*/ */
@ -440,7 +484,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
* If the task can still run on the previously used CPU and * If the task can still run on the previously used CPU and
* it's a full-idle core, keep using it. * it's a full-idle core, keep using it.
*/ */
if (bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr) && if (bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) &&
bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) && bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu; cpu = prev_cpu;
@ -450,7 +494,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
/* /*
* Otherwise, search for another usable full-idle core. * Otherwise, search for another usable full-idle core.
*/ */
cpu = bpf_cpumask_any_and_distribute(p->cpus_ptr, idle_smtmask); cpu = bpf_cpumask_any_and_distribute(cast_mask(p_mask), idle_smtmask);
if (bpf_cpumask_test_cpu(cpu, online_cpumask) && if (bpf_cpumask_test_cpu(cpu, online_cpumask) &&
scx_bpf_test_and_clear_cpu_idle(cpu)) scx_bpf_test_and_clear_cpu_idle(cpu))
goto out_put_cpumask; goto out_put_cpumask;
@ -460,7 +504,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
* If a full-idle core can't be found (or if this is not an SMT system) * If a full-idle core can't be found (or if this is not an SMT system)
* try to re-use the same CPU, even if it's not in a full-idle core. * try to re-use the same CPU, even if it's not in a full-idle core.
*/ */
if (bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr) && if (bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) &&
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu; cpu = prev_cpu;
goto out_put_cpumask; goto out_put_cpumask;
@ -470,7 +514,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
* If all the previous attempts have failed, try to use any idle CPU in * If all the previous attempts have failed, try to use any idle CPU in
* the system. * the system.
*/ */
cpu = bpf_cpumask_any_and_distribute(p->cpus_ptr, idle_cpumask); cpu = bpf_cpumask_any_and_distribute(cast_mask(p_mask), idle_cpumask);
if (bpf_cpumask_test_cpu(cpu, online_cpumask) && if (bpf_cpumask_test_cpu(cpu, online_cpumask) &&
scx_bpf_test_and_clear_cpu_idle(cpu)) scx_bpf_test_and_clear_cpu_idle(cpu))
goto out_put_cpumask; goto out_put_cpumask;
@ -497,6 +541,19 @@ static bool is_prio_congested(void)
return scx_bpf_dsq_nr_queued(prio_dsq_id) > nr_online_cpus * 4; return scx_bpf_dsq_nr_queued(prio_dsq_id) > nr_online_cpus * 4;
} }
s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
s32 cpu;
cpu = pick_idle_cpu(p, prev_cpu, wake_flags);
if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0)) {
__sync_fetch_and_add(&nr_direct_dispatches, 1);
return cpu;
}
return prev_cpu;
}
/* /*
* Handle synchronous wake-up event for a task. * Handle synchronous wake-up event for a task.
*/ */
@ -513,32 +570,13 @@ static void handle_sync_wakeup(struct task_struct *p)
* promote additional interactive tasks, instead we give priority to * promote additional interactive tasks, instead we give priority to
* the tasks that are already classified as interactive. * the tasks that are already classified as interactive.
*/ */
tctx = lookup_task_ctx(p); tctx = try_lookup_task_ctx(p);
if (!tctx) if (!tctx)
return; return;
if (!tctx->is_interactive && !is_prio_congested()) if (!tctx->is_interactive && !is_prio_congested())
tctx->is_interactive = true; tctx->is_interactive = true;
} }
s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
s32 cpu;
/*
* Try to prioritize newly awakened tasks.
*/
if (wake_flags & SCX_WAKE_SYNC)
handle_sync_wakeup(p);
cpu = pick_idle_cpu(p, prev_cpu, wake_flags);
if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0)) {
__sync_fetch_and_add(&nr_direct_dispatches, 1);
return cpu;
}
return prev_cpu;
}
/* /*
* Dispatch all the other tasks that were not dispatched directly in * Dispatch all the other tasks that were not dispatched directly in
* select_cpu(). * select_cpu().
@ -547,7 +585,14 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
{ {
u64 vtime = task_vtime(p); u64 vtime = task_vtime(p);
u64 slice = task_slice(p); u64 slice = task_slice(p);
struct task_ctx *tctx;
/*
* If the system is saturated and we couldn't dispatch directly in
* select_cpu(), try to prioritize newly awakened tasks by immediately
* promoting them as interactive.
*/
if (enq_flags & SCX_ENQ_WAKEUP)
handle_sync_wakeup(p);
/* /*
* Always dispatch per-CPU kthreads directly on their target CPU if * Always dispatch per-CPU kthreads directly on their target CPU if
@ -561,10 +606,6 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
} }
} }
tctx = lookup_task_ctx(p);
if (!tctx)
return;
/* /*
* Dispatch interactive tasks to the priority DSQ and regular tasks to * Dispatch interactive tasks to the priority DSQ and regular tasks to
* the shared DSQ. * the shared DSQ.
@ -574,7 +615,7 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
* that can consume them) we can just dispatch them to the shared DSQ * that can consume them) we can just dispatch them to the shared DSQ
* and simply rely on the vruntime logic. * and simply rely on the vruntime logic.
*/ */
if (tctx->is_interactive) { if (is_task_interactive(p)) {
scx_bpf_dispatch_vtime(p, prio_dsq_id, slice, vtime, enq_flags); scx_bpf_dispatch_vtime(p, prio_dsq_id, slice, vtime, enq_flags);
__sync_fetch_and_add(&nr_prio_dispatches, 1); __sync_fetch_and_add(&nr_prio_dispatches, 1);
} else { } else {
@ -726,12 +767,6 @@ void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p) void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
{ {
struct task_ctx *tctx;
tctx = lookup_task_ctx(p);
if (!tctx)
return;
/* Update global vruntime */ /* Update global vruntime */
if (vtime_before(vtime_now, p->scx.dsq_vtime)) if (vtime_before(vtime_now, p->scx.dsq_vtime))
vtime_now = p->scx.dsq_vtime; vtime_now = p->scx.dsq_vtime;
@ -744,7 +779,7 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
p->scx.slice = slice_ns; p->scx.slice = slice_ns;
/* Update CPU interactive state */ /* Update CPU interactive state */
if (tctx->is_interactive) if (is_task_interactive(p))
__sync_fetch_and_add(&nr_interactive, 1); __sync_fetch_and_add(&nr_interactive, 1);
__sync_fetch_and_add(&nr_running, 1); __sync_fetch_and_add(&nr_running, 1);
@ -885,11 +920,22 @@ void BPF_STRUCT_OPS(bpfland_cpu_offline, s32 cpu)
s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p, s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p,
struct scx_init_task_args *args) struct scx_init_task_args *args)
{ {
if (bpf_task_storage_get(&task_ctx_stor, p, 0, struct task_ctx *tctx;
BPF_LOCAL_STORAGE_GET_F_CREATE)) struct bpf_cpumask *cpumask;
return 0;
else tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!tctx)
return -ENOMEM; return -ENOMEM;
cpumask = bpf_cpumask_create();
if (!cpumask)
return -ENOMEM;
cpumask = bpf_kptr_xchg(&tctx->cpumask, cpumask);
if (cpumask)
bpf_cpumask_release(cpumask);
return 0;
} }
/* /*
@ -914,6 +960,51 @@ s32 get_nr_online_cpus(void)
return cpus; return cpus;
} }
static int init_allowed_cpus(void)
{
struct bpf_cpumask *mask;
int err = 0;
/*
* Do nothing if the mask is already initialized.
*/
mask = allowed_cpumask;
if (mask)
return 0;
/*
* Create the allowed CPU mask.
*/
err = calloc_cpumask(&allowed_cpumask);
if (!err)
mask = allowed_cpumask;
if (!mask)
err = -ENOMEM;
return err;
}
SEC("syscall")
int enable_cpu(struct cpu_arg *input)
{
struct bpf_cpumask *mask;
int err = 0;
/* Make sure the allowed CPU mask is initialized */
err = init_allowed_cpus();
if (err)
return err;
/*
* Enable the target CPU in the primary scheduling domain.
*/
bpf_rcu_read_lock();
mask = allowed_cpumask;
if (mask)
bpf_cpumask_set_cpu(input->cpu_id, mask);
bpf_rcu_read_unlock();
return err;
}
s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init) s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
{ {
struct bpf_cpumask *mask; struct bpf_cpumask *mask;
@ -966,7 +1057,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
if (err) if (err)
return err; return err;
return err; /* Initialize the primary scheduling domain */
return init_allowed_cpus();
} }
void BPF_STRUCT_OPS(bpfland_exit, struct scx_exit_info *ei) void BPF_STRUCT_OPS(bpfland_exit, struct scx_exit_info *ei)

View File

@ -10,31 +10,33 @@ pub use bpf_skel::*;
pub mod bpf_intf; pub mod bpf_intf;
pub use bpf_intf::*; pub use bpf_intf::*;
use std::ffi::c_int;
use std::fs::File; use std::fs::File;
use std::io::Read; use std::io::Read;
use std::mem::MaybeUninit; use std::mem::MaybeUninit;
use std::str;
use std::sync::atomic::AtomicBool; use std::sync::atomic::AtomicBool;
use std::sync::atomic::Ordering; use std::sync::atomic::Ordering;
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use std::str;
use anyhow::bail; use anyhow::bail;
use anyhow::Context; use anyhow::Context;
use anyhow::Result; use anyhow::Result;
use clap::Parser; use clap::Parser;
use log::info; use log::info;
use log::warn;
use metrics::{gauge, Gauge}; use metrics::{gauge, Gauge};
use metrics_exporter_prometheus::PrometheusBuilder; use metrics_exporter_prometheus::PrometheusBuilder;
use rlimit::{getrlimit, setrlimit, Resource}; use rlimit::{getrlimit, setrlimit, Resource};
use libbpf_rs::OpenObject;
use libbpf_rs::skel::OpenSkel; use libbpf_rs::skel::OpenSkel;
use libbpf_rs::skel::Skel; use libbpf_rs::skel::Skel;
use libbpf_rs::skel::SkelBuilder; use libbpf_rs::skel::SkelBuilder;
use libbpf_rs::OpenObject;
use libbpf_rs::ProgramInput;
use scx_utils::build_id; use scx_utils::build_id;
use scx_utils::scx_ops_attach; use scx_utils::scx_ops_attach;
@ -46,6 +48,69 @@ use scx_utils::UserExitInfo;
const SCHEDULER_NAME: &'static str = "scx_bpfland"; const SCHEDULER_NAME: &'static str = "scx_bpfland";
#[derive(Debug, Clone)]
struct CpuMask {
mask: Vec<u64>,
num_bits: usize,
}
impl CpuMask {
pub fn from_mask(mask: Vec<u64>, num_bits: usize) -> Self {
Self { mask, num_bits }
}
pub fn is_cpu_set(&self, cpu: usize) -> bool {
if self.num_bits == 0 {
return true;
}
if cpu >= self.num_bits {
return false;
}
let idx = cpu / 64;
let bit = cpu % 64;
self.mask
.get(idx)
.map_or(false, |&val| val & (1 << bit) != 0)
}
pub fn from_str(hex_str: &str) -> Result<Self, std::num::ParseIntError> {
let hex_str = hex_str.trim_start_matches("0x");
let num_bits = hex_str.len() * 4;
let num_u64s = (num_bits + 63) / 64;
let padded_hex_str = format!("{:0>width$}", hex_str, width = num_u64s * 16);
let mask = (0..num_u64s)
.rev()
.map(|i| u64::from_str_radix(&padded_hex_str[i * 16..(i + 1) * 16], 16))
.collect::<Result<Vec<_>, _>>()?;
Ok(CpuMask::from_mask(mask, num_bits))
}
pub fn to_string(&self) -> String {
if self.num_bits == 0 {
return "all".to_string();
}
let mut hex_str = String::new();
for &chunk in self.mask.iter().rev() {
hex_str.push_str(&format!("{:016x}", chunk));
}
// Remove leading zeros, but keep at least one digit.
hex_str = hex_str.trim_start_matches('0').to_string();
if hex_str.is_empty() {
hex_str = "0".to_string();
}
format!("0x{}", hex_str)
}
}
// Custom parser function for cpumask using CpuMask's from_str method
fn parse_cpumask(hex_str: &str) -> Result<CpuMask, std::num::ParseIntError> {
CpuMask::from_str(hex_str)
}
/// scx_bpfland: a vruntime-based sched_ext scheduler that prioritizes interactive workloads. /// scx_bpfland: a vruntime-based sched_ext scheduler that prioritizes interactive workloads.
/// ///
/// This scheduler is derived from scx_rustland, but it is fully implemented in BFP with minimal /// This scheduler is derived from scx_rustland, but it is fully implemented in BFP with minimal
@ -87,6 +152,14 @@ struct Opts {
#[clap(short = 'k', long, action = clap::ArgAction::SetTrue)] #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
local_kthreads: bool, local_kthreads: bool,
/// Specifies the initial set of CPUs, represented as a bitmask in hex (e.g., 0xff), that the
/// scheduler will use to dispatch tasks, until the system becomes saturated, at which point
/// tasks may overflow to other available CPUs.
///
/// (empty string = all CPUs are used for initial scheduling)
#[clap(short = 'm', long, default_value = "", value_parser = parse_cpumask)]
primary_domain: CpuMask,
/// Maximum threshold of voluntary context switch per second, used to classify interactive /// Maximum threshold of voluntary context switch per second, used to classify interactive
/// tasks (0 = disable interactive tasks classification). /// tasks (0 = disable interactive tasks classification).
#[clap(short = 'c', long, default_value = "10")] #[clap(short = 'c', long, default_value = "10")]
@ -205,8 +278,13 @@ impl<'a> Scheduler<'a> {
skel.maps.rodata_data.starvation_thresh_ns = opts.starvation_thresh_us * 1000; skel.maps.rodata_data.starvation_thresh_ns = opts.starvation_thresh_us * 1000;
skel.maps.rodata_data.nvcsw_max_thresh = opts.nvcsw_max_thresh; skel.maps.rodata_data.nvcsw_max_thresh = opts.nvcsw_max_thresh;
// Attach the scheduler. // Load the BPF program for validation.
let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?; let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
// Initialize primary domain CPUs.
Self::init_primary_domain(&mut skel, &opts.primary_domain)?;
// Attach the scheduler.
let struct_ops = Some(scx_ops_attach!(skel, bpfland_ops)?); let struct_ops = Some(scx_ops_attach!(skel, bpfland_ops)?);
// Enable Prometheus metrics. // Enable Prometheus metrics.
@ -224,6 +302,45 @@ impl<'a> Scheduler<'a> {
}) })
} }
fn enable_cpu(skel: &mut BpfSkel<'_>, cpu: usize) -> Result<(), u32> {
let prog = &mut skel.progs.enable_cpu;
let mut args = cpu_arg {
cpu_id: cpu as c_int,
};
let input = ProgramInput {
context_in: Some(unsafe {
std::slice::from_raw_parts_mut(
&mut args as *mut _ as *mut u8,
std::mem::size_of_val(&args),
)
}),
..Default::default()
};
let out = prog.test_run(input).unwrap();
if out.return_value != 0 {
return Err(out.return_value);
}
Ok(())
}
fn init_primary_domain(skel: &mut BpfSkel<'_>, primary_domain: &CpuMask) -> Result<()> {
info!("primary CPU domain = {}", primary_domain.to_string());
for cpu in 0..libbpf_rs::num_possible_cpus().unwrap() {
if primary_domain.is_cpu_set(cpu) {
if let Err(err) = Self::enable_cpu(skel, cpu) {
warn!(
"Failed to add CPU {} to primary domain: error {}",
cpu, err
);
}
}
}
Ok(())
}
fn update_stats(&mut self) { fn update_stats(&mut self) {
let nr_cpus = self.skel.maps.bss_data.nr_online_cpus; let nr_cpus = self.skel.maps.bss_data.nr_online_cpus;
let nr_running = self.skel.maps.bss_data.nr_running; let nr_running = self.skel.maps.bss_data.nr_running;
@ -245,7 +362,8 @@ impl<'a> Scheduler<'a> {
.nr_waiting .nr_waiting
.set(nr_waiting as f64); .set(nr_waiting as f64);
self.metrics self.metrics
.nvcsw_avg_thresh.set(nvcsw_avg_thresh as f64); .nvcsw_avg_thresh
.set(nvcsw_avg_thresh as f64);
self.metrics self.metrics
.nr_direct_dispatches .nr_direct_dispatches
.set(nr_direct_dispatches as f64); .set(nr_direct_dispatches as f64);