scx_bpfland: get rid of preferred domain

Using the turbo boosted CPUs as preferred scheduling seems to be
beneficial only a very few corner cases, for example on battery-powered
devices with an aggressive cpufreq governor that constantly tries to
scale down the frequency (and even in this case it's probably better to
not force the tasks to run on the fast CPUs, to save power).

In practive the preferred domain seems to introduce more overhead than
benefits overall, so let's get rid of it.

This can be improved in the future adding multiple user-configurable
scheduling domains.

Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
This commit is contained in:
Andrea Righi 2024-09-10 23:50:14 +02:00
parent 4fb2b09a6e
commit 079a53c689
2 changed files with 18 additions and 163 deletions

View File

@ -139,11 +139,6 @@ UEI_DEFINE(uei);
*/ */
private(BPFLAND) struct bpf_cpumask __kptr *primary_cpumask; private(BPFLAND) struct bpf_cpumask __kptr *primary_cpumask;
/*
* Mask of preferred CPUs in the system.
*/
private(BPFLAND) struct bpf_cpumask __kptr *preferred_cpumask;
/* /*
* Mask of offline CPUs, used to properly support CPU hotplugging. * Mask of offline CPUs, used to properly support CPU hotplugging.
*/ */
@ -290,7 +285,7 @@ static bool is_task_interactive(struct task_struct *p)
*/ */
static inline bool is_kthread(const struct task_struct *p) static inline bool is_kthread(const struct task_struct *p)
{ {
return !!(p->flags & PF_KTHREAD); return p->flags & PF_KTHREAD;
} }
/* /*
@ -538,11 +533,10 @@ static void handle_sync_wakeup(struct task_struct *p)
* to handle these mistakes in favor of a more efficient response and a reduced * to handle these mistakes in favor of a more efficient response and a reduced
* scheduling overhead. * scheduling overhead.
*/ */
static s32 pick_idle_cpu(struct task_struct *p, static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
s32 prev_cpu, u64 wake_flags, bool do_preferred)
{ {
const struct cpumask *online_cpumask, *idle_smtmask, *idle_cpumask; const struct cpumask *online_cpumask, *idle_smtmask, *idle_cpumask;
struct bpf_cpumask *primary, *preferred, *l2_domain, *l3_domain; struct bpf_cpumask *primary, *l2_domain, *l3_domain;
struct bpf_cpumask *p_mask, *l2_mask, *l3_mask; struct bpf_cpumask *p_mask, *l2_mask, *l3_mask;
struct task_ctx *tctx; struct task_ctx *tctx;
struct cpu_ctx *cctx; struct cpu_ctx *cctx;
@ -558,9 +552,6 @@ static s32 pick_idle_cpu(struct task_struct *p,
primary = primary_cpumask; primary = primary_cpumask;
if (!primary) if (!primary)
return -ENOENT; return -ENOENT;
preferred = preferred_cpumask;
if (!preferred)
return -ENOENT;
/* /*
* If the task isn't allowed to use its previously used CPU it means * If the task isn't allowed to use its previously used CPU it means
@ -574,14 +565,9 @@ static s32 pick_idle_cpu(struct task_struct *p,
/* /*
* For tasks that can run only on a single CPU, we can simply verify if * For tasks that can run only on a single CPU, we can simply verify if
* their only allowed CPU is still idle. * their only allowed CPU is still idle.
*
* Moreover, if local_kthreads is enabled, always allow to dispatch
* per-CPU kthreads directly to their target CPU, independently on the
* idle state.
*/ */
if (p->nr_cpus_allowed == 1) { if (p->nr_cpus_allowed == 1) {
if ((is_kthread(p) && local_kthreads) || if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
scx_bpf_test_and_clear_cpu_idle(prev_cpu))
return prev_cpu; return prev_cpu;
return -ENOENT; return -ENOENT;
} }
@ -628,18 +614,9 @@ static s32 pick_idle_cpu(struct task_struct *p,
/* /*
* Determine the task's scheduling domain. * Determine the task's scheduling domain.
*
* Try to dispatch on the preferred CPUs first. If we can't find any
* idle CPU, re-try again with the primary scheduling domain. * idle CPU, re-try again with the primary scheduling domain.
*/ */
if (do_preferred && bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary));
!bpf_cpumask_empty(cast_mask(preferred)) &&
!bpf_cpumask_equal(cast_mask(preferred), cast_mask(primary))) {
bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(preferred));
} else {
bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary));
do_preferred = false;
}
/* /*
* Determine the L2 cache domain as the intersection of the task's * Determine the L2 cache domain as the intersection of the task's
@ -730,15 +707,6 @@ static s32 pick_idle_cpu(struct task_struct *p,
goto out_put_cpumask; goto out_put_cpumask;
} }
/*
* When considering the preferred domain (first idle CPU
* selection pass) try to stay on the same LLC.
*/
if (do_preferred) {
cpu = -ENOENT;
goto out_put_cpumask;
}
/* /*
* Search for any other full-idle core in the primary domain. * Search for any other full-idle core in the primary domain.
*/ */
@ -780,15 +748,6 @@ static s32 pick_idle_cpu(struct task_struct *p,
goto out_put_cpumask; goto out_put_cpumask;
} }
/*
* When considering the preferred domain (first idle CPU selection
* pass) try to stay on the same LLC.
*/
if (do_preferred) {
cpu = -ENOENT;
goto out_put_cpumask;
}
/* /*
* Search for any idle CPU in the scheduling domain. * Search for any idle CPU in the scheduling domain.
*/ */
@ -815,7 +774,7 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64
{ {
s32 cpu; s32 cpu;
cpu = pick_idle_cpu(p, prev_cpu, wake_flags, true); cpu = pick_idle_cpu(p, prev_cpu, wake_flags);
if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0)) { if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0)) {
__sync_fetch_and_add(&nr_direct_dispatches, 1); __sync_fetch_and_add(&nr_direct_dispatches, 1);
return cpu; return cpu;
@ -832,16 +791,18 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
{ {
struct bpf_cpumask *primary; struct bpf_cpumask *primary;
u64 deadline = task_deadline(p); u64 deadline = task_deadline(p);
s32 cpu, prev_cpu = scx_bpf_task_cpu(p); s32 cpu;
/* /*
* If we couldn't find an idle CPU in ops.select_cpu(), give the task * If local_kthreads is enabled, always dispatch per-CPU kthreads
* another chance here to keep using the same CPU / cache / domain. * directly to their target CPU.
*/ */
cpu = pick_idle_cpu(p, prev_cpu, 0, false); if (local_kthreads && is_kthread(p) && p->nr_cpus_allowed == 1) {
if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0)) { cpu = scx_bpf_task_cpu(p);
__sync_fetch_and_add(&nr_direct_dispatches, 1); if (!dispatch_direct_cpu(p, cpu, enq_flags)) {
return; __sync_fetch_and_add(&nr_direct_dispatches, 1);
return;
}
} }
/* /*
@ -868,14 +829,9 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
* task, wake them up to see whether they'd be able to steal the just * task, wake them up to see whether they'd be able to steal the just
* queued task. * queued task.
*/ */
primary = primary_cpumask; cpu = scx_bpf_pick_idle_cpu(cast_mask(p->cpus_ptr), 0);
if (!primary) if (cpu >= 0)
return; scx_bpf_kick_cpu(cpu, 0);
if (bpf_cpumask_subset(cast_mask(primary), p->cpus_ptr)) {
cpu = scx_bpf_pick_idle_cpu(cast_mask(primary), 0);
if (cpu >= 0)
scx_bpf_kick_cpu(cpu, 0);
}
} }
/* /*
@ -1366,34 +1322,6 @@ int enable_sibling_cpu(struct domain_arg *input)
return err; return err;
} }
SEC("syscall")
int enable_preferred_cpu(struct cpu_arg *input)
{
struct bpf_cpumask *mask;
int err = 0;
/* Make sure the primary CPU mask is initialized */
err = init_cpumask(&preferred_cpumask);
if (err)
return err;
/*
* Enable the target CPU in the preferred scheduling domain.
*/
bpf_rcu_read_lock();
mask = preferred_cpumask;
if (mask) {
s32 cpu = input->cpu_id;
if (cpu < 0)
bpf_cpumask_clear(mask);
else
bpf_cpumask_set_cpu(cpu, mask);
}
bpf_rcu_read_unlock();
return err;
}
SEC("syscall") SEC("syscall")
int enable_primary_cpu(struct cpu_arg *input) int enable_primary_cpu(struct cpu_arg *input)
{ {
@ -1481,11 +1409,6 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
if (err) if (err)
return err; return err;
/* Initialize the preferred scheduling domain */
err = init_cpumask(&preferred_cpumask);
if (err)
return err;
return 0; return 0;
} }

View File

@ -52,7 +52,6 @@ const SCHEDULER_NAME: &'static str = "scx_bpfland";
#[derive(PartialEq)] #[derive(PartialEq)]
enum Powermode { enum Powermode {
Turbo,
Performance, Performance,
Powersave, Powersave,
Any, Any,
@ -66,8 +65,6 @@ fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
.into_iter() .into_iter()
.flat_map(|core| core.cpus()) .flat_map(|core| core.cpus())
.filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) { .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
// Turbo mode: only add turbo-boosted CPUs
(Powermode::Turbo, CoreType::Big { turbo: true }) |
// Performance mode: add all the Big CPUs (either Turbo or non-Turbo) // Performance mode: add all the Big CPUs (either Turbo or non-Turbo)
(Powermode::Performance, CoreType::Big { .. }) | (Powermode::Performance, CoreType::Big { .. }) |
// Powersave mode: add all the Little CPUs // Powersave mode: add all the Little CPUs
@ -158,15 +155,6 @@ struct Opts {
#[clap(short = 'k', long, action = clap::ArgAction::SetTrue)] #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
local_kthreads: bool, local_kthreads: bool,
/// Specifies a group of preferred CPUs, represented as a bitmask in hex (e.g., 0xff), that the
/// scheduler will try to prioritize to dispatch tasks.
///
/// Special values:
/// - "auto" = automaticlly detect the fastest CPUs based on the current scheduler and system
/// energy profiles.
#[clap(short = 'M', long, default_value = "auto")]
preferred_domain: String,
/// Specifies the initial set of CPUs, represented as a bitmask in hex (e.g., 0xff), that the /// Specifies the initial set of CPUs, represented as a bitmask in hex (e.g., 0xff), that the
/// scheduler will use to dispatch tasks, until the system becomes saturated, at which point /// scheduler will use to dispatch tasks, until the system becomes saturated, at which point
/// tasks may overflow to other available CPUs. /// tasks may overflow to other available CPUs.
@ -288,9 +276,6 @@ impl<'a> Scheduler<'a> {
// Initialize the primary scheduling domain and the preferred domain. // Initialize the primary scheduling domain and the preferred domain.
let energy_profile = Self::read_energy_profile(); let energy_profile = Self::read_energy_profile();
if let Err(err) = Self::init_preferred_domain(&mut skel, &opts.preferred_domain) {
warn!("failed to initialize preferred domain: error {}", err);
}
if let Err(err) = Self::init_energy_domain(&mut skel, &opts.primary_domain, &energy_profile) if let Err(err) = Self::init_energy_domain(&mut skel, &opts.primary_domain, &energy_profile)
{ {
warn!("failed to initialize primary domain: error {}", err); warn!("failed to initialize primary domain: error {}", err);
@ -372,28 +357,6 @@ impl<'a> Scheduler<'a> {
res.unwrap_or_else(|_: String| "none".to_string()) res.unwrap_or_else(|_: String| "none".to_string())
} }
fn enable_preferred_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
let prog = &mut skel.progs.enable_preferred_cpu;
let mut args = cpu_arg {
cpu_id: cpu as c_int,
};
let input = ProgramInput {
context_in: Some(unsafe {
std::slice::from_raw_parts_mut(
&mut args as *mut _ as *mut u8,
std::mem::size_of_val(&args),
)
}),
..Default::default()
};
let out = prog.test_run(input).unwrap();
if out.return_value != 0 {
return Err(out.return_value);
}
Ok(())
}
fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> { fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
let mut cpus = get_primary_cpus(profile).unwrap_or(Vec::new()); let mut cpus = get_primary_cpus(profile).unwrap_or(Vec::new());
if cpus.is_empty() { if cpus.is_empty() {
@ -402,32 +365,6 @@ impl<'a> Scheduler<'a> {
Cpumask::from_str(&cpus_to_cpumask(&cpus)) Cpumask::from_str(&cpus_to_cpumask(&cpus))
} }
fn init_preferred_domain(skel: &mut BpfSkel<'_>, preferred_domain: &String) -> Result<()> {
let domain = match preferred_domain.as_str() {
"auto" => Self::epp_to_cpumask(Powermode::Turbo)?,
&_ => Cpumask::from_str(&preferred_domain)?,
};
info!("preferred CPU domain = 0x{:x}", domain);
// Clear the preferred domain by passing a negative CPU id.
if let Err(err) = Self::enable_preferred_cpu(skel, -1) {
warn!("failed to reset preferred domain: error {}", err);
}
for cpu in 0..*NR_CPU_IDS {
if domain.test_cpu(cpu) {
if let Err(err) = Self::enable_preferred_cpu(skel, cpu as i32) {
warn!(
"failed to add CPU {} to preferred domain: error {}",
cpu, err
);
}
}
}
Ok(())
}
fn init_energy_domain( fn init_energy_domain(
skel: &mut BpfSkel<'_>, skel: &mut BpfSkel<'_>,
primary_domain: &String, primary_domain: &String,
@ -504,11 +441,6 @@ impl<'a> Scheduler<'a> {
self.energy_profile = energy_profile.clone(); self.energy_profile = energy_profile.clone();
if self.opts.primary_domain == "auto" { if self.opts.primary_domain == "auto" {
if let Err(err) =
Self::init_preferred_domain(&mut self.skel, &self.opts.preferred_domain)
{
warn!("failed to refresh preferred domain: error {}", err);
}
if let Err(err) = Self::init_energy_domain( if let Err(err) = Self::init_energy_domain(
&mut self.skel, &mut self.skel,
&self.opts.primary_domain, &self.opts.primary_domain,