scx_bpfland: introduce Intel Turbo Boost awareness

Make `--primar-domain auto` aware of turbo boosted CPUs and prioritize
them over the primary scheduling domain when the energy model
`balance_power` is used (typically when running on battery power with
the "balanced" profile).

With this change the scheduling hierarchy becomes the following:

 1) CPUs in the turbo scheduling domain
 2) CPUs in the primary scheduling domain
 3) full-idle SMT CPUs
 4) CPUs in the same L2 cache
 5) CPUs in the same L3 cache
 6) CPUs in the task's allowed domain

And the idle selection logic is modified as following:

 - In the turbo scheduling domain:
   - pick same full-idle SMT CPU
   - pick any other full-idle SMT CPU sharing the same L2 cache
   - pick any other full-idle SMT CPU sharing the same L3 cache
   - pick any other full-idle SMT CPU
   - pick same idle CPU
   - pick any other idle CPU sharing the same L2 cache
   - pick any other idle CPU sharing the same L3 cache
   - pick any other idle SMT CPU
 - In the primary scheduling domain:
   - pick same full-idle SMT CPU
   - pick any other full-idle SMT CPU sharing the same L2 cache
   - pick any other full-idle SMT CPU sharing the same L3 cache
   - pick any other full-idle SMT CPU
   - pick same idle CPU
   - pick any other idle CPU sharing the same L2 cache
   - pick any other idle CPU sharing the same L3 cache
   - pick any other idle SMT CPU
 - In the entire task domain:
   - pick any other idle CPU

Keep in mind that the turbo domain will be evaluated only when the
scheduler is started with `--primary-domain auto` and only when the
`balance_power` energy profile is used.

The turbo domain is always made using the subset of CPUs in the system
with the highest max frequency. If such subset can't be determined (for
example if all the CPUs in the primary domain have all the same
frequency), the turbo domain will be ignored.

Prioritizing turbo boosted CPUs can help to improve performance by
forcing the governor to scale up their frequency, without increasing too
much power consumption, due to the fact that tasks will be preferably
confined into a reduced amount of cores.

This change seems to improve performance, without increasing much
power consuption, on Intel laptops while using the `balanced_power`
energy profile.

Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
This commit is contained in:
Andrea Righi 2024-08-22 15:33:05 +02:00
parent d958dd4482
commit 50684e4569
2 changed files with 177 additions and 45 deletions

View File

@ -133,6 +133,11 @@ UEI_DEFINE(uei);
*/
private(BPFLAND) struct bpf_cpumask __kptr *primary_cpumask;
/*
* Mask of turbo boosted CPUs in the system.
*/
private(BPFLAND) struct bpf_cpumask __kptr *turbo_cpumask;
/*
* Mask of offline CPUs, used to properly support CPU hotplugging.
*/
@ -505,10 +510,11 @@ static int dispatch_direct_cpu(struct task_struct *p, s32 cpu, u64 enq_flags)
static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
const struct cpumask *online_cpumask, *idle_smtmask, *idle_cpumask;
struct bpf_cpumask *primary, *l2_domain, *l3_domain;
struct bpf_cpumask *primary, *turbo, *l2_domain, *l3_domain;
struct bpf_cpumask *p_mask, *l2_mask, *l3_mask;
struct task_ctx *tctx;
struct cpu_ctx *cctx;
bool do_turbo = true;
s32 cpu;
tctx = try_lookup_task_ctx(p);
@ -534,6 +540,9 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
primary = primary_cpumask;
if (!primary)
return -ENOENT;
turbo = turbo_cpumask;
if (!turbo)
return -ENOENT;
/*
* Acquire the CPU masks to determine the online and idle CPUs in the
@ -552,7 +561,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
l3_domain = cctx->l3_cpumask;
if (!l3_domain)
l3_domain = primary;
retry:
/*
* Task's scheduling domains.
*/
@ -576,10 +585,17 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
}
/*
* Determine the task's primary domain as the intersection of the
* task's allowed cpumask and the global primary scheduling domain.
* Determine the task's scheduling domain.
*
* Try to dispatch on the turbo boosted CPUs first. If we can't find
* any idle CPU, re-try again with the primary scheduling domain.
*/
bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary));
if (do_turbo && !bpf_cpumask_equal(cast_mask(turbo), cast_mask(primary))) {
bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(turbo));
} else {
bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary));
do_turbo = false;
}
/*
* Determine the L2 cache domain as the intersection of the task's
@ -682,13 +698,22 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
}
/*
* Search for any idle CPU in the primary domain.
* Search for any idle CPU in the scheduling domain.
*/
cpu = bpf_cpumask_any_and_distribute(cast_mask(p_mask), idle_cpumask);
if (bpf_cpumask_test_cpu(cpu, online_cpumask) &&
scx_bpf_test_and_clear_cpu_idle(cpu))
goto out_put_cpumask;
/*
* If we were looking for an idle CPU in the turbo domain and we
* couldn't find any, re-try again with the whole primary domain.
*/
if (do_turbo) {
do_turbo = false;
goto retry;
}
/*
* If all the previous attempts have failed, try to use any idle CPU in
* the system.
@ -1239,6 +1264,34 @@ int enable_sibling_cpu(struct domain_arg *input)
return err;
}
SEC("syscall")
int enable_turbo_cpu(struct cpu_arg *input)
{
struct bpf_cpumask *mask;
int err = 0;
/* Make sure the primary CPU mask is initialized */
err = init_cpumask(&turbo_cpumask);
if (err)
return err;
/*
* Enable the target CPU in the turbo boost scheduling domain.
*/
bpf_rcu_read_lock();
mask = turbo_cpumask;
if (mask) {
s32 cpu = input->cpu_id;
if (cpu < 0)
bpf_cpumask_clear(mask);
else
bpf_cpumask_set_cpu(cpu, mask);
}
bpf_rcu_read_unlock();
return err;
}
SEC("syscall")
int enable_primary_cpu(struct cpu_arg *input)
{
@ -1326,6 +1379,11 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
if (err)
return err;
/* Initialize the primary scheduling domain */
err = init_cpumask(&turbo_cpumask);
if (err)
return err;
return 0;
}

View File

@ -51,15 +51,24 @@ use scx_utils::NR_CPU_IDS;
const SCHEDULER_NAME: &'static str = "scx_bpfland";
fn get_primary_cpus(powersave: bool) -> std::io::Result<Vec<usize>> {
#[derive(PartialEq)]
enum Powermode {
Powersave,
Performance,
Turbo,
}
fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
let topo = Topology::new().unwrap();
// Iterate over each CPU directory and collect CPU ID and its base operational frequency to
// distinguish between fast and slow cores.
let mut cpu_freqs = Vec::new();
let mut max_cpu_freqs = Vec::new();
for core in topo.cores().into_iter() {
for (cpu_id, cpu) in core.cpus() {
cpu_freqs.push((*cpu_id, cpu.base_freq()));
max_cpu_freqs.push((*cpu_id, cpu.max_freq()));
}
}
if cpu_freqs.is_empty() {
@ -69,26 +78,35 @@ fn get_primary_cpus(powersave: bool) -> std::io::Result<Vec<usize>> {
// Find the smallest maximum frequency.
let min_freq = cpu_freqs.iter().map(|&(_, freq)| freq).min().unwrap();
// Find the highest maximum frequency.
let max_freq = max_cpu_freqs.iter().map(|&(_, freq)| freq).max().unwrap();
// Check if all CPUs have the smallest frequency.
let all_have_min_freq = cpu_freqs.iter().all(|&(_, freq)| freq == min_freq);
let selected_cpu_ids: Vec<usize> = if all_have_min_freq {
// If all CPUs have the smallest frequency, return all CPU IDs.
cpu_freqs.into_iter().map(|(cpu_id, _)| cpu_id).collect()
} else if powersave {
// If powersave is true, return the CPUs with the smallest frequency.
let selected_cpu_ids: Vec<usize> = if mode == Powermode::Turbo {
// Turbo: return the CPUs with the highest max frequency.
max_cpu_freqs
.into_iter()
.filter(|&(_, freq)| freq == max_freq)
.map(|(cpu_id, _)| cpu_id)
.collect()
} else if all_have_min_freq || mode == Powermode::Powersave {
// Powersave: return the CPUs with the smallest base frequency.
cpu_freqs
.into_iter()
.filter(|&(_, freq)| freq == min_freq)
.map(|(cpu_id, _)| cpu_id)
.collect()
} else {
// If powersave is false, return the CPUs with the highest frequency.
} else if mode == Powermode::Performance {
// Performance: return the CPUs with a base frequency greater than the minimum.
cpu_freqs
.into_iter()
.filter(|&(_, freq)| freq != min_freq)
.filter(|&(_, freq)| freq > min_freq)
.map(|(cpu_id, _)| cpu_id)
.collect()
} else {
Vec::new()
};
Ok(selected_cpu_ids)
@ -126,11 +144,11 @@ fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
fn parse_cpumask(cpu_str: &str) -> Result<Cpumask, anyhow::Error> {
match cpu_str {
"powersave" => {
let cpus = get_primary_cpus(true).unwrap();
let cpus = get_primary_cpus(Powermode::Powersave).unwrap();
Cpumask::from_str(&cpus_to_cpumask(&cpus))
}
"performance" => {
let cpus = get_primary_cpus(false).unwrap();
let cpus = get_primary_cpus(Powermode::Performance).unwrap();
Cpumask::from_str(&cpus_to_cpumask(&cpus))
}
"auto" => {
@ -345,6 +363,9 @@ impl<'a> Scheduler<'a> {
// Initialize the primary scheduling domain (based on the --primary-domain option).
let energy_profile = Self::read_energy_profile();
if let Err(err) = Self::init_turbo_domain(&mut skel, &opts.primary_domain, &energy_profile) {
warn!("failed to initialize turbo domain: error {}", err);
}
if let Err(err) = Self::init_energy_domain(&mut skel, &opts.primary_domain, &energy_profile) {
warn!("failed to initialize primary domain: error {}", err);
}
@ -401,28 +422,6 @@ impl<'a> Scheduler<'a> {
Ok(())
}
fn init_primary_domain(
skel: &mut BpfSkel<'_>,
primary_domain: &Cpumask,
) -> Result<()> {
info!("primary CPU domain = 0x{:x}", primary_domain);
// Clear the primary domain by passing a negative CPU id.
if let Err(err) = Self::enable_primary_cpu(skel, -1) {
warn!("failed to reset primary domain: error {}", err);
}
// Update primary scheduling domain.
for cpu in 0..*NR_CPU_IDS {
if primary_domain.test_cpu(cpu) {
if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
warn!("failed to add CPU {} to primary domain: error {}", cpu, err);
}
}
}
Ok(())
}
fn read_energy_profile() -> String {
let res = File::open("/sys/devices/system/cpu/cpufreq/policy0/energy_performance_preference")
.and_then(|mut file| {
@ -434,11 +433,32 @@ impl<'a> Scheduler<'a> {
res.unwrap_or_else(|_| "none".to_string())
}
fn init_energy_domain(skel: &mut BpfSkel<'_>, primary_domain: &Cpumask, energy_profile: &String) -> Result<()> {
fn enable_turbo_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
let prog = &mut skel.progs.enable_turbo_cpu;
let mut args = cpu_arg {
cpu_id: cpu as c_int,
};
let input = ProgramInput {
context_in: Some(unsafe {
std::slice::from_raw_parts_mut(
&mut args as *mut _ as *mut u8,
std::mem::size_of_val(&args),
)
}),
..Default::default()
};
let out = prog.test_run(input).unwrap();
if out.return_value != 0 {
return Err(out.return_value);
}
Ok(())
}
fn init_turbo_domain(skel: &mut BpfSkel<'_>, primary_domain: &Cpumask, energy_profile: &String) -> Result<()> {
let domain = if primary_domain.is_empty() {
let cpus = match energy_profile.as_str() {
"power" => get_primary_cpus(true).unwrap_or(Vec::new()),
"performance" => get_primary_cpus(false).unwrap_or(Vec::new()),
"balance_power" => get_primary_cpus(Powermode::Turbo).unwrap_or(Vec::new()),
&_ => Vec::new(),
};
if cpus.is_empty() {
@ -451,17 +471,71 @@ impl<'a> Scheduler<'a> {
} else {
primary_domain.clone()
};
Self::init_primary_domain(skel, &domain)?;
info!("Turbo CPU domain = 0x{:x}", domain);
// Clear the turbo domain by passing a negative CPU id.
if let Err(err) = Self::enable_turbo_cpu(skel, -1) {
warn!("failed to reset primary domain: error {}", err);
}
for cpu in 0..*NR_CPU_IDS {
if domain.test_cpu(cpu) {
if let Err(err) = Self::enable_turbo_cpu(skel, cpu as i32) {
warn!("failed to add CPU {} to turbo domain: error {}", cpu, err);
}
}
}
Ok(())
}
fn refresh_energy_domain(&mut self) {
fn init_energy_domain(skel: &mut BpfSkel<'_>, primary_domain: &Cpumask, energy_profile: &String) -> Result<()> {
let domain = if primary_domain.is_empty() {
let cpus = match energy_profile.as_str() {
"power" => get_primary_cpus(Powermode::Powersave).unwrap_or(Vec::new()),
"balance_power" => get_primary_cpus(Powermode::Performance).unwrap_or(Vec::new()),
"balance_performance" => get_primary_cpus(Powermode::Performance).unwrap_or(Vec::new()),
"performance" => get_primary_cpus(Powermode::Performance).unwrap_or(Vec::new()),
&_ => Vec::new(),
};
if cpus.is_empty() {
let mut cpumask = Cpumask::new()?;
cpumask.setall();
cpumask
} else {
Cpumask::from_str(&cpus_to_cpumask(&cpus))?
}
} else {
primary_domain.clone()
};
info!("primary CPU domain = 0x{:x}", domain);
// Clear the primary domain by passing a negative CPU id.
if let Err(err) = Self::enable_primary_cpu(skel, -1) {
warn!("failed to reset primary domain: error {}", err);
}
// Update primary scheduling domain.
for cpu in 0..*NR_CPU_IDS {
if domain.test_cpu(cpu) {
if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
warn!("failed to add CPU {} to primary domain: error {}", cpu, err);
}
}
}
Ok(())
}
fn refresh_sched_domain(&mut self) {
if self.opts.primary_domain.is_empty() {
let energy_profile = Self::read_energy_profile();
if energy_profile != self.energy_profile {
self.energy_profile = energy_profile.clone();
if let Err(err) = Self::init_turbo_domain(&mut self.skel, &self.opts.primary_domain, &energy_profile) {
warn!("failed to refresh turbo domain: error {}", err);
}
if let Err(err) = Self::init_energy_domain(&mut self.skel, &self.opts.primary_domain, &energy_profile) {
warn!("failed to refresh primary domain: error {}", err);
}
@ -634,7 +708,7 @@ impl<'a> Scheduler<'a> {
fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
while !shutdown.load(Ordering::Relaxed) && !self.exited() {
self.refresh_cache_domains();
self.refresh_energy_domain();
self.refresh_sched_domain();
self.update_stats();
std::thread::sleep(Duration::from_millis(1000));
}