Merge pull request #591 from multics69/lavd-turbo3

scx_lavd: introduce "autopilot" mode and misc. optimization & bug fix
This commit is contained in:
Changwoo Min 2024-08-31 02:14:35 +09:00 committed by GitHub
commit 4d8bf870a1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 194 additions and 54 deletions

View File

@ -89,6 +89,10 @@ enum consts {
LAVD_CC_CPU_PIN_INTERVAL_DIV = (LAVD_CC_CPU_PIN_INTERVAL /
LAVD_SYS_STAT_INTERVAL_NS),
LAVD_AP_LOW_UTIL = 50, /* powersave mode when cpu util <= 5% */
LAVD_AP_HIGH_UTIL = 300, /* balanced mode when 5% < cpu util <= 30%,
performance mode when cpu util > 30% */
LAVD_CPDOM_MAX_NR = 32, /* maximum number of compute domain */
LAVD_CPDOM_MAX_DIST = 4, /* maximum distance from one compute domain to another */
LAVD_CPDOM_STARV_NS = (5ULL * NSEC_PER_MSEC),
@ -299,6 +303,8 @@ enum {
LAVD_PM_PERFORMANCE = 0,
LAVD_PM_BALANCED = 1,
LAVD_PM_POWERSAVE = 2,
LAVD_PM_MAX = 3
};
struct power_arg {

View File

@ -229,10 +229,11 @@ static u64 cur_svc_time;
/*
* Options
*/
volatile bool no_core_compaction;
volatile bool no_freq_scaling;
volatile bool no_prefer_turbo_core;
volatile bool is_powersave_mode;
volatile bool no_core_compaction;
volatile bool no_freq_scaling;
volatile bool no_prefer_turbo_core;
volatile bool is_powersave_mode;
const volatile bool is_autopilot_on;
const volatile u32 is_smt_active;
const volatile u8 verbose;
@ -315,7 +316,7 @@ struct {
} introspec_msg SEC(".maps");
static u16 get_nice_prio(struct task_struct *p);
static void adjust_slice_boost(struct cpu_ctx *cpuc, struct task_ctx *taskc);
static int reinit_active_cpumask_for_performance(void);
static u64 sigmoid_u64(u64 v, u64 max)
{
@ -593,6 +594,7 @@ static void init_sys_stat_ctx(struct sys_stat_ctx *c)
c->stat_next = get_sys_stat_next();
c->now = bpf_ktime_get_ns();
c->duration = c->now - c->stat_cur->last_update_clk;
c->stat_next->last_update_clk = c->now;
}
static void collect_sys_stat(struct sys_stat_ctx *c)
@ -649,7 +651,7 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
bool ret = __sync_bool_compare_and_swap(
&cpuc->idle_start_clk, old_clk, c->now);
if (ret) {
c->idle_total += c->now - old_clk;
cpuc->idle_total += c->now - old_clk;
break;
}
}
@ -660,6 +662,7 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
u64 compute = 0;
if (c->duration > cpuc->idle_total)
compute = c->duration - cpuc->idle_total;
c->new_util = (compute * LAVD_CPU_UTIL_MAX) / c->duration;
cpuc->util = calc_avg(cpuc->util, c->new_util);
@ -691,9 +694,9 @@ static void calc_sys_stat(struct sys_stat_ctx *c)
c->duration_total = c->duration * nr_cpus_onln;
if (c->duration_total > c->idle_total)
c->compute_total = c->duration_total - c->idle_total;
c->new_util = (c->compute_total * LAVD_CPU_UTIL_MAX) /
c->duration_total;
else
c->compute_total = 0;
c->new_util = (c->compute_total * LAVD_CPU_UTIL_MAX)/c->duration_total;
if (c->sched_nr == 0) {
/*
@ -757,7 +760,6 @@ static void do_update_sys_stat(void)
/*
* Make the next version atomically visible.
*/
c.stat_next->last_update_clk = c.now;
flip_sys_stat();
}
@ -772,7 +774,7 @@ static u64 calc_nr_active_cpus(struct sys_stat *stat_cur)
nr_active /= (LAVD_CC_PER_CORE_MAX_CTUIL * 1000);
/*
* If a few CPUs are particularly busy, boost the overflow CPUs by 2x.
* If a few CPUs are particularly busy, boost the active CPUs more.
*/
nr_active += min(LAVD_CC_NR_OVRFLW, (stat_cur->nr_violation) / 1000);
nr_active = max(min(nr_active, nr_cpus_onln),
@ -900,10 +902,91 @@ unlock_out:
bpf_rcu_read_unlock();
}
int do_set_power_profile(s32 power_mode, int util)
{
static s32 cur_mode = LAVD_PM_MAX;
/*
* Skip setting the mode if alreay in the same mode.
*/
if (cur_mode == power_mode)
return 0;
cur_mode = power_mode;
/*
* Change the power mode.
*/
switch (power_mode) {
case LAVD_PM_PERFORMANCE:
no_core_compaction = true;
no_freq_scaling = true;
no_prefer_turbo_core = false;
is_powersave_mode = false;
/*
* Since the core compaction becomes off, we need to
* reinitialize the active and overflow cpumask for performance
* mode.
*/
reinit_active_cpumask_for_performance();
debugln("Set the scheduler's power profile to performance mode: %d", util);
break;
case LAVD_PM_BALANCED:
no_core_compaction = false;
no_freq_scaling = false;
no_prefer_turbo_core = false;
is_powersave_mode = false;
debugln("Set the scheduler's power profile to balanced mode: %d", util);
break;
case LAVD_PM_POWERSAVE:
no_core_compaction = false;
no_freq_scaling = false;
no_prefer_turbo_core = true;
is_powersave_mode = true;
debugln("Set the scheduler's power profile to power-save mode: %d", util);
break;
default:
return -EINVAL;
}
return 0;
}
static int do_autopilot(void)
{
struct sys_stat *stat_cur = get_sys_stat_cur();
/*
* If the CPU utiulization is very low (say <= 5%), it means high
* performance is not required. We run the scheduler in powersave mode
* to save energy consumption.
*/
if (stat_cur->util <= LAVD_AP_LOW_UTIL)
return do_set_power_profile(LAVD_PM_POWERSAVE, stat_cur->util);
/*
* If the CPU utiulization is moderate (say > 5%, <= 30%), we run the
* scheduler in balanced mode. Actually, balanced mode can save energy
* consumption only under moderate CPU load.
*/
if (stat_cur->util <= LAVD_AP_HIGH_UTIL)
return do_set_power_profile(LAVD_PM_BALANCED, stat_cur->util);
/*
* If the CPU utilization is high enough (say > 30%), we run the
* scheduler in performance mode. The system indeed needs perrformance
* also there is little energy benefit even under balanced mode anyway.
*/
return do_set_power_profile(LAVD_PM_PERFORMANCE, stat_cur->util);
}
static void update_sys_stat(void)
{
do_update_sys_stat();
if (is_autopilot_on)
do_autopilot();
if (!no_core_compaction)
do_core_compaction();
}
@ -1356,11 +1439,23 @@ static s32 pick_idle_cpu(struct task_struct *p, struct task_ctx *taskc,
struct bpf_cpumask *active, *ovrflw, *big, *little, *cpdom_mask_prev;
s32 cpu_id;
bpf_rcu_read_lock();
/*
* If a task can run only on a single CPU (e.g., per-CPU kworker), we
* simply check if a task is still pinned on the prev_cpu and go.
*/
if (p->nr_cpus_allowed == 1 &&
bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr)) {
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
*is_idle = true;
cpu_id = prev_cpu;
goto out;
}
/*
* Prepare cpumaks.
*/
bpf_rcu_read_lock();
cpuc = get_cpu_ctx();
cpuc_prev = get_cpu_ctx_id(prev_cpu);
if (!cpuc || !cpuc_prev || !taskc) {
@ -1515,6 +1610,7 @@ start_any_mask:
*/
unlock_out:
bpf_rcu_read_unlock();
out:
return cpu_id;
}
@ -2852,6 +2948,50 @@ static u16 get_cputurbo_cap(void)
return turbo_cap;
}
static int reinit_active_cpumask_for_performance(void)
{
struct cpu_ctx *cpuc;
struct bpf_cpumask *active, *ovrflw;
int cpu, err = 0;
barrier();
bpf_rcu_read_lock();
/*
* Prepare cpumasks.
*/
active = active_cpumask;
ovrflw = ovrflw_cpumask;
if (!active || !ovrflw) {
scx_bpf_error("Failed to prepare cpumasks.");
err = -ENOMEM;
goto unlock_out;
}
/*
* Once core compaction becomes off in performance mode,
* reinitialize active/overflow cpumasks to reflect the mode change.
*/
bpf_for(cpu, 0, nr_cpus_onln) {
cpuc = get_cpu_ctx_id(cpu);
if (!cpuc) {
scx_bpf_error("Failed to lookup cpu_ctx: %d", cpu);
err = -ESRCH;
goto unlock_out;
}
if (cpuc->big_core)
bpf_cpumask_set_cpu(cpu, active);
else
bpf_cpumask_set_cpu(cpu, ovrflw);
}
unlock_out:
bpf_rcu_read_unlock();
return err;
}
static s32 init_per_cpu_ctx(u64 now)
{
struct cpu_ctx *cpuc;
@ -3081,30 +3221,7 @@ void BPF_STRUCT_OPS(lavd_exit, struct scx_exit_info *ei)
SEC("syscall")
int set_power_profile(struct power_arg *input)
{
switch (input->power_mode) {
case LAVD_PM_PERFORMANCE:
no_core_compaction = true;
no_freq_scaling = true;
no_prefer_turbo_core = false;
is_powersave_mode = false;
break;
case LAVD_PM_BALANCED:
no_core_compaction = false;
no_freq_scaling = false;
no_prefer_turbo_core = false;
is_powersave_mode = false;
break;
case LAVD_PM_POWERSAVE:
no_core_compaction = false;
no_freq_scaling = false;
no_prefer_turbo_core = true;
is_powersave_mode = true;
break;
default:
return -EINVAL;
}
return 0;
return do_set_power_profile(input->power_mode, 0);
}
SCX_OPS_DEFINE(lavd_ops,

View File

@ -72,9 +72,14 @@ use rlimit::{getrlimit, setrlimit, Resource};
/// See the more detailed overview of the LAVD design at main.bpf.c.
#[derive(Debug, Parser)]
struct Opts {
/// Automatically decide the power mode based on the current energy profile.
#[clap(long = "auto", action = clap::ArgAction::SetTrue)]
auto: bool,
/// Automatically decide the scheduler's power mode based on system load.
/// This is a recommended mode if you don't understand the following options:
#[clap(long = "autopilot", action = clap::ArgAction::SetTrue)]
autopilot: bool,
/// Automatically decide the scheduler's power mode based on the system's energy profile.
#[clap(long = "autopower", action = clap::ArgAction::SetTrue)]
autopower: bool,
/// Run in performance mode to get maximum performance.
#[clap(long = "performance", action = clap::ArgAction::SetTrue)]
@ -551,6 +556,7 @@ impl<'a> Scheduler<'a> {
Ok(ret) => (ret == 1) as u32,
Err(_) => 0,
};
skel.maps.rodata_data.is_autopilot_on = opts.autopilot;
skel.maps.rodata_data.verbose = opts.verbose;
}
@ -692,32 +698,43 @@ impl<'a> Scheduler<'a> {
res.unwrap_or_else(|_| "none".to_string())
}
fn update_power_profile(&mut self) -> bool {
fn update_power_profile(&mut self, prev_profile: String) -> (bool, String) {
const LAVD_PM_PERFORMANCE: s32 = 0;
const LAVD_PM_BALANCED: s32 = 1;
const LAVD_PM_POWERSAVE: s32 = 2;
let profile = Self::read_energy_profile();
if profile == "performance" {
let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
} else if profile == "balance_performance" {
let _ = self.set_power_profile(LAVD_PM_BALANCED);
} else if profile == "power" {
let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
} else {
return false;
if profile == prev_profile {
// If the profile is the same, skip updaring the profile for BPF.
return (true, profile);
}
true
if profile == "performance" {
let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
info!("Set the scheduler's power profile to performance mode.");
} else if profile == "balance_performance" {
let _ = self.set_power_profile(LAVD_PM_BALANCED);
info!("Set the scheduler's power profile to balanced mode.");
} else if profile == "power" {
let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
info!("Set the scheduler's power profile to power-save mode.");
} else {
// We don't know how to handle an unknown energy profile,
// so we just give up updating the profile from now on.
return (false, profile);
}
(true, profile)
}
fn run(&mut self, auto: bool, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
fn run(&mut self, autopower: bool, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
let (res_ch, req_ch) = self.stats_server.channels();
let mut auto = auto;
let mut autopower = autopower;
let mut profile = "unknown".to_string();
while !shutdown.load(Ordering::Relaxed) && !self.exited() {
if auto {
auto = self.update_power_profile();
if autopower {
(autopower, profile) = self.update_power_profile(profile);
}
match req_ch.recv_timeout(Duration::from_secs(1)) {
@ -799,7 +816,7 @@ fn main() -> Result<()> {
*build_id::SCX_FULL_VERSION
);
info!("scx_lavd scheduler starts running.");
if !sched.run(opts.auto, shutdown.clone())?.should_restart() {
if !sched.run(opts.autopower, shutdown.clone())?.should_restart() {
break;
}
}