Merge pull request #591 from multics69/lavd-turbo3

scx_lavd: introduce "autopilot" mode and misc. optimization & bug fix
2024-11-26 03:20:24 +00:00 · 2024-08-31 02:14:35 +09:00 · 2024-08-31 02:14:35 +09:00 · 4d8bf870a1
commit 4d8bf870a1
parent 18d0b74f9b 9091dd983b
3 changed files with 194 additions and 54 deletions
--- a/scheds/rust/scx_lavd/src/bpf/intf.h
+++ b/scheds/rust/scx_lavd/src/bpf/intf.h
@ -89,6 +89,10 @@ enum consts {
 	LAVD_CC_CPU_PIN_INTERVAL_DIV	= (LAVD_CC_CPU_PIN_INTERVAL /
 					   LAVD_SYS_STAT_INTERVAL_NS),

+	LAVD_AP_LOW_UTIL		= 50, /* powersave mode when cpu util <= 5% */
+	LAVD_AP_HIGH_UTIL		= 300, /* balanced mode when 5% < cpu util <= 30%,
+						  performance mode when cpu util > 30% */
+
 	LAVD_CPDOM_MAX_NR		= 32, /* maximum number of compute domain */
 	LAVD_CPDOM_MAX_DIST		= 4,  /* maximum distance from one compute domain to another */
 	LAVD_CPDOM_STARV_NS		= (5ULL * NSEC_PER_MSEC),
@ -299,6 +303,8 @@ enum {
 	LAVD_PM_PERFORMANCE	= 0,
 	LAVD_PM_BALANCED	= 1,
 	LAVD_PM_POWERSAVE	= 2,
+
+	LAVD_PM_MAX		= 3
 };

 struct power_arg {
--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@ -229,10 +229,11 @@ static u64		cur_svc_time;
 /*
 * Options
 */
-volatile bool	no_core_compaction;
-volatile bool	no_freq_scaling;
-volatile bool	no_prefer_turbo_core;
-volatile bool	is_powersave_mode;
+volatile bool		no_core_compaction;
+volatile bool		no_freq_scaling;
+volatile bool		no_prefer_turbo_core;
+volatile bool		is_powersave_mode;
+const volatile bool	is_autopilot_on;
 const volatile u32 	is_smt_active;
 const volatile u8	verbose;

@ -315,7 +316,7 @@ struct {
 } introspec_msg SEC(".maps");

 static u16 get_nice_prio(struct task_struct *p);
-static void adjust_slice_boost(struct cpu_ctx *cpuc, struct task_ctx *taskc);
+static int reinit_active_cpumask_for_performance(void);

 static u64 sigmoid_u64(u64 v, u64 max)
 {
@ -593,6 +594,7 @@ static void init_sys_stat_ctx(struct sys_stat_ctx *c)
 	c->stat_next = get_sys_stat_next();
 	c->now = bpf_ktime_get_ns();
 	c->duration = c->now - c->stat_cur->last_update_clk;
+	c->stat_next->last_update_clk = c->now;
 }

 static void collect_sys_stat(struct sys_stat_ctx *c)
@ -649,7 +651,7 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
 			bool ret = __sync_bool_compare_and_swap(
 					&cpuc->idle_start_clk, old_clk, c->now);
 			if (ret) {
-				c->idle_total += c->now - old_clk;
+				cpuc->idle_total += c->now - old_clk;
 				break;
 			}
 		}
@ -660,6 +662,7 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
 		u64 compute = 0;
 		if (c->duration > cpuc->idle_total)
 			compute = c->duration - cpuc->idle_total;
+
 		c->new_util = (compute * LAVD_CPU_UTIL_MAX) / c->duration;
 		cpuc->util = calc_avg(cpuc->util, c->new_util);

@ -691,9 +694,9 @@ static void calc_sys_stat(struct sys_stat_ctx *c)
 	c->duration_total = c->duration * nr_cpus_onln;
 	if (c->duration_total > c->idle_total)
 		c->compute_total = c->duration_total - c->idle_total;
-
-	c->new_util = (c->compute_total * LAVD_CPU_UTIL_MAX) /
-		      c->duration_total;
+	else
+		c->compute_total = 0;
+	c->new_util = (c->compute_total * LAVD_CPU_UTIL_MAX)/c->duration_total;

 	if (c->sched_nr == 0) {
 		/*
@ -757,7 +760,6 @@ static void do_update_sys_stat(void)
 	/*
 	 * Make the next version atomically visible.
 	 */
-	c.stat_next->last_update_clk = c.now;
 	flip_sys_stat();
 }

@ -772,7 +774,7 @@ static u64 calc_nr_active_cpus(struct sys_stat *stat_cur)
 	nr_active /= (LAVD_CC_PER_CORE_MAX_CTUIL * 1000);

 	/*
-	 * If a few CPUs are particularly busy, boost the overflow CPUs by 2x.
+	 * If a few CPUs are particularly busy, boost the active CPUs more.
 	 */
 	nr_active += min(LAVD_CC_NR_OVRFLW, (stat_cur->nr_violation) / 1000);
 	nr_active = max(min(nr_active, nr_cpus_onln),
@ -900,10 +902,91 @@ unlock_out:
 	bpf_rcu_read_unlock();
 }

+int do_set_power_profile(s32 power_mode, int util)
+{
+	static s32 cur_mode = LAVD_PM_MAX;
+
+	/*
+	 * Skip setting the mode if alreay in the same mode.
+	 */
+	if (cur_mode == power_mode)
+		return 0;
+	cur_mode = power_mode;
+
+	/*
+	 * Change the power mode.
+	 */
+	switch (power_mode) {
+	case LAVD_PM_PERFORMANCE:
+		no_core_compaction = true;
+		no_freq_scaling = true;
+		no_prefer_turbo_core = false;
+		is_powersave_mode = false;
+
+		/*
+		 * Since the core compaction becomes off, we need to
+		 * reinitialize the active and overflow cpumask for performance
+		 * mode.
+		 */
+		reinit_active_cpumask_for_performance();
+		debugln("Set the scheduler's power profile to performance mode: %d", util);
+		break;
+	case LAVD_PM_BALANCED:
+		no_core_compaction = false;
+		no_freq_scaling = false;
+		no_prefer_turbo_core = false;
+		is_powersave_mode = false;
+		debugln("Set the scheduler's power profile to balanced mode: %d", util);
+		break;
+	case LAVD_PM_POWERSAVE:
+		no_core_compaction = false;
+		no_freq_scaling = false;
+		no_prefer_turbo_core = true;
+		is_powersave_mode = true;
+		debugln("Set the scheduler's power profile to power-save mode: %d", util);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int do_autopilot(void)
+{
+	struct sys_stat *stat_cur = get_sys_stat_cur();
+
+	/*
+	 * If the CPU utiulization is very low (say <= 5%), it means high
+	 * performance is not required. We run the scheduler in powersave mode
+	 * to save energy consumption.
+	 */
+	if (stat_cur->util <= LAVD_AP_LOW_UTIL)
+		return do_set_power_profile(LAVD_PM_POWERSAVE, stat_cur->util);
+
+	/*
+	 * If the CPU utiulization is moderate (say > 5%, <= 30%), we run the
+	 * scheduler in balanced mode. Actually, balanced mode can save energy
+	 * consumption only under moderate CPU load.
+	 */
+	if (stat_cur->util <= LAVD_AP_HIGH_UTIL)
+		return do_set_power_profile(LAVD_PM_BALANCED, stat_cur->util);
+
+	/*
+	 * If the CPU utilization is high enough (say > 30%), we run the
+	 * scheduler in performance mode. The system indeed needs perrformance
+	 * also there is little energy benefit even under balanced mode anyway.
+	 */
+	return do_set_power_profile(LAVD_PM_PERFORMANCE, stat_cur->util);
+}
+
 static void update_sys_stat(void)
 {
 	do_update_sys_stat();

+	if (is_autopilot_on)
+		do_autopilot();
+
 	if (!no_core_compaction)
 		do_core_compaction();
 }
@ -1356,11 +1439,23 @@ static s32 pick_idle_cpu(struct task_struct *p, struct task_ctx *taskc,
 	struct bpf_cpumask *active, *ovrflw, *big, *little, *cpdom_mask_prev;
 	s32 cpu_id;

-	bpf_rcu_read_lock();
+	/*
+	 * If a task can run only on a single CPU (e.g., per-CPU kworker), we
+	 * simply check if a task is still pinned on the prev_cpu and go.
+	 */
+	if (p->nr_cpus_allowed == 1 &&
+	    bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr)) {
+		if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
+			*is_idle = true;
+		cpu_id = prev_cpu;
+		goto out;
+	}

 	/*
 	 * Prepare cpumaks.
 	 */
+	bpf_rcu_read_lock();
+
 	cpuc = get_cpu_ctx();
 	cpuc_prev = get_cpu_ctx_id(prev_cpu);
 	if (!cpuc || !cpuc_prev || !taskc) {
@ -1515,6 +1610,7 @@ start_any_mask:
 	 */
 unlock_out:
 	bpf_rcu_read_unlock();
+out:
 	return cpu_id;
 }

@ -2852,6 +2948,50 @@ static u16 get_cputurbo_cap(void)
 	return turbo_cap;
 }

+static int reinit_active_cpumask_for_performance(void)
+{
+	struct cpu_ctx *cpuc;
+	struct bpf_cpumask *active, *ovrflw;
+	int cpu, err = 0;
+
+	barrier();
+	bpf_rcu_read_lock();
+
+	/*
+	 * Prepare cpumasks.
+	 */
+	active  = active_cpumask;
+	ovrflw  = ovrflw_cpumask;
+	if (!active || !ovrflw) {
+		scx_bpf_error("Failed to prepare cpumasks.");
+		err = -ENOMEM;
+		goto unlock_out;
+	}
+
+
+	/*
+	 * Once core compaction becomes off in performance mode,
+	 * reinitialize active/overflow cpumasks to reflect the mode change.
+	 */
+	bpf_for(cpu, 0, nr_cpus_onln) {
+		cpuc = get_cpu_ctx_id(cpu);
+		if (!cpuc) {
+			scx_bpf_error("Failed to lookup cpu_ctx: %d", cpu);
+			err = -ESRCH;
+			goto unlock_out;
+		}
+
+		if (cpuc->big_core)
+			bpf_cpumask_set_cpu(cpu, active);
+		else
+			bpf_cpumask_set_cpu(cpu, ovrflw);
+	}
+
+unlock_out:
+	bpf_rcu_read_unlock();
+	return err;
+}
+
 static s32 init_per_cpu_ctx(u64 now)
 {
 	struct cpu_ctx *cpuc;
@ -3081,30 +3221,7 @@ void BPF_STRUCT_OPS(lavd_exit, struct scx_exit_info *ei)
 SEC("syscall")
 int set_power_profile(struct power_arg *input)
 {
-	switch (input->power_mode) {
-	case LAVD_PM_PERFORMANCE:
-		no_core_compaction = true;
-		no_freq_scaling = true;
-		no_prefer_turbo_core = false;
-		is_powersave_mode = false;
-		break;
-	case LAVD_PM_BALANCED:
-		no_core_compaction = false;
-		no_freq_scaling = false;
-		no_prefer_turbo_core = false;
-		is_powersave_mode = false;
-		break;
-	case LAVD_PM_POWERSAVE:
-		no_core_compaction = false;
-		no_freq_scaling = false;
-		no_prefer_turbo_core = true;
-		is_powersave_mode = true;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
+	return do_set_power_profile(input->power_mode, 0);
 }

 SCX_OPS_DEFINE(lavd_ops,
--- a/scheds/rust/scx_lavd/src/main.rs
+++ b/scheds/rust/scx_lavd/src/main.rs
@ -72,9 +72,14 @@ use rlimit::{getrlimit, setrlimit, Resource};
 /// See the more detailed overview of the LAVD design at main.bpf.c.
 #[derive(Debug, Parser)]
 struct Opts {
-    /// Automatically decide the power mode based on the current energy profile.
-    #[clap(long = "auto", action = clap::ArgAction::SetTrue)]
-    auto: bool,
+    /// Automatically decide the scheduler's power mode based on system load.
+    /// This is a recommended mode if you don't understand the following options:
+    #[clap(long = "autopilot", action = clap::ArgAction::SetTrue)]
+    autopilot: bool,
+
+    /// Automatically decide the scheduler's power mode based on the system's energy profile.
+    #[clap(long = "autopower", action = clap::ArgAction::SetTrue)]
+    autopower: bool,

    /// Run in performance mode to get maximum performance.
    #[clap(long = "performance", action = clap::ArgAction::SetTrue)]
@ -551,6 +556,7 @@ impl<'a> Scheduler<'a> {
            Ok(ret) => (ret == 1) as u32,
            Err(_)  => 0,
        };
+        skel.maps.rodata_data.is_autopilot_on = opts.autopilot;
        skel.maps.rodata_data.verbose = opts.verbose;
    }

@ -692,32 +698,43 @@ impl<'a> Scheduler<'a> {
        res.unwrap_or_else(|_| "none".to_string())
    }

-    fn update_power_profile(&mut self) -> bool {
+    fn update_power_profile(&mut self, prev_profile: String) -> (bool, String) {
        const LAVD_PM_PERFORMANCE: s32 = 0;
        const LAVD_PM_BALANCED: s32 = 1;
        const LAVD_PM_POWERSAVE: s32 = 2;

        let profile = Self::read_energy_profile();
-        if profile == "performance" {
-            let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
-        } else if profile == "balance_performance" {
-            let _ = self.set_power_profile(LAVD_PM_BALANCED);
-        } else if profile == "power" {
-            let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
-        } else {
-            return false;
+        if profile == prev_profile {
+            // If the profile is the same, skip updaring the profile for BPF.
+            return (true, profile);
        }

-        true
+        if profile == "performance" {
+            let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
+            info!("Set the scheduler's power profile to performance mode.");
+        } else if profile == "balance_performance" {
+            let _ = self.set_power_profile(LAVD_PM_BALANCED);
+            info!("Set the scheduler's power profile to balanced mode.");
+        } else if profile == "power" {
+            let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
+            info!("Set the scheduler's power profile to power-save mode.");
+        } else {
+            // We don't know how to handle an unknown energy profile,
+            // so we just give up updating the profile from now on.
+            return (false, profile);
+        }
+
+        (true, profile)
    }

-    fn run(&mut self, auto: bool, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
+    fn run(&mut self, autopower: bool, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
        let (res_ch, req_ch) = self.stats_server.channels();
-        let mut auto = auto;
+        let mut autopower = autopower;
+        let mut profile = "unknown".to_string();

        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
-            if auto {
-                auto = self.update_power_profile();
+            if autopower {
+                (autopower, profile) = self.update_power_profile(profile);
            }

            match req_ch.recv_timeout(Duration::from_secs(1)) {
@ -799,7 +816,7 @@ fn main() -> Result<()> {
            *build_id::SCX_FULL_VERSION
        );
        info!("scx_lavd scheduler starts running.");
-        if !sched.run(opts.auto, shutdown.clone())?.should_restart() {
+        if !sched.run(opts.autopower, shutdown.clone())?.should_restart() {
            break;
        }
    }