Merge pull request #612 from multics69/lavd-monitor

scx_lavd: add --monitor flag and two micro-optimizations
2024-11-26 03:20:24 +00:00 · 2024-09-06 09:33:55 +09:00 · 2024-09-06 09:33:55 +09:00 · e3243c5d51
commit e3243c5d51
parent 0fa369b914 d9274bd8e6
6 changed files with 523 additions and 68 deletions
--- a/scheds/rust/Cargo.lock
+++ b/scheds/rust/Cargo.lock
@ -527,6 +527,15 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"

+[[package]]
+name = "gpoint"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1c00f1d62d57408109a871dd9e12b76645ec4284406d5ec838d277777ef1ef6c"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
@ -1152,6 +1161,7 @@ dependencies = [
 "crossbeam",
 "ctrlc",
 "fb_procfs",
+ "gpoint",
 "hex",
 "itertools 0.13.0",
 "libbpf-rs",
--- a/scheds/rust/scx_lavd/Cargo.toml
+++ b/scheds/rust/scx_lavd/Cargo.toml
@ -27,6 +27,7 @@ simplelog = "0.12"
 static_assertions = "1.1.0"
 rlimit = "0.10.1"
 plain = "0.2.3"
+gpoint = "0.2"

 [build-dependencies]
 scx_utils = { path = "../../../rust/scx_utils", version = "1.0.3" }
--- a/scheds/rust/scx_lavd/src/bpf/intf.h
+++ b/scheds/rust/scx_lavd/src/bpf/intf.h
@ -81,11 +81,12 @@ enum consts {
 	LAVD_PREEMPT_TICK_MARGIN	= (100ULL * NSEC_PER_USEC),

 	LAVD_SYS_STAT_INTERVAL_NS	= (50ULL * NSEC_PER_MSEC),
+	LAVD_SYS_STAT_DECAY_TIMES	= (2ULL * LAVD_TIME_ONE_SEC) / LAVD_SYS_STAT_INTERVAL_NS,
 	LAVD_CC_PER_CORE_MAX_CTUIL	= 500, /* maximum per-core CPU utilization */
 	LAVD_CC_PER_TURBO_CORE_MAX_CTUIL = 750, /* maximum per-core CPU utilization for a turbo core */
 	LAVD_CC_NR_ACTIVE_MIN		= 1, /* num of mininum active cores */
 	LAVD_CC_NR_OVRFLW		= 1, /* num of overflow cores */
-	LAVD_CC_CPU_PIN_INTERVAL	= (2ULL * LAVD_TIME_ONE_SEC),
+	LAVD_CC_CPU_PIN_INTERVAL	= (1ULL * LAVD_TIME_ONE_SEC),
 	LAVD_CC_CPU_PIN_INTERVAL_DIV	= (LAVD_CC_CPU_PIN_INTERVAL /
 					   LAVD_SYS_STAT_INTERVAL_NS),

@ -122,6 +123,16 @@ struct sys_stat {

 	volatile u32	nr_violation;	/* number of utilization violation */
 	volatile u32	nr_active;	/* number of active cores */
+
+	volatile u64	nr_sched;	/* total scheduling so far */
+	volatile u64	nr_migration;	/* number of task migration */
+	volatile u64	nr_preemption;	/* number of preemption */
+	volatile u64	nr_greedy;	/* number of greedy tasks scheduled */
+	volatile u64	nr_perf_cri;	/* number of performance-critical tasks scheduled */
+	volatile u64	nr_lat_cri;	/* number of latency-critical tasks scheduled */
+	volatile u64	nr_big;		/* scheduled on big core */
+	volatile u64	nr_pc_on_big;	/* performance-critical tasks scheduled on big core */
+	volatile u64	nr_lc_on_big;	/* latency-critical tasks scheduled on big core */
 };

 /*
@ -169,7 +180,7 @@ struct cpu_ctx {
 	 */
 	volatile u32	max_lat_cri;	/* maximum latency criticality */
 	volatile u32	sum_lat_cri;	/* sum of latency criticality */
-	volatile u32	sched_nr;	/* number of schedules */
+	volatile u32	nr_sched;	/* number of schedules */

 	/*
 	 * Information used to keep track of performance criticality
@ -205,6 +216,15 @@ struct cpu_ctx {
 	struct bpf_cpumask __kptr *tmp_o_mask;	/* temporary cpu mask */
 	struct bpf_cpumask __kptr *tmp_t_mask;	/* temporary cpu mask */
 	struct bpf_cpumask __kptr *tmp_t2_mask;	/* temporary cpu mask */
+
+	/*
+	 * Information for statistics.
+	 */
+	volatile u32	nr_migration;	/* number of migrations */
+	volatile u32	nr_preemption;	/* number of migrations */
+	volatile u32	nr_greedy;	/* number of greedy tasks scheduled */
+	volatile u32	nr_perf_cri;
+	volatile u32	nr_lat_cri;
 } __attribute__((aligned(CACHELINE_SIZE)));

 /*
@ -242,12 +262,18 @@ struct task_ctx {
 	volatile s32 victim_cpu;
 	u16	slice_boost_prio;	/* how many times a task fully consumed the slice */
 	u8	wakeup_ft;		/* regular wakeup = 1, sync wakeup = 2 */
+
 	/*
 	 * Task's performance criticality
 	 */
 	u8	on_big;			/* executable on a big core */
 	u8	on_little;		/* executable on a little core */
 	u32	perf_cri;		/* performance criticality of a task */
+
+	/*
+	 * Information for statistics collection
+	 */
+	u32	cpu_id;			/* CPU ID scheduled on */
 };

 /*
--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@ -197,8 +197,8 @@ char _license[] SEC("license") = "GPL";
 volatile u64		nr_cpus_onln;
 static volatile u64	nr_cpus_big;

-static struct sys_stat	__sys_stats[2];
-static volatile int	__sys_stat_idx;
+struct sys_stat	__sys_stats[2];
+volatile int	__sys_stat_idx;

 private(LAVD) struct bpf_cpumask __kptr *turbo_cpumask; /* CPU mask for turbo CPUs */
 private(LAVD) struct bpf_cpumask __kptr *big_cpumask; /* CPU mask for big CPUs */
@ -240,6 +240,18 @@ const volatile bool	is_autopilot_on;
 const volatile u32 	is_smt_active;
 const volatile u8	verbose;

+/*
+ * Statistics
+ */
+volatile int		power_mode;
+volatile u64		last_power_mode_clk;
+volatile u64		performance_mode_ns;
+volatile u64		balanced_mode_ns;
+volatile u64		powersave_mode_ns;
+
+/*
+ * Exit infomation
+ */
 UEI_DEFINE(uei);

 #define debugln(fmt, ...)						\
@ -320,6 +332,7 @@ struct {

 static u16 get_nice_prio(struct task_struct *p);
 static int reinit_active_cpumask_for_performance(void);
+static void update_power_mode_time(void);

 static u64 sigmoid_u64(u64 v, u64 max)
 {
@ -582,7 +595,15 @@ struct sys_stat_ctx {
 	s32		max_lat_cri;
 	s32		avg_lat_cri;
 	u64		sum_lat_cri;
-	u32		sched_nr;
+	u32		nr_sched;
+	u32		nr_migration;
+	u32		nr_preemption;
+	u32		nr_greedy;
+	u32		nr_perf_cri;
+	u32		nr_lat_cri;
+	u32		nr_big;
+	u32		nr_pc_on_big;
+	u32		nr_lc_on_big;
 	u64		sum_perf_cri;
 	u32		avg_perf_cri;
 	u64		new_util;
@ -618,6 +639,30 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
 		c->load_actual += cpuc->load_actual;
 		c->load_run_time_ns += cpuc->load_run_time_ns;
 		c->tot_svc_time += cpuc->tot_svc_time;
+		cpuc->tot_svc_time = 0;
+
+		/*
+		 * Accumulate statistics.
+		 */
+		if (cpuc->big_core) {
+			c->nr_big += cpuc->nr_sched;
+			c->nr_pc_on_big += cpuc->nr_perf_cri;
+			c->nr_lc_on_big += cpuc->nr_lat_cri;
+		}
+		c->nr_perf_cri += cpuc->nr_perf_cri;
+		cpuc->nr_perf_cri = 0;
+
+		c->nr_lat_cri += cpuc->nr_lat_cri;
+		cpuc->nr_lat_cri = 0;
+
+		c->nr_migration += cpuc->nr_migration;
+		cpuc->nr_migration = 0;
+
+		c->nr_preemption += cpuc->nr_preemption;
+		cpuc->nr_preemption = 0;
+
+		c->nr_greedy += cpuc->nr_greedy;
+		cpuc->nr_greedy = 0;

 		/*
 		 * Accumulate task's latency criticlity information.
@ -629,8 +674,8 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
 		c->sum_lat_cri += cpuc->sum_lat_cri;
 		cpuc->sum_lat_cri = 0;

-		c->sched_nr += cpuc->sched_nr;
-		cpuc->sched_nr = 0;
+		c->nr_sched += cpuc->nr_sched;
+		cpuc->nr_sched = 0;

 		if (cpuc->max_lat_cri > c->max_lat_cri)
 			c->max_lat_cri = cpuc->max_lat_cri;
@ -701,7 +746,7 @@ static void calc_sys_stat(struct sys_stat_ctx *c)
 		c->compute_total = 0;
 	c->new_util = (c->compute_total * LAVD_CPU_UTIL_MAX)/c->duration_total;

-	if (c->sched_nr == 0) {
+	if (c->nr_sched == 0) {
 		/*
 		 * When a system is completely idle, it is indeed possible
 		 * nothing scheduled for an interval.
@ -711,13 +756,15 @@ static void calc_sys_stat(struct sys_stat_ctx *c)
 		c->avg_perf_cri = c->stat_cur->avg_perf_cri;
 	}
 	else {
-		c->avg_lat_cri = c->sum_lat_cri / c->sched_nr;
-		c->avg_perf_cri = c->sum_perf_cri / c->sched_nr;
+		c->avg_lat_cri = c->sum_lat_cri / c->nr_sched;
+		c->avg_perf_cri = c->sum_perf_cri / c->nr_sched;
 	}
 }

 static void update_sys_stat_next(struct sys_stat_ctx *c)
 {
+	static int cnt = 0;
+
 	/*
 	 * Update the CPU utilization to the next version.
 	 */
@ -741,11 +788,45 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
 	stat_next->nr_violation =
 		calc_avg32(stat_cur->nr_violation, c->nr_violation);

-	stat_next->avg_svc_time = (c->sched_nr == 0) ? 0 :
-				  c->tot_svc_time / c->sched_nr;
+	stat_next->avg_svc_time = (c->nr_sched == 0) ? 0 :
+				  c->tot_svc_time / c->nr_sched;

 	stat_next->nr_queued_task =
 		calc_avg(stat_cur->nr_queued_task, c->nr_queued_task);
+
+
+	/*
+	 * Half the statistics every minitue so the statistics hold the
+	 * information on a few minutes.
+	 */
+	if (cnt++ == LAVD_SYS_STAT_DECAY_TIMES) {
+		cnt = 0;
+		stat_next->nr_sched >>= 1;
+		stat_next->nr_migration >>= 1;
+		stat_next->nr_preemption >>= 1;
+		stat_next->nr_greedy >>= 1;
+		stat_next->nr_perf_cri >>= 1;
+		stat_next->nr_lat_cri >>= 1;
+		stat_next->nr_big >>= 1;
+		stat_next->nr_pc_on_big >>= 1;
+		stat_next->nr_lc_on_big >>= 1;
+
+		__sync_fetch_and_sub(&performance_mode_ns, performance_mode_ns/2);
+		__sync_fetch_and_sub(&balanced_mode_ns, balanced_mode_ns/2);
+		__sync_fetch_and_sub(&powersave_mode_ns, powersave_mode_ns/2);
+	}
+
+	stat_next->nr_sched += c->nr_sched;
+	stat_next->nr_migration += c->nr_migration;
+	stat_next->nr_preemption += c->nr_preemption;
+	stat_next->nr_greedy += c->nr_greedy;
+	stat_next->nr_perf_cri += c->nr_perf_cri;
+	stat_next->nr_lat_cri += c->nr_lat_cri;
+	stat_next->nr_big += c->nr_big;
+	stat_next->nr_pc_on_big += c->nr_pc_on_big;
+	stat_next->nr_lc_on_big += c->nr_lc_on_big;
+
+	update_power_mode_time();
 }

 static void do_update_sys_stat(void)
@ -905,21 +986,49 @@ unlock_out:
 	bpf_rcu_read_unlock();
 }

-int do_set_power_profile(s32 power_mode, int util)
+static void update_power_mode_time(void)
 {
-	static s32 cur_mode = LAVD_PM_MAX;
+	u64 now = bpf_ktime_get_ns();
+	u64 delta;

+	if (last_power_mode_clk == 0)
+		last_power_mode_clk = now;
+
+	delta = now - last_power_mode_clk;
+	last_power_mode_clk = now;
+
+	switch (power_mode) {
+	case LAVD_PM_PERFORMANCE:
+		__sync_fetch_and_add(&performance_mode_ns, delta);
+		break;
+	case LAVD_PM_BALANCED:
+		__sync_fetch_and_add(&balanced_mode_ns, delta);
+		break;
+	case LAVD_PM_POWERSAVE:
+		__sync_fetch_and_add(&powersave_mode_ns, delta);
+		break;
+	}
+}
+
+
+static int do_set_power_profile(s32 pm, int util)
+{
 	/*
 	 * Skip setting the mode if alreay in the same mode.
 	 */
-	if (cur_mode == power_mode)
+	if (power_mode == pm)
 		return 0;
-	cur_mode = power_mode;
+
+	/*
+	 * Update power mode time
+	 */
+	update_power_mode_time();
+	power_mode = pm;

 	/*
 	 * Change the power mode.
 	 */
-	switch (power_mode) {
+	switch (pm) {
 	case LAVD_PM_PERFORMANCE:
 		no_core_compaction = true;
 		no_freq_scaling = true;
@ -1184,13 +1293,6 @@ static u64 calc_time_slice(struct task_struct *p, struct task_ctx *taskc,
 			  taskc->slice_boost_prio) / LAVD_SLICE_BOOST_MAX_STEP;
 	}

-	/*
-	 * Boost time slice based on CPU's capacity to assign a longer time
-	 * slice for a more performant CPU for making each CPU's job processing
-	 * throughput similar.
-	 */
-	slice = slice * cpuc->capacity / 1024;
-
 	/*
 	 * If a task has yet to be scheduled (i.e., a freshly forked task or a
 	 * task just under sched_ext), don't give a fair amount of time slice
@ -1274,6 +1376,7 @@ static void update_stat_for_running(struct task_struct *p,
 				    struct task_ctx *taskc,
 				    struct cpu_ctx *cpuc)
 {
+	struct sys_stat *stat_cur = get_sys_stat_cur();
 	u64 wait_period, interval;
 	u64 now = bpf_ktime_get_ns();
 	u64 wait_freq_ft, wake_freq_ft, perf_cri;
@ -1306,7 +1409,7 @@ static void update_stat_for_running(struct task_struct *p,
 	if (cpuc->max_lat_cri < taskc->lat_cri)
 		cpuc->max_lat_cri = taskc->lat_cri;
 	cpuc->sum_lat_cri += taskc->lat_cri;
-	cpuc->sched_nr++;
+	cpuc->nr_sched++;

 	/*
 	 * It is clear there is no need to consider the suspended duration
@ -1345,6 +1448,30 @@ static void update_stat_for_running(struct task_struct *p,
 	 * Update task state when starts running.
 	 */
 	taskc->last_running_clk = now;
+
+	/*
+	 * Update statistics information.
+	 */
+	if (taskc->cpu_id != cpuc->cpu_id) {
+		taskc->cpu_id = cpuc->cpu_id;
+		cpuc->nr_migration++;
+	}
+
+	if (taskc->victim_cpu >= 0)
+		cpuc->nr_preemption++;
+	
+	if (is_lat_cri(taskc, stat_cur)) {
+		cpuc->nr_lat_cri++;
+//		debugln("------------------------ lc = %llu", cpuc->nr__cri);
+	}
+
+	if (is_perf_cri(taskc, stat_cur)) {
+		cpuc->nr_perf_cri++;
+//		debugln("------------------------ pc = %llu", cpuc->nr_perf_cri);
+	}
+
+	if (is_greedy(taskc))
+		cpuc->nr_greedy++;
 }

 static u64 calc_svc_time(struct task_struct *p, struct task_ctx *taskc)
@ -1626,13 +1753,14 @@ start_omask:
 	/*
 	 * If the task cannot run on either active or overflow cores,
 	 * stay on the previous core (if it is okay) or one of its taskset.
+	 * Then, put the CPU to the overflow set.
 	 */
+start_any_mask:
 	if (bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr))
 		cpu_id = prev_cpu;
-	else {
-start_any_mask:
+	else
 		cpu_id = bpf_cpumask_any_distribute(p->cpus_ptr);
-	}
+	bpf_cpumask_set_cpu(cpu_id, ovrflw);

 	/*
 	 * Note that we don't need to kick the picked CPU here since the
--- a/scheds/rust/scx_lavd/src/main.rs
+++ b/scheds/rust/scx_lavd/src/main.rs
@ -12,6 +12,7 @@ pub mod bpf_intf;
 pub use bpf_intf::*;

 mod stats;
+use stats::SysStats;
 use stats::SchedSample;
 use stats::SchedSamples;
 use stats::StatsReq;
@ -122,6 +123,14 @@ struct Opts {
    #[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
    no_freq_scaling: bool,

+    /// Enable stats monitoring with the specified interval.
+    #[clap(long)]
+    stats: Option<f64>,
+
+    /// Run in stats monitoring mode with the specified interval. Scheduler is not launched.
+    #[clap(long)]
+    monitor: Option<f64>,
+
    /// Run in monitoring mode. Show the specified number of scheduling
    /// samples every second.
    #[clap(long)]
@ -135,6 +144,10 @@ struct Opts {
    /// Print scheduler version and exit.
    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
    version: bool,
+
+    /// Show descriptions for statistics.
+    #[clap(long)]
+    help_stats: bool,
 }

 impl Opts {
@ -448,8 +461,9 @@ struct Scheduler<'a> {
    rb_mgr: libbpf_rs::RingBuffer<'static>,
    intrspc: introspec,
    intrspc_rx: Receiver<SchedSample>,
-    sampler_tid: Option<ThreadId>,
+    monitor_tid: Option<ThreadId>,
    stats_server: StatsServer<StatsReq, StatsRes>,
+    mseq_id: u64,
 }

 impl<'a> Scheduler<'a> {
@ -494,8 +508,9 @@ impl<'a> Scheduler<'a> {
            rb_mgr,
            intrspc: introspec::new(),
            intrspc_rx,
-            sampler_tid: None,
+            monitor_tid: None,
            stats_server,
+            mseq_id: 0,
        })
    }

@ -626,19 +641,97 @@ impl<'a> Scheduler<'a> {
        self.skel.maps.bss_data.intrspc.cmd = LAVD_CMD_NOP;
    }

+    fn get_pc(x: u64, y: u64) -> f64 {
+        return 100. * x as f64 / y as f64;
+    }
+
+    fn get_power_mode(power_mode: s32) -> &'static str {
+        const LAVD_PM_PERFORMANCE: s32 = 0;
+        const LAVD_PM_BALANCED: s32 = 1;
+        const LAVD_PM_POWERSAVE: s32 = 2;
+
+        match power_mode {
+            LAVD_PM_PERFORMANCE => {
+                return &"performance";
+            }
+            LAVD_PM_BALANCED => {
+                return &"balanced";
+            }
+            LAVD_PM_POWERSAVE => {
+                return &"powersave";
+            }
+            _ => {
+                return &"unknown";
+            }
+        }
+    }
+
    fn stats_req_to_res(&mut self, req: &StatsReq) -> Result<StatsRes> {
        Ok(match req {
            StatsReq::NewSampler(tid) => {
                self.rb_mgr.consume().unwrap();
-                self.sampler_tid = Some(*tid);
+                self.monitor_tid = Some(*tid);
                StatsRes::Ack
            }
+            StatsReq::SysStatsReq {
+                tid,
+            } => {
+                if Some(*tid) != self.monitor_tid {
+                    return Ok(StatsRes::Bye);
+                }
+                self.mseq_id += 1;
+
+                let bss_data = &self.skel.maps.bss_data;
+                let st = bss_data.__sys_stats[0];
+
+                let mseq = self.mseq_id;
+                let avg_svc_time = st.avg_svc_time;
+                let nr_queued_task = st.nr_queued_task;
+                let nr_active = st.nr_active;
+                let nr_sched = st.nr_sched;
+                let pc_migration = Self::get_pc(st.nr_migration, nr_sched);
+                let pc_preemption = Self::get_pc(st.nr_preemption, nr_sched);
+                let pc_greedy = Self::get_pc(st.nr_greedy, nr_sched);
+                let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
+                let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
+                let nr_big = st.nr_big;
+                let pc_big = Self::get_pc(nr_big, nr_sched);
+                let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
+                let pc_lc_on_big = Self::get_pc(st.nr_lc_on_big, nr_big);
+                let power_mode = Self::get_power_mode(bss_data.power_mode);
+                let total_time = bss_data.performance_mode_ns +
+                                 bss_data.balanced_mode_ns +
+                                 bss_data.powersave_mode_ns;
+                let pc_performance = Self::get_pc(bss_data.performance_mode_ns, total_time);
+                let pc_balanced = Self::get_pc(bss_data.balanced_mode_ns, total_time);
+                let pc_powersave = Self::get_pc(bss_data.powersave_mode_ns, total_time);
+
+                StatsRes::SysStats(SysStats {
+                    mseq,
+                    avg_svc_time,
+                    nr_queued_task,
+                    nr_active,
+                    nr_sched,
+                    pc_migration,
+                    pc_preemption,
+                    pc_greedy,
+                    pc_pc,
+                    pc_lc,
+                    pc_big,
+                    pc_pc_on_big,
+                    pc_lc_on_big,
+                    power_mode: power_mode.to_string(),
+                    pc_performance,
+                    pc_balanced,
+                    pc_powersave,
+                })
+            }
            StatsReq::SchedSamplesNr {
                tid,
                nr_samples,
                interval_ms,
            } => {
-                if Some(*tid) != self.sampler_tid {
+                if Some(*tid) != self.monitor_tid {
                    return Ok(StatsRes::Bye);
                }

@ -791,6 +884,11 @@ fn main() -> Result<()> {
        return Ok(());
    }

+    if opts.help_stats {
+        stats::server_data(0).describe_meta(&mut std::io::stdout(), None)?;
+        return Ok(());
+    }
+
    init_log(&opts);
    debug!("{:#?}", opts);

@ -808,6 +906,17 @@ fn main() -> Result<()> {
        return Ok(());
    }

+    if let Some(intv) = opts.monitor.or(opts.stats) {
+        let shutdown_copy = shutdown.clone();
+        let jh = std::thread::spawn(move || {
+            stats::monitor(Duration::from_secs_f64(intv), shutdown_copy).unwrap()
+        });
+        if opts.monitor.is_some() {
+            let _ = jh.join();
+            return Ok(());
+        }
+    }
+
    let mut open_object = MaybeUninit::uninit();
    loop {
        let mut sched = Scheduler::init(&opts, &mut open_object)?;
--- a/scheds/rust/scx_lavd/src/stats.rs
+++ b/scheds/rust/scx_lavd/src/stats.rs
@ -11,30 +11,165 @@ use std::sync::atomic::Ordering;
 use std::sync::Arc;
 use std::thread::ThreadId;
 use std::time::Duration;
+use gpoint::GPoint;
+
+#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
+#[stat(top)]
+pub struct SysStats {
+    #[stat(desc = "Sequence ID of this messge")]
+    pub mseq: u64,
+
+    #[stat(desc = "Average runtime per schedule")]
+    pub avg_svc_time: u64,
+
+    #[stat(desc = "Number of runnable tasks in runqueues")]
+    pub nr_queued_task: u64,
+
+    #[stat(desc = "Number of active CPUs when core compaction is enabled")]
+    pub nr_active: u32,
+
+    #[stat(desc = "Number of context switches")]
+    pub nr_sched: u64,
+
+    #[stat(desc = "% of task migration")]
+    pub pc_migration: f64,
+
+    #[stat(desc = "% of task preemption")]
+    pub pc_preemption: f64,
+
+    #[stat(desc = "% of greedy tasks")]
+    pub pc_greedy: f64,
+
+    #[stat(desc = "% of performance-critical tasks")]
+    pub pc_pc: f64,
+
+    #[stat(desc = "% of latency-critical tasks")]
+    pub pc_lc: f64,
+
+    #[stat(desc = "% of tasks scheduled on big cores")]
+    pub pc_big: f64,
+
+    #[stat(desc = "% of performance-critical tasks scheduled on big cores")]
+    pub pc_pc_on_big: f64,
+
+    #[stat(desc = "% of latency-critical tasks scheduled on big cores")]
+    pub pc_lc_on_big: f64,
+
+    #[stat(desc = "Current power mode")]
+    pub power_mode: String,
+
+    #[stat(desc = "% of performance mode")]
+    pub pc_performance: f64,
+
+    #[stat(desc = "% of balanced mode")]
+    pub pc_balanced: f64,
+
+    #[stat(desc = "% of powersave powersave")]
+    pub pc_powersave: f64,
+}
+
+impl SysStats {
+    pub fn format_header<W: Write>(w: &mut W) -> Result<()> {
+        writeln!(
+            w,
+            "\x1b[93m| {:8} | {:9} | {:9} | {:9} | {:9} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |\x1b[0m",
+            "MSEQ",
+            "SVC_TIME",
+            "# Q TASK",
+            "# ACT CPU",
+            "# SCHED",
+            "MIGRATE%",
+            "PREEMPT%",
+            "GREEDY%",
+            "PERF-CR%",
+            "LAT-CR%",
+            "BIG%",
+            "PC/BIG%",
+            "LC/BIG%",
+            "POWER MODE",
+            "PERFORMANCE%",
+            "BALANCED%",
+            "POWERSAVE%",
+        )?;
+        Ok(())
+    }
+
+    fn format<W: Write>(&self, w: &mut W) -> Result<()> {
+        if self.mseq % 10 == 1 {
+            Self::format_header(w)?;
+        }
+
+        writeln!(
+            w,
+            "| {:8} | {:9} | {:9} | {:9} | {:9} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |",
+            self.mseq,
+            self.avg_svc_time,
+            self.nr_queued_task,
+            self.nr_active,
+            self.nr_sched,
+            GPoint(self.pc_migration),
+            GPoint(self.pc_preemption),
+            GPoint(self.pc_greedy),
+            GPoint(self.pc_pc),
+            GPoint(self.pc_lc),
+            GPoint(self.pc_big),
+            GPoint(self.pc_pc_on_big),
+            GPoint(self.pc_lc_on_big),
+            self.power_mode,
+            GPoint(self.pc_performance),
+            GPoint(self.pc_balanced),
+            GPoint(self.pc_powersave),
+        )?;
+        Ok(())
+    }
+
+}

 #[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
 pub struct SchedSample {
+    #[stat(desc = "Sequence ID of this message")]
    pub mseq: u64,
+    #[stat(desc = "Process ID")]
    pub pid: i32,
+    #[stat(desc = "Task name")]
    pub comm: String,
+    #[stat(desc = "LR: 'L'atency-critical or 'R'egular, HI: performance-'H'ungry or performance-'I'nsensitive, BT: 'B'ig or li'T'tle, EG: 'E'ligigle or 'G'reedy, PN: 'P'reempting or 'N'ot")]
    pub stat: String,
+    #[stat(desc = "CPU id where this task is scheduled on")]
    pub cpu_id: u32,
+    #[stat(desc = "Victim CPU to be preempted out (-1 = no preemption)")]
    pub victim_cpu: i32,
+    #[stat(desc = "Assigned virtual deadline")]
    pub vdeadline_delta_ns: u64,
+    #[stat(desc = "Assigned time slice")]
    pub slice_ns: u64,
+    #[stat(desc = "How greedy this task is in using CPU time (1000 = fair)")]
    pub greedy_ratio: u32,
+    #[stat(desc = "Latency criticality of this task")]
    pub lat_cri: u32,
+    #[stat(desc = "Average latency criticality in a system")]
    pub avg_lat_cri: u32,
+    #[stat(desc = "Static priority (20 == nice 0)")]
    pub static_prio: u16,
+    #[stat(desc = "Slice boost factor (number of consecutive full slice exhaustions)")]
    pub slice_boost_prio: u16,
+    #[stat(desc = "How often this task is scheduled per second")]
    pub run_freq: u64,
+    #[stat(desc = "Average runtime per schedule")]
    pub run_time_ns: u64,
+    #[stat(desc = "How frequently this task waits for other tasks")]
    pub wait_freq: u64,
+    #[stat(desc = "How frequently this task wakes other tasks")]
    pub wake_freq: u64,
+    #[stat(desc = "Performance criticality of this task")]
    pub perf_cri: u32,
+    #[stat(desc = "Average performance criticality in a system")]
    pub avg_perf_cri: u32,
+    #[stat(desc = "Target performance level of this CPU")]
    pub cpuperf_cur: u32,
+    #[stat(desc = "CPU utilization of this particular CPU")]
    pub cpu_util: u64,
+    #[stat(desc = "Number of active CPUs when core compaction is enabled")]
    pub nr_active: u32,
 }

@ -42,42 +177,42 @@ impl SchedSample {
    pub fn format_header<W: Write>(w: &mut W) -> Result<()> {
        writeln!(
            w,
-            "| {:6} | {:7} | {:17} \
+            "\x1b[93m| {:6} | {:7} | {:17} \
                   | {:5} | {:4} | {:4} \
                   | {:14} | {:8} | {:7} \
                   | {:8} | {:7} | {:8} \
                   | {:7} | {:9} | {:9} \
                   | {:9} | {:9} | {:8} \
                   | {:8} | {:8} | {:8} \
-                   | {:6} |",
-            "mseq",
-            "pid",
-            "comm",
-            "stat",
-            "cpu",
-            "vtmc",
-            "vddln_ns",
-            "slc_ns",
-            "grdy_rt",
-            "lat_cri",
-            "avg_lc",
-            "st_prio",
-            "slc_bst",
-            "run_freq",
-            "run_tm_ns",
-            "wait_freq",
-            "wake_freq",
-            "perf_cri",
-            "avg_pc",
-            "cpufreq",
-            "cpu_util",
-            "nr_act",
+                   | {:6} |\x1b[0m",
+            "MSEQ",
+            "PID",
+            "COMM",
+            "STAT",
+            "CPU",
+            "VTMC",
+            "VDDLN_NS",
+            "SLC_NS",
+            "GRDY_RT",
+            "LAT_CRI",
+            "AVG_LC",
+            "ST_PRIO",
+            "SLC_BST",
+            "RUN_FREQ",
+            "RUN_TM_NS",
+            "WAIT_FREQ",
+            "WAKE_FREQ",
+            "PERF_CRI",
+            "AVG_PC",
+            "CPUFREQ",
+            "CPU_UTIL",
+            "NR_ACT",
        )?;
        Ok(())
    }

    pub fn format<W: Write>(&self, w: &mut W) -> Result<()> {
-        if self.mseq % 32 == 1 {
+        if self.mseq % 10 == 1 {
            Self::format_header(w)?;
        }

@ -126,6 +261,9 @@ pub struct SchedSamples {
 #[derive(Debug)]
 pub enum StatsReq {
    NewSampler(ThreadId),
+    SysStatsReq {
+        tid: ThreadId,
+    },
    SchedSamplesNr {
        tid: ThreadId,
        nr_samples: u64,
@ -134,7 +272,15 @@ pub enum StatsReq {
 }

 impl StatsReq {
-    fn from_args(
+    fn from_args_stats(
+        tid: ThreadId,
+    ) -> Result<Self> {
+        Ok(Self::SysStatsReq {
+            tid,
+        })
+    }
+
+    fn from_args_samples(
        tid: ThreadId,
        nr_cpus_onln: u64,
        args: &BTreeMap<String, String>,
@ -164,12 +310,36 @@ impl StatsReq {
 pub enum StatsRes {
    Ack,
    Bye,
+    SysStats(SysStats),
    SchedSamples(SchedSamples),
 }

 pub fn server_data(nr_cpus_onln: u64) -> StatsServerData<StatsReq, StatsRes> {
-    let samples_open: Box<dyn StatsOpener<StatsReq, StatsRes>> =
-        Box::new(move |(req_ch, res_ch)| {
+    let open: Box<dyn StatsOpener<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
+            let tid = std::thread::current().id();
+            req_ch.send(StatsReq::NewSampler(tid))?;
+            match res_ch.recv()? {
+                StatsRes::Ack => {}
+                res => bail!("invalid response: {:?}", &res),
+            }
+
+            let read: Box<dyn StatsReader<StatsReq, StatsRes>> =
+                Box::new(move |_args, (req_ch, res_ch)| {
+                    let req = StatsReq::from_args_stats(tid)?;
+                    req_ch.send(req)?;
+
+                    let stats = match res_ch.recv()? {
+                        StatsRes::SysStats(v) => v,
+                        StatsRes::Bye => bail!("preempted by another sampler"),
+                        res => bail!("invalid response: {:?}", &res),
+                    };
+
+                    stats.to_json()
+                });
+            Ok(read)
+        });
+
+    let samples_open: Box<dyn StatsOpener<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
            let tid = std::thread::current().id();
            req_ch.send(StatsReq::NewSampler(tid))?;
            match res_ch.recv()? {
@ -179,7 +349,7 @@ pub fn server_data(nr_cpus_onln: u64) -> StatsServerData<StatsReq, StatsRes> {

            let read: Box<dyn StatsReader<StatsReq, StatsRes>> =
                Box::new(move |args, (req_ch, res_ch)| {
-                    let req = StatsReq::from_args(tid, nr_cpus_onln, args)?;
+                    let req = StatsReq::from_args_samples(tid, nr_cpus_onln, args)?;
                    req_ch.send(req)?;

                    let samples = match res_ch.recv()? {
@ -194,6 +364,14 @@ pub fn server_data(nr_cpus_onln: u64) -> StatsServerData<StatsReq, StatsRes> {
        });

    StatsServerData::new()
+        .add_meta(SysStats::meta())
+        .add_ops(
+            "top",
+            StatsOps {
+                open: open,
+                close: None,
+            },
+        )
        .add_meta(SchedSample::meta())
        .add_ops(
            "sched_samples",
@ -205,13 +383,6 @@ pub fn server_data(nr_cpus_onln: u64) -> StatsServerData<StatsReq, StatsRes> {
 }

 pub fn monitor_sched_samples(nr_samples: u64, shutdown: Arc<AtomicBool>) -> Result<()> {
-    println!("## stats");
-    println!("  LR: 'L'atency-critical or 'R'egular");
-    println!("  HI: performance-'H'ungry or performance-'I'nsensitive");
-    println!("  BT: 'B'ig or li'T'tle");
-    println!("  EG: 'E'ligigle or 'G'reedy");
-    println!("  PN: 'P'reempting or 'N'ot");
-
    scx_utils::monitor_stats::<SchedSamples>(
        &vec![
            ("target".into(), "sched_samples".into()),
@ -228,3 +399,13 @@ pub fn monitor_sched_samples(nr_samples: u64, shutdown: Arc<AtomicBool>) -> Resu
        },
    )
 }
+
+pub fn monitor(intv: Duration, shutdown: Arc<AtomicBool>) -> Result<()> {
+    scx_utils::monitor_stats::<SysStats>(
+        &vec![],
+        intv,
+        || shutdown.load(Ordering::Relaxed),
+        |sysstats| sysstats.format(&mut std::io::stdout()),
+    );
+    Ok(())
+}