From 7f9548eb3421afcea194e9efc7806f820893b8ae Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <schatzberg.dan@gmail.com>
Date: Wed, 24 Jan 2024 11:43:37 -0800
Subject: [PATCH] scx_layered: Add support for OpenMetrics format

Currently scx_layered outputs statistics periodically as info! logs. The
format of this is largely unstructured and mostly suitable for running
scx_layered interactively (e.g. observing its behavior on the command
line or via logs after the fact).

In order to run scx_layered at larger scale, it's desireable to have
statistics output in some format that is amenable to being ingested into
monitoring databases (e.g. Prometheseus). This allows collection of
stats across many machines.

This commit adds a command line flag (-o) that outputs statistics to
stdout in OpenMetrics format instead of the normal log mechanism.
OpenMetrics has a public format
specification (https://github.com/OpenObservability/OpenMetrics) and is
in use by many projects.

The library for producing OpenMetrics metrics is lightweight but does
induce some changes. Primarily, metrics need to be pre-registered (see
OpenMetricsStats::new()).

Without -o, the output looks as before, for example:

```
19:39:54 [INFO] CPUs: online/possible=52/52 nr_cores=26
19:39:54 [INFO] Layered Scheduler Attached
19:39:56 [INFO] tot=   9912 local=76.71 open_idle= 0.00 affn_viol= 2.63 tctx_err=0 proc=21ms
19:39:56 [INFO] busy=  1.3 util=   65.2 load=    263.4 fallback_cpu=  1
19:39:56 [INFO]   batch    : util/frac=   49.7/ 76.3 load/frac=    252.0: 95.7 tasks=   458
19:39:56 [INFO]              tot=   2842 local=45.04 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
19:39:56 [INFO]              cpus=  2 [  0,  2] 04000001 00000000
19:39:56 [INFO]   immediate: util/frac=    0.0/  0.0 load/frac=      0.0:  0.0 tasks=     0
19:39:56 [INFO]              tot=      0 local= 0.00 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
19:39:56 [INFO]              cpus= 50 [  0, 50] fbfffffe 000fffff
19:39:56 [INFO]   normal   : util/frac=   15.4/ 23.7 load/frac=     11.4:  4.3 tasks=   556
19:39:56 [INFO]              tot=   7070 local=89.43 open_idle= 0.00 preempt= 0.00 affn_viol= 3.69
19:39:56 [INFO]              cpus= 50 [  0, 50] fbfffffe 000fffff
19:39:58 [INFO] tot=   7091 local=84.91 open_idle= 0.00 affn_viol= 2.64 tctx_err=0 proc=21ms
19:39:58 [INFO] busy=  0.6 util=   31.2 load=    107.1 fallback_cpu=  1
19:39:58 [INFO]   batch    : util/frac=   18.3/ 58.5 load/frac=     93.9: 87.7 tasks=   589
19:39:58 [INFO]              tot=   2011 local=60.67 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
19:39:58 [INFO]              cpus=  2 [  2,  2] 04000001 00000000
19:39:58 [INFO]   immediate: util/frac=    0.0/  0.0 load/frac=      0.0:  0.0 tasks=     0
19:39:58 [INFO]              tot=      0 local= 0.00 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
19:39:58 [INFO]              cpus= 50 [ 50, 50] fbfffffe 000fffff
19:39:58 [INFO]   normal   : util/frac=   13.0/ 41.5 load/frac=     13.2: 12.3 tasks=   650
19:39:58 [INFO]              tot=   5080 local=94.51 open_idle= 0.00 preempt= 0.00 affn_viol= 3.68
19:39:58 [INFO]              cpus= 50 [ 50, 50] fbfffffe 000fffff
^C19:39:59 [INFO] EXIT: BPF scheduler unregistered
```

With -o passed, the output is in OpenMetrics format:

```
19:40:08 [INFO] CPUs: online/possible=52/52 nr_cores=26
19:40:08 [INFO] Layered Scheduler Attached
 # HELP total Total scheduling events in the period.
 # TYPE total gauge
total 8489
 # HELP local % that got scheduled directly into an idle CPU.
 # TYPE local gauge
local 86.45305689716104
 # HELP open_idle % of open layer tasks scheduled into occupied idle CPUs.
 # TYPE open_idle gauge
open_idle 0.0
 # HELP affn_viol % which violated configured policies due to CPU affinity restrictions.
 # TYPE affn_viol gauge
affn_viol 2.332430203793144
 # HELP tctx_err Failures to free task contexts.
 # TYPE tctx_err gauge
tctx_err 0
 # HELP proc_ms CPU time this binary has consumed during the period.
 # TYPE proc_ms gauge
proc_ms 20
 # HELP busy CPU busy % (100% means all CPUs were fully occupied).
 # TYPE busy gauge
busy 0.5294061026085283
 # HELP util CPU utilization % (100% means one CPU was fully occupied).
 # TYPE util gauge
util 27.37195512782239
 # HELP load Sum of weight * duty_cycle for all tasks.
 # TYPE load gauge
load 81.55024768702126
 # HELP layer_util CPU utilization of the layer (100% means one CPU was fully occupied).
 # TYPE layer_util gauge
layer_util{layer_name="immediate"} 0.0
layer_util{layer_name="normal"} 19.340849995024997
layer_util{layer_name="batch"} 8.031105132797393
 # HELP layer_util_frac Fraction of total CPU utilization consumed by the layer.
 # TYPE layer_util_frac gauge
layer_util_frac{layer_name="batch"} 29.34063385422595
layer_util_frac{layer_name="immediate"} 0.0
layer_util_frac{layer_name="normal"} 70.65936614577405
 # HELP layer_load Sum of weight * duty_cycle for tasks in the layer.
 # TYPE layer_load gauge
layer_load{layer_name="immediate"} 0.0
layer_load{layer_name="normal"} 11.14363313258934
layer_load{layer_name="batch"} 70.40661455443191
 # HELP layer_load_frac Fraction of total load consumed by the layer.
 # TYPE layer_load_frac gauge
layer_load_frac{layer_name="normal"} 13.664744680306903
layer_load_frac{layer_name="immediate"} 0.0
layer_load_frac{layer_name="batch"} 86.33525531969309
 # HELP layer_tasks Number of tasks in the layer.
 # TYPE layer_tasks gauge
layer_tasks{layer_name="immediate"} 0
layer_tasks{layer_name="normal"} 490
layer_tasks{layer_name="batch"} 343
 # HELP layer_total Number of scheduling events in the layer.
 # TYPE layer_total gauge
layer_total{layer_name="normal"} 6711
layer_total{layer_name="batch"} 1778
layer_total{layer_name="immediate"} 0
 # HELP layer_local % of scheduling events directly into an idle CPU.
 # TYPE layer_local gauge
layer_local{layer_name="batch"} 69.79752530933632
layer_local{layer_name="immediate"} 0.0
layer_local{layer_name="normal"} 90.86574281031143
 # HELP layer_open_idle % of scheduling events into idle CPUs occupied by other layers.
 # TYPE layer_open_idle gauge
layer_open_idle{layer_name="immediate"} 0.0
layer_open_idle{layer_name="batch"} 0.0
layer_open_idle{layer_name="normal"} 0.0
 # HELP layer_preempt % of scheduling events that preempted other tasks. #
 # TYPE layer_preempt gauge
layer_preempt{layer_name="normal"} 0.0
layer_preempt{layer_name="batch"} 0.0
layer_preempt{layer_name="immediate"} 0.0
 # HELP layer_affn_viol % of scheduling events that violated configured policies due to CPU affinity restrictions.
 # TYPE layer_affn_viol gauge
layer_affn_viol{layer_name="normal"} 2.950379973178364
layer_affn_viol{layer_name="batch"} 0.0
layer_affn_viol{layer_name="immediate"} 0.0
 # HELP layer_cur_nr_cpus Current  # of CPUs assigned to the layer.
 # TYPE layer_cur_nr_cpus gauge
layer_cur_nr_cpus{layer_name="normal"} 50
layer_cur_nr_cpus{layer_name="batch"} 2
layer_cur_nr_cpus{layer_name="immediate"} 50
 # HELP layer_min_nr_cpus Minimum  # of CPUs assigned to the layer.
 # TYPE layer_min_nr_cpus gauge
layer_min_nr_cpus{layer_name="normal"} 0
layer_min_nr_cpus{layer_name="batch"} 0
layer_min_nr_cpus{layer_name="immediate"} 0
 # HELP layer_max_nr_cpus Maximum  # of CPUs assigned to the layer.
 # TYPE layer_max_nr_cpus gauge
layer_max_nr_cpus{layer_name="immediate"} 50
layer_max_nr_cpus{layer_name="normal"} 50
layer_max_nr_cpus{layer_name="batch"} 2
 # EOF
^C19:40:11 [INFO] EXIT: BPF scheduler unregistered
```

Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
---
 scheds/rust/scx_layered/Cargo.toml  |   1 +
 scheds/rust/scx_layered/src/main.rs | 261 +++++++++++++++++++++++-----
 2 files changed, 221 insertions(+), 41 deletions(-)

diff --git a/scheds/rust/scx_layered/Cargo.toml b/scheds/rust/scx_layered/Cargo.toml
index 007a017..ca6423b 100644
--- a/scheds/rust/scx_layered/Cargo.toml
+++ b/scheds/rust/scx_layered/Cargo.toml
@@ -16,6 +16,7 @@ lazy_static = "1.4"
 libbpf-rs = "0.22"
 libc = "0.2"
 log = "0.4"
+prometheus-client = "0.22.0"
 scx_utils = { path = "../../../rust/scx_utils", version = "0.5" }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs
index 5b53742..f2e536f 100644
--- a/scheds/rust/scx_layered/src/main.rs
+++ b/scheds/rust/scx_layered/src/main.rs
@@ -15,6 +15,8 @@ use std::io::Read;
 use std::io::Write;
 use std::ops::Sub;
 use std::sync::atomic::AtomicBool;
+use std::sync::atomic::AtomicI64;
+use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
 use std::time::Duration;
@@ -33,6 +35,10 @@ use libbpf_rs::skel::SkelBuilder as _;
 use log::debug;
 use log::info;
 use log::trace;
+use prometheus_client::encoding::text::encode;
+use prometheus_client::metrics::family::Family;
+use prometheus_client::metrics::gauge::Gauge;
+use prometheus_client::registry::Registry;
 use scx_utils::ravg::ravg_read;
 use serde::Deserialize;
 use serde::Serialize;
@@ -252,6 +258,12 @@ struct Opts {
     #[clap(short = 'v', long, action = clap::ArgAction::Count)]
     verbose: u8,
 
+    /// Enable output of stats in OpenMetrics format instead of via log macros.
+    /// This option is useful if you want to collect stats in some monitoring
+    /// database like prometheseus.
+    #[clap(short = 'o', long)]
+    open_metrics_format: bool,
+
     /// Write example layer specifications into the file and exit.
     #[clap(short = 'e', long)]
     example: Option<String>,
@@ -1099,6 +1111,102 @@ impl Layer {
     }
 }
 
+#[derive(Default)]
+struct OpenMetricsStats {
+    registry: Registry,
+    total: Gauge<i64, AtomicI64>,
+    local: Gauge<f64, AtomicU64>,
+    open_idle: Gauge<f64, AtomicU64>,
+    affn_viol: Gauge<f64, AtomicU64>,
+    tctx_err: Gauge<i64, AtomicI64>,
+    proc_ms: Gauge<i64, AtomicI64>,
+    busy: Gauge<f64, AtomicU64>,
+    util: Gauge<f64, AtomicU64>,
+    load: Gauge<f64, AtomicU64>,
+    l_util: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
+    l_util_frac: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
+    l_load: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
+    l_load_frac: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
+    l_tasks: Family<Vec<(String, String)>, Gauge<i64, AtomicI64>>,
+    l_total: Family<Vec<(String, String)>, Gauge<i64, AtomicI64>>,
+    l_local: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
+    l_open_idle: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
+    l_preempt: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
+    l_affn_viol: Family<Vec<(String, String)>, Gauge<f64, AtomicU64>>,
+    l_cur_nr_cpus: Family<Vec<(String, String)>, Gauge<i64, AtomicI64>>,
+    l_min_nr_cpus: Family<Vec<(String, String)>, Gauge<i64, AtomicI64>>,
+    l_max_nr_cpus: Family<Vec<(String, String)>, Gauge<i64, AtomicI64>>,
+}
+
+impl OpenMetricsStats {
+    fn new() -> OpenMetricsStats {
+        let mut metrics = OpenMetricsStats {
+            registry: <Registry>::default(),
+            ..Default::default()
+        };
+        // Helper macro to reduce on some of the boilerplate:
+        // $i: The identifier of the metric to register
+        // $help: The Help text associated with the metric
+        macro_rules! register {
+            ($i:ident, $help:expr) => {
+                metrics
+                    .registry
+                    .register(stringify!($i), $help, metrics.$i.clone())
+            };
+        }
+        register!(total, "Total scheduling events in the period");
+        register!(local, "% that got scheduled directly into an idle CPU");
+        register!(
+            open_idle,
+            "% of open layer tasks scheduled into occupied idle CPUs"
+        );
+        register!(
+            affn_viol,
+            "% which violated configured policies due to CPU affinity restrictions"
+        );
+        register!(tctx_err, "Failures to free task contexts");
+        register!(
+            proc_ms,
+            "CPU time this binary has consumed during the period"
+        );
+        register!(busy, "CPU busy % (100% means all CPUs were fully occupied)");
+        register!(
+            util,
+            "CPU utilization % (100% means one CPU was fully occupied)"
+        );
+        register!(load, "Sum of weight * duty_cycle for all tasks");
+        register!(
+            l_util,
+            "CPU utilization of the layer (100% means one CPU was fully occupied)"
+        );
+        register!(
+            l_util_frac,
+            "Fraction of total CPU utilization consumed by the layer"
+        );
+        register!(l_load, "Sum of weight * duty_cycle for tasks in the layer");
+        register!(l_load_frac, "Fraction of total load consumed by the layer");
+        register!(l_tasks, "Number of tasks in the layer");
+        register!(l_total, "Number of scheduling events in the layer");
+        register!(l_local, "% of scheduling events directly into an idle CPU");
+        register!(
+            l_open_idle,
+            "% of scheduling events into idle CPUs occupied by other layers"
+        );
+        register!(
+            l_preempt,
+            "% of scheduling events that preempted other tasks"
+        );
+        register!(
+            l_affn_viol,
+            "% of scheduling events that violated configured policies due to CPU affinity restrictions"
+        );
+        register!(l_cur_nr_cpus, "Current # of CPUs assigned to the layer");
+        register!(l_min_nr_cpus, "Minimum # of CPUs assigned to the layer");
+        register!(l_max_nr_cpus, "Maximum # of CPUs assigned to the layer");
+        metrics
+    }
+}
+
 struct Scheduler<'a> {
     skel: BpfSkel<'a>,
     struct_ops: Option<libbpf_rs::Link>,
@@ -1118,6 +1226,9 @@ struct Scheduler<'a> {
     nr_layer_cpus_min_max: Vec<(usize, usize)>,
     processing_dur: Duration,
     prev_processing_dur: Duration,
+
+    om_stats: OpenMetricsStats,
+    om_format: bool,
 }
 
 impl<'a> Scheduler<'a> {
@@ -1224,6 +1335,9 @@ impl<'a> Scheduler<'a> {
 
             proc_reader,
             skel,
+
+            om_stats: OpenMetricsStats::new(),
+            om_format: opts.open_metrics_format,
         })
     }
 
@@ -1333,24 +1447,44 @@ impl<'a> Scheduler<'a> {
             }
         };
 
-        info!(
-            "tot={:7} local={:5.2} open_idle={:5.2} affn_viol={:5.2} tctx_err={} proc={:?}ms",
-            total,
-            lsum_pct(bpf_intf::layer_stat_idx_LSTAT_LOCAL),
-            lsum_pct(bpf_intf::layer_stat_idx_LSTAT_OPEN_IDLE),
-            lsum_pct(bpf_intf::layer_stat_idx_LSTAT_AFFN_VIOL),
+        self.om_stats.total.set(total as i64);
+        self.om_stats
+            .local
+            .set(lsum_pct(bpf_intf::layer_stat_idx_LSTAT_LOCAL));
+        self.om_stats
+            .open_idle
+            .set(lsum_pct(bpf_intf::layer_stat_idx_LSTAT_OPEN_IDLE));
+        self.om_stats
+            .affn_viol
+            .set(lsum_pct(bpf_intf::layer_stat_idx_LSTAT_AFFN_VIOL));
+        self.om_stats.tctx_err.set(
             stats.prev_bpf_stats.gstats
-                [bpf_intf::global_stat_idx_GSTAT_TASK_CTX_FREE_FAILED as usize],
-            processing_dur.as_millis(),
+                [bpf_intf::global_stat_idx_GSTAT_TASK_CTX_FREE_FAILED as usize] as i64,
         );
+        self.om_stats.proc_ms.set(processing_dur.as_millis() as i64);
+        self.om_stats.busy.set(stats.cpu_busy * 100.0);
+        self.om_stats.util.set(stats.total_util * 100.0);
+        self.om_stats.load.set(stats.total_load);
 
-        info!(
-            "busy={:5.1} util={:7.1} load={:9.1} fallback_cpu={:3}",
-            stats.cpu_busy * 100.0,
-            stats.total_util * 100.0,
-            stats.total_load,
-            self.cpu_pool.fallback_cpu,
-        );
+        if !self.om_format {
+            info!(
+                "tot={:7} local={:5.2} open_idle={:5.2} affn_viol={:5.2} tctx_err={} proc={:?}ms",
+                self.om_stats.total.get(),
+                self.om_stats.local.get(),
+                self.om_stats.open_idle.get(),
+                self.om_stats.affn_viol.get(),
+                self.om_stats.tctx_err.get(),
+                self.om_stats.proc_ms.get(),
+            );
+
+            info!(
+                "busy={:5.1} util={:7.1} load={:9.1} fallback_cpu={:3}",
+                self.om_stats.busy.get(),
+                self.om_stats.util.get(),
+                self.om_stats.load.get(),
+                self.cpu_pool.fallback_cpu,
+            );
+        }
 
         let header_width = self
             .layer_specs
@@ -1376,38 +1510,83 @@ impl<'a> Scheduler<'a> {
                 }
             };
 
-            info!(
-                "  {:<width$}: util/frac={:7.1}/{:5.1} load/frac={:9.1}:{:5.1} tasks={:6}",
-                spec.name,
-                stats.layer_utils[lidx] * 100.0,
-                calc_frac(stats.layer_utils[lidx], stats.total_util),
-                stats.layer_loads[lidx],
-                calc_frac(stats.layer_loads[lidx], stats.total_load),
-                stats.nr_layer_tasks[lidx],
-                width = header_width,
+            // Helper macros to reduce some boilerplate.
+            // $i: The identifier of the metric to set
+            // $e: The expression to set the metric to
+            macro_rules! set {
+                ($i: ident, $e:expr) => {{
+                    let l = self
+                        .om_stats
+                        .$i
+                        .get_or_create(&vec![("layer_name".to_owned(), spec.name.clone())]);
+                    l.set($e);
+                    l
+                }};
+            }
+            let l_util = set!(l_util, stats.layer_utils[lidx] * 100.0);
+            let l_util_frac = set!(
+                l_util_frac,
+                calc_frac(stats.layer_utils[lidx], stats.total_util)
             );
-            info!(
-                "  {:<width$}  tot={:7} local={:5.2} open_idle={:5.2} preempt={:5.2} affn_viol={:5.2}",
-                "",
-                ltotal,
-                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_LOCAL),
-                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_OPEN_IDLE),
-                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_PREEMPT),
-                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_AFFN_VIOL),
-                width = header_width,
+            let l_load = set!(l_load, stats.layer_loads[lidx]);
+            let l_load_frac = set!(
+                l_load_frac,
+                calc_frac(stats.layer_loads[lidx], stats.total_load)
             );
-            info!(
-                "  {:<width$}  cpus={:3} [{:3},{:3}] {}",
-                "",
-                layer.nr_cpus,
-                self.nr_layer_cpus_min_max[lidx].0,
-                self.nr_layer_cpus_min_max[lidx].1,
-                format_bitvec(&layer.cpus),
-                width = header_width
+            let l_tasks = set!(l_tasks, stats.nr_layer_tasks[lidx] as i64);
+            let l_total = set!(l_total, ltotal as i64);
+            let l_local = set!(l_local, lstat_pct(bpf_intf::layer_stat_idx_LSTAT_LOCAL));
+            let l_open_idle = set!(
+                l_open_idle,
+                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_OPEN_IDLE)
             );
+            let l_preempt = set!(l_preempt, lstat_pct(bpf_intf::layer_stat_idx_LSTAT_PREEMPT));
+            let l_affn_viol = set!(
+                l_affn_viol,
+                lstat_pct(bpf_intf::layer_stat_idx_LSTAT_AFFN_VIOL)
+            );
+            let l_cur_nr_cpus = set!(l_cur_nr_cpus, layer.nr_cpus as i64);
+            let l_min_nr_cpus = set!(l_min_nr_cpus, self.nr_layer_cpus_min_max[lidx].0 as i64);
+            let l_max_nr_cpus = set!(l_max_nr_cpus, self.nr_layer_cpus_min_max[lidx].1 as i64);
+            if !self.om_format {
+                info!(
+                    "  {:<width$}: util/frac={:7.1}/{:5.1} load/frac={:9.1}:{:5.1} tasks={:6}",
+                    spec.name,
+                    l_util.get(),
+                    l_util_frac.get(),
+                    l_load.get(),
+                    l_load_frac.get(),
+                    l_tasks.get(),
+                    width = header_width,
+                );
+                info!(
+                    "  {:<width$}  tot={:7} local={:5.2} open_idle={:5.2} preempt={:5.2} affn_viol={:5.2}",
+                    "",
+                    l_total.get(),
+                    l_local.get(),
+                    l_open_idle.get(),
+                    l_preempt.get(),
+                    l_affn_viol.get(),
+                    width = header_width,
+                );
+                info!(
+                    "  {:<width$}  cpus={:3} [{:3},{:3}] {}",
+                    "",
+                    l_cur_nr_cpus.get(),
+                    l_min_nr_cpus.get(),
+                    l_max_nr_cpus.get(),
+                    format_bitvec(&layer.cpus),
+                    width = header_width
+                );
+            }
             self.nr_layer_cpus_min_max[lidx] = (layer.nr_cpus, layer.nr_cpus);
         }
 
+        if self.om_format {
+            let mut buffer = String::new();
+            encode(&mut buffer, &self.om_stats.registry).unwrap();
+            print!("{}", buffer);
+        }
         self.processing_dur += Instant::now().duration_since(started_at);
         Ok(())
     }