From 7f9548eb3421afcea194e9efc7806f820893b8ae Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Wed, 24 Jan 2024 11:43:37 -0800 Subject: [PATCH] scx_layered: Add support for OpenMetrics format Currently scx_layered outputs statistics periodically as info! logs. The format of this is largely unstructured and mostly suitable for running scx_layered interactively (e.g. observing its behavior on the command line or via logs after the fact). In order to run scx_layered at larger scale, it's desireable to have statistics output in some format that is amenable to being ingested into monitoring databases (e.g. Prometheseus). This allows collection of stats across many machines. This commit adds a command line flag (-o) that outputs statistics to stdout in OpenMetrics format instead of the normal log mechanism. OpenMetrics has a public format specification (https://github.com/OpenObservability/OpenMetrics) and is in use by many projects. The library for producing OpenMetrics metrics is lightweight but does induce some changes. Primarily, metrics need to be pre-registered (see OpenMetricsStats::new()). Without -o, the output looks as before, for example: ``` 19:39:54 [INFO] CPUs: online/possible=52/52 nr_cores=26 19:39:54 [INFO] Layered Scheduler Attached 19:39:56 [INFO] tot= 9912 local=76.71 open_idle= 0.00 affn_viol= 2.63 tctx_err=0 proc=21ms 19:39:56 [INFO] busy= 1.3 util= 65.2 load= 263.4 fallback_cpu= 1 19:39:56 [INFO] batch : util/frac= 49.7/ 76.3 load/frac= 252.0: 95.7 tasks= 458 19:39:56 [INFO] tot= 2842 local=45.04 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00 19:39:56 [INFO] cpus= 2 [ 0, 2] 04000001 00000000 19:39:56 [INFO] immediate: util/frac= 0.0/ 0.0 load/frac= 0.0: 0.0 tasks= 0 19:39:56 [INFO] tot= 0 local= 0.00 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00 19:39:56 [INFO] cpus= 50 [ 0, 50] fbfffffe 000fffff 19:39:56 [INFO] normal : util/frac= 15.4/ 23.7 load/frac= 11.4: 4.3 tasks= 556 19:39:56 [INFO] tot= 7070 local=89.43 open_idle= 0.00 preempt= 0.00 affn_viol= 3.69 19:39:56 [INFO] cpus= 50 [ 0, 50] fbfffffe 000fffff 19:39:58 [INFO] tot= 7091 local=84.91 open_idle= 0.00 affn_viol= 2.64 tctx_err=0 proc=21ms 19:39:58 [INFO] busy= 0.6 util= 31.2 load= 107.1 fallback_cpu= 1 19:39:58 [INFO] batch : util/frac= 18.3/ 58.5 load/frac= 93.9: 87.7 tasks= 589 19:39:58 [INFO] tot= 2011 local=60.67 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00 19:39:58 [INFO] cpus= 2 [ 2, 2] 04000001 00000000 19:39:58 [INFO] immediate: util/frac= 0.0/ 0.0 load/frac= 0.0: 0.0 tasks= 0 19:39:58 [INFO] tot= 0 local= 0.00 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00 19:39:58 [INFO] cpus= 50 [ 50, 50] fbfffffe 000fffff 19:39:58 [INFO] normal : util/frac= 13.0/ 41.5 load/frac= 13.2: 12.3 tasks= 650 19:39:58 [INFO] tot= 5080 local=94.51 open_idle= 0.00 preempt= 0.00 affn_viol= 3.68 19:39:58 [INFO] cpus= 50 [ 50, 50] fbfffffe 000fffff ^C19:39:59 [INFO] EXIT: BPF scheduler unregistered ``` With -o passed, the output is in OpenMetrics format: ``` 19:40:08 [INFO] CPUs: online/possible=52/52 nr_cores=26 19:40:08 [INFO] Layered Scheduler Attached # HELP total Total scheduling events in the period. # TYPE total gauge total 8489 # HELP local % that got scheduled directly into an idle CPU. # TYPE local gauge local 86.45305689716104 # HELP open_idle % of open layer tasks scheduled into occupied idle CPUs. # TYPE open_idle gauge open_idle 0.0 # HELP affn_viol % which violated configured policies due to CPU affinity restrictions. # TYPE affn_viol gauge affn_viol 2.332430203793144 # HELP tctx_err Failures to free task contexts. # TYPE tctx_err gauge tctx_err 0 # HELP proc_ms CPU time this binary has consumed during the period. # TYPE proc_ms gauge proc_ms 20 # HELP busy CPU busy % (100% means all CPUs were fully occupied). # TYPE busy gauge busy 0.5294061026085283 # HELP util CPU utilization % (100% means one CPU was fully occupied). # TYPE util gauge util 27.37195512782239 # HELP load Sum of weight * duty_cycle for all tasks. # TYPE load gauge load 81.55024768702126 # HELP layer_util CPU utilization of the layer (100% means one CPU was fully occupied). # TYPE layer_util gauge layer_util{layer_name="immediate"} 0.0 layer_util{layer_name="normal"} 19.340849995024997 layer_util{layer_name="batch"} 8.031105132797393 # HELP layer_util_frac Fraction of total CPU utilization consumed by the layer. # TYPE layer_util_frac gauge layer_util_frac{layer_name="batch"} 29.34063385422595 layer_util_frac{layer_name="immediate"} 0.0 layer_util_frac{layer_name="normal"} 70.65936614577405 # HELP layer_load Sum of weight * duty_cycle for tasks in the layer. # TYPE layer_load gauge layer_load{layer_name="immediate"} 0.0 layer_load{layer_name="normal"} 11.14363313258934 layer_load{layer_name="batch"} 70.40661455443191 # HELP layer_load_frac Fraction of total load consumed by the layer. # TYPE layer_load_frac gauge layer_load_frac{layer_name="normal"} 13.664744680306903 layer_load_frac{layer_name="immediate"} 0.0 layer_load_frac{layer_name="batch"} 86.33525531969309 # HELP layer_tasks Number of tasks in the layer. # TYPE layer_tasks gauge layer_tasks{layer_name="immediate"} 0 layer_tasks{layer_name="normal"} 490 layer_tasks{layer_name="batch"} 343 # HELP layer_total Number of scheduling events in the layer. # TYPE layer_total gauge layer_total{layer_name="normal"} 6711 layer_total{layer_name="batch"} 1778 layer_total{layer_name="immediate"} 0 # HELP layer_local % of scheduling events directly into an idle CPU. # TYPE layer_local gauge layer_local{layer_name="batch"} 69.79752530933632 layer_local{layer_name="immediate"} 0.0 layer_local{layer_name="normal"} 90.86574281031143 # HELP layer_open_idle % of scheduling events into idle CPUs occupied by other layers. # TYPE layer_open_idle gauge layer_open_idle{layer_name="immediate"} 0.0 layer_open_idle{layer_name="batch"} 0.0 layer_open_idle{layer_name="normal"} 0.0 # HELP layer_preempt % of scheduling events that preempted other tasks. # # TYPE layer_preempt gauge layer_preempt{layer_name="normal"} 0.0 layer_preempt{layer_name="batch"} 0.0 layer_preempt{layer_name="immediate"} 0.0 # HELP layer_affn_viol % of scheduling events that violated configured policies due to CPU affinity restrictions. # TYPE layer_affn_viol gauge layer_affn_viol{layer_name="normal"} 2.950379973178364 layer_affn_viol{layer_name="batch"} 0.0 layer_affn_viol{layer_name="immediate"} 0.0 # HELP layer_cur_nr_cpus Current # of CPUs assigned to the layer. # TYPE layer_cur_nr_cpus gauge layer_cur_nr_cpus{layer_name="normal"} 50 layer_cur_nr_cpus{layer_name="batch"} 2 layer_cur_nr_cpus{layer_name="immediate"} 50 # HELP layer_min_nr_cpus Minimum # of CPUs assigned to the layer. # TYPE layer_min_nr_cpus gauge layer_min_nr_cpus{layer_name="normal"} 0 layer_min_nr_cpus{layer_name="batch"} 0 layer_min_nr_cpus{layer_name="immediate"} 0 # HELP layer_max_nr_cpus Maximum # of CPUs assigned to the layer. # TYPE layer_max_nr_cpus gauge layer_max_nr_cpus{layer_name="immediate"} 50 layer_max_nr_cpus{layer_name="normal"} 50 layer_max_nr_cpus{layer_name="batch"} 2 # EOF ^C19:40:11 [INFO] EXIT: BPF scheduler unregistered ``` Signed-off-by: Dan Schatzberg --- scheds/rust/scx_layered/Cargo.toml | 1 + scheds/rust/scx_layered/src/main.rs | 261 +++++++++++++++++++++++----- 2 files changed, 221 insertions(+), 41 deletions(-) diff --git a/scheds/rust/scx_layered/Cargo.toml b/scheds/rust/scx_layered/Cargo.toml index 007a017..ca6423b 100644 --- a/scheds/rust/scx_layered/Cargo.toml +++ b/scheds/rust/scx_layered/Cargo.toml @@ -16,6 +16,7 @@ lazy_static = "1.4" libbpf-rs = "0.22" libc = "0.2" log = "0.4" +prometheus-client = "0.22.0" scx_utils = { path = "../../../rust/scx_utils", version = "0.5" } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs index 5b53742..f2e536f 100644 --- a/scheds/rust/scx_layered/src/main.rs +++ b/scheds/rust/scx_layered/src/main.rs @@ -15,6 +15,8 @@ use std::io::Read; use std::io::Write; use std::ops::Sub; use std::sync::atomic::AtomicBool; +use std::sync::atomic::AtomicI64; +use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; use std::time::Duration; @@ -33,6 +35,10 @@ use libbpf_rs::skel::SkelBuilder as _; use log::debug; use log::info; use log::trace; +use prometheus_client::encoding::text::encode; +use prometheus_client::metrics::family::Family; +use prometheus_client::metrics::gauge::Gauge; +use prometheus_client::registry::Registry; use scx_utils::ravg::ravg_read; use serde::Deserialize; use serde::Serialize; @@ -252,6 +258,12 @@ struct Opts { #[clap(short = 'v', long, action = clap::ArgAction::Count)] verbose: u8, + /// Enable output of stats in OpenMetrics format instead of via log macros. + /// This option is useful if you want to collect stats in some monitoring + /// database like prometheseus. + #[clap(short = 'o', long)] + open_metrics_format: bool, + /// Write example layer specifications into the file and exit. #[clap(short = 'e', long)] example: Option, @@ -1099,6 +1111,102 @@ impl Layer { } } +#[derive(Default)] +struct OpenMetricsStats { + registry: Registry, + total: Gauge, + local: Gauge, + open_idle: Gauge, + affn_viol: Gauge, + tctx_err: Gauge, + proc_ms: Gauge, + busy: Gauge, + util: Gauge, + load: Gauge, + l_util: Family, Gauge>, + l_util_frac: Family, Gauge>, + l_load: Family, Gauge>, + l_load_frac: Family, Gauge>, + l_tasks: Family, Gauge>, + l_total: Family, Gauge>, + l_local: Family, Gauge>, + l_open_idle: Family, Gauge>, + l_preempt: Family, Gauge>, + l_affn_viol: Family, Gauge>, + l_cur_nr_cpus: Family, Gauge>, + l_min_nr_cpus: Family, Gauge>, + l_max_nr_cpus: Family, Gauge>, +} + +impl OpenMetricsStats { + fn new() -> OpenMetricsStats { + let mut metrics = OpenMetricsStats { + registry: ::default(), + ..Default::default() + }; + // Helper macro to reduce on some of the boilerplate: + // $i: The identifier of the metric to register + // $help: The Help text associated with the metric + macro_rules! register { + ($i:ident, $help:expr) => { + metrics + .registry + .register(stringify!($i), $help, metrics.$i.clone()) + }; + } + register!(total, "Total scheduling events in the period"); + register!(local, "% that got scheduled directly into an idle CPU"); + register!( + open_idle, + "% of open layer tasks scheduled into occupied idle CPUs" + ); + register!( + affn_viol, + "% which violated configured policies due to CPU affinity restrictions" + ); + register!(tctx_err, "Failures to free task contexts"); + register!( + proc_ms, + "CPU time this binary has consumed during the period" + ); + register!(busy, "CPU busy % (100% means all CPUs were fully occupied)"); + register!( + util, + "CPU utilization % (100% means one CPU was fully occupied)" + ); + register!(load, "Sum of weight * duty_cycle for all tasks"); + register!( + l_util, + "CPU utilization of the layer (100% means one CPU was fully occupied)" + ); + register!( + l_util_frac, + "Fraction of total CPU utilization consumed by the layer" + ); + register!(l_load, "Sum of weight * duty_cycle for tasks in the layer"); + register!(l_load_frac, "Fraction of total load consumed by the layer"); + register!(l_tasks, "Number of tasks in the layer"); + register!(l_total, "Number of scheduling events in the layer"); + register!(l_local, "% of scheduling events directly into an idle CPU"); + register!( + l_open_idle, + "% of scheduling events into idle CPUs occupied by other layers" + ); + register!( + l_preempt, + "% of scheduling events that preempted other tasks" + ); + register!( + l_affn_viol, + "% of scheduling events that violated configured policies due to CPU affinity restrictions" + ); + register!(l_cur_nr_cpus, "Current # of CPUs assigned to the layer"); + register!(l_min_nr_cpus, "Minimum # of CPUs assigned to the layer"); + register!(l_max_nr_cpus, "Maximum # of CPUs assigned to the layer"); + metrics + } +} + struct Scheduler<'a> { skel: BpfSkel<'a>, struct_ops: Option, @@ -1118,6 +1226,9 @@ struct Scheduler<'a> { nr_layer_cpus_min_max: Vec<(usize, usize)>, processing_dur: Duration, prev_processing_dur: Duration, + + om_stats: OpenMetricsStats, + om_format: bool, } impl<'a> Scheduler<'a> { @@ -1224,6 +1335,9 @@ impl<'a> Scheduler<'a> { proc_reader, skel, + + om_stats: OpenMetricsStats::new(), + om_format: opts.open_metrics_format, }) } @@ -1333,24 +1447,44 @@ impl<'a> Scheduler<'a> { } }; - info!( - "tot={:7} local={:5.2} open_idle={:5.2} affn_viol={:5.2} tctx_err={} proc={:?}ms", - total, - lsum_pct(bpf_intf::layer_stat_idx_LSTAT_LOCAL), - lsum_pct(bpf_intf::layer_stat_idx_LSTAT_OPEN_IDLE), - lsum_pct(bpf_intf::layer_stat_idx_LSTAT_AFFN_VIOL), + self.om_stats.total.set(total as i64); + self.om_stats + .local + .set(lsum_pct(bpf_intf::layer_stat_idx_LSTAT_LOCAL)); + self.om_stats + .open_idle + .set(lsum_pct(bpf_intf::layer_stat_idx_LSTAT_OPEN_IDLE)); + self.om_stats + .affn_viol + .set(lsum_pct(bpf_intf::layer_stat_idx_LSTAT_AFFN_VIOL)); + self.om_stats.tctx_err.set( stats.prev_bpf_stats.gstats - [bpf_intf::global_stat_idx_GSTAT_TASK_CTX_FREE_FAILED as usize], - processing_dur.as_millis(), + [bpf_intf::global_stat_idx_GSTAT_TASK_CTX_FREE_FAILED as usize] as i64, ); + self.om_stats.proc_ms.set(processing_dur.as_millis() as i64); + self.om_stats.busy.set(stats.cpu_busy * 100.0); + self.om_stats.util.set(stats.total_util * 100.0); + self.om_stats.load.set(stats.total_load); - info!( - "busy={:5.1} util={:7.1} load={:9.1} fallback_cpu={:3}", - stats.cpu_busy * 100.0, - stats.total_util * 100.0, - stats.total_load, - self.cpu_pool.fallback_cpu, - ); + if !self.om_format { + info!( + "tot={:7} local={:5.2} open_idle={:5.2} affn_viol={:5.2} tctx_err={} proc={:?}ms", + self.om_stats.total.get(), + self.om_stats.local.get(), + self.om_stats.open_idle.get(), + self.om_stats.affn_viol.get(), + self.om_stats.tctx_err.get(), + self.om_stats.proc_ms.get(), + ); + + info!( + "busy={:5.1} util={:7.1} load={:9.1} fallback_cpu={:3}", + self.om_stats.busy.get(), + self.om_stats.util.get(), + self.om_stats.load.get(), + self.cpu_pool.fallback_cpu, + ); + } let header_width = self .layer_specs @@ -1376,38 +1510,83 @@ impl<'a> Scheduler<'a> { } }; - info!( - " {: {{ + let l = self + .om_stats + .$i + .get_or_create(&vec![("layer_name".to_owned(), spec.name.clone())]); + l.set($e); + l + }}; + } + let l_util = set!(l_util, stats.layer_utils[lidx] * 100.0); + let l_util_frac = set!( + l_util_frac, + calc_frac(stats.layer_utils[lidx], stats.total_util) ); - info!( - " {: