rusty: Add debug! logging to load_balance.rs

We removed the debug!() output that was previously present in main.rs. Let's
add more debug!() output that helps debug the current LB hierarchy.

Signed-off-by: David Vernet <void@manifault.com>
This commit is contained in:
David Vernet 2024-03-05 17:35:53 -06:00
parent 0d0b101398
commit 26a94b1b14
No known key found for this signature in database
GPG Key ID: 59E4B86965C4F364

View File

@ -143,6 +143,7 @@ use std::sync::Arc;
use anyhow::bail; use anyhow::bail;
use anyhow::Context; use anyhow::Context;
use anyhow::Result; use anyhow::Result;
use log::debug;
use log::warn; use log::warn;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use scx_utils::ravg::ravg_read; use scx_utils::ravg::ravg_read;
@ -176,6 +177,16 @@ enum BalanceState {
NeedsPull, NeedsPull,
} }
impl fmt::Display for BalanceState {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
BalanceState::Balanced => write!(f, "BALANCED"),
BalanceState::NeedsPush => write!(f, "OVER-LOADED"),
BalanceState::NeedsPull => write!(f, "UNDER-LOADED"),
}
}
}
macro_rules! impl_ord_for_type { macro_rules! impl_ord_for_type {
($($t:ty),*) => { ($($t:ty),*) => {
$( $(
@ -240,7 +251,7 @@ impl LoadEntity {
xfer_ratio: f64, xfer_ratio: f64,
load_sum: f64, load_sum: f64,
load_avg: f64) -> Self { load_avg: f64) -> Self {
Self { let mut entity = Self {
cost_ratio, cost_ratio,
push_max_ratio, push_max_ratio,
xfer_ratio, xfer_ratio,
@ -248,7 +259,9 @@ impl LoadEntity {
load_avg, load_avg,
load_delta: 0.0f64, load_delta: 0.0f64,
bal_state: BalanceState::Balanced, bal_state: BalanceState::Balanced,
} };
entity.add_load(0.0f64);
entity
} }
pub fn load_sum(&self) -> f64 { pub fn load_sum(&self) -> f64 {
@ -367,6 +380,9 @@ impl Domain {
self.load.add_load(-load); self.load.add_load(-load);
other.load.add_load(load); other.load.add_load(load);
debug!(" DOM {} sending [pid: {:05}](load: {:.06}) --> DOM {} ",
self.id, pid, load, other.id);
} }
fn xfer_between(&self, other: &Domain) -> f64 { fn xfer_between(&self, other: &Domain) -> f64 {
@ -407,14 +423,6 @@ impl NumaNode {
} }
} }
fn can_push(&self) -> bool {
self.push_domains.len() > 0
}
fn can_pull(&self) -> bool {
self.pull_domains.len() > 0
}
fn allocate_domain(&mut self, id: usize, load: f64, dom_load_avg: f64) { fn allocate_domain(&mut self, id: usize, load: f64, dom_load_avg: f64) {
let domain = Domain::new(id, load, dom_load_avg); let domain = Domain::new(id, load, dom_load_avg);
@ -472,6 +480,7 @@ impl NumaNode {
load: dom.load.clone(), load: dom.load.clone(),
}); });
} }
n_stat.domains.sort_by(|x, y| x.id.partial_cmp(&y.id).unwrap());
n_stat n_stat
} }
@ -496,7 +505,7 @@ fn fmt_balance_stat(f: &mut fmt::Formatter<'_>,
let load_delta = load.delta(); let load_delta = load.delta();
let get_fmt = |num: f64| if num >= 0.0f64 { format!("{:+4.2}", num) } else { format!("{:4.2}", num) }; let get_fmt = |num: f64| if num >= 0.0f64 { format!("{:+4.2}", num) } else { format!("{:4.2}", num) };
write!(f, "{} load={:4.2} imbal={:4.2} load_delta={:4.2}", write!(f, "{} load={:4.2} imbal={} load_delta={}",
preamble, load_sum, get_fmt(imbal), get_fmt(load_delta)) preamble, load_sum, get_fmt(imbal), get_fmt(load_delta))
} }
@ -592,6 +601,8 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
numa_stats.push(node.numa_stat()); numa_stats.push(node.numa_stat());
} }
numa_stats.sort_by(|x, y| x.id.partial_cmp(&y.id).unwrap());
numa_stats numa_stats
} }
@ -889,7 +900,12 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
fn transfer_between_nodes(&mut self, fn transfer_between_nodes(&mut self,
push_node: &mut NumaNode, push_node: &mut NumaNode,
pull_node: &mut NumaNode) -> Result<f64> { pull_node: &mut NumaNode) -> Result<f64> {
if !push_node.can_push() || !pull_node.can_pull() { let n_push_doms = push_node.push_domains.len();
let n_pull_doms = pull_node.pull_domains.len();
debug!("Inter node {} -> {} started ({} push domains -> {} pull domains)",
push_node.id, pull_node.id, n_push_doms, n_pull_doms);
if n_push_doms == 0 || n_pull_doms == 0 {
return Ok(0.0f64); return Ok(0.0f64);
} }
@ -922,7 +938,13 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
} }
fn balance_between_nodes(&mut self) -> Result<()> { fn balance_between_nodes(&mut self) -> Result<()> {
if self.push_nodes.len() == 0 || self.pull_nodes.len() == 0 { let n_push_nodes = self.push_nodes.len();
let n_pull_nodes = self.pull_nodes.len();
debug!("Node <-> Node LB started ({} pushers -> {} pullers)",
n_push_nodes, n_pull_nodes);
if n_push_nodes == 0 || n_pull_nodes == 0 {
return Ok(()); return Ok(());
} }
@ -933,6 +955,10 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
let push_cutoff = push_node.load.push_cutoff(); let push_cutoff = push_node.load.push_cutoff();
let mut pushed = 0f64; let mut pushed = 0f64;
if push_node.load.imbal() < 0.0f64 {
bail!("Push node {} had imbal {}", push_node.id, push_node.load.imbal());
}
// Always try to send load to the nodes that need it most, in // Always try to send load to the nodes that need it most, in
// descending order. // descending order.
loop { loop {
@ -940,6 +966,9 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
let mut pull_nodes = std::mem::take(&mut self.pull_nodes).into_vec(); let mut pull_nodes = std::mem::take(&mut self.pull_nodes).into_vec();
for pull_node in pull_nodes.iter_mut() { for pull_node in pull_nodes.iter_mut() {
if pull_node.load.imbal() >= 0.0f64 {
bail!("Pull node {} had imbal {}", pull_node.id, pull_node.load.imbal());
}
let migrated = self.transfer_between_nodes(push_node, pull_node)?; let migrated = self.transfer_between_nodes(push_node, pull_node)?;
if migrated > 0.0f64 { if migrated > 0.0f64 {
// Break after a successful migration so that we can // Break after a successful migration so that we can
@ -948,6 +977,7 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
// pull from domains in descending-imbalance order. // pull from domains in descending-imbalance order.
pushed += migrated; pushed += migrated;
transfer_occurred = true; transfer_occurred = true;
debug!("NODE {} sending {:.06} --> NODE {}", push_node.id, migrated, pull_node.id);
break; break;
} }
} }
@ -957,6 +987,10 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
break; break;
} }
} }
if pushed > 0.0f64 {
debug!("NODE {} pushed {:.06} total load", push_node.id, pushed);
}
} }
std::mem::swap(&mut self.push_nodes, &mut SortedVec::from_unsorted(push_nodes)); std::mem::swap(&mut self.push_nodes, &mut SortedVec::from_unsorted(push_nodes));
@ -964,7 +998,13 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
} }
fn balance_within_node(&mut self, node: &mut NumaNode) -> Result<()> { fn balance_within_node(&mut self, node: &mut NumaNode) -> Result<()> {
if !node.can_push() || !node.can_pull() { let n_push_doms = node.push_domains.len();
let n_pull_doms = node.pull_domains.len();
debug!("Intra node {} LB started ({} push domains -> {} pull domains)",
node.id, n_push_doms, n_pull_doms);
if n_push_doms == 0 || n_pull_doms == 0 {
return Ok(()); return Ok(());
} }
@ -973,11 +1013,17 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
let push_cutoff = push_dom.load.push_cutoff(); let push_cutoff = push_dom.load.push_cutoff();
let push_imbal = push_dom.load.imbal(); let push_imbal = push_dom.load.imbal();
let mut load = 0.0f64; let mut load = 0.0f64;
if push_dom.load.imbal() < 0.0f64 {
bail!("Push dom {} had imbal {}", push_dom.id, push_dom.load.imbal());
}
loop { loop {
let mut did_transfer = false; let mut did_transfer = false;
let mut pull_doms = std::mem::take(&mut node.pull_domains).into_vec(); let mut pull_doms = std::mem::take(&mut node.pull_domains).into_vec();
for pull_dom in pull_doms.iter_mut().filter(|x| x.load.state() == BalanceState::NeedsPull) { for pull_dom in pull_doms.iter_mut().filter(|x| x.load.state() == BalanceState::NeedsPull) {
if pull_dom.load.imbal() >= 0.0f64 {
bail!("Pull dom {} had imbal {}", pull_dom.id, pull_dom.load.imbal());
}
let pull_imbal = pull_dom.load.imbal(); let pull_imbal = pull_dom.load.imbal();
let xfer = push_dom.xfer_between(&pull_dom); let xfer = push_dom.xfer_between(&pull_dom);
if let Some(transferred) = self.try_find_move_task((push_dom, push_imbal), if let Some(transferred) = self.try_find_move_task((push_dom, push_imbal),
@ -996,6 +1042,9 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
break; break;
} }
} }
if load > 0.0f64 {
debug!("DOM {} pushed {:.06} total load", push_dom.id, load);
}
} }
std::mem::swap(&mut node.push_domains, &mut SortedVec::from_unsorted(push_doms)); std::mem::swap(&mut node.push_domains, &mut SortedVec::from_unsorted(push_doms));
@ -1009,11 +1058,15 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
// higher cost function than balancing between domains inside of NUMA // higher cost function than balancing between domains inside of NUMA
// nodes, but the mechanics are the same. Adjustments made here are // nodes, but the mechanics are the same. Adjustments made here are
// reflected in intra-node balancing decisions made next. // reflected in intra-node balancing decisions made next.
self.balance_between_nodes()?; if self.dom_group.nr_nodes() > 1 {
self.balance_between_nodes()?;
}
// Now that the NUMA nodes have been balanced, do another balance round // Now that the NUMA nodes have been balanced, do another balance round
// amongst the domains in each node. // amongst the domains in each node.
debug!("Intra node LBs started");
// Assume all nodes are now balanced. // Assume all nodes are now balanced.
self.balanced_nodes.append(&mut std::mem::take(&mut self.push_nodes).into_vec()); self.balanced_nodes.append(&mut std::mem::take(&mut self.push_nodes).into_vec());
self.balanced_nodes.append(&mut std::mem::take(&mut self.pull_nodes).into_vec()); self.balanced_nodes.append(&mut std::mem::take(&mut self.pull_nodes).into_vec());