scx_layered: Add weighted layer DSQ iteration

Add a flag to control DSQ iteration across layers by layer weight. This
helps prevent starvation by iterating over layers with the lowest weight
first.

Signed-off-by: Daniel Hodges <hodges.daniel.scott@gmail.com>
This commit is contained in:
Daniel Hodges 2024-10-03 11:22:05 -07:00
parent bd75ac8dbf
commit f3b3d4f19c
2 changed files with 50 additions and 11 deletions

View File

@ -32,8 +32,10 @@ const volatile u32 nr_llcs = 32; /* !0 for veristat, set during init */
const volatile bool smt_enabled = true;
const volatile bool disable_topology = false;
const volatile bool xnuma_preemption = false;
const volatile bool layer_weight_dsq_iter = false;
const volatile s32 __sibling_cpu[MAX_CPUS];
const volatile unsigned char all_cpus[MAX_CPUS_U8];
const volatile u32 layer_iteration_order[MAX_LAYERS];
private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
struct layer layers[MAX_LAYERS];
@ -79,6 +81,12 @@ static u32 cpu_ctx_layer_idx_inc(struct cpu_ctx *cctx)
return cctx->layer_idx;
}
// Returns the iterator index of a layer ordered by weight.
static u32 iter_layer_weight_ctx(int idx)
{
return *MEMBER_VPTR(layer_iteration_order, [idx]);
}
static __noinline u32 iter_layer_cpu_ctx(u32 layer_idx, int idx)
{
u32 offset;
@ -1160,14 +1168,16 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
/* consume preempting layers first */
bpf_for(idx, 0, nr_layers) {
layer_idx = iter_layer_cpu_ctx(cctx->layer_idx, idx);
if (disable_topology) {
if (MEMBER_VPTR(layers, [layer_idx].preempt) && scx_bpf_consume(layer_idx))
if (MEMBER_VPTR(layers, [idx].preempt) && scx_bpf_consume(idx))
return;
} else {
layer_idx = layer_weight_dsq_iter ? iter_layer_weight_ctx(idx) :
iter_layer_cpu_ctx(cctx->layer_idx, idx);
bpf_for(llc_id, 0, nr_llcs) {
dsq_id = layer_dsq_id(layer_idx, llc_id);
if (MEMBER_VPTR(layers, [layer_idx].preempt) && scx_bpf_consume(dsq_id))
if (MEMBER_VPTR(layers, [layer_idx].preempt) &&
scx_bpf_consume(dsq_id))
return;
}
}
@ -1179,21 +1189,23 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
/* consume !open layers second */
bpf_for(idx, 0, nr_layers) {
layer_idx = iter_layer_cpu_ctx(cctx->layer_idx, idx);
if (disable_topology) {
struct layer *layer = &layers[layer_idx];
layer_idx = idx;
struct layer *layer = &layers[idx];
struct cpumask *layer_cpumask;
/* consume matching layers */
if (!(layer_cpumask = lookup_layer_cpumask(layer_idx)))
if (!(layer_cpumask = lookup_layer_cpumask(idx)))
return;
if (bpf_cpumask_test_cpu(cpu, layer_cpumask) ||
(cpu == fallback_cpu && layer->nr_cpus == 0)) {
if (scx_bpf_consume(layer_idx))
if (scx_bpf_consume(idx))
return;
}
} else {
layer_idx = layer_weight_dsq_iter ? iter_layer_weight_ctx(idx) :
iter_layer_cpu_ctx(cctx->layer_idx, idx);
bpf_for(llc_id, 0, nr_llcs) {
struct layer *layer = &layers[layer_idx];
struct cpumask *layer_cpumask;
@ -1205,7 +1217,7 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
if (bpf_cpumask_test_cpu(cpu, layer_cpumask) ||
(cpu <= nr_possible_cpus && cpu == fallback_cpu &&
layer->nr_cpus == 0)) {
MEMBER_VPTR(layer, ->nr_cpus) == 0)) {
if (scx_bpf_consume(dsq_id))
return;
}
@ -1215,12 +1227,13 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
/* consume !preempting open layers */
bpf_for(idx, 0, nr_layers) {
layer_idx = iter_layer_cpu_ctx(cctx->layer_idx, idx);
if (disable_topology) {
if (!layers[layer_idx].preempt && layers[layer_idx].open &&
scx_bpf_consume(layer_idx))
if (!layers[idx].preempt && layers[idx].open &&
scx_bpf_consume(idx))
return;
} else {
layer_idx = layer_weight_dsq_iter ? iter_layer_weight_ctx(idx) :
iter_layer_cpu_ctx(cctx->layer_idx, idx);
bpf_for(llc_id, 0, nr_llcs) {
dsq_id = layer_dsq_id(layer_idx, llc_id);
@ -1885,6 +1898,14 @@ void BPF_STRUCT_OPS(layered_dump, struct scx_dump_ctx *dctx)
dsq_first_runnable_for_ms(LO_FALLBACK_DSQ, now));
}
static void print_iter_order() {
int i;
bpf_for(i, 0, nr_layers) {
trace("ITER order i: %d %d\n", i, *MEMBER_VPTR(layer_iteration_order, [i]));
}
}
s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
{
struct bpf_cpumask *cpumask;
@ -2059,6 +2080,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
}
}
print_iter_order();
return 0;
}

View File

@ -414,6 +414,12 @@ struct Opts {
#[clap(long, default_value = "0.0")]
layer_growth_weight_disable: f64,
/// When iterating over layer DSQs use the weight of the layer for iteration
/// order. The default iteration order is semi-random except when topology
/// awareness is disabled.
#[clap(long)]
layer_weight_dsq_iter: bool,
/// Enable stats monitoring with the specified interval.
#[clap(long)]
stats: Option<f64>,
@ -1555,6 +1561,9 @@ impl<'a, 'b> Scheduler<'a, 'b> {
skel.maps.rodata_data.nr_layers = specs.len() as u32;
let mut perf_set = false;
let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
let mut layer_weights: Vec<usize> = vec![];
for (spec_i, spec) in specs.iter().enumerate() {
let layer = &mut skel.maps.bss_data.layers[spec_i];
@ -1675,6 +1684,7 @@ impl<'a, 'b> Scheduler<'a, 'b> {
} else {
DEFAULT_LAYER_WEIGHT
};
layer_weights.push(layer.weight.try_into().unwrap());
layer.perf = u32::try_from(*perf)?;
layer.node_mask = nodemask_from_nodes(nodes) as u64;
for topo_node in topo.nodes() {
@ -1696,6 +1706,11 @@ impl<'a, 'b> Scheduler<'a, 'b> {
perf_set |= layer.perf > 0;
}
layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
skel.maps.rodata_data.layer_iteration_order[idx] = *layer_idx as u32;
}
if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
warn!("cpufreq support not available, ignoring perf configurations");
}
@ -1775,6 +1790,7 @@ impl<'a, 'b> Scheduler<'a, 'b> {
skel.maps.rodata_data.smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
skel.maps.rodata_data.disable_topology = opts.disable_topology;
skel.maps.rodata_data.xnuma_preemption = opts.xnuma_preemption;
skel.maps.rodata_data.layer_weight_dsq_iter = opts.layer_weight_dsq_iter;
for (cpu, sib) in cpu_pool.sibling_cpu.iter().enumerate() {
skel.maps.rodata_data.__sibling_cpu[cpu] = *sib;
}