scx_layered: Deprecate idle_smt layer config

24fba4ab8d ("scx_layered: Add idle smt layer configuration") added the
idle_smt layer config but

- It flipped the default from preferring idle cores to not having
  preference.

- Misdocumented what it meant. It doesn't only pick idle cores. It tries to
  pick an idle core first and if that fails pick any idle CPU.

A follow-up commit 637fc3f6e1 ("scx_layered: Use layer idle_smt option")
made it more confusing. If idle_smt is set, the idle core prioritizing logic
is disabled.

The first commit disables idle core prioritization by overriding
idle_smtmask to be idle_cpumask if idle_smt is *clear* and the second commit
disables the same by disabling the code path when the flag is *set*. ie.
Both options did exactly the same thing.

Recently, 75dd81e3e6 ("scx_layered: Improve topology aware select_cpu()")
restored the function of the flag by dropping the cpumask override. However,
this made the actual behavior the opposite of the documented one by leaving
only the behavior of the second commit which implemented the reverse
behavior.

This flag is hopeless. History aside, the name itself is too confusing.
idle_smt - is it saying that the flag is going to prefer idle smt *thread*
or idle smt *core*? While the name is transferred from idle_cpumask/smtmask,
there, the meaning of the former is clear which also makes it difficult to
confuse what the latter means.

Preferring idle cores was one of the drivers of performance gain identified
during earlier ads experiments. Let's just drop the flag to restore the
previous behavior and retry if necessary.
This commit is contained in:
Tejun Heo 2024-11-29 23:43:40 -10:00
parent 14af41d0dd
commit b7b15ac4b1
4 changed files with 21 additions and 25 deletions

View File

@ -245,7 +245,6 @@ struct layer {
bool preempt;
bool preempt_first;
bool exclusive;
bool idle_smt;
int growth_algo;
u32 owned_usage_target_ppk;

View File

@ -580,7 +580,7 @@ static void maybe_refresh_layered_cpus_node(struct task_struct *p, struct task_c
}
static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu,
const struct cpumask *idle_smtmask, bool pref_idle_smt)
const struct cpumask *idle_smtmask)
{
bool prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, cand_cpumask);
s32 cpu;
@ -589,7 +589,7 @@ static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu,
* If CPU has SMT, any wholly idle CPU is likely a better pick than
* partially idle @prev_cpu.
*/
if (smt_enabled && !pref_idle_smt) {
if (smt_enabled) {
if (prev_in_cand &&
bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
scx_bpf_test_and_clear_cpu_idle(prev_cpu))
@ -654,8 +654,7 @@ s32 pick_idle_big_little(struct layer *layer, struct task_ctx *taskc,
bpf_cpumask_and(tmp_cpumask, cast_mask(taskc->layered_mask),
cast_mask(big_cpumask));
cpu = pick_idle_cpu_from(cast_mask(tmp_cpumask),
prev_cpu, idle_smtmask,
layer->idle_smt);
prev_cpu, idle_smtmask);
goto out_put;
}
case GROWTH_ALGO_LITTLE_BIG: {
@ -669,8 +668,7 @@ s32 pick_idle_big_little(struct layer *layer, struct task_ctx *taskc,
bpf_cpumask_and(tmp_cpumask, cast_mask(taskc->layered_mask),
cast_mask(tmp_cpumask));
cpu = pick_idle_cpu_from(cast_mask(tmp_cpumask),
prev_cpu, idle_smtmask,
layer->idle_smt);
prev_cpu, idle_smtmask);
goto out_put;
}
default:
@ -763,8 +761,7 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
cpu = -1;
goto out_put;
}
if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask,
layer->idle_smt)) >= 0)
if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask)) >= 0)
goto out_put;
}
@ -778,21 +775,18 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
cpu = -1;
goto out_put;
}
if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask,
layer->idle_smt)) >= 0)
if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask)) >= 0)
goto out_put;
}
if ((cpu = pick_idle_cpu_from(layered_cpumask, prev_cpu,
idle_smtmask, layer->idle_smt)) >= 0)
if ((cpu = pick_idle_cpu_from(layered_cpumask, prev_cpu, idle_smtmask)) >= 0)
goto out_put;
/*
* If the layer is an open one, we can try the whole machine.
*/
if (layer->kind != LAYER_KIND_CONFINED &&
((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu,
idle_smtmask, layer->idle_smt)) >= 0)) {
((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu, idle_smtmask) >= 0))) {
lstat_inc(LSTAT_OPEN_IDLE, layer, cpuc);
goto out_put;
}

View File

@ -89,8 +89,8 @@ pub struct LayerCommon {
pub exclusive: bool,
#[serde(default)]
pub weight: u32,
#[serde(default)]
pub idle_smt: bool,
#[serde(default, skip_serializing)]
pub idle_smt: Option<bool>,
#[serde(default)]
pub growth_algo: LayerGrowthAlgo,
#[serde(default)]

View File

@ -105,7 +105,7 @@ lazy_static! {
preempt: false,
preempt_first: false,
exclusive: false,
idle_smt: false,
idle_smt: None,
slice_us: 20000,
weight: DEFAULT_LAYER_WEIGHT,
growth_algo: LayerGrowthAlgo::Sticky,
@ -129,7 +129,7 @@ lazy_static! {
preempt: true,
preempt_first: false,
exclusive: true,
idle_smt: false,
idle_smt: None,
slice_us: 20000,
weight: DEFAULT_LAYER_WEIGHT,
growth_algo: LayerGrowthAlgo::Sticky,
@ -155,7 +155,7 @@ lazy_static! {
preempt: true,
preempt_first: false,
exclusive: false,
idle_smt: false,
idle_smt: None,
slice_us: 800,
weight: DEFAULT_LAYER_WEIGHT,
growth_algo: LayerGrowthAlgo::Topo,
@ -178,7 +178,7 @@ lazy_static! {
preempt: false,
preempt_first: false,
exclusive: false,
idle_smt: false,
idle_smt: None,
slice_us: 20000,
weight: DEFAULT_LAYER_WEIGHT,
growth_algo: LayerGrowthAlgo::Linear,
@ -323,8 +323,7 @@ lazy_static! {
/// utilization to determine the infeasible adjusted weight with higher
/// weights having a larger adjustment in adjusted utilization.
///
/// - idle_smt: When selecting an idle CPU for task task migration use
/// only idle SMT CPUs. The default is to select any idle cpu.
/// - idle_smt: *** DEPRECATED ****
///
/// - growth_algo: When a layer is allocated new CPUs different algorithms can
/// be used to determine which CPU should be allocated next. The default
@ -1160,7 +1159,6 @@ impl<'a> Scheduler<'a> {
preempt,
preempt_first,
exclusive,
idle_smt,
growth_algo,
nodes,
slice_us,
@ -1187,7 +1185,6 @@ impl<'a> Scheduler<'a> {
layer.preempt.write(*preempt);
layer.preempt_first.write(*preempt_first);
layer.exclusive.write(*exclusive);
layer.idle_smt.write(*idle_smt);
layer.growth_algo = growth_algo.as_bpf_enum();
layer.weight = if *weight <= MAX_LAYER_WEIGHT && *weight >= MIN_LAYER_WEIGHT {
*weight
@ -2315,6 +2312,12 @@ fn main() -> Result<()> {
);
}
for spec in &layer_config.specs {
if spec.kind.common().idle_smt.is_some() {
warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
}
}
debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
verify_layer_specs(&layer_config.specs)?;