Merge pull request #934 from sched-ext/htejun/layered-updates

scx_layered: Cleanups around topology handling
This commit is contained in:
Tejun Heo 2024-11-18 23:12:48 +00:00 committed by GitHub
commit 88c7d47314
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 89 additions and 136 deletions

View File

@ -290,18 +290,7 @@ pub struct Topology {
}
impl Topology {
/// Build a complete host Topology
pub fn new() -> Result<Topology> {
let span = cpus_online()?;
// If the kernel is compiled with CONFIG_NUMA, then build a topology
// from the NUMA hierarchy in sysfs. Otherwise, just make a single
// default node of ID 0 which contains all cores.
let nodes = if Path::new("/sys/devices/system/node").exists() {
create_numa_nodes(&span)?
} else {
create_default_node(&span)?
};
fn instantiate(span: Cpumask, nodes: Vec<Node>) -> Result<Self> {
// For convenient and efficient lookup from the root topology object,
// create two BTreeMaps to the full set of Core and Cpu objects on the
// system. We clone the objects that are located further down in the
@ -332,6 +321,27 @@ impl Topology {
})
}
/// Build a complete host Topology
pub fn new() -> Result<Topology> {
let span = cpus_online()?;
// If the kernel is compiled with CONFIG_NUMA, then build a topology
// from the NUMA hierarchy in sysfs. Otherwise, just make a single
// default node of ID 0 which contains all cores.
let nodes = if Path::new("/sys/devices/system/node").exists() {
create_numa_nodes(&span)?
} else {
create_default_node(&span, false)?
};
Self::instantiate(span, nodes)
}
pub fn with_flattened_llc_node() -> Result<Topology> {
let span = cpus_online()?;
let nodes = create_default_node(&span, true)?;
Self::instantiate(span, nodes)
}
/// Get a slice of the NUMA nodes on the host.
pub fn nodes(&self) -> &[Node] {
&self.nodes
@ -518,6 +528,7 @@ fn create_insert_cpu(
node: &mut Node,
online_mask: &Cpumask,
avg_cpu_freq: Option<(usize, usize)>,
flatten_llc: bool,
) -> Result<()> {
// CPU is offline. The Topology hierarchy is read-only, and assumes
// that hotplug will cause the scheduler to restart. Thus, we can
@ -542,7 +553,7 @@ fn create_insert_cpu(
let l2_id = read_file_usize(&cache_path.join(format!("index{}", 2)).join("id")).unwrap_or(0);
let l3_id = read_file_usize(&cache_path.join(format!("index{}", 3)).join("id")).unwrap_or(0);
// Assume that LLC is always 3.
let llc_id = l3_id;
let llc_id = if flatten_llc { 0 } else { l3_id };
// Min and max frequencies. If the kernel is not compiled with
// CONFIG_CPU_FREQ, just assume 0 for both frequencies.
@ -647,7 +658,7 @@ fn avg_cpu_freq() -> Option<(usize, usize)> {
Some((avg_base_freq / nr_cpus, top_max_freq))
}
fn create_default_node(online_mask: &Cpumask) -> Result<Vec<Node>> {
fn create_default_node(online_mask: &Cpumask, flatten_llc: bool) -> Result<Vec<Node>> {
let mut nodes: Vec<Node> = Vec::with_capacity(1);
let mut node = Node {
@ -678,7 +689,7 @@ fn create_default_node(online_mask: &Cpumask) -> Result<Vec<Node>> {
let avg_cpu_freq = avg_cpu_freq();
let cpu_ids = read_cpu_ids()?;
for cpu_id in cpu_ids.iter() {
create_insert_cpu(*cpu_id, &mut node, &online_mask, avg_cpu_freq)?;
create_insert_cpu(*cpu_id, &mut node, &online_mask, avg_cpu_freq, flatten_llc)?;
}
nodes.push(node);
@ -734,7 +745,7 @@ fn create_numa_nodes(online_mask: &Cpumask) -> Result<Vec<Node>> {
}
};
create_insert_cpu(cpu_id, &mut node, &online_mask, avg_cpu_freq)?;
create_insert_cpu(cpu_id, &mut node, &online_mask, avg_cpu_freq, false)?;
}
nodes.push(node);

View File

@ -112,6 +112,7 @@ struct cpu_ctx {
u64 gstats[NR_GSTATS];
u64 lstats[MAX_LAYERS][NR_LSTATS];
u64 ran_current_for;
u64 hi_fallback_dsq_id;
u32 layer_idx;
u32 cache_idx;
u32 node_idx;

View File

@ -115,15 +115,15 @@ u32 rotate_llc_id(u32 base_llc_id, u32 rotation)
// return the dsq id for the layer based on the LLC id.
static __noinline u64 layer_dsq_id(u32 layer_id, u32 llc_id)
{
return (layer_id * nr_llcs) + llc_id;
if (nr_llcs == 1)
return layer_id;
else
return (layer_id * nr_llcs) + llc_id;
}
// XXX - cpu_to_llc_id() must not be inlined to not blow past ins limit when
// topo is enabled but older kernels get confused by RCU state when subprogs are
// called from sleepable functions. Use __always_inline variant from
// layered_init() and __noinline from everywhere else. Remove this once we can
// ignore the older kernels.
static __always_inline u32 __cpu_to_llc_id(s32 cpu_id)
// XXX - older kernels get confused by RCU state when subprogs are called from
// sleepable functions. Use __always_inline.
static __always_inline u32 cpu_to_llc_id(s32 cpu_id)
{
const volatile u32 *llc_ptr;
@ -135,11 +135,6 @@ static __always_inline u32 __cpu_to_llc_id(s32 cpu_id)
return *llc_ptr;
}
static __noinline u32 cpu_to_llc_id(u32 cpu_id)
{
return __cpu_to_llc_id(cpu_id);
}
u32 llc_node_id(u32 llc_id)
{
const volatile u32 *llc_ptr;
@ -162,33 +157,6 @@ static inline bool is_fallback_dsq(u64 dsq_id)
return dsq_id > HI_FALLBACK_DSQ_BASE && dsq_id <= LO_FALLBACK_DSQ;
}
static u64 llc_hi_fallback_dsq_iter_offset(int llc_offset, int idx)
{
int offset = llc_offset + idx;
if (offset >= nr_llcs)
return llc_hi_fallback_dsq_id(offset - nr_llcs);
return llc_hi_fallback_dsq_id(idx + llc_offset);
}
static int llc_iter_cpu_offset(int idx, s32 cpu)
{
int offset;
if (cpu <= 0)
return idx;
offset = (cpu % nr_llcs) + idx;
return offset >= nr_llcs ? offset - nr_llcs : offset;
}
static u64 cpu_hi_fallback_dsq_id(s32 cpu)
{
return llc_hi_fallback_dsq_id(cpu_to_llc_id(cpu));
}
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
@ -1108,16 +1076,15 @@ preempt_fail:
void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
{
struct cpu_ctx *cctx;
struct cpu_ctx *cctx, *task_cctx;
struct task_ctx *tctx;
struct layer *layer;
s32 task_cpu = scx_bpf_task_cpu(p);
u64 vtime = p->scx.dsq_vtime;
bool try_preempt_first;
u32 idx;
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
!(layer = lookup_layer(tctx->layer)))
if (!(cctx = lookup_cpu_ctx(-1)) || !(task_cctx = lookup_cpu_ctx(task_cpu)) ||
!(tctx = lookup_task_ctx(p)) || !(layer = lookup_layer(tctx->layer)))
return;
try_preempt_first = cctx->try_preempt_first;
@ -1160,9 +1127,8 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
!bpf_cpumask_test_cpu(task_cpu, layer_cpumask))
lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
idx = cpu_hi_fallback_dsq_id(task_cpu);
tctx->last_dsq = idx;
scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
tctx->last_dsq = task_cctx->hi_fallback_dsq_id;
scx_bpf_dispatch(p, tctx->last_dsq, slice_ns, enq_flags);
goto preempt;
}
@ -1186,22 +1152,13 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
* to the LLC local HI_FALLBACK_DSQ to avoid this starvation
* issue.
*/
idx = cpu_hi_fallback_dsq_id(task_cpu);
scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
tctx->last_dsq = idx;
tctx->last_dsq = task_cctx->hi_fallback_dsq_id;
scx_bpf_dispatch(p, tctx->last_dsq, slice_ns, enq_flags);
goto preempt;
}
if (disable_topology) {
tctx->last_dsq = tctx->layer;
scx_bpf_dispatch_vtime(p, tctx->layer, slice_ns, vtime, enq_flags);
} else {
u32 llc_id = cpu_to_llc_id(tctx->last_cpu >= 0 ? tctx->last_cpu :
bpf_get_smp_processor_id());
idx = layer_dsq_id(layer->idx, llc_id);
tctx->last_dsq = idx;
scx_bpf_dispatch_vtime(p, idx, slice_ns, vtime, enq_flags);
}
tctx->last_dsq = layer_dsq_id(layer->idx, task_cctx->cache_idx);
scx_bpf_dispatch_vtime(p, tctx->last_dsq, slice_ns, vtime, enq_flags);
preempt:
try_preempt(task_cpu, p, tctx, try_preempt_first, enq_flags);
@ -1247,21 +1204,11 @@ static bool keep_running(struct cpu_ctx *cctx, struct task_struct *p)
* have tasks waiting, keep running it. If there are multiple
* competing preempting layers, this won't work well.
*/
if (disable_topology) {
if (!scx_bpf_dsq_nr_queued(layer->idx)) {
p->scx.slice = slice_ns;
lstat_inc(LSTAT_KEEP, layer, cctx);
return true;
}
} else {
u32 dsq_id = cpu_to_llc_id(tctx->last_cpu >= 0 ?
tctx->last_cpu :
bpf_get_smp_processor_id());
if (!scx_bpf_dsq_nr_queued(dsq_id)) {
p->scx.slice = slice_ns;
lstat_inc(LSTAT_KEEP, layer, cctx);
return true;
}
u32 dsq_id = layer_dsq_id(layer->idx, cctx->cache_idx);
if (!scx_bpf_dsq_nr_queued(dsq_id)) {
p->scx.slice = slice_ns;
lstat_inc(LSTAT_KEEP, layer, cctx);
return true;
}
} else {
const struct cpumask *idle_cpumask = scx_bpf_get_idle_cpumask();
@ -1465,8 +1412,7 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
return;
}
dsq_id = cpu_hi_fallback_dsq_id(cpu);
if (scx_bpf_consume(dsq_id))
if (scx_bpf_consume(cctx->hi_fallback_dsq_id))
return;
/* consume !open layers second */
@ -1725,8 +1671,6 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
return;
}
u32 my_llc_id = cpu_to_llc_id(cpu);
/*
* If one of the fallback DSQs has the most budget then consume from it
* to prevent starvation.
@ -1741,19 +1685,18 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
}
/* consume preempting layers first */
if (consume_preempting(costc, my_llc_id) == 0)
if (consume_preempting(costc, cctx->cache_idx) == 0)
return;
dsq_id = cpu_hi_fallback_dsq_id(cpu);
if (scx_bpf_consume(dsq_id))
if (scx_bpf_consume(cctx->hi_fallback_dsq_id))
return;
/* consume !open layers second */
if (consume_non_open(costc, cpu, my_llc_id) == 0)
if (consume_non_open(costc, cpu, cctx->cache_idx) == 0)
return;
/* consume !preempting open layers */
if (consume_open_no_preempt(costc, my_llc_id) == 0)
if (consume_open_no_preempt(costc, cctx->cache_idx) == 0)
return;
scx_bpf_consume(LO_FALLBACK_DSQ);
@ -2036,13 +1979,14 @@ static s32 create_cache(u32 cache_id)
return -ENOENT;
}
llc_id = __cpu_to_llc_id(cpu);
llc_id = cpu_to_llc_id(cpu);
if (llc_id != cache_id)
continue;
bpf_cpumask_set_cpu(cpu, cpumask);
cachec->nr_cpus++;
cctx->cache_idx = cache_id;
cctx->hi_fallback_dsq_id = llc_hi_fallback_dsq_id(cache_id);
}
dbg("CFG creating cache %d with %d cpus", cache_id, cachec->nr_cpus);
@ -2496,23 +2440,16 @@ void BPF_STRUCT_OPS(layered_dump, struct scx_dump_ctx *dctx)
continue;
}
if (disable_topology) {
scx_bpf_dump("LAYER[%d][%s] nr_cpus=%u nr_queued=%d -%llums cpus=",
i, layer->name, layer->nr_cpus,
scx_bpf_dsq_nr_queued(i),
dsq_first_runnable_for_ms(i, now));
} else {
bpf_for(j, 0, nr_llcs) {
if (!(layer->cache_mask & (1 << j)))
continue;
bpf_for(j, 0, nr_llcs) {
if (!(layer->cache_mask & (1 << j)))
continue;
idx = layer_dsq_id(layer->idx, j);
scx_bpf_dump("LAYER[%d][%s]DSQ[%d] nr_cpus=%u nr_queued=%d -%llums cpus=",
i, layer->name, idx, layer->nr_cpus,
scx_bpf_dsq_nr_queued(idx),
dsq_first_runnable_for_ms(idx, now));
scx_bpf_dump("\n");
}
idx = layer_dsq_id(layer->idx, j);
scx_bpf_dump("LAYER[%d][%s]DSQ[%d] nr_cpus=%u nr_queued=%d -%llums cpus=",
i, layer->name, idx, layer->nr_cpus,
scx_bpf_dsq_nr_queued(idx),
dsq_first_runnable_for_ms(idx, now));
scx_bpf_dump("\n");
}
dump_layer_cpumask(i);
scx_bpf_dump("\n");
@ -2638,8 +2575,8 @@ unlock:
*/
static bool antistall_scan(void)
{
s32 cpu;
u64 dsq_id;
s32 llc;
u64 layer_id;
u64 jiffies_now;
if (!enable_antistall)
@ -2647,14 +2584,12 @@ static bool antistall_scan(void)
jiffies_now = bpf_jiffies64();
bpf_for(dsq_id, 0, nr_layers) {
antistall_set(dsq_id, jiffies_now);
}
bpf_for(layer_id, 0, nr_layers)
bpf_for(llc, 0, nr_llcs)
antistall_set(layer_dsq_id(layer_id, llc), jiffies_now);
bpf_for(cpu, 0, nr_possible_cpus) {
dsq_id = cpu_hi_fallback_dsq_id(cpu);
antistall_set(dsq_id, jiffies_now);
}
bpf_for(llc, 0, nr_llcs)
antistall_set(llc_hi_fallback_dsq_id(llc), jiffies_now);
antistall_set(LO_FALLBACK_DSQ, jiffies_now);

View File

@ -1336,7 +1336,7 @@ impl<'a> Scheduler<'a> {
layer.perf = u32::try_from(*perf)?;
layer.node_mask = nodemask_from_nodes(nodes) as u64;
for topo_node in topo.nodes() {
if !nodes.contains(&topo_node.id()) {
if !nodes.is_empty() && !nodes.contains(&topo_node.id()) {
continue;
}
layer.cache_mask |= cachemask_from_llcs(&topo_node.llcs()) as u64;
@ -1396,24 +1396,30 @@ impl<'a> Scheduler<'a> {
open_object: &'a mut MaybeUninit<OpenObject>,
) -> Result<Self> {
let nr_layers = layer_specs.len();
let topo = Topology::new()?;
let cpu_pool = CpuPool::new(&topo)?;
let mut disable_topology = opts.disable_topology.unwrap_or(false);
let disable_topology = if let Some(val) = opts.disable_topology {
val
let topo = if disable_topology {
Topology::with_flattened_llc_node()?
} else {
let val = if topo.nodes().len() > 1 {
false
} else {
topo.nodes().iter().all(|n| n.llcs().len() <= 1)
Topology::new()?
};
if !disable_topology {
if topo.nodes().len() == 1 && topo.nodes()[0].llcs().len() == 1 {
disable_topology = true;
};
info!(
"Topology awareness not specified, selecting {} based on hardware",
if val { "disabled" } else { "enabled" }
if disable_topology {
"disabled"
} else {
"enabled"
}
);
val
};
let cpu_pool = CpuPool::new(&topo)?;
// If disabling topology awareness clear out any set NUMA/LLC configs and
// it will fallback to using all cores.
let layer_specs: Vec<_> = if disable_topology {