mirror of
https://github.com/JakeHillion/scx.git
synced 2024-11-25 11:00:24 +00:00
Merge pull request #934 from sched-ext/htejun/layered-updates
scx_layered: Cleanups around topology handling
This commit is contained in:
commit
88c7d47314
@ -290,18 +290,7 @@ pub struct Topology {
|
||||
}
|
||||
|
||||
impl Topology {
|
||||
/// Build a complete host Topology
|
||||
pub fn new() -> Result<Topology> {
|
||||
let span = cpus_online()?;
|
||||
// If the kernel is compiled with CONFIG_NUMA, then build a topology
|
||||
// from the NUMA hierarchy in sysfs. Otherwise, just make a single
|
||||
// default node of ID 0 which contains all cores.
|
||||
let nodes = if Path::new("/sys/devices/system/node").exists() {
|
||||
create_numa_nodes(&span)?
|
||||
} else {
|
||||
create_default_node(&span)?
|
||||
};
|
||||
|
||||
fn instantiate(span: Cpumask, nodes: Vec<Node>) -> Result<Self> {
|
||||
// For convenient and efficient lookup from the root topology object,
|
||||
// create two BTreeMaps to the full set of Core and Cpu objects on the
|
||||
// system. We clone the objects that are located further down in the
|
||||
@ -332,6 +321,27 @@ impl Topology {
|
||||
})
|
||||
}
|
||||
|
||||
/// Build a complete host Topology
|
||||
pub fn new() -> Result<Topology> {
|
||||
let span = cpus_online()?;
|
||||
// If the kernel is compiled with CONFIG_NUMA, then build a topology
|
||||
// from the NUMA hierarchy in sysfs. Otherwise, just make a single
|
||||
// default node of ID 0 which contains all cores.
|
||||
let nodes = if Path::new("/sys/devices/system/node").exists() {
|
||||
create_numa_nodes(&span)?
|
||||
} else {
|
||||
create_default_node(&span, false)?
|
||||
};
|
||||
|
||||
Self::instantiate(span, nodes)
|
||||
}
|
||||
|
||||
pub fn with_flattened_llc_node() -> Result<Topology> {
|
||||
let span = cpus_online()?;
|
||||
let nodes = create_default_node(&span, true)?;
|
||||
Self::instantiate(span, nodes)
|
||||
}
|
||||
|
||||
/// Get a slice of the NUMA nodes on the host.
|
||||
pub fn nodes(&self) -> &[Node] {
|
||||
&self.nodes
|
||||
@ -518,6 +528,7 @@ fn create_insert_cpu(
|
||||
node: &mut Node,
|
||||
online_mask: &Cpumask,
|
||||
avg_cpu_freq: Option<(usize, usize)>,
|
||||
flatten_llc: bool,
|
||||
) -> Result<()> {
|
||||
// CPU is offline. The Topology hierarchy is read-only, and assumes
|
||||
// that hotplug will cause the scheduler to restart. Thus, we can
|
||||
@ -542,7 +553,7 @@ fn create_insert_cpu(
|
||||
let l2_id = read_file_usize(&cache_path.join(format!("index{}", 2)).join("id")).unwrap_or(0);
|
||||
let l3_id = read_file_usize(&cache_path.join(format!("index{}", 3)).join("id")).unwrap_or(0);
|
||||
// Assume that LLC is always 3.
|
||||
let llc_id = l3_id;
|
||||
let llc_id = if flatten_llc { 0 } else { l3_id };
|
||||
|
||||
// Min and max frequencies. If the kernel is not compiled with
|
||||
// CONFIG_CPU_FREQ, just assume 0 for both frequencies.
|
||||
@ -647,7 +658,7 @@ fn avg_cpu_freq() -> Option<(usize, usize)> {
|
||||
Some((avg_base_freq / nr_cpus, top_max_freq))
|
||||
}
|
||||
|
||||
fn create_default_node(online_mask: &Cpumask) -> Result<Vec<Node>> {
|
||||
fn create_default_node(online_mask: &Cpumask, flatten_llc: bool) -> Result<Vec<Node>> {
|
||||
let mut nodes: Vec<Node> = Vec::with_capacity(1);
|
||||
|
||||
let mut node = Node {
|
||||
@ -678,7 +689,7 @@ fn create_default_node(online_mask: &Cpumask) -> Result<Vec<Node>> {
|
||||
let avg_cpu_freq = avg_cpu_freq();
|
||||
let cpu_ids = read_cpu_ids()?;
|
||||
for cpu_id in cpu_ids.iter() {
|
||||
create_insert_cpu(*cpu_id, &mut node, &online_mask, avg_cpu_freq)?;
|
||||
create_insert_cpu(*cpu_id, &mut node, &online_mask, avg_cpu_freq, flatten_llc)?;
|
||||
}
|
||||
|
||||
nodes.push(node);
|
||||
@ -734,7 +745,7 @@ fn create_numa_nodes(online_mask: &Cpumask) -> Result<Vec<Node>> {
|
||||
}
|
||||
};
|
||||
|
||||
create_insert_cpu(cpu_id, &mut node, &online_mask, avg_cpu_freq)?;
|
||||
create_insert_cpu(cpu_id, &mut node, &online_mask, avg_cpu_freq, false)?;
|
||||
}
|
||||
|
||||
nodes.push(node);
|
||||
|
@ -112,6 +112,7 @@ struct cpu_ctx {
|
||||
u64 gstats[NR_GSTATS];
|
||||
u64 lstats[MAX_LAYERS][NR_LSTATS];
|
||||
u64 ran_current_for;
|
||||
u64 hi_fallback_dsq_id;
|
||||
u32 layer_idx;
|
||||
u32 cache_idx;
|
||||
u32 node_idx;
|
||||
|
@ -115,15 +115,15 @@ u32 rotate_llc_id(u32 base_llc_id, u32 rotation)
|
||||
// return the dsq id for the layer based on the LLC id.
|
||||
static __noinline u64 layer_dsq_id(u32 layer_id, u32 llc_id)
|
||||
{
|
||||
return (layer_id * nr_llcs) + llc_id;
|
||||
if (nr_llcs == 1)
|
||||
return layer_id;
|
||||
else
|
||||
return (layer_id * nr_llcs) + llc_id;
|
||||
}
|
||||
|
||||
// XXX - cpu_to_llc_id() must not be inlined to not blow past ins limit when
|
||||
// topo is enabled but older kernels get confused by RCU state when subprogs are
|
||||
// called from sleepable functions. Use __always_inline variant from
|
||||
// layered_init() and __noinline from everywhere else. Remove this once we can
|
||||
// ignore the older kernels.
|
||||
static __always_inline u32 __cpu_to_llc_id(s32 cpu_id)
|
||||
// XXX - older kernels get confused by RCU state when subprogs are called from
|
||||
// sleepable functions. Use __always_inline.
|
||||
static __always_inline u32 cpu_to_llc_id(s32 cpu_id)
|
||||
{
|
||||
const volatile u32 *llc_ptr;
|
||||
|
||||
@ -135,11 +135,6 @@ static __always_inline u32 __cpu_to_llc_id(s32 cpu_id)
|
||||
return *llc_ptr;
|
||||
}
|
||||
|
||||
static __noinline u32 cpu_to_llc_id(u32 cpu_id)
|
||||
{
|
||||
return __cpu_to_llc_id(cpu_id);
|
||||
}
|
||||
|
||||
u32 llc_node_id(u32 llc_id)
|
||||
{
|
||||
const volatile u32 *llc_ptr;
|
||||
@ -162,33 +157,6 @@ static inline bool is_fallback_dsq(u64 dsq_id)
|
||||
return dsq_id > HI_FALLBACK_DSQ_BASE && dsq_id <= LO_FALLBACK_DSQ;
|
||||
}
|
||||
|
||||
static u64 llc_hi_fallback_dsq_iter_offset(int llc_offset, int idx)
|
||||
{
|
||||
int offset = llc_offset + idx;
|
||||
|
||||
if (offset >= nr_llcs)
|
||||
return llc_hi_fallback_dsq_id(offset - nr_llcs);
|
||||
|
||||
return llc_hi_fallback_dsq_id(idx + llc_offset);
|
||||
}
|
||||
|
||||
static int llc_iter_cpu_offset(int idx, s32 cpu)
|
||||
{
|
||||
int offset;
|
||||
|
||||
if (cpu <= 0)
|
||||
return idx;
|
||||
|
||||
offset = (cpu % nr_llcs) + idx;
|
||||
|
||||
return offset >= nr_llcs ? offset - nr_llcs : offset;
|
||||
}
|
||||
|
||||
static u64 cpu_hi_fallback_dsq_id(s32 cpu)
|
||||
{
|
||||
return llc_hi_fallback_dsq_id(cpu_to_llc_id(cpu));
|
||||
}
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||
__type(key, u32);
|
||||
@ -1108,16 +1076,15 @@ preempt_fail:
|
||||
|
||||
void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
{
|
||||
struct cpu_ctx *cctx;
|
||||
struct cpu_ctx *cctx, *task_cctx;
|
||||
struct task_ctx *tctx;
|
||||
struct layer *layer;
|
||||
s32 task_cpu = scx_bpf_task_cpu(p);
|
||||
u64 vtime = p->scx.dsq_vtime;
|
||||
bool try_preempt_first;
|
||||
u32 idx;
|
||||
|
||||
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
|
||||
!(layer = lookup_layer(tctx->layer)))
|
||||
if (!(cctx = lookup_cpu_ctx(-1)) || !(task_cctx = lookup_cpu_ctx(task_cpu)) ||
|
||||
!(tctx = lookup_task_ctx(p)) || !(layer = lookup_layer(tctx->layer)))
|
||||
return;
|
||||
|
||||
try_preempt_first = cctx->try_preempt_first;
|
||||
@ -1160,9 +1127,8 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
!bpf_cpumask_test_cpu(task_cpu, layer_cpumask))
|
||||
lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
|
||||
|
||||
idx = cpu_hi_fallback_dsq_id(task_cpu);
|
||||
tctx->last_dsq = idx;
|
||||
scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
|
||||
tctx->last_dsq = task_cctx->hi_fallback_dsq_id;
|
||||
scx_bpf_dispatch(p, tctx->last_dsq, slice_ns, enq_flags);
|
||||
goto preempt;
|
||||
}
|
||||
|
||||
@ -1186,22 +1152,13 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
* to the LLC local HI_FALLBACK_DSQ to avoid this starvation
|
||||
* issue.
|
||||
*/
|
||||
idx = cpu_hi_fallback_dsq_id(task_cpu);
|
||||
scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
|
||||
tctx->last_dsq = idx;
|
||||
tctx->last_dsq = task_cctx->hi_fallback_dsq_id;
|
||||
scx_bpf_dispatch(p, tctx->last_dsq, slice_ns, enq_flags);
|
||||
goto preempt;
|
||||
}
|
||||
|
||||
if (disable_topology) {
|
||||
tctx->last_dsq = tctx->layer;
|
||||
scx_bpf_dispatch_vtime(p, tctx->layer, slice_ns, vtime, enq_flags);
|
||||
} else {
|
||||
u32 llc_id = cpu_to_llc_id(tctx->last_cpu >= 0 ? tctx->last_cpu :
|
||||
bpf_get_smp_processor_id());
|
||||
idx = layer_dsq_id(layer->idx, llc_id);
|
||||
tctx->last_dsq = idx;
|
||||
scx_bpf_dispatch_vtime(p, idx, slice_ns, vtime, enq_flags);
|
||||
}
|
||||
tctx->last_dsq = layer_dsq_id(layer->idx, task_cctx->cache_idx);
|
||||
scx_bpf_dispatch_vtime(p, tctx->last_dsq, slice_ns, vtime, enq_flags);
|
||||
|
||||
preempt:
|
||||
try_preempt(task_cpu, p, tctx, try_preempt_first, enq_flags);
|
||||
@ -1247,21 +1204,11 @@ static bool keep_running(struct cpu_ctx *cctx, struct task_struct *p)
|
||||
* have tasks waiting, keep running it. If there are multiple
|
||||
* competing preempting layers, this won't work well.
|
||||
*/
|
||||
if (disable_topology) {
|
||||
if (!scx_bpf_dsq_nr_queued(layer->idx)) {
|
||||
p->scx.slice = slice_ns;
|
||||
lstat_inc(LSTAT_KEEP, layer, cctx);
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
u32 dsq_id = cpu_to_llc_id(tctx->last_cpu >= 0 ?
|
||||
tctx->last_cpu :
|
||||
bpf_get_smp_processor_id());
|
||||
if (!scx_bpf_dsq_nr_queued(dsq_id)) {
|
||||
p->scx.slice = slice_ns;
|
||||
lstat_inc(LSTAT_KEEP, layer, cctx);
|
||||
return true;
|
||||
}
|
||||
u32 dsq_id = layer_dsq_id(layer->idx, cctx->cache_idx);
|
||||
if (!scx_bpf_dsq_nr_queued(dsq_id)) {
|
||||
p->scx.slice = slice_ns;
|
||||
lstat_inc(LSTAT_KEEP, layer, cctx);
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
const struct cpumask *idle_cpumask = scx_bpf_get_idle_cpumask();
|
||||
@ -1465,8 +1412,7 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
|
||||
return;
|
||||
}
|
||||
|
||||
dsq_id = cpu_hi_fallback_dsq_id(cpu);
|
||||
if (scx_bpf_consume(dsq_id))
|
||||
if (scx_bpf_consume(cctx->hi_fallback_dsq_id))
|
||||
return;
|
||||
|
||||
/* consume !open layers second */
|
||||
@ -1725,8 +1671,6 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
|
||||
return;
|
||||
}
|
||||
|
||||
u32 my_llc_id = cpu_to_llc_id(cpu);
|
||||
|
||||
/*
|
||||
* If one of the fallback DSQs has the most budget then consume from it
|
||||
* to prevent starvation.
|
||||
@ -1741,19 +1685,18 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
|
||||
}
|
||||
|
||||
/* consume preempting layers first */
|
||||
if (consume_preempting(costc, my_llc_id) == 0)
|
||||
if (consume_preempting(costc, cctx->cache_idx) == 0)
|
||||
return;
|
||||
|
||||
dsq_id = cpu_hi_fallback_dsq_id(cpu);
|
||||
if (scx_bpf_consume(dsq_id))
|
||||
if (scx_bpf_consume(cctx->hi_fallback_dsq_id))
|
||||
return;
|
||||
|
||||
/* consume !open layers second */
|
||||
if (consume_non_open(costc, cpu, my_llc_id) == 0)
|
||||
if (consume_non_open(costc, cpu, cctx->cache_idx) == 0)
|
||||
return;
|
||||
|
||||
/* consume !preempting open layers */
|
||||
if (consume_open_no_preempt(costc, my_llc_id) == 0)
|
||||
if (consume_open_no_preempt(costc, cctx->cache_idx) == 0)
|
||||
return;
|
||||
|
||||
scx_bpf_consume(LO_FALLBACK_DSQ);
|
||||
@ -2036,13 +1979,14 @@ static s32 create_cache(u32 cache_id)
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
llc_id = __cpu_to_llc_id(cpu);
|
||||
llc_id = cpu_to_llc_id(cpu);
|
||||
if (llc_id != cache_id)
|
||||
continue;
|
||||
|
||||
bpf_cpumask_set_cpu(cpu, cpumask);
|
||||
cachec->nr_cpus++;
|
||||
cctx->cache_idx = cache_id;
|
||||
cctx->hi_fallback_dsq_id = llc_hi_fallback_dsq_id(cache_id);
|
||||
}
|
||||
|
||||
dbg("CFG creating cache %d with %d cpus", cache_id, cachec->nr_cpus);
|
||||
@ -2496,23 +2440,16 @@ void BPF_STRUCT_OPS(layered_dump, struct scx_dump_ctx *dctx)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (disable_topology) {
|
||||
scx_bpf_dump("LAYER[%d][%s] nr_cpus=%u nr_queued=%d -%llums cpus=",
|
||||
i, layer->name, layer->nr_cpus,
|
||||
scx_bpf_dsq_nr_queued(i),
|
||||
dsq_first_runnable_for_ms(i, now));
|
||||
} else {
|
||||
bpf_for(j, 0, nr_llcs) {
|
||||
if (!(layer->cache_mask & (1 << j)))
|
||||
continue;
|
||||
bpf_for(j, 0, nr_llcs) {
|
||||
if (!(layer->cache_mask & (1 << j)))
|
||||
continue;
|
||||
|
||||
idx = layer_dsq_id(layer->idx, j);
|
||||
scx_bpf_dump("LAYER[%d][%s]DSQ[%d] nr_cpus=%u nr_queued=%d -%llums cpus=",
|
||||
i, layer->name, idx, layer->nr_cpus,
|
||||
scx_bpf_dsq_nr_queued(idx),
|
||||
dsq_first_runnable_for_ms(idx, now));
|
||||
scx_bpf_dump("\n");
|
||||
}
|
||||
idx = layer_dsq_id(layer->idx, j);
|
||||
scx_bpf_dump("LAYER[%d][%s]DSQ[%d] nr_cpus=%u nr_queued=%d -%llums cpus=",
|
||||
i, layer->name, idx, layer->nr_cpus,
|
||||
scx_bpf_dsq_nr_queued(idx),
|
||||
dsq_first_runnable_for_ms(idx, now));
|
||||
scx_bpf_dump("\n");
|
||||
}
|
||||
dump_layer_cpumask(i);
|
||||
scx_bpf_dump("\n");
|
||||
@ -2638,8 +2575,8 @@ unlock:
|
||||
*/
|
||||
static bool antistall_scan(void)
|
||||
{
|
||||
s32 cpu;
|
||||
u64 dsq_id;
|
||||
s32 llc;
|
||||
u64 layer_id;
|
||||
u64 jiffies_now;
|
||||
|
||||
if (!enable_antistall)
|
||||
@ -2647,14 +2584,12 @@ static bool antistall_scan(void)
|
||||
|
||||
jiffies_now = bpf_jiffies64();
|
||||
|
||||
bpf_for(dsq_id, 0, nr_layers) {
|
||||
antistall_set(dsq_id, jiffies_now);
|
||||
}
|
||||
bpf_for(layer_id, 0, nr_layers)
|
||||
bpf_for(llc, 0, nr_llcs)
|
||||
antistall_set(layer_dsq_id(layer_id, llc), jiffies_now);
|
||||
|
||||
bpf_for(cpu, 0, nr_possible_cpus) {
|
||||
dsq_id = cpu_hi_fallback_dsq_id(cpu);
|
||||
antistall_set(dsq_id, jiffies_now);
|
||||
}
|
||||
bpf_for(llc, 0, nr_llcs)
|
||||
antistall_set(llc_hi_fallback_dsq_id(llc), jiffies_now);
|
||||
|
||||
antistall_set(LO_FALLBACK_DSQ, jiffies_now);
|
||||
|
||||
|
@ -1336,7 +1336,7 @@ impl<'a> Scheduler<'a> {
|
||||
layer.perf = u32::try_from(*perf)?;
|
||||
layer.node_mask = nodemask_from_nodes(nodes) as u64;
|
||||
for topo_node in topo.nodes() {
|
||||
if !nodes.contains(&topo_node.id()) {
|
||||
if !nodes.is_empty() && !nodes.contains(&topo_node.id()) {
|
||||
continue;
|
||||
}
|
||||
layer.cache_mask |= cachemask_from_llcs(&topo_node.llcs()) as u64;
|
||||
@ -1396,24 +1396,30 @@ impl<'a> Scheduler<'a> {
|
||||
open_object: &'a mut MaybeUninit<OpenObject>,
|
||||
) -> Result<Self> {
|
||||
let nr_layers = layer_specs.len();
|
||||
let topo = Topology::new()?;
|
||||
let cpu_pool = CpuPool::new(&topo)?;
|
||||
let mut disable_topology = opts.disable_topology.unwrap_or(false);
|
||||
|
||||
let disable_topology = if let Some(val) = opts.disable_topology {
|
||||
val
|
||||
let topo = if disable_topology {
|
||||
Topology::with_flattened_llc_node()?
|
||||
} else {
|
||||
let val = if topo.nodes().len() > 1 {
|
||||
false
|
||||
} else {
|
||||
topo.nodes().iter().all(|n| n.llcs().len() <= 1)
|
||||
Topology::new()?
|
||||
};
|
||||
|
||||
if !disable_topology {
|
||||
if topo.nodes().len() == 1 && topo.nodes()[0].llcs().len() == 1 {
|
||||
disable_topology = true;
|
||||
};
|
||||
info!(
|
||||
"Topology awareness not specified, selecting {} based on hardware",
|
||||
if val { "disabled" } else { "enabled" }
|
||||
if disable_topology {
|
||||
"disabled"
|
||||
} else {
|
||||
"enabled"
|
||||
}
|
||||
);
|
||||
val
|
||||
};
|
||||
|
||||
let cpu_pool = CpuPool::new(&topo)?;
|
||||
|
||||
// If disabling topology awareness clear out any set NUMA/LLC configs and
|
||||
// it will fallback to using all cores.
|
||||
let layer_specs: Vec<_> = if disable_topology {
|
||||
|
Loading…
Reference in New Issue
Block a user