mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-24 20:00:22 +00:00
Merge pull request #666 from hodgesds/layered-local-llc
scx_layered: Add topology aware preemption
This commit is contained in:
commit
1a2f82b91c
@ -63,6 +63,8 @@ enum layer_stat_idx {
|
||||
LSTAT_KEEP_FAIL_BUSY,
|
||||
LSTAT_PREEMPT,
|
||||
LSTAT_PREEMPT_FIRST,
|
||||
LSTAT_PREEMPT_XLLC,
|
||||
LSTAT_PREEMPT_XNUMA,
|
||||
LSTAT_PREEMPT_IDLE,
|
||||
LSTAT_PREEMPT_FAIL,
|
||||
LSTAT_EXCL_COLLISION,
|
||||
@ -88,19 +90,21 @@ struct cpu_ctx {
|
||||
u64 lstats[MAX_LAYERS][NR_LSTATS];
|
||||
u64 ran_current_for;
|
||||
u32 layer_idx;
|
||||
u32 node_idx;
|
||||
u32 cache_idx;
|
||||
u32 node_idx;
|
||||
};
|
||||
|
||||
struct cache_ctx {
|
||||
u32 id;
|
||||
struct bpf_cpumask __kptr *cpumask;
|
||||
u32 nr_cpus;
|
||||
};
|
||||
|
||||
struct node_ctx {
|
||||
u32 id;
|
||||
struct bpf_cpumask __kptr *cpumask;
|
||||
u32 nr_llcs;
|
||||
u32 nr_cpus;
|
||||
u64 llc_mask;
|
||||
};
|
||||
|
||||
|
@ -24,6 +24,7 @@ const volatile u32 nr_nodes = 32; /* !0 for veristat, set during init */
|
||||
const volatile u32 nr_llcs = 32; /* !0 for veristat, set during init */
|
||||
const volatile bool smt_enabled = true;
|
||||
const volatile bool disable_topology = false;
|
||||
const volatile bool xnuma_preemption = false;
|
||||
const volatile s32 __sibling_cpu[MAX_CPUS];
|
||||
const volatile unsigned char all_cpus[MAX_CPUS_U8];
|
||||
|
||||
@ -657,8 +658,9 @@ bool pick_idle_cpu_and_kick(struct task_struct *p, s32 task_cpu,
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
bool try_preempt(s32 cand, struct task_struct *p, struct cpu_ctx *cctx,
|
||||
struct task_ctx *tctx, struct layer *layer, bool preempt_first)
|
||||
bool try_preempt_cpu(s32 cand, struct task_struct *p, struct cpu_ctx *cctx,
|
||||
struct task_ctx *tctx, struct layer *layer,
|
||||
bool preempt_first)
|
||||
{
|
||||
struct cpu_ctx *cand_cctx, *sib_cctx = NULL;
|
||||
s32 sib;
|
||||
@ -704,6 +706,170 @@ bool try_preempt(s32 cand, struct task_struct *p, struct cpu_ctx *cctx,
|
||||
return true;
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
void try_preempt(s32 task_cpu, struct task_struct *p, struct task_ctx *tctx,
|
||||
bool preempt_first, u64 enq_flags)
|
||||
{
|
||||
struct bpf_cpumask *attempted, *topo_cpus;
|
||||
struct cache_ctx *cachec;
|
||||
struct cpu_ctx *cctx;
|
||||
struct layer *layer;
|
||||
struct node_ctx *nodec;
|
||||
u32 idx;
|
||||
|
||||
if (!(layer = lookup_layer(tctx->layer)) || !(cctx = lookup_cpu_ctx(-1)))
|
||||
return;
|
||||
|
||||
if (preempt_first) {
|
||||
/*
|
||||
* @p prefers to preempt its previous CPU even when there are
|
||||
* other idle CPUs.
|
||||
*/
|
||||
if (try_preempt_cpu(task_cpu, p, cctx, tctx, layer, true))
|
||||
return;
|
||||
/* we skipped idle CPU picking in select_cpu. Do it here. */
|
||||
if (pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer))
|
||||
return;
|
||||
} else {
|
||||
/*
|
||||
* If we aren't in the wakeup path, layered_select_cpu() hasn't
|
||||
* run and thus we haven't looked for and kicked an idle CPU.
|
||||
* Let's do it now.
|
||||
*/
|
||||
if (!(enq_flags & SCX_ENQ_WAKEUP) &&
|
||||
pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer))
|
||||
return;
|
||||
if (!layer->preempt)
|
||||
return;
|
||||
if (try_preempt_cpu(task_cpu, p, cctx, tctx, layer, false))
|
||||
return;
|
||||
}
|
||||
|
||||
if (!disable_topology) {
|
||||
if (!(cachec = lookup_cache_ctx(cctx->cache_idx)) ||
|
||||
!(nodec = lookup_node_ctx(cctx->node_idx)))
|
||||
return;
|
||||
|
||||
attempted = bpf_cpumask_create();
|
||||
if (!attempted)
|
||||
goto preempt_fail;
|
||||
|
||||
topo_cpus = bpf_cpumask_create();
|
||||
if (!topo_cpus) {
|
||||
bpf_cpumask_release(attempted);
|
||||
goto preempt_fail;
|
||||
}
|
||||
|
||||
if (!cachec->cpumask) {
|
||||
bpf_cpumask_release(attempted);
|
||||
bpf_cpumask_release(topo_cpus);
|
||||
goto preempt_fail;
|
||||
}
|
||||
|
||||
bpf_cpumask_copy(topo_cpus, cast_mask(cachec->cpumask));
|
||||
|
||||
/*
|
||||
* First try preempting in the local LLC
|
||||
*/
|
||||
bpf_for(idx, 0, cachec->nr_cpus) {
|
||||
s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus));
|
||||
trace("PREEMPT attempt on cpu %d from cpu %d",
|
||||
preempt_cpu, bpf_get_smp_processor_id());
|
||||
if (preempt_cpu > cachec->nr_cpus)
|
||||
break;
|
||||
|
||||
if (try_preempt_cpu(preempt_cpu, p, cctx, tctx, layer, false)) {
|
||||
bpf_cpumask_release(attempted);
|
||||
bpf_cpumask_release(topo_cpus);
|
||||
return;
|
||||
}
|
||||
bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus);
|
||||
bpf_cpumask_set_cpu(preempt_cpu, attempted);
|
||||
}
|
||||
|
||||
/*
|
||||
* Next try node local LLC
|
||||
*/
|
||||
if (!nodec->cpumask) {
|
||||
bpf_cpumask_release(attempted);
|
||||
bpf_cpumask_release(topo_cpus);
|
||||
goto preempt_fail;
|
||||
}
|
||||
|
||||
bpf_cpumask_copy(topo_cpus, cast_mask(nodec->cpumask));
|
||||
bpf_cpumask_xor(topo_cpus, cast_mask(attempted), cast_mask(topo_cpus));
|
||||
|
||||
bpf_for(idx, 0, nodec->nr_cpus) {
|
||||
s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus));
|
||||
if (try_preempt_cpu(preempt_cpu, p, cctx, tctx, layer, false)) {
|
||||
bpf_cpumask_release(attempted);
|
||||
bpf_cpumask_release(topo_cpus);
|
||||
lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx);
|
||||
return;
|
||||
}
|
||||
bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus);
|
||||
bpf_cpumask_set_cpu(preempt_cpu, attempted);
|
||||
if (bpf_cpumask_empty(cast_mask(topo_cpus)))
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Finally try across nodes
|
||||
*/
|
||||
if (xnuma_preemption) {
|
||||
if (!all_cpumask) {
|
||||
bpf_cpumask_release(attempted);
|
||||
bpf_cpumask_release(topo_cpus);
|
||||
goto preempt_fail;
|
||||
}
|
||||
bpf_cpumask_copy(topo_cpus, cast_mask(all_cpumask));
|
||||
bpf_cpumask_xor(topo_cpus, cast_mask(attempted), cast_mask(topo_cpus));
|
||||
|
||||
bpf_for(idx, 0, nr_possible_cpus) {
|
||||
s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus));
|
||||
if (try_preempt_cpu(preempt_cpu, p, cctx, tctx, layer, false)) {
|
||||
bpf_cpumask_release(attempted);
|
||||
bpf_cpumask_release(topo_cpus);
|
||||
lstat_inc(LSTAT_PREEMPT_XNUMA, layer, cctx);
|
||||
return;
|
||||
}
|
||||
bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus);
|
||||
bpf_cpumask_set_cpu(preempt_cpu, attempted);
|
||||
if (bpf_cpumask_empty(cast_mask(topo_cpus)))
|
||||
break;
|
||||
}
|
||||
}
|
||||
bpf_cpumask_release(attempted);
|
||||
bpf_cpumask_release(topo_cpus);
|
||||
} else {
|
||||
|
||||
bpf_for(idx, 0, nr_possible_cpus) {
|
||||
s32 cand = (preempt_cursor + idx) % nr_possible_cpus;
|
||||
|
||||
if (try_preempt_cpu(cand, p, cctx, tctx, layer, false)) {
|
||||
/*
|
||||
* Round-robining doesn't have to be strict. Let's
|
||||
* not bother with atomic ops on $preempt_cursor.
|
||||
*/
|
||||
preempt_cursor = (cand + 1) % nr_possible_cpus;
|
||||
struct cpu_ctx *new_cctx;
|
||||
if ((new_cctx = lookup_cpu_ctx(cand))) {
|
||||
if (new_cctx->node_idx != nodec->id && new_cctx->node_idx == nodec->id)
|
||||
lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx);
|
||||
if (new_cctx->node_idx != nodec->id)
|
||||
lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx);
|
||||
|
||||
preempt_fail:
|
||||
lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
{
|
||||
struct cpu_ctx *cctx;
|
||||
@ -760,7 +926,7 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
|
||||
idx = cpu_hi_fallback_dsq_id(task_cpu);
|
||||
scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
|
||||
goto find_cpu;
|
||||
goto preempt;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -785,7 +951,7 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
*/
|
||||
idx = cpu_hi_fallback_dsq_id(task_cpu);
|
||||
scx_bpf_dispatch(p, idx, slice_ns, enq_flags);
|
||||
goto find_cpu;
|
||||
goto preempt;
|
||||
}
|
||||
|
||||
if (disable_topology) {
|
||||
@ -797,46 +963,8 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
scx_bpf_dispatch_vtime(p, idx, layer_slice_ns, vtime, enq_flags);
|
||||
}
|
||||
|
||||
find_cpu:
|
||||
if (try_preempt_first) {
|
||||
/*
|
||||
* @p prefers to preempt its previous CPU even when there are
|
||||
* other idle CPUs.
|
||||
*/
|
||||
if (try_preempt(task_cpu, p, cctx, tctx, layer, true))
|
||||
return;
|
||||
/* we skipped idle CPU picking in select_cpu. Do it here. */
|
||||
if (pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer))
|
||||
return;
|
||||
} else {
|
||||
/*
|
||||
* If we aren't in the wakeup path, layered_select_cpu() hasn't
|
||||
* run and thus we haven't looked for and kicked an idle CPU.
|
||||
* Let's do it now.
|
||||
*/
|
||||
if (!(enq_flags & SCX_ENQ_WAKEUP) &&
|
||||
pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer))
|
||||
return;
|
||||
if (!layer->preempt)
|
||||
return;
|
||||
if (try_preempt(task_cpu, p, cctx, tctx, layer, false))
|
||||
return;
|
||||
}
|
||||
|
||||
bpf_for(idx, 0, nr_possible_cpus) {
|
||||
s32 cand = (preempt_cursor + idx) % nr_possible_cpus;
|
||||
|
||||
if (try_preempt(cand, p, cctx, tctx, layer, false)) {
|
||||
/*
|
||||
* Round-robining doesn't have to be strict. Let's
|
||||
* not bother with atomic ops on $preempt_cursor.
|
||||
*/
|
||||
preempt_cursor = (cand + 1) % nr_possible_cpus;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx);
|
||||
preempt:
|
||||
try_preempt(task_cpu, p, tctx, try_preempt_first, enq_flags);
|
||||
}
|
||||
|
||||
static bool keep_running(struct cpu_ctx *cctx, struct task_struct *p)
|
||||
@ -1275,6 +1403,8 @@ static s32 create_node(u32 node_id)
|
||||
break;
|
||||
}
|
||||
cctx->node_idx = node_id;
|
||||
nodec->nr_cpus++;
|
||||
nodec->llc_mask &= (1LLU << node_id);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1314,6 +1444,7 @@ static s32 create_cache(u32 cache_id)
|
||||
if (llc_id != cache_id)
|
||||
continue;
|
||||
|
||||
cachec->nr_cpus++;
|
||||
bpf_cpumask_set_cpu(cpu, cpumask);
|
||||
if (!(cctx = lookup_cpu_ctx(-1))) {
|
||||
scx_bpf_error("cpu ctx error"); ret = -ENOENT; break;
|
||||
|
@ -413,6 +413,10 @@ struct Opts {
|
||||
#[clap(short = 't', long)]
|
||||
disable_topology: bool,
|
||||
|
||||
/// Enable cross NUMA preemption.
|
||||
#[clap(long)]
|
||||
xnuma_preemption: bool,
|
||||
|
||||
/// Write example layer specifications into the file and exit.
|
||||
#[clap(short = 'e', long)]
|
||||
example: Option<String>,
|
||||
@ -1870,6 +1874,7 @@ impl<'a, 'b> Scheduler<'a, 'b> {
|
||||
skel.maps.rodata_data.nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
|
||||
skel.maps.rodata_data.smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
|
||||
skel.maps.rodata_data.disable_topology = opts.disable_topology;
|
||||
skel.maps.rodata_data.xnuma_preemption = opts.xnuma_preemption;
|
||||
for (cpu, sib) in cpu_pool.sibling_cpu.iter().enumerate() {
|
||||
skel.maps.rodata_data.__sibling_cpu[cpu] = *sib;
|
||||
}
|
||||
|
@ -75,6 +75,10 @@ pub struct LayerStats {
|
||||
pub open_idle: f64,
|
||||
#[stat(desc = "% preempted other tasks")]
|
||||
pub preempt: f64,
|
||||
#[stat(desc = "% preempted XLLC tasks")]
|
||||
pub preempt_xllc: f64,
|
||||
#[stat(desc = "% preempted XNUMA tasks")]
|
||||
pub preempt_xnuma: f64,
|
||||
#[stat(desc = "% first-preempted other tasks")]
|
||||
pub preempt_first: f64,
|
||||
#[stat(desc = "% idle-preempted other tasks")]
|
||||
@ -178,6 +182,8 @@ impl LayerStats {
|
||||
min_exec_us: (lstat(bpf_intf::layer_stat_idx_LSTAT_MIN_EXEC_NS) / 1000) as u64,
|
||||
open_idle: lstat_pct(bpf_intf::layer_stat_idx_LSTAT_OPEN_IDLE),
|
||||
preempt: lstat_pct(bpf_intf::layer_stat_idx_LSTAT_PREEMPT),
|
||||
preempt_xllc: lstat_pct(bpf_intf::layer_stat_idx_LSTAT_PREEMPT_XLLC),
|
||||
preempt_xnuma: lstat_pct(bpf_intf::layer_stat_idx_LSTAT_PREEMPT_XNUMA),
|
||||
preempt_first: lstat_pct(bpf_intf::layer_stat_idx_LSTAT_PREEMPT_FIRST),
|
||||
preempt_idle: lstat_pct(bpf_intf::layer_stat_idx_LSTAT_PREEMPT_IDLE),
|
||||
preempt_fail: lstat_pct(bpf_intf::layer_stat_idx_LSTAT_PREEMPT_FAIL),
|
||||
@ -253,10 +259,12 @@ impl LayerStats {
|
||||
|
||||
writeln!(
|
||||
w,
|
||||
" {:<width$} preempt/first/idle/fail={}/{}/{}/{} min_exec={}/{:7.2}ms",
|
||||
" {:<width$} preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{} min_exec={}/{:7.2}ms",
|
||||
"",
|
||||
fmt_pct(self.preempt),
|
||||
fmt_pct(self.preempt_first),
|
||||
fmt_pct(self.preempt_xllc),
|
||||
fmt_pct(self.preempt_xnuma),
|
||||
fmt_pct(self.preempt_idle),
|
||||
fmt_pct(self.preempt_fail),
|
||||
fmt_pct(self.min_exec),
|
||||
|
Loading…
Reference in New Issue
Block a user