diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c index a191c8a..9e4c7a6 100644 --- a/scheds/rust/scx_layered/src/bpf/main.bpf.c +++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c @@ -658,8 +658,9 @@ bool pick_idle_cpu_and_kick(struct task_struct *p, s32 task_cpu, } static __always_inline -bool try_preempt(s32 cand, struct task_struct *p, struct cpu_ctx *cctx, - struct task_ctx *tctx, struct layer *layer, bool preempt_first) +bool try_preempt_cpu(s32 cand, struct task_struct *p, struct cpu_ctx *cctx, + struct task_ctx *tctx, struct layer *layer, + bool preempt_first) { struct cpu_ctx *cand_cctx, *sib_cctx = NULL; s32 sib; @@ -705,12 +706,174 @@ bool try_preempt(s32 cand, struct task_struct *p, struct cpu_ctx *cctx, return true; } +static __always_inline +void try_preempt(s32 task_cpu, struct task_struct *p, struct task_ctx *tctx, + bool preempt_first, u64 enq_flags) +{ + struct bpf_cpumask *attempted, *topo_cpus; + struct cache_ctx *cachec; + struct cpu_ctx *cctx; + struct layer *layer; + struct node_ctx *nodec; + u32 idx; + + if (!(layer = lookup_layer(tctx->layer)) || !(cctx = lookup_cpu_ctx(-1))) + return; + + if (preempt_first) { + /* + * @p prefers to preempt its previous CPU even when there are + * other idle CPUs. + */ + if (try_preempt_cpu(task_cpu, p, cctx, tctx, layer, true)) + return; + /* we skipped idle CPU picking in select_cpu. Do it here. */ + if (pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer)) + return; + } else { + /* + * If we aren't in the wakeup path, layered_select_cpu() hasn't + * run and thus we haven't looked for and kicked an idle CPU. + * Let's do it now. + */ + if (!(enq_flags & SCX_ENQ_WAKEUP) && + pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer)) + return; + if (!layer->preempt) + return; + if (try_preempt_cpu(task_cpu, p, cctx, tctx, layer, false)) + return; + } + + if (!disable_topology) { + if (!(cachec = lookup_cache_ctx(cctx->cache_idx)) || + !(nodec = lookup_node_ctx(cctx->node_idx))) + return; + + attempted = bpf_cpumask_create(); + if (!attempted) + goto preempt_fail; + + topo_cpus = bpf_cpumask_create(); + if (!topo_cpus) { + bpf_cpumask_release(attempted); + goto preempt_fail; + } + + if (!cachec->cpumask) { + bpf_cpumask_release(attempted); + bpf_cpumask_release(topo_cpus); + goto preempt_fail; + } + + bpf_cpumask_copy(topo_cpus, cast_mask(cachec->cpumask)); + + /* + * First try preempting in the local LLC + */ + bpf_for(idx, 0, cachec->nr_cpus) { + s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus)); + trace("PREEMPT attempt on cpu %d from cpu %d", + preempt_cpu, bpf_get_smp_processor_id()); + if (preempt_cpu > cachec->nr_cpus) + break; + + if (try_preempt_cpu(preempt_cpu, p, cctx, tctx, layer, false)) { + bpf_cpumask_release(attempted); + bpf_cpumask_release(topo_cpus); + return; + } + bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus); + bpf_cpumask_set_cpu(preempt_cpu, attempted); + } + + /* + * Next try node local LLC + */ + if (!nodec->cpumask) { + bpf_cpumask_release(attempted); + bpf_cpumask_release(topo_cpus); + goto preempt_fail; + } + + bpf_cpumask_copy(topo_cpus, cast_mask(nodec->cpumask)); + bpf_cpumask_xor(topo_cpus, cast_mask(attempted), cast_mask(topo_cpus)); + + bpf_for(idx, 0, nodec->nr_cpus) { + s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus)); + if (try_preempt_cpu(preempt_cpu, p, cctx, tctx, layer, false)) { + bpf_cpumask_release(attempted); + bpf_cpumask_release(topo_cpus); + lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx); + return; + } + bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus); + bpf_cpumask_set_cpu(preempt_cpu, attempted); + if (bpf_cpumask_empty(cast_mask(topo_cpus))) + break; + } + + /* + * Finally try across nodes + */ + if (xnuma_preemption) { + if (!all_cpumask) { + bpf_cpumask_release(attempted); + bpf_cpumask_release(topo_cpus); + goto preempt_fail; + } + bpf_cpumask_copy(topo_cpus, cast_mask(all_cpumask)); + bpf_cpumask_xor(topo_cpus, cast_mask(attempted), cast_mask(topo_cpus)); + + bpf_for(idx, 0, nr_possible_cpus) { + s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus)); + if (try_preempt_cpu(preempt_cpu, p, cctx, tctx, layer, false)) { + bpf_cpumask_release(attempted); + bpf_cpumask_release(topo_cpus); + lstat_inc(LSTAT_PREEMPT_XNUMA, layer, cctx); + return; + } + bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus); + bpf_cpumask_set_cpu(preempt_cpu, attempted); + if (bpf_cpumask_empty(cast_mask(topo_cpus))) + break; + } + } + bpf_cpumask_release(attempted); + bpf_cpumask_release(topo_cpus); + } else { + + bpf_for(idx, 0, nr_possible_cpus) { + s32 cand = (preempt_cursor + idx) % nr_possible_cpus; + + if (try_preempt_cpu(cand, p, cctx, tctx, layer, false)) { + /* + * Round-robining doesn't have to be strict. Let's + * not bother with atomic ops on $preempt_cursor. + */ + preempt_cursor = (cand + 1) % nr_possible_cpus; + struct cpu_ctx *new_cctx; + if ((new_cctx = lookup_cpu_ctx(cand))) { + if (new_cctx->node_idx != nodec->id && new_cctx->node_idx == nodec->id) + lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx); + if (new_cctx->node_idx != nodec->id) + lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx); + } + return; + } + } + } + + lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx); + +preempt_fail: + lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx); +} + void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags) { struct cpu_ctx *cctx; struct task_ctx *tctx; - struct cache_ctx *cachec; - struct node_ctx *nodec; struct layer *layer; s32 task_cpu = scx_bpf_task_cpu(p); u64 vtime = p->scx.dsq_vtime; @@ -763,7 +926,7 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags) idx = cpu_hi_fallback_dsq_id(task_cpu); scx_bpf_dispatch(p, idx, slice_ns, enq_flags); - goto find_cpu; + goto preempt; } /* @@ -788,7 +951,7 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags) */ idx = cpu_hi_fallback_dsq_id(task_cpu); scx_bpf_dispatch(p, idx, slice_ns, enq_flags); - goto find_cpu; + goto preempt; } if (disable_topology) { @@ -800,151 +963,8 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags) scx_bpf_dispatch_vtime(p, idx, layer_slice_ns, vtime, enq_flags); } -find_cpu: - if (try_preempt_first) { - /* - * @p prefers to preempt its previous CPU even when there are - * other idle CPUs. - */ - if (try_preempt(task_cpu, p, cctx, tctx, layer, true)) - return; - /* we skipped idle CPU picking in select_cpu. Do it here. */ - if (pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer)) - return; - } else { - /* - * If we aren't in the wakeup path, layered_select_cpu() hasn't - * run and thus we haven't looked for and kicked an idle CPU. - * Let's do it now. - */ - if (!(enq_flags & SCX_ENQ_WAKEUP) && - pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx, layer)) - return; - if (!layer->preempt) - return; - if (try_preempt(task_cpu, p, cctx, tctx, layer, false)) - return; - } - - if (!disable_topology) { - if (!(cachec = lookup_cache_ctx(cctx->cache_idx)) || - !(nodec = lookup_node_ctx(cctx->node_idx))) - return; - - struct bpf_cpumask *attempted = bpf_cpumask_create(); - if (!attempted) - return; - - struct bpf_cpumask *topo_cpus = bpf_cpumask_create(); - if (!topo_cpus) { - bpf_cpumask_release(attempted); - return; - } - - if (!cachec->cpumask) { - bpf_cpumask_release(attempted); - bpf_cpumask_release(topo_cpus); - return; - } - - bpf_cpumask_copy(topo_cpus, cast_mask(cachec->cpumask)); - - /* - * First try preempting in the local LLC - */ - bpf_for(idx, 0, cachec->nr_cpus) { - s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus)); - trace("PREEMPT attempt on cpu %d from cpu %d", - preempt_cpu, bpf_get_smp_processor_id()); - if (preempt_cpu > cachec->nr_cpus) - break; - - if (try_preempt(preempt_cpu, p, cctx, tctx, layer, false)) { - bpf_cpumask_release(attempted); - bpf_cpumask_release(topo_cpus); - return; - } - bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus); - bpf_cpumask_set_cpu(preempt_cpu, attempted); - } - - /* - * Next try node local LLC - */ - if (!nodec->cpumask) { - bpf_cpumask_release(attempted); - bpf_cpumask_release(topo_cpus); - return; - } - bpf_cpumask_copy(topo_cpus, cast_mask(nodec->cpumask)); - bpf_cpumask_xor(topo_cpus, cast_mask(attempted), cast_mask(topo_cpus)); - - bpf_for(idx, 0, nodec->nr_cpus) { - s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus)); - if (try_preempt(preempt_cpu, p, cctx, tctx, layer, false)) { - bpf_cpumask_release(attempted); - bpf_cpumask_release(topo_cpus); - lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx); - return; - } - bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus); - bpf_cpumask_set_cpu(preempt_cpu, attempted); - if (bpf_cpumask_empty(cast_mask(topo_cpus))) - break; - } - - /* - * Finally try across nodes - */ - if (xnuma_preemption) { - if (!all_cpumask) { - bpf_cpumask_release(attempted); - bpf_cpumask_release(topo_cpus); - return; - } - bpf_cpumask_copy(topo_cpus, cast_mask(all_cpumask)); - bpf_cpumask_xor(topo_cpus, cast_mask(attempted), cast_mask(topo_cpus)); - - bpf_for(idx, 0, nr_possible_cpus) { - s32 preempt_cpu = bpf_cpumask_any_distribute(cast_mask(topo_cpus)); - if (try_preempt(preempt_cpu, p, cctx, tctx, layer, false)) { - bpf_cpumask_release(attempted); - bpf_cpumask_release(topo_cpus); - lstat_inc(LSTAT_PREEMPT_XNUMA, layer, cctx); - return; - } - bpf_cpumask_clear_cpu(preempt_cpu, topo_cpus); - bpf_cpumask_set_cpu(preempt_cpu, attempted); - if (bpf_cpumask_empty(cast_mask(topo_cpus))) - break; - } - } - bpf_cpumask_release(attempted); - bpf_cpumask_release(topo_cpus); - } else { - - bpf_for(idx, 0, nr_possible_cpus) { - s32 cand = (preempt_cursor + idx) % nr_possible_cpus; - - if (try_preempt(cand, p, cctx, tctx, layer, false)) { - /* - * Round-robining doesn't have to be strict. Let's - * not bother with atomic ops on $preempt_cursor. - */ - preempt_cursor = (cand + 1) % nr_possible_cpus; - struct cpu_ctx *new_cctx; - if ((new_cctx = lookup_cpu_ctx(cand))) { - if (new_cctx->node_idx != nodec->id && new_cctx->node_idx == nodec->id) - lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx); - if (new_cctx->node_idx != nodec->id) - lstat_inc(LSTAT_PREEMPT_XLLC, layer, cctx); - } - return; - } - } - } - - lstat_inc(LSTAT_PREEMPT_FAIL, layer, cctx); +preempt: + try_preempt(task_cpu, p, tctx, try_preempt_first, enq_flags); } static bool keep_running(struct cpu_ctx *cctx, struct task_struct *p)