From eb99e45cede906ebaec3cf728f05af74cc4dab87 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 5 Nov 2024 09:03:51 +0100 Subject: [PATCH 01/11] scx_bpfland: consistent vruntime update Ensure that task vruntime is always updated in ops.running() to maintain consistency with other schedulers. Signed-off-by: Andrea Righi --- scheds/rust/scx_bpfland/src/bpf/main.bpf.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c index 653ce41..d5a3039 100644 --- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c +++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c @@ -908,6 +908,12 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p) */ if (tctx->is_interactive) __sync_fetch_and_add(&nr_interactive, 1); + + /* + * Update global vruntime. + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; } /* @@ -971,11 +977,6 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable) p->scx.dsq_vtime += slice; tctx->deadline = p->scx.dsq_vtime + task_compute_dl(p, tctx); - /* - * Update global vruntime. - */ - vtime_now += slice; - /* * Refresh voluntary context switch metrics. * From f0c8de347759b371af83bb8ef83920525c91b860 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 5 Nov 2024 09:04:10 +0100 Subject: [PATCH 02/11] scx_bpfland: do not exclude exiting tasks Add SCX_OPS_ENQ_EXITING to the scheduler flags, since we are not using bpf_task_from_pid() and the scheduler can handle exiting tasks. Signed-off-by: Andrea Righi --- scheds/rust/scx_bpfland/src/bpf/main.bpf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c index d5a3039..e0d1850 100644 --- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c +++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c @@ -1227,5 +1227,6 @@ SCX_OPS_DEFINE(bpfland_ops, .init_task = (void *)bpfland_init_task, .init = (void *)bpfland_init, .exit = (void *)bpfland_exit, + .flags = SCX_OPS_ENQ_EXITING, .timeout_ms = 5000, .name = "bpfland"); From 8a655d94f5900ce10c9ae2c7cb35026fa600acde Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 2 Nov 2024 15:26:26 +0100 Subject: [PATCH 03/11] scx_bpfland: do not overly prioritize WAKE_SYNC tasks This can lead to stalls when a high number of interactive tasks are running in the system (i.e.., hackbench or similar stress tests). Signed-off-by: Andrea Righi --- scheds/rust/scx_bpfland/src/bpf/main.bpf.c | 32 ---------------------- 1 file changed, 32 deletions(-) diff --git a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c index e0d1850..669d985 100644 --- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c +++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c @@ -384,32 +384,6 @@ static inline void task_refill_slice(struct task_struct *p) p->scx.slice = CLAMP(slice_max / scale_factor, slice_min, slice_max); } -static bool is_prio_congested(void) -{ - return scx_bpf_dsq_nr_queued(PRIO_DSQ) > nr_online_cpus * 4; -} - -/* - * Handle synchronous wake-up event for a task. - */ -static void handle_sync_wakeup(struct task_struct *p) -{ - struct task_ctx *tctx; - - /* - * If we are waking up a task immediately promote it as interactive, so - * that it can be dispatched as soon as possible on the first CPU - * available. - * - * However, if the priority queue is congested, we don't want to - * promote additional interactive tasks, instead we give priority to - * the tasks that are already classified as interactive. - */ - tctx = try_lookup_task_ctx(p); - if (tctx && !is_prio_congested()) - tctx->is_interactive = true; -} - /* * Find an idle CPU in the system. * @@ -531,12 +505,6 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) struct bpf_cpumask *curr_l3_domain; bool share_llc, has_idle; - /* - * Prioritize newly awakened tasks by immediately promoting - * them as interactive. - */ - handle_sync_wakeup(p); - /* * Determine waker CPU scheduling domain. */ From 064d6fb5602ae27b0fe5f761dea30195816884e2 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 2 Nov 2024 15:27:24 +0100 Subject: [PATCH 04/11] scx_bpfland: consider all tasks as regular if priority DSQ is congested This allows to prevent excessive starvation of regular tasks in presence of high amount of interactive tasks (e.g., when running stress tests, such as hackbench). Signed-off-by: Andrea Righi --- scheds/rust/scx_bpfland/src/bpf/main.bpf.c | 29 ++++++++++++---------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c index 669d985..989da3a 100644 --- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c +++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c @@ -296,7 +296,7 @@ static u64 task_dyn_prio(struct task_struct *p) /* * Return task's dynamic priority. */ -static u64 task_prio(struct task_struct *p) +static u64 task_prio(const struct task_struct *p) { return p->scx.weight * task_dyn_prio(p); } @@ -305,7 +305,7 @@ static u64 task_prio(struct task_struct *p) * Return the task's allowed lag: used to determine how early its vruntime can * be. */ -static u64 task_lag(struct task_struct *p) +static u64 task_lag(const struct task_struct *p) { return slice_lag * task_prio(p) / 100; } @@ -313,7 +313,7 @@ static u64 task_lag(struct task_struct *p) /* * Return a value inversely proportional to the task's weight. */ -static u64 scale_inverse_fair(struct task_struct *p, u64 value) +static u64 scale_inverse_fair(const struct task_struct *p, u64 value) { return value * 100 / task_prio(p); } @@ -322,7 +322,8 @@ static u64 scale_inverse_fair(struct task_struct *p, u64 value) * Compute the deadline component of a task (this value will be added to the * task's vruntime to determine the actual deadline). */ -static s64 task_compute_dl(struct task_struct *p ,struct task_ctx *tctx) +static s64 task_compute_dl(const struct task_struct *p, + const struct task_ctx *tctx) { /* * Return the deadline as a function of the average runtime and the @@ -334,14 +335,9 @@ static s64 task_compute_dl(struct task_struct *p ,struct task_ctx *tctx) /* * Return task's evaluated vruntime. */ -static inline u64 task_deadline(struct task_struct *p) +static u64 task_deadline(struct task_struct *p, struct task_ctx *tctx) { u64 min_vruntime = vtime_now - task_lag(p); - struct task_ctx *tctx; - - tctx = try_lookup_task_ctx(p); - if (!tctx) - return min_vruntime; /* * Limit the vruntime to to avoid excessively penalizing tasks. @@ -667,6 +663,14 @@ static void kick_task_cpu(struct task_struct *p) scx_bpf_kick_cpu(cpu, 0); } +static bool is_task_interactive(const struct task_struct *p, + const struct task_ctx *tctx) +{ + if (!tctx->is_interactive) + return false; + return scx_bpf_dsq_nr_queued(PRIO_DSQ) < 100; +} + /* * Dispatch all the other tasks that were not dispatched directly in * select_cpu(). @@ -700,7 +704,7 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags) * When lowlatency is enabled, the separate priority DSQ is disabled, * so in this case always dispatch to the shared DSQ. */ - if (!lowlatency && tctx->is_interactive) { + if (!lowlatency && is_task_interactive(p, tctx)) { dsq_id = PRIO_DSQ; __sync_fetch_and_add(&nr_prio_dispatches, 1); } else { @@ -708,7 +712,7 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags) __sync_fetch_and_add(&nr_shared_dispatches, 1); } scx_bpf_dispatch_vtime(p, dsq_id, SCX_SLICE_DFL, - task_deadline(p), enq_flags); + task_deadline(p, tctx), enq_flags); /* * If there is an idle CPU available for the task, wake it up so it can @@ -993,7 +997,6 @@ void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p) return; tctx->sum_exec_runtime = p->se.sum_exec_runtime; tctx->nvcsw_ts = now; - tctx->avg_runtime = slice_max; tctx->deadline = vtime_now; } From efc41dd93631887e7677483e9fbf0b0550cc0e1f Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sun, 3 Nov 2024 16:24:09 +0100 Subject: [PATCH 05/11] scx_bpfland: strict domain affinity Rather then always migrating tasks across LLC domains when no idle CPU is available in their current LLC domain, allow migration but attempt to bring tasks back to their original LLC domain whenever possible. To do so, define the task's scheduling domain upon task creation or when its affinity changes, and ensure the task remains within this domain throughout its lifetime. In the future we will add a proper load balancing logic, but for now this change seems to provide consistent performance improvement in certain server workloads. For example, simple CUDA benchmarks show a performance boost of about +10-20% with this change applied (on multi-LLC / NUMA machines). Signed-off-by: Andrea Righi --- scheds/rust/scx_bpfland/src/bpf/main.bpf.c | 236 +++++++++++++-------- 1 file changed, 147 insertions(+), 89 deletions(-) diff --git a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c index 989da3a..38d77e2 100644 --- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c +++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c @@ -380,6 +380,73 @@ static inline void task_refill_slice(struct task_struct *p) p->scx.slice = CLAMP(slice_max / scale_factor, slice_min, slice_max); } +static void task_set_domain(struct task_struct *p, s32 cpu, + const struct cpumask *cpumask) +{ + struct bpf_cpumask *primary, *l2_domain, *l3_domain; + struct bpf_cpumask *p_mask, *l2_mask, *l3_mask; + struct task_ctx *tctx; + struct cpu_ctx *cctx; + + tctx = try_lookup_task_ctx(p); + if (!tctx) + return; + + cctx = try_lookup_cpu_ctx(cpu); + if (!cctx) + return; + + primary = primary_cpumask; + if (!primary) + return; + + l2_domain = cctx->l2_cpumask; + if (!l2_domain) + l2_domain = primary; + + l3_domain = cctx->l3_cpumask; + if (!l3_domain) + l3_domain = primary; + + p_mask = tctx->cpumask; + if (!p_mask) { + scx_bpf_error("cpumask not initialized"); + return; + } + + l2_mask = tctx->l2_cpumask; + if (!l2_mask) { + scx_bpf_error("l2 cpumask not initialized"); + return; + } + + l3_mask = tctx->l3_cpumask; + if (!l3_mask) { + scx_bpf_error("l3 cpumask not initialized"); + return; + } + + /* + * Determine the task's scheduling domain. + * idle CPU, re-try again with the primary scheduling domain. + */ + bpf_cpumask_and(p_mask, cpumask, cast_mask(primary)); + + /* + * Determine the L2 cache domain as the intersection of the task's + * primary cpumask and the L2 cache domain mask of the previously used + * CPU. + */ + bpf_cpumask_and(l2_mask, cast_mask(p_mask), cast_mask(l2_domain)); + + /* + * Determine the L3 cache domain as the intersection of the task's + * primary cpumask and the L3 cache domain mask of the previously used + * CPU. + */ + bpf_cpumask_and(l3_mask, cast_mask(p_mask), cast_mask(l3_domain)); +} + /* * Find an idle CPU in the system. * @@ -389,22 +456,16 @@ static inline void task_refill_slice(struct task_struct *p) * to handle these mistakes in favor of a more efficient response and a reduced * scheduling overhead. */ -static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) +static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { - const struct cpumask *online_cpumask, *idle_smtmask, *idle_cpumask; + const struct cpumask *idle_smtmask, *idle_cpumask; struct bpf_cpumask *primary, *l2_domain, *l3_domain; struct bpf_cpumask *p_mask, *l2_mask, *l3_mask; struct task_ctx *tctx; - struct cpu_ctx *cctx; + bool is_prev_llc_affine = false; s32 cpu; - /* - * If the task isn't allowed to use its previously used CPU it means - * that it's changing affinity. In this case try to pick any random - * idle CPU in its new allowed CPU domain. - */ - if (!bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr)) - return scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + *is_idle = false; /* * For tasks that can run only on a single CPU, we can simply verify if @@ -412,41 +473,24 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) */ if (p->nr_cpus_allowed == 1 || p->migration_disabled) { if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) - return prev_cpu; - return -EBUSY; + *is_idle = true; + return prev_cpu; } tctx = try_lookup_task_ctx(p); if (!tctx) return -ENOENT; - cctx = try_lookup_cpu_ctx(prev_cpu); - if (!cctx) - return -EINVAL; - primary = primary_cpumask; if (!primary) return -EINVAL; /* - * Acquire the CPU masks to determine the online and idle CPUs in the - * system. + * Acquire the CPU masks to determine the idle CPUs in the system. */ - online_cpumask = scx_bpf_get_online_cpumask(); idle_smtmask = scx_bpf_get_idle_smtmask(); idle_cpumask = scx_bpf_get_idle_cpumask(); - /* - * Scheduling domains of the previously used CPU. - */ - l2_domain = cctx->l2_cpumask; - if (!l2_domain) - l2_domain = primary; - - l3_domain = cctx->l3_cpumask; - if (!l3_domain) - l3_domain = primary; - /* * Task's scheduling domains. */ @@ -472,24 +516,10 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) } /* - * Determine the task's scheduling domain. - * idle CPU, re-try again with the primary scheduling domain. + * Check if the previously used CPU is still in the L3 task domain. If + * not, we may want to move the task back to its original L3 domain. */ - bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary)); - - /* - * Determine the L2 cache domain as the intersection of the task's - * primary cpumask and the L2 cache domain mask of the previously used - * CPU. - */ - bpf_cpumask_and(l2_mask, cast_mask(p_mask), cast_mask(l2_domain)); - - /* - * Determine the L3 cache domain as the intersection of the task's - * primary cpumask and the L3 cache domain mask of the previously used - * CPU. - */ - bpf_cpumask_and(l3_mask, cast_mask(p_mask), cast_mask(l3_domain)); + is_prev_llc_affine = bpf_cpumask_test_cpu(prev_cpu, cast_mask(l3_mask)); /* * If the current task is waking up another task and releasing the CPU @@ -499,6 +529,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) if (wake_flags & SCX_WAKE_SYNC) { struct task_struct *current = (void *)bpf_get_current_task_btf(); struct bpf_cpumask *curr_l3_domain; + struct cpu_ctx *cctx; bool share_llc, has_idle; /* @@ -520,8 +551,10 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) * using the same CPU if possible. */ share_llc = bpf_cpumask_test_cpu(prev_cpu, cast_mask(curr_l3_domain)); - if (share_llc && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + if (share_llc && + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { cpu = prev_cpu; + *is_idle = true; goto out_put_cpumask; } @@ -532,10 +565,12 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) */ has_idle = bpf_cpumask_intersects(cast_mask(curr_l3_domain), idle_cpumask); if (has_idle && - bpf_cpumask_test_cpu(cpu, p->cpus_ptr) && + bpf_cpumask_test_cpu(cpu, cast_mask(p_mask)) && !(current->flags & PF_EXITING) && - scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu) == 0) + scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu) == 0) { + *is_idle = true; goto out_put_cpumask; + } } /* @@ -546,10 +581,11 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) * If the task can still run on the previously used CPU and * it's a full-idle core, keep using it. */ - if (bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) && + if (is_prev_llc_affine && bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { cpu = prev_cpu; + *is_idle = true; goto out_put_cpumask; } @@ -557,36 +593,40 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) * Search for any full-idle CPU in the primary domain that * shares the same L2 cache. */ - cpu = bpf_cpumask_any_and_distribute(cast_mask(l2_mask), idle_smtmask); - if (bpf_cpumask_test_cpu(cpu, online_cpumask) && - scx_bpf_test_and_clear_cpu_idle(cpu)) + cpu = scx_bpf_pick_idle_cpu(cast_mask(l2_mask), SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + *is_idle = true; goto out_put_cpumask; + } /* * Search for any full-idle CPU in the primary domain that * shares the same L3 cache. */ - cpu = bpf_cpumask_any_and_distribute(cast_mask(l3_mask), idle_smtmask); - if (bpf_cpumask_test_cpu(cpu, online_cpumask) && - scx_bpf_test_and_clear_cpu_idle(cpu)) + cpu = scx_bpf_pick_idle_cpu(cast_mask(l3_mask), SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + *is_idle = true; goto out_put_cpumask; + } /* * Search for any other full-idle core in the primary domain. */ - cpu = bpf_cpumask_any_and_distribute(cast_mask(p_mask), idle_smtmask); - if (bpf_cpumask_test_cpu(cpu, online_cpumask) && - scx_bpf_test_and_clear_cpu_idle(cpu)) + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + *is_idle = true; goto out_put_cpumask; + } } /* * If a full-idle core can't be found (or if this is not an SMT system) * try to re-use the same CPU, even if it's not in a full-idle core. */ - if (bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) && + if (is_prev_llc_affine && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { cpu = prev_cpu; + *is_idle = true; goto out_put_cpumask; } @@ -594,38 +634,50 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) * Search for any idle CPU in the primary domain that shares the same * L2 cache. */ - cpu = bpf_cpumask_any_and_distribute(cast_mask(l2_mask), idle_cpumask); - if (bpf_cpumask_test_cpu(cpu, online_cpumask) && - scx_bpf_test_and_clear_cpu_idle(cpu)) + cpu = scx_bpf_pick_idle_cpu(cast_mask(l2_mask), 0); + if (cpu >= 0) { + *is_idle = true; goto out_put_cpumask; + } /* * Search for any idle CPU in the primary domain that shares the same * L3 cache. */ - cpu = bpf_cpumask_any_and_distribute(cast_mask(l3_mask), idle_cpumask); - if (bpf_cpumask_test_cpu(cpu, online_cpumask) && - scx_bpf_test_and_clear_cpu_idle(cpu)) + cpu = scx_bpf_pick_idle_cpu(cast_mask(l3_mask), 0); + if (cpu >= 0) { + *is_idle = true; goto out_put_cpumask; + } /* * Search for any idle CPU in the scheduling domain. */ - cpu = bpf_cpumask_any_and_distribute(cast_mask(p_mask), idle_cpumask); - if (bpf_cpumask_test_cpu(cpu, online_cpumask) && - scx_bpf_test_and_clear_cpu_idle(cpu)) + cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0); + if (cpu >= 0) { + *is_idle = true; goto out_put_cpumask; + } /* - * We couldn't find any idle CPU, so simply dispatch the task to the - * first CPU that will become available. + * We couldn't find any idle CPU, return the previous CPU if it is in + * the task's L3 domain, otherwise pick any other CPU in the L3 domain. */ - cpu = -ENOENT; + if (is_prev_llc_affine) + cpu = prev_cpu; + else + cpu = scx_bpf_pick_any_cpu(cast_mask(l3_mask), 0); out_put_cpumask: scx_bpf_put_cpumask(idle_cpumask); scx_bpf_put_cpumask(idle_smtmask); - scx_bpf_put_cpumask(online_cpumask); + + /* + * If we couldn't find any CPU, or in case of error, return the + * previously used CPU. + */ + if (cpu < 0) + cpu = prev_cpu; return cpu; } @@ -639,16 +691,16 @@ out_put_cpumask: s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { + bool is_idle = false; s32 cpu; - cpu = pick_idle_cpu(p, prev_cpu, wake_flags); - if (cpu >= 0) { + cpu = pick_idle_cpu(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); __sync_fetch_and_add(&nr_direct_dispatches, 1); - return cpu; } - return prev_cpu; + return cpu; } /* @@ -657,9 +709,10 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, static void kick_task_cpu(struct task_struct *p) { s32 cpu = scx_bpf_task_cpu(p); + bool is_idle = false; - cpu = pick_idle_cpu(p, cpu, 0); - if (cpu >= 0) + cpu = pick_idle_cpu(p, cpu, 0, &is_idle); + if (is_idle) scx_bpf_kick_cpu(cpu, 0); } @@ -1000,9 +1053,18 @@ void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p) tctx->deadline = vtime_now; } +void BPF_STRUCT_OPS(bpfland_set_cpumask, struct task_struct *p, + const struct cpumask *cpumask) +{ + s32 cpu = bpf_get_smp_processor_id(); + + task_set_domain(p, cpu, cpumask); +} + s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p, struct scx_init_task_args *args) { + s32 cpu = bpf_get_smp_processor_id(); struct task_ctx *tctx; struct bpf_cpumask *cpumask; @@ -1038,6 +1100,8 @@ s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p, if (cpumask) bpf_cpumask_release(cpumask); + task_set_domain(p, cpu, p->cpus_ptr); + return 0; } @@ -1047,17 +1111,10 @@ s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p, s32 get_nr_online_cpus(void) { const struct cpumask *online_cpumask; - u64 nr_cpu_ids = scx_bpf_nr_cpu_ids(); - int i, cpus = 0; + int cpus; online_cpumask = scx_bpf_get_online_cpumask(); - - bpf_for(i, 0, nr_cpu_ids) { - if (!bpf_cpumask_test_cpu(i, online_cpumask)) - continue; - cpus++; - } - + cpus = bpf_cpumask_weight(online_cpumask); scx_bpf_put_cpumask(online_cpumask); return cpus; @@ -1195,6 +1252,7 @@ SCX_OPS_DEFINE(bpfland_ops, .running = (void *)bpfland_running, .stopping = (void *)bpfland_stopping, .enable = (void *)bpfland_enable, + .set_cpumask = (void *)bpfland_set_cpumask, .init_task = (void *)bpfland_init_task, .init = (void *)bpfland_init, .exit = (void *)bpfland_exit, From cfe23aa21bd350a0e8f0beff959f1fc35e16ac68 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Wed, 6 Nov 2024 13:59:24 +0900 Subject: [PATCH 06/11] scx_lavd: avoid self-IPI at preemption Revert the change of sending self-IPI at preemption when a victim CPU is the current CPU. The cost of self-IPI is prohibitively expensive in some workloads (e.g., perf bench). Instead, resetting task' time slice to zero. Signed-off-by: Changwoo Min --- scheds/rust/scx_lavd/src/bpf/preempt.bpf.c | 26 +++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/scheds/rust/scx_lavd/src/bpf/preempt.bpf.c b/scheds/rust/scx_lavd/src/bpf/preempt.bpf.c index 17c6653..00bd19c 100644 --- a/scheds/rust/scx_lavd/src/bpf/preempt.bpf.c +++ b/scheds/rust/scx_lavd/src/bpf/preempt.bpf.c @@ -215,9 +215,6 @@ null_out: static bool try_kick_cpu(struct cpu_ctx *victim_cpuc, u64 victim_last_kick_clk) { /* - * Kick a victim CPU if it is not victimized yet by another - * concurrent kick task. - * * Kicking the victim CPU does _not_ guarantee that task @p will run on * that CPU. Enqueuing @p to the global queue is one operation, and * kicking the victim is another asynchronous operation. However, it is @@ -226,9 +223,32 @@ static bool try_kick_cpu(struct cpu_ctx *victim_cpuc, u64 victim_last_kick_clk) */ bool ret; + /* + * If the current CPU is a victim, we just reset the current task's + * time slice as an optimization. Othewise, kick the remote CPU for + * preemption. + * + * Resetting task's time slice to zero does not trigger an immediate + * preemption. However, the cost of self-IPI is prohibitively expensive + * for some scenarios. The actual preemption will happen at the next + * ops.tick(). + */ + if (bpf_get_smp_processor_id() == victim_cpuc->cpu_id) { + struct task_struct *tsk = bpf_get_current_task_btf(); + tsk->scx.slice = 0; + return true; + } + + /* + * Kick a victim CPU if it is not victimized yet by another + * concurrent kick task. + */ ret = __sync_bool_compare_and_swap(&victim_cpuc->last_kick_clk, victim_last_kick_clk, bpf_ktime_get_ns()); + /* + * Kick the remote CPU for preemption. + */ if (ret) scx_bpf_kick_cpu(victim_cpuc->cpu_id, SCX_KICK_PREEMPT); From d0eeebf98ad31a279a54407b1a5f9cf23f0bfff1 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Wed, 6 Nov 2024 17:40:16 +0900 Subject: [PATCH 07/11] scx_lavd: deprioritize a long runtime by prioritizing frequencies further Since tasks' average runtimes show skewed distribution, directly using the runtime in the deadline calculation causes several performance regressions. Instead, let's use the constant factor and further prioritize frequency factors to deprioritize the long runtime tasks. Signed-off-by: Changwoo Min --- scheds/rust/scx_lavd/src/bpf/lavd.bpf.h | 1 + scheds/rust/scx_lavd/src/bpf/main.bpf.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scheds/rust/scx_lavd/src/bpf/lavd.bpf.h b/scheds/rust/scx_lavd/src/bpf/lavd.bpf.h index 5423a1e..a576edd 100644 --- a/scheds/rust/scx_lavd/src/bpf/lavd.bpf.h +++ b/scheds/rust/scx_lavd/src/bpf/lavd.bpf.h @@ -29,6 +29,7 @@ enum consts_internal { LAVD_LC_RUNTIME_MAX = LAVD_TIME_ONE_SEC, LAVD_LC_WEIGHT_BOOST = 128, /* 2^7 */ LAVD_LC_GREEDY_PENALTY = 20, /* 20% */ + LAVD_LC_FREQ_OVER_RUNTIME = 100, /* 100x */ LAVD_SLICE_BOOST_MAX_FT = 3, /* maximum additional 3x of slice */ LAVD_SLICE_BOOST_MAX_STEP = 6, /* 6 slice exhausitions in a row */ diff --git a/scheds/rust/scx_lavd/src/bpf/main.bpf.c b/scheds/rust/scx_lavd/src/bpf/main.bpf.c index 8570a05..8ce64d6 100644 --- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c +++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c @@ -259,7 +259,7 @@ static u64 calc_runtime_factor(u64 runtime, u64 weight_ft) static u64 calc_freq_factor(u64 freq, u64 weight_ft) { u64 ft = sigmoid_u64(freq, LAVD_LC_FREQ_MAX); - return (ft * weight_ft) + 1; + return (ft * weight_ft * LAVD_LC_FREQ_OVER_RUNTIME) + 1; } static u64 calc_weight_factor(struct task_struct *p, struct task_ctx *taskc, @@ -367,7 +367,7 @@ static void calc_virtual_deadline_delta(struct task_struct *p, greedy_ratio = calc_greedy_ratio(taskc); greedy_ft = calc_greedy_factor(greedy_ratio); - deadline = (taskc->run_time_ns / lat_cri) * greedy_ft; + deadline = (LAVD_SLICE_MAX_NS / lat_cri) * greedy_ft; taskc->vdeadline_delta_ns = deadline; } From 78101e46883674ed294a31f5af586e9e592f7673 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 5 Nov 2024 10:15:41 +0100 Subject: [PATCH 08/11] scx_bpfland: drop lowlatency mode and the priority DSQ Schedule all tasks using a single global DSQ. This gives a better control to prevent potential starvation conditions. With this change, scx_bpfland adopts a logic similar to scx_rusty and scx_lavd, prioritizing tasks based on the frequency of their wait and wake-up events, rather than relying exclusively on the average amount of voluntary context switches. Tasks are still classified as interactive / non-interactive based on the amount of voluntary context switches, but this is only affecting the cpufreq logic. Signed-off-by: Andrea Righi --- scheds/rust/scx_bpfland/src/bpf/intf.h | 1 + scheds/rust/scx_bpfland/src/bpf/main.bpf.c | 479 +++++++++++---------- scheds/rust/scx_bpfland/src/main.rs | 19 - scheds/rust/scx_bpfland/src/stats.rs | 12 +- 4 files changed, 252 insertions(+), 259 deletions(-) diff --git a/scheds/rust/scx_bpfland/src/bpf/intf.h b/scheds/rust/scx_bpfland/src/bpf/intf.h index 75954c0..1da1382 100644 --- a/scheds/rust/scx_bpfland/src/bpf/intf.h +++ b/scheds/rust/scx_bpfland/src/bpf/intf.h @@ -13,6 +13,7 @@ #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define CLAMP(val, lo, hi) MIN(MAX(val, lo), hi) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) enum consts { NSEC_PER_USEC = 1000ULL, diff --git a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c index 38d77e2..303ec6f 100644 --- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c +++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c @@ -17,14 +17,19 @@ char _license[] SEC("license") = "GPL"; const volatile bool debug; /* - * Priority DSQ used to dispatch interactive tasks. + * Maximum task weight. */ -#define PRIO_DSQ 0 +#define MAX_TASK_WEIGHT 10000 + +/* + * Maximum frequency of task wakeup events / sec. + */ +#define MAX_WAKEUP_FREQ 1024 /* * DSQ used to dispatch regular tasks. */ -#define SHARED_DSQ 1 +#define SHARED_DSQ 0 /* * Default task time slice. @@ -55,17 +60,6 @@ const volatile s64 slice_lag = 20ULL * NSEC_PER_MSEC; */ const volatile bool local_kthreads; -/* - * With lowlatency enabled, instead of classifying tasks as interactive or - * non-interactive, they all get a dynamic priority, which is adjusted in - * function of their average rate of voluntary context switches. - * - * This option guarantess less spikey behavior and it can be particularly - * useful in soft real-time scenarios, such as audio processing, multimedia, - * etc. - */ -const volatile bool lowlatency; - /* * Maximum threshold of voluntary context switches. */ @@ -77,31 +71,15 @@ const volatile u64 nvcsw_max_thresh = 10ULL; */ volatile s64 cpufreq_perf_lvl; -/* - * Time threshold to prevent task starvation. - * - * Tasks dispatched to the priority DSQ are always consumed before those - * dispatched to the shared DSQ, so tasks in shared DSQ may be starved by those - * in the priority DSQ. - * - * To mitigate this, store the timestamp of the last task consumption from - * the shared DSQ. If the starvation_thresh_ns threshold is exceeded without - * consuming a task, the scheduler will be forced to consume a task from the - * corresponding DSQ. - */ -const volatile u64 starvation_thresh_ns = 1000ULL * NSEC_PER_MSEC; -static u64 starvation_shared_ts; - /* * Scheduling statistics. */ -volatile u64 nr_kthread_dispatches, nr_direct_dispatches, - nr_prio_dispatches, nr_shared_dispatches; +volatile u64 nr_kthread_dispatches, nr_direct_dispatches, nr_shared_dispatches; /* * Amount of currently running tasks. */ -volatile u64 nr_running, nr_interactive, nr_shared_waiting, nr_prio_waiting; +volatile u64 nr_running, nr_interactive; /* * Amount of online CPUs. @@ -169,26 +147,31 @@ struct task_ctx { struct bpf_cpumask __kptr *l2_cpumask; struct bpf_cpumask __kptr *l3_cpumask; - /* - * Total execution time of the task. - */ - u64 sum_exec_runtime; - /* * Voluntary context switches metrics. */ u64 nvcsw; u64 nvcsw_ts; + u64 avg_nvcsw; /* - * Task's latency priority. + * Frequency with which a task is blocked (consumer). */ - u64 lat_weight; + u64 blocked_freq; + u64 last_blocked_at; + + /* + * Frequency with which a task wakes other tasks (producer). + */ + u64 waker_freq; + u64 last_woke_at; /* * Task's average used time slice. */ u64 avg_runtime; + u64 sum_runtime; + u64 last_run_at; /* * Task's deadline. @@ -276,75 +259,184 @@ static u64 calc_avg_clamp(u64 old_val, u64 new_val, u64 low, u64 high) } /* - * Return the dynamic priority multiplier (only applied in lowlatency mode). - * - * The multiplier is evaluated in function of the task's average rate of - * voluntary context switches per second. + * Evaluate the average frequency of an event over time. */ -static u64 task_dyn_prio(struct task_struct *p) +static u64 update_freq(u64 freq, u64 delta) { - struct task_ctx *tctx; + u64 new_freq; - if (!lowlatency) - return 1; - tctx = try_lookup_task_ctx(p); - if (!tctx) - return 1; - return MAX(tctx->lat_weight, 1); + new_freq = NSEC_PER_SEC / delta; + return calc_avg(freq, new_freq); } /* - * Return task's dynamic priority. + * Return the total amount of tasks that are currently waiting to be scheduled. */ -static u64 task_prio(const struct task_struct *p) +static u64 nr_tasks_waiting(void) { - return p->scx.weight * task_dyn_prio(p); + return scx_bpf_dsq_nr_queued(SHARED_DSQ) + 1; +} + +/* + * Return task's dynamic weight. + */ +static u64 task_weight(const struct task_struct *p, const struct task_ctx *tctx) +{ + /* + * Scale the static task weight by the average amount of voluntary + * context switches to determine the dynamic weight. + */ + u64 prio = p->scx.weight * CLAMP(tctx->avg_nvcsw, 1, nvcsw_max_thresh); + + return CLAMP(prio, 1, MAX_TASK_WEIGHT); +} + +/* + * Return a value proportionally scaled to the task's priority. + */ +static u64 scale_up_fair(const struct task_struct *p, + const struct task_ctx *tctx, u64 value) +{ + return value * task_weight(p, tctx) / 100; +} + +/* + * Return a value inversely proportional to the task's priority. + */ +static u64 scale_inverse_fair(const struct task_struct *p, + const struct task_ctx *tctx, u64 value) +{ + return value * 100 / task_weight(p, tctx); } /* * Return the task's allowed lag: used to determine how early its vruntime can * be. */ -static u64 task_lag(const struct task_struct *p) +static u64 task_lag(const struct task_struct *p, const struct task_ctx *tctx) { - return slice_lag * task_prio(p) / 100; + return scale_up_fair(p, tctx, slice_lag); } /* - * Return a value inversely proportional to the task's weight. + * ** Taken directly from fair.c in the Linux kernel ** + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) */ -static u64 scale_inverse_fair(const struct task_struct *p, u64 value) +const int sched_prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +static u64 max_sched_prio(void) { - return value * 100 / task_prio(p); + return ARRAY_SIZE(sched_prio_to_weight); } /* - * Compute the deadline component of a task (this value will be added to the - * task's vruntime to determine the actual deadline). + * Convert task priority to weight (following fair.c logic). */ -static s64 task_compute_dl(const struct task_struct *p, - const struct task_ctx *tctx) +static u64 sched_prio_to_latency_weight(u64 prio) { - /* - * Return the deadline as a function of the average runtime and the - * evaluated task's dynamic priority. - */ - return scale_inverse_fair(p, tctx->avg_runtime); + u64 max_prio = max_sched_prio(); + + if (prio >= max_prio) { + scx_bpf_error("invalid priority"); + return 0; + } + + return sched_prio_to_weight[max_prio - prio - 1]; } /* - * Return task's evaluated vruntime. + * Evaluate task's deadline. + * + * Reuse a logic similar to scx_rusty or scx_lavd and evaluate the deadline as + * a function of the waiting and wake-up events and the average task's runtime. */ static u64 task_deadline(struct task_struct *p, struct task_ctx *tctx) { - u64 min_vruntime = vtime_now - task_lag(p); + u64 waker_freq, blocked_freq; + u64 lat_prio, lat_weight; + u64 avg_run_scaled, avg_run; + u64 freq_factor; + + /* + * Limit the wait and wake-up frequencies to prevent spikes. + */ + waker_freq = CLAMP(tctx->waker_freq, 1, MAX_WAKEUP_FREQ); + blocked_freq = CLAMP(tctx->blocked_freq, 1, MAX_WAKEUP_FREQ); + + /* + * We want to prioritize producers (waker tasks) more than consumers + * (blocked tasks), using the following formula: + * + * freq_factor = blocked_freq * waker_freq^2 + * + * This seems to improve the overall responsiveness of + * producer/consumer pipelines. + */ + freq_factor = blocked_freq * waker_freq * waker_freq; + + /* + * Evaluate the "latency priority" as a function of the wake-up, block + * frequencies and average runtime, using the following formula: + * + * lat_prio = log(freq_factor / avg_run_scaled) + * + * Frequencies can grow very quickly, almost exponential, so use + * log2_u64() to get a more linear metric that can be used as a + * priority. + * + * The avg_run_scaled component is used to scale the latency priority + * proportionally to the task's weight and inversely proportional to + * its runtime, so that a task with a higher weight / shorter runtime + * gets a higher latency priority than a task with a lower weight / + * higher runtime. + */ + avg_run_scaled = scale_inverse_fair(p, tctx, tctx->avg_runtime); + avg_run = log2_u64(avg_run_scaled + 1); + + lat_prio = log2_u64(freq_factor); + lat_prio = MIN(lat_prio, max_sched_prio()); + + if (lat_prio >= avg_run) + lat_prio -= avg_run; + else + lat_prio = 0; + + /* + * Lastly, translate the latency priority into a weight and apply it to + * the task's average runtime to determine the task's deadline. + */ + lat_weight = sched_prio_to_latency_weight(lat_prio); + + return tctx->avg_runtime * 100 / lat_weight; +} + +/* + * Return task's evaluated deadline applied to its vruntime. + */ +static u64 task_vtime(struct task_struct *p, struct task_ctx *tctx) +{ + u64 min_vruntime = vtime_now - task_lag(p, tctx); /* * Limit the vruntime to to avoid excessively penalizing tasks. */ if (vtime_before(p->scx.dsq_vtime, min_vruntime)) { p->scx.dsq_vtime = min_vruntime; - tctx->deadline = p->scx.dsq_vtime + task_compute_dl(p, tctx); + tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx); } return tctx->deadline; @@ -356,28 +448,11 @@ static u64 task_deadline(struct task_struct *p, struct task_ctx *tctx) */ static inline void task_refill_slice(struct task_struct *p) { - u64 curr_prio_waiting = scx_bpf_dsq_nr_queued(PRIO_DSQ); - u64 curr_shared_waiting = scx_bpf_dsq_nr_queued(SHARED_DSQ); - u64 scale_factor; - - /* - * Refresh the amount of waiting tasks to get a more accurate scaling - * factor for the time slice. - */ - nr_prio_waiting = calc_avg(nr_prio_waiting, curr_prio_waiting); - nr_shared_waiting = calc_avg(nr_shared_waiting, curr_shared_waiting); - /* * Scale the time slice of an inversely proportional factor of the - * total amount of tasks that are waiting (use a more immediate metric - * in lowlatency mode and an average in normal mode). + * total amount of tasks that are waiting. */ - if (lowlatency) - scale_factor = curr_shared_waiting + 1; - else - scale_factor = nr_prio_waiting + nr_shared_waiting + 1; - - p->scx.slice = CLAMP(slice_max / scale_factor, slice_min, slice_max); + p->scx.slice = CLAMP(slice_max / nr_tasks_waiting(), slice_min, slice_max); } static void task_set_domain(struct task_struct *p, s32 cpu, @@ -459,8 +534,7 @@ static void task_set_domain(struct task_struct *p, s32 cpu, static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { const struct cpumask *idle_smtmask, *idle_cpumask; - struct bpf_cpumask *primary, *l2_domain, *l3_domain; - struct bpf_cpumask *p_mask, *l2_mask, *l3_mask; + const struct cpumask *primary, *p_mask, *l2_mask, *l3_mask; struct task_ctx *tctx; bool is_prev_llc_affine = false; s32 cpu; @@ -481,7 +555,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo if (!tctx) return -ENOENT; - primary = primary_cpumask; + primary = cast_mask(primary_cpumask); if (!primary) return -EINVAL; @@ -494,21 +568,21 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo /* * Task's scheduling domains. */ - p_mask = tctx->cpumask; + p_mask = cast_mask(tctx->cpumask); if (!p_mask) { scx_bpf_error("cpumask not initialized"); cpu = -EINVAL; goto out_put_cpumask; } - l2_mask = tctx->l2_cpumask; + l2_mask = cast_mask(tctx->l2_cpumask); if (!l2_mask) { scx_bpf_error("l2 cpumask not initialized"); cpu = -EINVAL; goto out_put_cpumask; } - l3_mask = tctx->l3_cpumask; + l3_mask = cast_mask(tctx->l3_cpumask); if (!l3_mask) { scx_bpf_error("l3 cpumask not initialized"); cpu = -EINVAL; @@ -519,7 +593,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo * Check if the previously used CPU is still in the L3 task domain. If * not, we may want to move the task back to its original L3 domain. */ - is_prev_llc_affine = bpf_cpumask_test_cpu(prev_cpu, cast_mask(l3_mask)); + is_prev_llc_affine = bpf_cpumask_test_cpu(prev_cpu, l3_mask); /* * If the current task is waking up another task and releasing the CPU @@ -528,7 +602,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo */ if (wake_flags & SCX_WAKE_SYNC) { struct task_struct *current = (void *)bpf_get_current_task_btf(); - struct bpf_cpumask *curr_l3_domain; + const struct cpumask *curr_l3_domain; struct cpu_ctx *cctx; bool share_llc, has_idle; @@ -542,7 +616,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo goto out_put_cpumask; } - curr_l3_domain = cctx->l3_cpumask; + curr_l3_domain = cast_mask(cctx->l3_cpumask); if (!curr_l3_domain) curr_l3_domain = primary; @@ -550,7 +624,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo * If both the waker and wakee share the same L3 cache keep * using the same CPU if possible. */ - share_llc = bpf_cpumask_test_cpu(prev_cpu, cast_mask(curr_l3_domain)); + share_llc = bpf_cpumask_test_cpu(prev_cpu, curr_l3_domain); if (share_llc && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { cpu = prev_cpu; @@ -563,9 +637,9 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo * the wakee on the same CPU as the waker (since it's going to * block and release the current CPU). */ - has_idle = bpf_cpumask_intersects(cast_mask(curr_l3_domain), idle_cpumask); + has_idle = bpf_cpumask_intersects(curr_l3_domain, idle_cpumask); if (has_idle && - bpf_cpumask_test_cpu(cpu, cast_mask(p_mask)) && + bpf_cpumask_test_cpu(cpu, p_mask) && !(current->flags & PF_EXITING) && scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu) == 0) { *is_idle = true; @@ -593,7 +667,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo * Search for any full-idle CPU in the primary domain that * shares the same L2 cache. */ - cpu = scx_bpf_pick_idle_cpu(cast_mask(l2_mask), SCX_PICK_IDLE_CORE); + cpu = scx_bpf_pick_idle_cpu(l2_mask, SCX_PICK_IDLE_CORE); if (cpu >= 0) { *is_idle = true; goto out_put_cpumask; @@ -603,7 +677,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo * Search for any full-idle CPU in the primary domain that * shares the same L3 cache. */ - cpu = scx_bpf_pick_idle_cpu(cast_mask(l3_mask), SCX_PICK_IDLE_CORE); + cpu = scx_bpf_pick_idle_cpu(l3_mask, SCX_PICK_IDLE_CORE); if (cpu >= 0) { *is_idle = true; goto out_put_cpumask; @@ -612,7 +686,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo /* * Search for any other full-idle core in the primary domain. */ - cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), SCX_PICK_IDLE_CORE); + cpu = scx_bpf_pick_idle_cpu(p_mask, SCX_PICK_IDLE_CORE); if (cpu >= 0) { *is_idle = true; goto out_put_cpumask; @@ -634,7 +708,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo * Search for any idle CPU in the primary domain that shares the same * L2 cache. */ - cpu = scx_bpf_pick_idle_cpu(cast_mask(l2_mask), 0); + cpu = scx_bpf_pick_idle_cpu(l2_mask, 0); if (cpu >= 0) { *is_idle = true; goto out_put_cpumask; @@ -644,7 +718,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo * Search for any idle CPU in the primary domain that shares the same * L3 cache. */ - cpu = scx_bpf_pick_idle_cpu(cast_mask(l3_mask), 0); + cpu = scx_bpf_pick_idle_cpu(l3_mask, 0); if (cpu >= 0) { *is_idle = true; goto out_put_cpumask; @@ -653,7 +727,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo /* * Search for any idle CPU in the scheduling domain. */ - cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0); + cpu = scx_bpf_pick_idle_cpu(p_mask, 0); if (cpu >= 0) { *is_idle = true; goto out_put_cpumask; @@ -666,7 +740,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo if (is_prev_llc_affine) cpu = prev_cpu; else - cpu = scx_bpf_pick_any_cpu(cast_mask(l3_mask), 0); + cpu = scx_bpf_pick_any_cpu(l3_mask, 0); out_put_cpumask: scx_bpf_put_cpumask(idle_cpumask); @@ -716,14 +790,6 @@ static void kick_task_cpu(struct task_struct *p) scx_bpf_kick_cpu(cpu, 0); } -static bool is_task_interactive(const struct task_struct *p, - const struct task_ctx *tctx) -{ - if (!tctx->is_interactive) - return false; - return scx_bpf_dsq_nr_queued(PRIO_DSQ) < 100; -} - /* * Dispatch all the other tasks that were not dispatched directly in * select_cpu(). @@ -731,7 +797,6 @@ static bool is_task_interactive(const struct task_struct *p, void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags) { struct task_ctx *tctx; - s32 dsq_id; tctx = try_lookup_task_ctx(p); if (!tctx) @@ -753,19 +818,10 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags) /* * Dispatch interactive tasks to the priority DSQ and regular tasks to * the shared DSQ. - * - * When lowlatency is enabled, the separate priority DSQ is disabled, - * so in this case always dispatch to the shared DSQ. */ - if (!lowlatency && is_task_interactive(p, tctx)) { - dsq_id = PRIO_DSQ; - __sync_fetch_and_add(&nr_prio_dispatches, 1); - } else { - dsq_id = SHARED_DSQ; - __sync_fetch_and_add(&nr_shared_dispatches, 1); - } - scx_bpf_dispatch_vtime(p, dsq_id, SCX_SLICE_DFL, - task_deadline(p, tctx), enq_flags); + scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, + task_vtime(p, tctx), enq_flags); + __sync_fetch_and_add(&nr_shared_dispatches, 1); /* * If there is an idle CPU available for the task, wake it up so it can @@ -774,76 +830,13 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags) kick_task_cpu(p); } -/* - * Consume a task from the priority DSQ, transferring it to the local CPU DSQ. - * - * Return true if a task is consumed, false otherwise. - */ -static bool consume_prio_task(u64 now) -{ - return scx_bpf_consume(PRIO_DSQ); -} - -/* - * Consume a task from the shared DSQ, transferring it to the local CPU DSQ. - * - * Return true if a task is consumed, false otherwise. - */ -static bool consume_regular_task(u64 now) -{ - bool ret; - - ret = scx_bpf_consume(SHARED_DSQ); - if (ret) - starvation_shared_ts = now; - - return ret; -} - -/* - * Consume tasks that are potentially starving. - * - * In order to limit potential starvation conditions the scheduler uses a - * time-based threshold to ensure that at least one task from the - * lower-priority DSQs is periodically consumed. - */ -static bool consume_starving_tasks(u64 now) -{ - if (!starvation_thresh_ns) - return false; - - if (vtime_before(starvation_shared_ts + starvation_thresh_ns, now)) - if (consume_regular_task(now)) - return true; - - return false; -} - -/* - * Consume regular tasks from the per-CPU DSQ or a shared DSQ, transferring - * them to the local CPU DSQ. - * - * Return true if at least a task is consumed, false otherwise. - */ -static bool consume_shared_tasks(s32 cpu, u64 now) -{ - /* - * The priority DSQ can starve the shared DSQ, so to mitigate this - * starvation we have the starvation_thresh_ns, see also - * consume_starving_tasks(). - */ - if (consume_prio_task(now) || consume_regular_task(now)) - return true; - return false; -} - void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev) { - u64 now = bpf_ktime_get_ns(); - - if (consume_starving_tasks(now)) - return; - if (consume_shared_tasks(cpu, now)) + /* + * Consume regular tasks from the shared DSQ, transferring them to the + * local CPU DSQ. + */ + if (scx_bpf_consume(SHARED_DSQ)) return; /* * If the current task expired its time slice and no other task wants @@ -922,6 +915,7 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p) tctx = try_lookup_task_ctx(p); if (!tctx) return; + tctx->last_run_at = bpf_ktime_get_ns(); /* * Adjust target CPU frequency before the task starts to run. @@ -989,18 +983,15 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable) /* * Update task's average runtime. */ - slice = p->se.sum_exec_runtime - tctx->sum_exec_runtime; - if (lowlatency) - slice = CLAMP(slice, slice_min, slice_max); - tctx->sum_exec_runtime = p->se.sum_exec_runtime; - tctx->avg_runtime = calc_avg(tctx->avg_runtime, slice); + slice = now - tctx->last_run_at; + tctx->sum_runtime += slice; + tctx->avg_runtime = calc_avg(tctx->avg_runtime, tctx->sum_runtime); /* * Update task vruntime charging the weighted used time slice. */ - slice = scale_inverse_fair(p, slice); - p->scx.dsq_vtime += slice; - tctx->deadline = p->scx.dsq_vtime + task_compute_dl(p, tctx); + p->scx.dsq_vtime += scale_inverse_fair(p, tctx, slice); + tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx); /* * Refresh voluntary context switch metrics. @@ -1011,7 +1002,7 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable) delta_t = (s64)(now - tctx->nvcsw_ts); if (delta_t > NSEC_PER_SEC) { u64 avg_nvcsw = tctx->nvcsw * NSEC_PER_SEC / delta_t; - u64 max_lat_weight = nvcsw_max_thresh * 100; + u64 max_nvcsw = nvcsw_max_thresh * 100; tctx->nvcsw = 0; tctx->nvcsw_ts = now; @@ -1021,8 +1012,7 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable) * of voluntary context switches (limited to to prevent * excessive spikes). */ - tctx->lat_weight = calc_avg_clamp(tctx->lat_weight, avg_nvcsw, - 0, max_lat_weight); + tctx->avg_nvcsw = calc_avg_clamp(tctx->avg_nvcsw, avg_nvcsw, 0, max_nvcsw); /* * Classify the task based on the average of voluntary context @@ -1032,10 +1022,53 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable) * it is classified as interactive, otherwise the task is * classified as regular. */ - tctx->is_interactive = tctx->lat_weight >= nvcsw_max_thresh; + tctx->is_interactive = tctx->avg_nvcsw >= nvcsw_max_thresh; } } +void BPF_STRUCT_OPS(bpfland_runnable, struct task_struct *p, u64 enq_flags) +{ + u64 now = bpf_ktime_get_ns(), delta; + struct task_struct *waker; + struct task_ctx *tctx; + + tctx = try_lookup_task_ctx(p); + if (!tctx) + return; + tctx->sum_runtime = 0; + + waker = bpf_get_current_task_btf(); + tctx = try_lookup_task_ctx(waker); + if (!tctx) + return; + + delta = MAX(now - tctx->last_woke_at, 1); + tctx->waker_freq = update_freq(tctx->waker_freq, delta); + tctx->last_woke_at = now; +} + +void BPF_STRUCT_OPS(bpfland_quiescent, struct task_struct *p, u64 deq_flags) +{ + u64 now = bpf_ktime_get_ns(), delta; + struct task_ctx *tctx; + + tctx = try_lookup_task_ctx(p); + if (!tctx) + return; + + delta = MAX(now - tctx->last_blocked_at, 1); + tctx->blocked_freq = update_freq(tctx->blocked_freq, delta); + tctx->last_blocked_at = now; +} + +void BPF_STRUCT_OPS(bpfland_set_cpumask, struct task_struct *p, + const struct cpumask *cpumask) +{ + s32 cpu = bpf_get_smp_processor_id(); + + task_set_domain(p, cpu, cpumask); +} + void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p) { u64 now = bpf_ktime_get_ns(); @@ -1048,17 +1081,11 @@ void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p) tctx = try_lookup_task_ctx(p); if (!tctx) return; - tctx->sum_exec_runtime = p->se.sum_exec_runtime; tctx->nvcsw_ts = now; - tctx->deadline = vtime_now; -} + tctx->last_woke_at = now; + tctx->last_blocked_at = now; -void BPF_STRUCT_OPS(bpfland_set_cpumask, struct task_struct *p, - const struct cpumask *cpumask) -{ - s32 cpu = bpf_get_smp_processor_id(); - - task_set_domain(p, cpu, cpumask); + tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx); } s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p, @@ -1216,16 +1243,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init) nr_online_cpus = get_nr_online_cpus(); /* - * Create the global priority and shared DSQs. - * - * Allocate a new DSQ id that does not clash with any valid CPU id. + * Create the global shared DSQ. */ - err = scx_bpf_create_dsq(PRIO_DSQ, -1); - if (err) { - scx_bpf_error("failed to create priority DSQ: %d", err); - return err; - } - err = scx_bpf_create_dsq(SHARED_DSQ, -1); if (err) { scx_bpf_error("failed to create shared DSQ: %d", err); @@ -1251,8 +1270,10 @@ SCX_OPS_DEFINE(bpfland_ops, .dispatch = (void *)bpfland_dispatch, .running = (void *)bpfland_running, .stopping = (void *)bpfland_stopping, - .enable = (void *)bpfland_enable, + .runnable = (void *)bpfland_runnable, + .quiescent = (void *)bpfland_quiescent, .set_cpumask = (void *)bpfland_set_cpumask, + .enable = (void *)bpfland_enable, .init_task = (void *)bpfland_init_task, .init = (void *)bpfland_init, .exit = (void *)bpfland_exit, diff --git a/scheds/rust/scx_bpfland/src/main.rs b/scheds/rust/scx_bpfland/src/main.rs index 043c07c..ca05230 100644 --- a/scheds/rust/scx_bpfland/src/main.rs +++ b/scheds/rust/scx_bpfland/src/main.rs @@ -138,15 +138,6 @@ struct Opts { #[clap(short = 'l', long, allow_hyphen_values = true, default_value = "20000")] slice_us_lag: i64, - /// With lowlatency enabled, instead of classifying tasks as interactive or non-interactive, - /// they all get a dynamic priority, which is adjusted in function of their average rate of - /// voluntary context switches. - /// - /// This option guarantess less spikey behavior and it can be particularly useful in soft - /// real-time scenarios, such as audio processing, multimedia, etc. - #[clap(short = 'L', long, action = clap::ArgAction::SetTrue)] - lowlatency: bool, - /// Enable kthreads prioritization. /// /// Enabling this can improve system performance, but it may also introduce interactivity @@ -181,11 +172,6 @@ struct Opts { #[clap(short = 'c', long, default_value = "10")] nvcsw_max_thresh: u64, - /// Prevent starvation by making sure that at least one lower priority task is scheduled every - /// starvation_thresh_us (0 = disable starvation prevention). - #[clap(short = 't', long, default_value = "1000")] - starvation_thresh_us: u64, - /// Enable stats monitoring with the specified interval. #[clap(long)] stats: Option, @@ -259,12 +245,10 @@ impl<'a> Scheduler<'a> { // Override default BPF scheduling parameters. skel.maps.rodata_data.debug = opts.debug; skel.maps.rodata_data.smt_enabled = smt_enabled; - skel.maps.rodata_data.lowlatency = opts.lowlatency; skel.maps.rodata_data.local_kthreads = opts.local_kthreads; skel.maps.rodata_data.slice_max = opts.slice_us * 1000; skel.maps.rodata_data.slice_min = opts.slice_us_min * 1000; skel.maps.rodata_data.slice_lag = opts.slice_us_lag * 1000; - skel.maps.rodata_data.starvation_thresh_ns = opts.starvation_thresh_us * 1000; skel.maps.rodata_data.nvcsw_max_thresh = opts.nvcsw_max_thresh; // Load the BPF program for validation. @@ -556,11 +540,8 @@ impl<'a> Scheduler<'a> { nr_running: self.skel.maps.bss_data.nr_running, nr_cpus: self.skel.maps.bss_data.nr_online_cpus, nr_interactive: self.skel.maps.bss_data.nr_interactive, - nr_prio_waiting: self.skel.maps.bss_data.nr_prio_waiting, - nr_shared_waiting: self.skel.maps.bss_data.nr_shared_waiting, nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches, nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches, - nr_prio_dispatches: self.skel.maps.bss_data.nr_prio_dispatches, nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches, } } diff --git a/scheds/rust/scx_bpfland/src/stats.rs b/scheds/rust/scx_bpfland/src/stats.rs index 73e3a08..23869eb 100644 --- a/scheds/rust/scx_bpfland/src/stats.rs +++ b/scheds/rust/scx_bpfland/src/stats.rs @@ -21,16 +21,10 @@ pub struct Metrics { pub nr_cpus: u64, #[stat(desc = "Number of running interactive tasks")] pub nr_interactive: u64, - #[stat(desc = "Average amount of regular tasks waiting to be dispatched")] - pub nr_shared_waiting: u64, - #[stat(desc = "Average amount of interactive tasks waiting to be dispatched")] - pub nr_prio_waiting: u64, #[stat(desc = "Number of kthread direct dispatches")] pub nr_kthread_dispatches: u64, #[stat(desc = "Number of task direct dispatches")] pub nr_direct_dispatches: u64, - #[stat(desc = "Number of interactive task dispatches")] - pub nr_prio_dispatches: u64, #[stat(desc = "Number of regular task dispatches")] pub nr_shared_dispatches: u64, } @@ -39,16 +33,13 @@ impl Metrics { fn format(&self, w: &mut W) -> Result<()> { writeln!( w, - "[{}] tasks -> r: {:>2}/{:<2} i: {:<2} pw: {:<4} w: {:<4} | dispatch -> k: {:<5} d: {:<5} p: {:<5} s: {:<5}", + "[{}] tasks -> r: {:>2}/{:<2} i: {:<2} | dispatch -> k: {:<5} d: {:<5} s: {:<5}", crate::SCHEDULER_NAME, self.nr_running, self.nr_cpus, self.nr_interactive, - self.nr_prio_waiting, - self.nr_shared_waiting, self.nr_kthread_dispatches, self.nr_direct_dispatches, - self.nr_prio_dispatches, self.nr_shared_dispatches )?; Ok(()) @@ -58,7 +49,6 @@ impl Metrics { Self { nr_kthread_dispatches: self.nr_kthread_dispatches - rhs.nr_kthread_dispatches, nr_direct_dispatches: self.nr_direct_dispatches - rhs.nr_direct_dispatches, - nr_prio_dispatches: self.nr_prio_dispatches - rhs.nr_prio_dispatches, nr_shared_dispatches: self.nr_shared_dispatches - rhs.nr_shared_dispatches, ..self.clone() } From af2cb1abbed15df7661ece67377b1db181b128f9 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Wed, 6 Nov 2024 08:33:29 -0800 Subject: [PATCH 09/11] scx_mitosis: add RCU-like synchronization scx_mitosis relied on the implicit assumption that after a sched tick, all outstanding scheduling events had completed but this might not actually be correct. This feels like a natural use-case for RCU, but there is no way to directly make use of RCU in BPF. Instead, this commit implements an RCU-like synchronization mechanism. Signed-off-by: Dan Schatzberg --- scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 132 +++++++++++++++++- 1 file changed, 126 insertions(+), 6 deletions(-) diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index 42e8fcb..da87a95 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -235,6 +235,91 @@ static inline const struct cpumask *lookup_cell_cpumask(int idx) return (const struct cpumask *)cpumaskw->cpumask; } +/* + * This is an RCU-like implementation to keep track of scheduling events so we + * can establish when cell assignments have propagated completely. + */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} percpu_critical_sections SEC(".maps"); + +/* Same implementation for enter/exit */ +static __always_inline int critical_section() +{ + u32 zero = 0; + u32 *data; + + if (!(data = bpf_map_lookup_elem(&percpu_critical_sections, &zero))) { + scx_bpf_error("no percpu_critical_sections"); + return -1; + } + + /* + * Bump the counter, the LSB indicates we are in a critical section and the + * rest of the bits keep track of how many critical sections. + */ + WRITE_ONCE(*data, *data + 1); + return 0; +} + +#define critical_section_enter() critical_section() +#define critical_section_exit() critical_section() + +u32 critical_section_state[MAX_CPUS]; +/* + * Write side will record the current state and then poll to check that the + * generation has advanced (somewhat like call_rcu) + */ +static __always_inline int critical_section_record() +{ + u32 zero = 0; + u32 *data; + int nr_cpus = nr_possible_cpus; + if (nr_cpus > MAX_CPUS) + nr_cpus = MAX_CPUS; + + for (int i = 0; i < nr_cpus; ++i) { + if (!(data = bpf_map_lookup_percpu_elem( + &percpu_critical_sections, &zero, i))) { + scx_bpf_error("no percpu_critical_sections"); + return -1; + } + + critical_section_state[i] = READ_ONCE(*data); + } + return 0; +} + +static __always_inline int critical_section_poll() +{ + u32 zero = 0; + u32 *data; + + int nr_cpus = nr_possible_cpus; + if (nr_cpus > MAX_CPUS) + nr_cpus = MAX_CPUS; + + for (int i = 0; i < nr_cpus; ++i) { + /* If not in a critical section at the time of record, then it passes */ + if (!(critical_section_state[i] & 1)) + continue; + + if (!(data = bpf_map_lookup_percpu_elem( + &percpu_critical_sections, &zero, i))) { + scx_bpf_error("no percpu_critical_sections"); + return -1; + } + + if (READ_ONCE(*data) == critical_section_state[i]) + return 1; + } + + return 0; +} + /* * Along with a user_global_seq bump, indicates that cgroup->cell assignment * changed @@ -264,6 +349,16 @@ int BPF_PROG(sched_tick_fentry) * scheduler tick. This is a crude way of mimicing RCU synchronization. */ if (READ_ONCE(draining)) { + if (critical_section_poll()) + return 0; + /* FIXME: If a cell is being destroyed, we need to make sure that dsq is + * drained before removing it from all the cpus + * + * Additionally, the handling of pinned tasks is broken here - we send + * them to a cell DSQ if there's overlap of the cell's CPUs and the + * task's cpumask but if the cell's CPU change we might stall the + * task indefinitely. + */ bpf_for(cpu_idx, 0, nr_possible_cpus) { if (!(cpu_ctx = lookup_cpu_ctx(cpu_idx))) @@ -422,6 +517,11 @@ int BPF_PROG(sched_tick_fentry) /* Bump the global seq last to ensure that prior stores are now visible. This synchronizes with the read of global_seq */ barrier(); WRITE_ONCE(global_seq, global_seq + 1); + /* + * On subsequent ticks we'll check that all in-flight enqueues are done so + * we can clear the prev_cell for each cpu. Record the state here. + */ + critical_section_record(); return 0; } @@ -611,8 +711,17 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu, if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) return prev_cpu; - if (maybe_refresh_cell(p, tctx) < 0) - return prev_cpu; + /* + * This is a lightweight (RCU-like) critical section covering from when we + * refresh cell information to when we enqueue onto the task's assigned + * cell's DSQ. This allows us to publish new cell assignments and establish + * a point at which all future enqueues will be on the new assignments. + */ + critical_section_enter(); + if (maybe_refresh_cell(p, tctx) < 0) { + cpu = prev_cpu; + goto out; + } if ((cpu = pick_idle_cpu(p, prev_cpu, cctx, tctx)) >= 0) { cstat_inc(CSTAT_LOCAL, tctx->cell, cctx); @@ -622,10 +731,12 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu, scx_bpf_error( "select_cpu returned cpu %d belonging to cell %d but task belongs to cell %d", cpu, cctx->cell, tctx->cell); - return cpu; + goto out; } - return prev_cpu; + cpu = prev_cpu; +out: + critical_section_exit(); } static __always_inline bool pick_idle_cpu_and_kick(struct task_struct *p, @@ -661,11 +772,18 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) return; + /* + * This is a lightweight (RCU-like) critical section covering from when we + * refresh cell information to when we enqueue onto the task's assigned + * cell's DSQ. This allows us to publish new cell assignments and establish + * a point at which all future enqueues will be on the new assignments. + */ + critical_section_enter(); if (maybe_refresh_cell(p, tctx) < 0) - return; + goto out; if (!(cell = lookup_cell(tctx->cell))) - return; + goto out; /* * Limit the amount of budget that an idling task can accumulate @@ -689,6 +807,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) */ if (!(enq_flags & SCX_ENQ_WAKEUP)) pick_idle_cpu_and_kick(p, task_cpu, cctx, tctx); +out: + critical_section_exit(); } void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) From ad457271396f1b7da7fbca3f2b6eebe798aab3f0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 6 Nov 2024 06:54:26 -1000 Subject: [PATCH 10/11] version: v1.0.6 --- meson.build | 2 +- rust/scx_loader/Cargo.toml | 2 +- rust/scx_rustland_core/Cargo.toml | 6 +++--- rust/scx_stats/Cargo.toml | 2 +- rust/scx_stats/scx_stats_derive/Cargo.toml | 4 ++-- rust/scx_utils/Cargo.toml | 4 ++-- scheds/rust/scx_bpfland/Cargo.toml | 10 +++++----- scheds/rust/scx_lavd/Cargo.toml | 10 +++++----- scheds/rust/scx_layered/Cargo.toml | 10 +++++----- scheds/rust/scx_mitosis/Cargo.toml | 4 ++-- scheds/rust/scx_rlfifo/Cargo.toml | 10 +++++----- scheds/rust/scx_rustland/Cargo.toml | 14 +++++++------- scheds/rust/scx_rusty/Cargo.toml | 10 +++++----- 13 files changed, 44 insertions(+), 44 deletions(-) diff --git a/meson.build b/meson.build index dd29e52..cf5dd5c 100644 --- a/meson.build +++ b/meson.build @@ -1,5 +1,5 @@ project('sched_ext schedulers', 'c', - version: '1.0.5', + version: '1.0.6', license: 'GPL-2.0', meson_version : '>= 1.2.0',) diff --git a/rust/scx_loader/Cargo.toml b/rust/scx_loader/Cargo.toml index a4b761a..b0474fd 100644 --- a/rust/scx_loader/Cargo.toml +++ b/rust/scx_loader/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_loader" -version = "1.0.5" +version = "1.0.6" authors = ["Vladislav Nepogodin "] edition = "2021" description = "DBUS on-demand loader of sched-ext schedulers" diff --git a/rust/scx_rustland_core/Cargo.toml b/rust/scx_rustland_core/Cargo.toml index 987da27..949c30e 100644 --- a/rust/scx_rustland_core/Cargo.toml +++ b/rust/scx_rustland_core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_rustland_core" -version = "2.2.2" +version = "2.2.3" edition = "2021" authors = ["Andrea Righi "] license = "GPL-2.0-only" @@ -12,12 +12,12 @@ anyhow = "1.0.65" plain = "0.2.3" libbpf-rs = "0.24.1" libc = "0.2.137" -scx_utils = { path = "../scx_utils", version = "1.0.5" } +scx_utils = { path = "../scx_utils", version = "1.0.6" } [build-dependencies] tar = "0.4" walkdir = "2.4" -scx_utils = { path = "../scx_utils", version = "1.0.5" } +scx_utils = { path = "../scx_utils", version = "1.0.6" } [lib] name = "scx_rustland_core" diff --git a/rust/scx_stats/Cargo.toml b/rust/scx_stats/Cargo.toml index beefba7..cafdee4 100644 --- a/rust/scx_stats/Cargo.toml +++ b/rust/scx_stats/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_stats" -version = "1.0.5" +version = "1.0.6" edition = "2021" authors = ["Tejun Heo "] license = "GPL-2.0-only" diff --git a/rust/scx_stats/scx_stats_derive/Cargo.toml b/rust/scx_stats/scx_stats_derive/Cargo.toml index cfc21b9..4d0d41d 100644 --- a/rust/scx_stats/scx_stats_derive/Cargo.toml +++ b/rust/scx_stats/scx_stats_derive/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_stats_derive" -version = "1.0.5" +version = "1.0.6" edition = "2021" authors = ["Tejun Heo "] license = "GPL-2.0-only" @@ -13,6 +13,6 @@ proc-macro = true [dependencies] proc-macro2 = "1.0" quote = "1.0" -scx_stats = { path = "..", version = "1.0.5" } +scx_stats = { path = "..", version = "1.0.6" } serde_json = "1.0" syn = { version = "2.0", features = ["extra-traits", "full"] } diff --git a/rust/scx_utils/Cargo.toml b/rust/scx_utils/Cargo.toml index 99bd45a..3007e87 100644 --- a/rust/scx_utils/Cargo.toml +++ b/rust/scx_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_utils" -version = "1.0.5" +version = "1.0.6" edition = "2021" authors = ["Tejun Heo "] license = "GPL-2.0-only" @@ -22,7 +22,7 @@ log = "0.4.17" nvml-wrapper = { version = "0.10.0", optional = true } paste = "1.0" regex = "1.10" -scx_stats = { path = "../scx_stats", version = "1.0.5" } +scx_stats = { path = "../scx_stats", version = "1.0.6" } serde = { version = "1.0", features = ["derive"] } sscanf = "0.4" tar = "0.4" diff --git a/scheds/rust/scx_bpfland/Cargo.toml b/scheds/rust/scx_bpfland/Cargo.toml index 2ac29a7..6c416f7 100644 --- a/scheds/rust/scx_bpfland/Cargo.toml +++ b/scheds/rust/scx_bpfland/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_bpfland" -version = "1.0.5" +version = "1.0.6" authors = ["Andrea Righi "] edition = "2021" description = "A vruntime-based sched_ext scheduler that prioritizes interactive workloads. https://github.com/sched-ext/scx/tree/main" @@ -13,14 +13,14 @@ clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] } crossbeam = "0.8.4" libbpf-rs = "0.24.1" log = "0.4.17" -scx_stats = { path = "../../../rust/scx_stats", version = "1.0.5" } -scx_stats_derive = { path = "../../../rust/scx_stats/scx_stats_derive", version = "1.0.5" } -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } +scx_stats = { path = "../../../rust/scx_stats", version = "1.0.6" } +scx_stats_derive = { path = "../../../rust/scx_stats/scx_stats_derive", version = "1.0.6" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } serde = { version = "1.0", features = ["derive"] } simplelog = "0.12" [build-dependencies] -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } [features] enable_backtrace = [] diff --git a/scheds/rust/scx_lavd/Cargo.toml b/scheds/rust/scx_lavd/Cargo.toml index 3ed285d..9efef6e 100644 --- a/scheds/rust/scx_lavd/Cargo.toml +++ b/scheds/rust/scx_lavd/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_lavd" -version = "1.0.5" +version = "1.0.6" authors = ["Changwoo Min ", "Igalia"] edition = "2021" description = "A Latency-criticality Aware Virtual Deadline (LAVD) scheduler based on sched_ext, which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. https://github.com/sched-ext/scx/tree/main" @@ -19,9 +19,9 @@ libbpf-rs = "0.24.1" libc = "0.2.137" log = "0.4.17" ordered-float = "3.4.0" -scx_stats = { path = "../../../rust/scx_stats", version = "1.0.5" } -scx_stats_derive = { path = "../../../rust/scx_stats/scx_stats_derive", version = "1.0.5" } -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } +scx_stats = { path = "../../../rust/scx_stats", version = "1.0.6" } +scx_stats_derive = { path = "../../../rust/scx_stats/scx_stats_derive", version = "1.0.6" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } serde = { version = "1.0", features = ["derive"] } simplelog = "0.12" static_assertions = "1.1.0" @@ -29,7 +29,7 @@ plain = "0.2.3" gpoint = "0.2" [build-dependencies] -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } [features] enable_backtrace = [] diff --git a/scheds/rust/scx_layered/Cargo.toml b/scheds/rust/scx_layered/Cargo.toml index 8bd62c5..e753e1f 100644 --- a/scheds/rust/scx_layered/Cargo.toml +++ b/scheds/rust/scx_layered/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_layered" -version = "1.0.5" +version = "1.0.6" authors = ["Tejun Heo ", "Meta"] edition = "2021" description = "A highly configurable multi-layer BPF / user space hybrid scheduler used within sched_ext, which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. https://github.com/sched-ext/scx/tree/main" @@ -19,15 +19,15 @@ lazy_static = "1.4" libbpf-rs = "0.24.1" libc = "0.2.137" log = "0.4.17" -scx_stats = { path = "../../../rust/scx_stats", version = "1.0.5" } -scx_stats_derive = { path = "../../../rust/scx_stats/scx_stats_derive", version = "1.0.5" } -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } +scx_stats = { path = "../../../rust/scx_stats", version = "1.0.6" } +scx_stats_derive = { path = "../../../rust/scx_stats/scx_stats_derive", version = "1.0.6" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" simplelog = "0.12" [build-dependencies] -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } [features] enable_backtrace = [] diff --git a/scheds/rust/scx_mitosis/Cargo.toml b/scheds/rust/scx_mitosis/Cargo.toml index c0206e0..71cd23a 100644 --- a/scheds/rust/scx_mitosis/Cargo.toml +++ b/scheds/rust/scx_mitosis/Cargo.toml @@ -19,13 +19,13 @@ libbpf-rs = "0.24.1" libc = "0.2.137" log = "0.4.17" maplit = "1.0.2" -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" simplelog = "0.12" [build-dependencies] -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } [features] enable_backtrace = [] diff --git a/scheds/rust/scx_rlfifo/Cargo.toml b/scheds/rust/scx_rlfifo/Cargo.toml index fdb21d4..0d107d3 100644 --- a/scheds/rust/scx_rlfifo/Cargo.toml +++ b/scheds/rust/scx_rlfifo/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_rlfifo" -version = "1.0.5" +version = "1.0.6" authors = ["Andrea Righi "] edition = "2021" description = "A simple FIFO scheduler in Rust that runs in user-space" @@ -12,12 +12,12 @@ plain = "0.2.3" ctrlc = { version = "3.1", features = ["termination"] } libbpf-rs = "0.24.1" libc = "0.2.137" -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } -scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "2.2.2" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } +scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "2.2.3" } [build-dependencies] -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } -scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "2.2.2" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } +scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "2.2.3" } [features] enable_backtrace = [] diff --git a/scheds/rust/scx_rustland/Cargo.toml b/scheds/rust/scx_rustland/Cargo.toml index 5e1ab94..6c716bc 100644 --- a/scheds/rust/scx_rustland/Cargo.toml +++ b/scheds/rust/scx_rustland/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_rustland" -version = "1.0.5" +version = "1.0.6" authors = ["Andrea Righi "] edition = "2021" description = "A BPF component (dispatcher) that implements the low level sched-ext functionalities and a user-space counterpart (scheduler), written in Rust, that implements the actual scheduling policy. This is used within sched_ext, which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. https://github.com/sched-ext/scx/tree/main" @@ -17,15 +17,15 @@ libc = "0.2.137" log = "0.4.17" ordered-float = "3.4.0" serde = { version = "1.0", features = ["derive"] } -scx_stats = { path = "../../../rust/scx_stats", version = "1.0.5" } -scx_stats_derive = { path = "../../../rust/scx_stats/scx_stats_derive", version = "1.0.5" } -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } -scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "2.2.2" } +scx_stats = { path = "../../../rust/scx_stats", version = "1.0.6" } +scx_stats_derive = { path = "../../../rust/scx_stats/scx_stats_derive", version = "1.0.6" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } +scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "2.2.3" } simplelog = "0.12" [build-dependencies] -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } -scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "2.2.2" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } +scx_rustland_core = { path = "../../../rust/scx_rustland_core", version = "2.2.3" } [features] enable_backtrace = [] diff --git a/scheds/rust/scx_rusty/Cargo.toml b/scheds/rust/scx_rusty/Cargo.toml index 18962fc..9b68dfe 100644 --- a/scheds/rust/scx_rusty/Cargo.toml +++ b/scheds/rust/scx_rusty/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_rusty" -version = "1.0.5" +version = "1.0.6" authors = ["Dan Schatzberg ", "Meta"] edition = "2021" description = "A multi-domain, BPF / user space hybrid scheduler used within sched_ext, which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. https://github.com/sched-ext/scx/tree/main" @@ -17,16 +17,16 @@ libbpf-rs = "0.24.1" libc = "0.2.137" log = "0.4.17" ordered-float = "3.4.0" -scx_stats = { path = "../../../rust/scx_stats", version = "1.0.5" } -scx_stats_derive = { path = "../../../rust/scx_stats/scx_stats_derive", version = "1.0.5" } -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } +scx_stats = { path = "../../../rust/scx_stats", version = "1.0.6" } +scx_stats_derive = { path = "../../../rust/scx_stats/scx_stats_derive", version = "1.0.6" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } serde = { version = "1.0", features = ["derive"] } simplelog = "0.12" sorted-vec = "0.8.3" static_assertions = "1.1.0" [build-dependencies] -scx_utils = { path = "../../../rust/scx_utils", version = "1.0.5" } +scx_utils = { path = "../../../rust/scx_utils", version = "1.0.6" } [features] enable_backtrace = [] From 2fca9e38a580d72c75d3b3a33f64ec3edce4a5e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 6 Nov 2024 09:27:52 -1000 Subject: [PATCH 11/11] version: v1.0.6 Cargo.lock updates --- Cargo.lock | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 84f6fc7..0a2cb0b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1685,7 +1685,7 @@ dependencies = [ [[package]] name = "scx_bpfland" -version = "1.0.5" +version = "1.0.6" dependencies = [ "anyhow", "clap", @@ -1702,7 +1702,7 @@ dependencies = [ [[package]] name = "scx_lavd" -version = "1.0.5" +version = "1.0.6" dependencies = [ "anyhow", "bitvec", @@ -1728,7 +1728,7 @@ dependencies = [ [[package]] name = "scx_layered" -version = "1.0.5" +version = "1.0.6" dependencies = [ "anyhow", "bitvec", @@ -1752,7 +1752,7 @@ dependencies = [ [[package]] name = "scx_loader" -version = "1.0.5" +version = "1.0.6" dependencies = [ "anyhow", "clap", @@ -1792,7 +1792,7 @@ dependencies = [ [[package]] name = "scx_rlfifo" -version = "1.0.5" +version = "1.0.6" dependencies = [ "anyhow", "ctrlc", @@ -1805,7 +1805,7 @@ dependencies = [ [[package]] name = "scx_rustland" -version = "1.0.5" +version = "1.0.6" dependencies = [ "anyhow", "clap", @@ -1826,7 +1826,7 @@ dependencies = [ [[package]] name = "scx_rustland_core" -version = "2.2.2" +version = "2.2.3" dependencies = [ "anyhow", "libbpf-rs", @@ -1839,7 +1839,7 @@ dependencies = [ [[package]] name = "scx_rusty" -version = "1.0.5" +version = "1.0.6" dependencies = [ "anyhow", "chrono", @@ -1862,7 +1862,7 @@ dependencies = [ [[package]] name = "scx_stats" -version = "1.0.5" +version = "1.0.6" dependencies = [ "anyhow", "crossbeam", @@ -1879,7 +1879,7 @@ dependencies = [ [[package]] name = "scx_stats_derive" -version = "1.0.5" +version = "1.0.6" dependencies = [ "proc-macro2", "quote", @@ -1890,7 +1890,7 @@ dependencies = [ [[package]] name = "scx_utils" -version = "1.0.5" +version = "1.0.6" dependencies = [ "anyhow", "bindgen",