scx_bpfland: drop lowlatency mode and the priority DSQ

Schedule all tasks using a single global DSQ. This gives a better control to prevent potential starvation conditions. With this change, scx_bpfland adopts a logic similar to scx_rusty and scx_lavd, prioritizing tasks based on the frequency of their wait and wake-up events, rather than relying exclusively on the average amount of voluntary context switches. Tasks are still classified as interactive / non-interactive based on the amount of voluntary context switches, but this is only affecting the cpufreq logic. Signed-off-by: Andrea Righi <arighi@nvidia.com>
2024-11-21 18:41:47 +00:00 · 2024-11-05 10:15:41 +01:00 · 2024-11-05 10:15:41 +01:00 · 78101e4688
commit 78101e4688
parent efc41dd936
4 changed files with 252 additions and 259 deletions
--- a/scheds/rust/scx_bpfland/src/bpf/intf.h
+++ b/scheds/rust/scx_bpfland/src/bpf/intf.h
@ -13,6 +13,7 @@
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
 #define MIN(x, y) ((x) < (y) ? (x) : (y))
 #define CLAMP(val, lo, hi) MIN(MAX(val, lo), hi)
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))

 enum consts {
 	NSEC_PER_USEC = 1000ULL,
--- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c
@ -17,14 +17,19 @@ char _license[] SEC("license") = "GPL";
 const volatile bool debug;

 /*
- * Priority DSQ used to dispatch interactive tasks.
+ * Maximum task weight.
 */
-#define PRIO_DSQ	0
+#define MAX_TASK_WEIGHT		10000
+
+/*
+ * Maximum frequency of task wakeup events / sec.
+ */
+#define MAX_WAKEUP_FREQ		1024

 /*
 * DSQ used to dispatch regular tasks.
 */
-#define SHARED_DSQ	1
+#define SHARED_DSQ		0

 /*
 * Default task time slice.
@ -55,17 +60,6 @@ const volatile s64 slice_lag = 20ULL * NSEC_PER_MSEC;
 */
 const volatile bool local_kthreads;

-/*
- * With lowlatency enabled, instead of classifying tasks as interactive or
- * non-interactive, they all get a dynamic priority, which is adjusted in
- * function of their average rate of voluntary context switches.
- *
- * This option guarantess less spikey behavior and it can be particularly
- * useful in soft real-time scenarios, such as audio processing, multimedia,
- * etc.
- */
-const volatile bool lowlatency;
-
 /*
 * Maximum threshold of voluntary context switches.
 */
@ -77,31 +71,15 @@ const volatile u64 nvcsw_max_thresh = 10ULL;
 */
 volatile s64 cpufreq_perf_lvl;

-/*
- * Time threshold to prevent task starvation.
- *
- * Tasks dispatched to the priority DSQ are always consumed before those
- * dispatched to the shared DSQ, so tasks in shared DSQ may be starved by those
- * in the priority DSQ.
- *
- *  To mitigate this, store the timestamp of the last task consumption from
- *  the shared DSQ. If the starvation_thresh_ns threshold is exceeded without
- *  consuming a task, the scheduler will be forced to consume a task from the
- *  corresponding DSQ.
- */
-const volatile u64 starvation_thresh_ns = 1000ULL * NSEC_PER_MSEC;
-static u64 starvation_shared_ts;
-
 /*
 * Scheduling statistics.
 */
-volatile u64 nr_kthread_dispatches, nr_direct_dispatches,
-	     nr_prio_dispatches, nr_shared_dispatches;
+volatile u64 nr_kthread_dispatches, nr_direct_dispatches, nr_shared_dispatches;

 /*
 * Amount of currently running tasks.
 */
-volatile u64 nr_running, nr_interactive, nr_shared_waiting, nr_prio_waiting;
+volatile u64 nr_running, nr_interactive;

 /*
 * Amount of online CPUs.
@ -169,26 +147,31 @@ struct task_ctx {
 	struct bpf_cpumask __kptr *l2_cpumask;
 	struct bpf_cpumask __kptr *l3_cpumask;

-	/*
-	 * Total execution time of the task.
-	 */
-	u64 sum_exec_runtime;
-
 	/*
 	 * Voluntary context switches metrics.
 	 */
 	u64 nvcsw;
 	u64 nvcsw_ts;
+	u64 avg_nvcsw;

 	/*
-	 * Task's latency priority.
+	 * Frequency with which a task is blocked (consumer).
 	 */
-	u64 lat_weight;
+	u64 blocked_freq;
+	u64 last_blocked_at;
+
+	/*
+	 * Frequency with which a task wakes other tasks (producer).
+	 */
+	u64 waker_freq;
+	u64 last_woke_at;

 	/*
 	 * Task's average used time slice.
 	 */
 	u64 avg_runtime;
+	u64 sum_runtime;
+	u64 last_run_at;

 	/*
 	 * Task's deadline.
@ -276,75 +259,184 @@ static u64 calc_avg_clamp(u64 old_val, u64 new_val, u64 low, u64 high)
 }

 /*
- * Return the dynamic priority multiplier (only applied in lowlatency mode).
- *
- * The multiplier is evaluated in function of the task's average rate of
- * voluntary context switches per second.
+ * Evaluate the average frequency of an event over time.
 */
-static u64 task_dyn_prio(struct task_struct *p)
+static u64 update_freq(u64 freq, u64 delta)
 {
-	struct task_ctx *tctx;
+	u64 new_freq;

-	if (!lowlatency)
-		return 1;
-	tctx = try_lookup_task_ctx(p);
-	if (!tctx)
-		return 1;
-	return MAX(tctx->lat_weight, 1);
+	new_freq = NSEC_PER_SEC / delta;
+	return calc_avg(freq, new_freq);
 }

 /*
- * Return task's dynamic priority.
+ * Return the total amount of tasks that are currently waiting to be scheduled.
 */
-static u64 task_prio(const struct task_struct *p)
+static u64 nr_tasks_waiting(void)
 {
-	return p->scx.weight * task_dyn_prio(p);
+	return scx_bpf_dsq_nr_queued(SHARED_DSQ) + 1;
+}
+
+/*
+ * Return task's dynamic weight.
+ */
+static u64 task_weight(const struct task_struct *p, const struct task_ctx *tctx)
+{
+	/*
+	 * Scale the static task weight by the average amount of voluntary
+	 * context switches to determine the dynamic weight.
+	 */
+	u64 prio = p->scx.weight * CLAMP(tctx->avg_nvcsw, 1, nvcsw_max_thresh);
+
+	return CLAMP(prio, 1, MAX_TASK_WEIGHT);
+}
+
+/*
+ * Return a value proportionally scaled to the task's priority.
+ */
+static u64 scale_up_fair(const struct task_struct *p,
+			 const struct task_ctx *tctx, u64 value)
+{
+	return value * task_weight(p, tctx) / 100;
+}
+
+/*
+ * Return a value inversely proportional to the task's priority.
+ */
+static u64 scale_inverse_fair(const struct task_struct *p,
+			      const struct task_ctx *tctx, u64 value)
+{
+	return value * 100 / task_weight(p, tctx);
 }

 /*
 * Return the task's allowed lag: used to determine how early its vruntime can
 * be.
 */
-static u64 task_lag(const struct task_struct *p)
+static u64 task_lag(const struct task_struct *p, const struct task_ctx *tctx)
 {
-	return slice_lag * task_prio(p) / 100;
+	return scale_up_fair(p, tctx, slice_lag);
 }

 /*
- * Return a value inversely proportional to the task's weight.
+ * ** Taken directly from fair.c in the Linux kernel **
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
 */
-static u64 scale_inverse_fair(const struct task_struct *p, u64 value)
+const int sched_prio_to_weight[40] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+
+static u64 max_sched_prio(void)
 {
-	return value * 100 / task_prio(p);
+	return ARRAY_SIZE(sched_prio_to_weight);
 }

 /*
- * Compute the deadline component of a task (this value will be added to the
- * task's vruntime to determine the actual deadline).
+ * Convert task priority to weight (following fair.c logic).
 */
-static s64 task_compute_dl(const struct task_struct *p,
-			   const struct task_ctx *tctx)
+static u64 sched_prio_to_latency_weight(u64 prio)
 {
-	/*
-	 * Return the deadline as a function of the average runtime and the
-	 * evaluated task's dynamic priority.
-	 */
-	return scale_inverse_fair(p, tctx->avg_runtime);
+	u64 max_prio = max_sched_prio();
+
+	if (prio >= max_prio) {
+		scx_bpf_error("invalid priority");
+		return 0;
+	}
+
+	return sched_prio_to_weight[max_prio - prio - 1];
 }

 /*
- * Return task's evaluated vruntime.
+ * Evaluate task's deadline.
+ *
+ * Reuse a logic similar to scx_rusty or scx_lavd and evaluate the deadline as
+ * a function of the waiting and wake-up events and the average task's runtime.
 */
 static u64 task_deadline(struct task_struct *p, struct task_ctx *tctx)
 {
-	u64 min_vruntime = vtime_now - task_lag(p);
+	u64 waker_freq, blocked_freq;
+	u64 lat_prio, lat_weight;
+	u64 avg_run_scaled, avg_run;
+	u64 freq_factor;
+
+	/*
+	 * Limit the wait and wake-up frequencies to prevent spikes.
+	 */
+	waker_freq = CLAMP(tctx->waker_freq, 1, MAX_WAKEUP_FREQ);
+	blocked_freq = CLAMP(tctx->blocked_freq, 1, MAX_WAKEUP_FREQ);
+
+	/*
+	 * We want to prioritize producers (waker tasks) more than consumers
+	 * (blocked tasks), using the following formula:
+	 *
+	 *   freq_factor = blocked_freq * waker_freq^2
+	 *
+	 * This seems to improve the overall responsiveness of
+	 * producer/consumer pipelines.
+	 */
+	freq_factor = blocked_freq * waker_freq * waker_freq;
+
+	/*
+	 * Evaluate the "latency priority" as a function of the wake-up, block
+	 * frequencies and average runtime, using the following formula:
+	 *
+	 *   lat_prio = log(freq_factor / avg_run_scaled)
+	 *
+	 * Frequencies can grow very quickly, almost exponential, so use
+	 * log2_u64() to get a more linear metric that can be used as a
+	 * priority.
+	 *
+	 * The avg_run_scaled component is used to scale the latency priority
+	 * proportionally to the task's weight and inversely proportional to
+	 * its runtime, so that a task with a higher weight / shorter runtime
+	 * gets a higher latency priority than a task with a lower weight /
+	 * higher runtime.
+	 */
+	avg_run_scaled = scale_inverse_fair(p, tctx, tctx->avg_runtime);
+	avg_run = log2_u64(avg_run_scaled + 1);
+
+	lat_prio = log2_u64(freq_factor);
+	lat_prio = MIN(lat_prio, max_sched_prio());
+
+	if (lat_prio >= avg_run)
+		lat_prio -= avg_run;
+	else
+		lat_prio = 0;
+
+	/*
+	 * Lastly, translate the latency priority into a weight and apply it to
+	 * the task's average runtime to determine the task's deadline.
+	 */
+	lat_weight = sched_prio_to_latency_weight(lat_prio);
+
+	return tctx->avg_runtime * 100 / lat_weight;
+}
+
+/*
+ * Return task's evaluated deadline applied to its vruntime.
+ */
+static u64 task_vtime(struct task_struct *p, struct task_ctx *tctx)
+{
+	u64 min_vruntime = vtime_now - task_lag(p, tctx);

 	/*
 	 * Limit the vruntime to to avoid excessively penalizing tasks.
 	 */
 	if (vtime_before(p->scx.dsq_vtime, min_vruntime)) {
 		p->scx.dsq_vtime = min_vruntime;
-		tctx->deadline = p->scx.dsq_vtime + task_compute_dl(p, tctx);
+		tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx);
 	}

 	return tctx->deadline;
@ -356,28 +448,11 @@ static u64 task_deadline(struct task_struct *p, struct task_ctx *tctx)
 */
 static inline void task_refill_slice(struct task_struct *p)
 {
-	u64 curr_prio_waiting = scx_bpf_dsq_nr_queued(PRIO_DSQ);
-	u64 curr_shared_waiting = scx_bpf_dsq_nr_queued(SHARED_DSQ);
-	u64 scale_factor;
-
-	/*
-	 * Refresh the amount of waiting tasks to get a more accurate scaling
-	 * factor for the time slice.
-	 */
-	nr_prio_waiting = calc_avg(nr_prio_waiting, curr_prio_waiting);
-	nr_shared_waiting = calc_avg(nr_shared_waiting, curr_shared_waiting);
-
 	/*
 	 * Scale the time slice of an inversely proportional factor of the
-	 * total amount of tasks that are waiting (use a more immediate metric
-	 * in lowlatency mode and an average in normal mode).
+	 * total amount of tasks that are waiting.
 	 */
-	if (lowlatency)
-		scale_factor = curr_shared_waiting + 1;
-	else
-		scale_factor = nr_prio_waiting + nr_shared_waiting + 1;
-
-	p->scx.slice = CLAMP(slice_max / scale_factor, slice_min, slice_max);
+	p->scx.slice = CLAMP(slice_max / nr_tasks_waiting(), slice_min, slice_max);
 }

 static void task_set_domain(struct task_struct *p, s32 cpu,
@ -459,8 +534,7 @@ static void task_set_domain(struct task_struct *p, s32 cpu,
 static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle)
 {
 	const struct cpumask *idle_smtmask, *idle_cpumask;
-	struct bpf_cpumask *primary, *l2_domain, *l3_domain;
-	struct bpf_cpumask *p_mask, *l2_mask, *l3_mask;
+	const struct cpumask *primary, *p_mask, *l2_mask, *l3_mask;
 	struct task_ctx *tctx;
 	bool is_prev_llc_affine = false;
 	s32 cpu;
@ -481,7 +555,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 	if (!tctx)
 		return -ENOENT;

-	primary = primary_cpumask;
+	primary = cast_mask(primary_cpumask);
 	if (!primary)
 		return -EINVAL;

@ -494,21 +568,21 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 	/*
 	 * Task's scheduling domains.
 	 */
-	p_mask = tctx->cpumask;
+	p_mask = cast_mask(tctx->cpumask);
 	if (!p_mask) {
 		scx_bpf_error("cpumask not initialized");
 		cpu = -EINVAL;
 		goto out_put_cpumask;
 	}

-	l2_mask = tctx->l2_cpumask;
+	l2_mask = cast_mask(tctx->l2_cpumask);
 	if (!l2_mask) {
 		scx_bpf_error("l2 cpumask not initialized");
 		cpu = -EINVAL;
 		goto out_put_cpumask;
 	}

-	l3_mask = tctx->l3_cpumask;
+	l3_mask = cast_mask(tctx->l3_cpumask);
 	if (!l3_mask) {
 		scx_bpf_error("l3 cpumask not initialized");
 		cpu = -EINVAL;
@ -519,7 +593,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 	 * Check if the previously used CPU is still in the L3 task domain. If
 	 * not, we may want to move the task back to its original L3 domain.
 	 */
-	is_prev_llc_affine = bpf_cpumask_test_cpu(prev_cpu, cast_mask(l3_mask));
+	is_prev_llc_affine = bpf_cpumask_test_cpu(prev_cpu, l3_mask);

 	/*
 	 * If the current task is waking up another task and releasing the CPU
@ -528,7 +602,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 	 */
 	if (wake_flags & SCX_WAKE_SYNC) {
 		struct task_struct *current = (void *)bpf_get_current_task_btf();
-		struct bpf_cpumask *curr_l3_domain;
+		const struct cpumask *curr_l3_domain;
 		struct cpu_ctx *cctx;
 		bool share_llc, has_idle;

@ -542,7 +616,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 			goto out_put_cpumask;
 		}

-		curr_l3_domain = cctx->l3_cpumask;
+		curr_l3_domain = cast_mask(cctx->l3_cpumask);
 		if (!curr_l3_domain)
 			curr_l3_domain = primary;

@ -550,7 +624,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 		 * If both the waker and wakee share the same L3 cache keep
 		 * using the same CPU if possible.
 		 */
-		share_llc = bpf_cpumask_test_cpu(prev_cpu, cast_mask(curr_l3_domain));
+		share_llc = bpf_cpumask_test_cpu(prev_cpu, curr_l3_domain);
 		if (share_llc &&
 		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
 			cpu = prev_cpu;
@ -563,9 +637,9 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 		 * the wakee on the same CPU as the waker (since it's going to
 		 * block and release the current CPU).
 		 */
-		has_idle = bpf_cpumask_intersects(cast_mask(curr_l3_domain), idle_cpumask);
+		has_idle = bpf_cpumask_intersects(curr_l3_domain, idle_cpumask);
 		if (has_idle &&
-		    bpf_cpumask_test_cpu(cpu, cast_mask(p_mask)) &&
+		    bpf_cpumask_test_cpu(cpu, p_mask) &&
 		    !(current->flags & PF_EXITING) &&
 		    scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu) == 0) {
 			*is_idle = true;
@ -593,7 +667,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 		 * Search for any full-idle CPU in the primary domain that
 		 * shares the same L2 cache.
 		 */
-		cpu = scx_bpf_pick_idle_cpu(cast_mask(l2_mask), SCX_PICK_IDLE_CORE);
+		cpu = scx_bpf_pick_idle_cpu(l2_mask, SCX_PICK_IDLE_CORE);
 		if (cpu >= 0) {
 			*is_idle = true;
 			goto out_put_cpumask;
@ -603,7 +677,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 		 * Search for any full-idle CPU in the primary domain that
 		 * shares the same L3 cache.
 		 */
-		cpu = scx_bpf_pick_idle_cpu(cast_mask(l3_mask), SCX_PICK_IDLE_CORE);
+		cpu = scx_bpf_pick_idle_cpu(l3_mask, SCX_PICK_IDLE_CORE);
 		if (cpu >= 0) {
 			*is_idle = true;
 			goto out_put_cpumask;
@ -612,7 +686,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 		/*
 		 * Search for any other full-idle core in the primary domain.
 		 */
-		cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), SCX_PICK_IDLE_CORE);
+		cpu = scx_bpf_pick_idle_cpu(p_mask, SCX_PICK_IDLE_CORE);
 		if (cpu >= 0) {
 			*is_idle = true;
 			goto out_put_cpumask;
@ -634,7 +708,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 	 * Search for any idle CPU in the primary domain that shares the same
 	 * L2 cache.
 	 */
-	cpu = scx_bpf_pick_idle_cpu(cast_mask(l2_mask), 0);
+	cpu = scx_bpf_pick_idle_cpu(l2_mask, 0);
 	if (cpu >= 0) {
 		*is_idle = true;
 		goto out_put_cpumask;
@ -644,7 +718,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 	 * Search for any idle CPU in the primary domain that shares the same
 	 * L3 cache.
 	 */
-	cpu = scx_bpf_pick_idle_cpu(cast_mask(l3_mask), 0);
+	cpu = scx_bpf_pick_idle_cpu(l3_mask, 0);
 	if (cpu >= 0) {
 		*is_idle = true;
 		goto out_put_cpumask;
@ -653,7 +727,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 	/*
 	 * Search for any idle CPU in the scheduling domain.
 	 */
-	cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0);
+	cpu = scx_bpf_pick_idle_cpu(p_mask, 0);
 	if (cpu >= 0) {
 		*is_idle = true;
 		goto out_put_cpumask;
@ -666,7 +740,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
 	if (is_prev_llc_affine)
 		cpu = prev_cpu;
 	else
-		cpu = scx_bpf_pick_any_cpu(cast_mask(l3_mask), 0);
+		cpu = scx_bpf_pick_any_cpu(l3_mask, 0);

 out_put_cpumask:
 	scx_bpf_put_cpumask(idle_cpumask);
@ -716,14 +790,6 @@ static void kick_task_cpu(struct task_struct *p)
 		scx_bpf_kick_cpu(cpu, 0);
 }

-static bool is_task_interactive(const struct task_struct *p,
-				const struct task_ctx *tctx)
-{
-	if (!tctx->is_interactive)
-		return false;
-	return scx_bpf_dsq_nr_queued(PRIO_DSQ) < 100;
-}
-
 /*
 * Dispatch all the other tasks that were not dispatched directly in
 * select_cpu().
@ -731,7 +797,6 @@ static bool is_task_interactive(const struct task_struct *p,
 void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	struct task_ctx *tctx;
-	s32 dsq_id;

 	tctx = try_lookup_task_ctx(p);
 	if (!tctx)
@ -753,19 +818,10 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
 	/*
 	 * Dispatch interactive tasks to the priority DSQ and regular tasks to
 	 * the shared DSQ.
-	 *
-	 * When lowlatency is enabled, the separate priority DSQ is disabled,
-	 * so in this case always dispatch to the shared DSQ.
 	 */
-	if (!lowlatency && is_task_interactive(p, tctx)) {
-		dsq_id = PRIO_DSQ;
-		__sync_fetch_and_add(&nr_prio_dispatches, 1);
-	} else {
-		dsq_id = SHARED_DSQ;
-		__sync_fetch_and_add(&nr_shared_dispatches, 1);
-	}
-	scx_bpf_dispatch_vtime(p, dsq_id, SCX_SLICE_DFL,
-			       task_deadline(p, tctx), enq_flags);
+	scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL,
+			       task_vtime(p, tctx), enq_flags);
+	__sync_fetch_and_add(&nr_shared_dispatches, 1);

 	/*
 	 * If there is an idle CPU available for the task, wake it up so it can
@ -774,76 +830,13 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
 	kick_task_cpu(p);
 }

-/*
- * Consume a task from the priority DSQ, transferring it to the local CPU DSQ.
- *
- * Return true if a task is consumed, false otherwise.
- */
-static bool consume_prio_task(u64 now)
-{
-	return scx_bpf_consume(PRIO_DSQ);
-}
-
-/*
- * Consume a task from the shared DSQ, transferring it to the local CPU DSQ.
- *
- * Return true if a task is consumed, false otherwise.
- */
-static bool consume_regular_task(u64 now)
-{
-	bool ret;
-
-	ret = scx_bpf_consume(SHARED_DSQ);
-	if (ret)
-		starvation_shared_ts = now;
-
-	return ret;
-}
-
-/*
- * Consume tasks that are potentially starving.
- *
- * In order to limit potential starvation conditions the scheduler uses a
- * time-based threshold to ensure that at least one task from the
- * lower-priority DSQs is periodically consumed.
- */
-static bool consume_starving_tasks(u64 now)
-{
-	if (!starvation_thresh_ns)
-		return false;
-
-	if (vtime_before(starvation_shared_ts + starvation_thresh_ns, now))
-		if (consume_regular_task(now))
-			return true;
-
-	return false;
-}
-
-/*
- * Consume regular tasks from the per-CPU DSQ or a shared DSQ, transferring
- * them to the local CPU DSQ.
- *
- * Return true if at least a task is consumed, false otherwise.
- */
-static bool consume_shared_tasks(s32 cpu, u64 now)
-{
-	/*
-	 * The priority DSQ can starve the shared DSQ, so to mitigate this
-	 * starvation we have the starvation_thresh_ns, see also
-	 * consume_starving_tasks().
-	 */
-	if (consume_prio_task(now) || consume_regular_task(now))
-		return true;
-	return false;
-}
-
 void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
 {
-	u64 now = bpf_ktime_get_ns();
-
-	if (consume_starving_tasks(now))
-		return;
-	if (consume_shared_tasks(cpu, now))
+	/*
+	 * Consume regular tasks from the shared DSQ, transferring them to the
+	 * local CPU DSQ.
+	 */
+	if (scx_bpf_consume(SHARED_DSQ))
 		return;
 	/*
 	 * If the current task expired its time slice and no other task wants
@ -922,6 +915,7 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
 	tctx = try_lookup_task_ctx(p);
 	if (!tctx)
 		return;
+	tctx->last_run_at = bpf_ktime_get_ns();

 	/*
 	 * Adjust target CPU frequency before the task starts to run.
@ -989,18 +983,15 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
 	/*
 	 * Update task's average runtime.
 	 */
-	slice = p->se.sum_exec_runtime - tctx->sum_exec_runtime;
-	if (lowlatency)
-		slice = CLAMP(slice, slice_min, slice_max);
-	tctx->sum_exec_runtime = p->se.sum_exec_runtime;
-	tctx->avg_runtime = calc_avg(tctx->avg_runtime, slice);
+	slice = now - tctx->last_run_at;
+	tctx->sum_runtime += slice;
+	tctx->avg_runtime = calc_avg(tctx->avg_runtime, tctx->sum_runtime);

 	/*
 	 * Update task vruntime charging the weighted used time slice.
 	 */
-	slice = scale_inverse_fair(p, slice);
-	p->scx.dsq_vtime += slice;
-	tctx->deadline = p->scx.dsq_vtime + task_compute_dl(p, tctx);
+	p->scx.dsq_vtime += scale_inverse_fair(p, tctx, slice);
+	tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx);

 	/*
 	 * Refresh voluntary context switch metrics.
@ -1011,7 +1002,7 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
 	delta_t = (s64)(now - tctx->nvcsw_ts);
 	if (delta_t > NSEC_PER_SEC) {
 		u64 avg_nvcsw = tctx->nvcsw * NSEC_PER_SEC / delta_t;
-		u64 max_lat_weight = nvcsw_max_thresh * 100;
+		u64 max_nvcsw = nvcsw_max_thresh * 100;

 		tctx->nvcsw = 0;
 		tctx->nvcsw_ts = now;
@ -1021,8 +1012,7 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
 		 * of voluntary context switches (limited to to prevent
 		 * excessive spikes).
 		 */
-		tctx->lat_weight = calc_avg_clamp(tctx->lat_weight, avg_nvcsw,
-						  0, max_lat_weight);
+		tctx->avg_nvcsw = calc_avg_clamp(tctx->avg_nvcsw, avg_nvcsw, 0, max_nvcsw);

 		/*
 		 * Classify the task based on the average of voluntary context
@ -1032,10 +1022,53 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
 		 * it is classified as interactive, otherwise the task is
 		 * classified as regular.
 		 */
-		tctx->is_interactive = tctx->lat_weight >= nvcsw_max_thresh;
+		tctx->is_interactive = tctx->avg_nvcsw >= nvcsw_max_thresh;
 	}
 }

+void BPF_STRUCT_OPS(bpfland_runnable, struct task_struct *p, u64 enq_flags)
+{
+	u64 now = bpf_ktime_get_ns(), delta;
+	struct task_struct *waker;
+	struct task_ctx *tctx;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+	tctx->sum_runtime = 0;
+
+	waker = bpf_get_current_task_btf();
+	tctx = try_lookup_task_ctx(waker);
+	if (!tctx)
+		return;
+
+	delta = MAX(now - tctx->last_woke_at, 1);
+	tctx->waker_freq = update_freq(tctx->waker_freq, delta);
+	tctx->last_woke_at = now;
+}
+
+void BPF_STRUCT_OPS(bpfland_quiescent, struct task_struct *p, u64 deq_flags)
+{
+	u64 now = bpf_ktime_get_ns(), delta;
+	struct task_ctx *tctx;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	delta = MAX(now - tctx->last_blocked_at, 1);
+	tctx->blocked_freq = update_freq(tctx->blocked_freq, delta);
+	tctx->last_blocked_at = now;
+}
+
+void BPF_STRUCT_OPS(bpfland_set_cpumask, struct task_struct *p,
+		    const struct cpumask *cpumask)
+{
+	s32 cpu = bpf_get_smp_processor_id();
+
+	task_set_domain(p, cpu, cpumask);
+}
+
 void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p)
 {
 	u64 now = bpf_ktime_get_ns();
@ -1048,17 +1081,11 @@ void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p)
 	tctx = try_lookup_task_ctx(p);
 	if (!tctx)
 		return;
-	tctx->sum_exec_runtime = p->se.sum_exec_runtime;
 	tctx->nvcsw_ts = now;
-	tctx->deadline = vtime_now;
-}
+	tctx->last_woke_at = now;
+	tctx->last_blocked_at = now;

-void BPF_STRUCT_OPS(bpfland_set_cpumask, struct task_struct *p,
-		    const struct cpumask *cpumask)
-{
-	s32 cpu = bpf_get_smp_processor_id();
-
-	task_set_domain(p, cpu, cpumask);
+	tctx->deadline = p->scx.dsq_vtime + task_deadline(p, tctx);
 }

 s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p,
@ -1216,16 +1243,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
 	nr_online_cpus = get_nr_online_cpus();

 	/*
-	 * Create the global priority and shared DSQs.
-	 *
-	 * Allocate a new DSQ id that does not clash with any valid CPU id.
+	 * Create the global shared DSQ.
 	 */
-	err = scx_bpf_create_dsq(PRIO_DSQ, -1);
-	if (err) {
-		scx_bpf_error("failed to create priority DSQ: %d", err);
-		return err;
-	}
-
 	err = scx_bpf_create_dsq(SHARED_DSQ, -1);
 	if (err) {
 		scx_bpf_error("failed to create shared DSQ: %d", err);
@ -1251,8 +1270,10 @@ SCX_OPS_DEFINE(bpfland_ops,
 	       .dispatch		= (void *)bpfland_dispatch,
 	       .running			= (void *)bpfland_running,
 	       .stopping		= (void *)bpfland_stopping,
-	       .enable			= (void *)bpfland_enable,
+	       .runnable		= (void *)bpfland_runnable,
+	       .quiescent		= (void *)bpfland_quiescent,
 	       .set_cpumask		= (void *)bpfland_set_cpumask,
+	       .enable			= (void *)bpfland_enable,
 	       .init_task		= (void *)bpfland_init_task,
 	       .init			= (void *)bpfland_init,
 	       .exit			= (void *)bpfland_exit,
--- a/scheds/rust/scx_bpfland/src/main.rs
+++ b/scheds/rust/scx_bpfland/src/main.rs
@ -138,15 +138,6 @@ struct Opts {
    #[clap(short = 'l', long, allow_hyphen_values = true, default_value = "20000")]
    slice_us_lag: i64,

-    /// With lowlatency enabled, instead of classifying tasks as interactive or non-interactive,
-    /// they all get a dynamic priority, which is adjusted in function of their average rate of
-    /// voluntary context switches.
-    ///
-    /// This option guarantess less spikey behavior and it can be particularly useful in soft
-    /// real-time scenarios, such as audio processing, multimedia, etc.
-    #[clap(short = 'L', long, action = clap::ArgAction::SetTrue)]
-    lowlatency: bool,
-
    /// Enable kthreads prioritization.
    ///
    /// Enabling this can improve system performance, but it may also introduce interactivity
@ -181,11 +172,6 @@ struct Opts {
    #[clap(short = 'c', long, default_value = "10")]
    nvcsw_max_thresh: u64,

-    /// Prevent starvation by making sure that at least one lower priority task is scheduled every
-    /// starvation_thresh_us (0 = disable starvation prevention).
-    #[clap(short = 't', long, default_value = "1000")]
-    starvation_thresh_us: u64,
-
    /// Enable stats monitoring with the specified interval.
    #[clap(long)]
    stats: Option<f64>,
@ -259,12 +245,10 @@ impl<'a> Scheduler<'a> {
        // Override default BPF scheduling parameters.
        skel.maps.rodata_data.debug = opts.debug;
        skel.maps.rodata_data.smt_enabled = smt_enabled;
-        skel.maps.rodata_data.lowlatency = opts.lowlatency;
        skel.maps.rodata_data.local_kthreads = opts.local_kthreads;
        skel.maps.rodata_data.slice_max = opts.slice_us * 1000;
        skel.maps.rodata_data.slice_min = opts.slice_us_min * 1000;
        skel.maps.rodata_data.slice_lag = opts.slice_us_lag * 1000;
-        skel.maps.rodata_data.starvation_thresh_ns = opts.starvation_thresh_us * 1000;
        skel.maps.rodata_data.nvcsw_max_thresh = opts.nvcsw_max_thresh;

        // Load the BPF program for validation.
@ -556,11 +540,8 @@ impl<'a> Scheduler<'a> {
            nr_running: self.skel.maps.bss_data.nr_running,
            nr_cpus: self.skel.maps.bss_data.nr_online_cpus,
            nr_interactive: self.skel.maps.bss_data.nr_interactive,
-            nr_prio_waiting: self.skel.maps.bss_data.nr_prio_waiting,
-            nr_shared_waiting: self.skel.maps.bss_data.nr_shared_waiting,
            nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches,
            nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
-            nr_prio_dispatches: self.skel.maps.bss_data.nr_prio_dispatches,
            nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches,
        }
    }
--- a/scheds/rust/scx_bpfland/src/stats.rs
+++ b/scheds/rust/scx_bpfland/src/stats.rs
@ -21,16 +21,10 @@ pub struct Metrics {
    pub nr_cpus: u64,
    #[stat(desc = "Number of running interactive tasks")]
    pub nr_interactive: u64,
-    #[stat(desc = "Average amount of regular tasks waiting to be dispatched")]
-    pub nr_shared_waiting: u64,
-    #[stat(desc = "Average amount of interactive tasks waiting to be dispatched")]
-    pub nr_prio_waiting: u64,
    #[stat(desc = "Number of kthread direct dispatches")]
    pub nr_kthread_dispatches: u64,
    #[stat(desc = "Number of task direct dispatches")]
    pub nr_direct_dispatches: u64,
-    #[stat(desc = "Number of interactive task dispatches")]
-    pub nr_prio_dispatches: u64,
    #[stat(desc = "Number of regular task dispatches")]
    pub nr_shared_dispatches: u64,
 }
@ -39,16 +33,13 @@ impl Metrics {
    fn format<W: Write>(&self, w: &mut W) -> Result<()> {
        writeln!(
            w,
-            "[{}] tasks -> r: {:>2}/{:<2} i: {:<2} pw: {:<4} w: {:<4} | dispatch -> k: {:<5} d: {:<5} p: {:<5} s: {:<5}",
+            "[{}] tasks -> r: {:>2}/{:<2} i: {:<2} | dispatch -> k: {:<5} d: {:<5} s: {:<5}",
            crate::SCHEDULER_NAME,
            self.nr_running,
            self.nr_cpus,
            self.nr_interactive,
-            self.nr_prio_waiting,
-            self.nr_shared_waiting,
            self.nr_kthread_dispatches,
            self.nr_direct_dispatches,
-            self.nr_prio_dispatches,
            self.nr_shared_dispatches
        )?;
        Ok(())
@ -58,7 +49,6 @@ impl Metrics {
        Self {
            nr_kthread_dispatches: self.nr_kthread_dispatches - rhs.nr_kthread_dispatches,
            nr_direct_dispatches: self.nr_direct_dispatches - rhs.nr_direct_dispatches,
-            nr_prio_dispatches: self.nr_prio_dispatches - rhs.nr_prio_dispatches,
            nr_shared_dispatches: self.nr_shared_dispatches - rhs.nr_shared_dispatches,
            ..self.clone()
        }