Merge pull request #410 from sched-ext/bpfland-smooth-perf

scx_bpfland: enhance performance consistency and predictability
2024-11-26 19:30:24 +00:00 · 2024-07-04 21:37:07 +02:00 · 2024-07-04 21:37:07 +02:00 · 86d2f50230
commit 86d2f50230
parent d98516fe75 cfe2ed063d
3 changed files with 191 additions and 162 deletions
--- a/scheds/rust/scx_bpfland/src/bpf/intf.h
+++ b/scheds/rust/scx_bpfland/src/bpf/intf.h
@ -15,7 +15,9 @@
 #define CLAMP(val, lo, hi) MIN(MAX(val, lo), hi)

 enum consts {
-	NSEC_PER_SEC = 1000000000ULL,
+        NSEC_PER_USEC = 1000ULL,
+        NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC),
+        NSEC_PER_SEC = (1000ULL * NSEC_PER_MSEC),
 };

 #ifndef __VMLINUX_H__
--- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c
@ -34,12 +34,12 @@ const volatile bool debug;
 /*
 * Default task time slice.
 */
-const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile u64 slice_ns = 5ULL * NSEC_PER_SEC;

 /*
 * Time slice used when system is over commissioned.
 */
-const volatile u64 slice_ns_min = 500000;
+const volatile u64 slice_ns_min = 500ULL * NSEC_PER_USEC;

 /*
 * Maximum time slice lag.
@ -50,22 +50,35 @@ const volatile u64 slice_ns_min = 500000;
 */
 const volatile u64 slice_ns_lag;

-/*
- * Enable built-in idle selection logic.
- */
-const volatile bool builtin_idle;
-
 /*
 * Threshold of voluntary context switches used to classify a task as
 * interactive.
 */
-const volatile u64 nvcsw_thresh = 10;
+const volatile u64 nvcsw_thresh = 10ULL;
+
+/*
+ * Time threshold to prevent task starvation.
+ *
+ * The scheduler processes tasks from various DSQs in the following order:
+ *
+ *  per-CPU DSQs => priority DSQ => shared DSQ
+ *
+ *  Tasks in the shared DSQ may be starved by those in the priority DSQ, which
+ *  in turn may be starved by tasks in any per-CPU DSQ.
+ *
+ *  To mitigate this, store the timestamp of the last task consumption from
+ *  both the priority DSQ and the shared DSQ. If the starvation_thresh_ns
+ *  threshold is exceeded without consuming a task, the scheduler will be
+ *  forced to consume a task from the corresponding DSQ.
+ */
+const volatile u64 starvation_thresh_ns = 5ULL * NSEC_PER_MSEC;
+static u64 starvation_shared_ts;
+static u64 starvation_prio_ts;

 /*
 * Scheduling statistics.
 */
-volatile u64 nr_direct_dispatches, nr_kthread_dispatches,
-	     nr_shared_dispatches, nr_prio_dispatches;
+volatile u64 nr_direct_dispatches, nr_shared_dispatches, nr_prio_dispatches;

 /*
 * Amount of currently running tasks.
@ -166,6 +179,19 @@ static int calloc_cpumask(struct bpf_cpumask **p_cpumask)
 	return 0;
 }

+/*
+ * Set the state of a CPU in a cpumask.
+ */
+static bool set_cpu_state(struct bpf_cpumask *cpumask, s32 cpu, bool state)
+{
+	if (!cpumask)
+		return false;
+	if (state)
+		return bpf_cpumask_test_and_set_cpu(cpu, cpumask);
+	else
+		return bpf_cpumask_test_and_clear_cpu(cpu, cpumask);
+}
+
 /*
 * Exponential weighted moving average (EWMA).
 *
@ -212,9 +238,7 @@ static inline u64 task_vtime(struct task_struct *p)
 */
 static inline u64 task_slice(struct task_struct *p)
 {
-	u64 slice = p->scx.slice;
-
-	return MAX(slice, slice_ns_min);
+	return MAX(p->scx.slice, slice_ns_min);
 }

 /*
@ -230,22 +254,6 @@ static u64 cpu_to_dsq(s32 cpu)
 	return (u64)cpu;
 }

-/*
- * Dispatch a per-CPU kthread directly to the local CPU DSQ.
- */
-static void dispatch_kthread(struct task_struct *p, u64 enq_flags)
-{
-	u64 slice = task_slice(p);
-
-	/*
-	 * Use the local CPU DSQ directly for per-CPU kthreads, to give them
-	 * maximum priority.
-	 */
-	scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice, enq_flags);
-
-	__sync_fetch_and_add(&nr_kthread_dispatches, 1);
-}
-
 /*
 * Dispatch a task directly to the assigned CPU DSQ (used when an idle CPU is
 * found).
@ -253,6 +261,7 @@ static void dispatch_kthread(struct task_struct *p, u64 enq_flags)
 static int dispatch_direct_cpu(struct task_struct *p, s32 cpu)
 {
 	u64 slice = task_slice(p);
+	u64 vtime = task_vtime(p);
 	u64 dsq_id = cpu_to_dsq(cpu);

 	/*
@ -262,14 +271,7 @@ static int dispatch_direct_cpu(struct task_struct *p, s32 cpu)
 	if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
 		return -EINVAL;

-	/*
-	 * We don't need to use vtime here, because we basically dispatch one
-	 * task at a time when the corresponding CPU is idle.
-	 *
-	 * We could also use SCX_DSQ_LOCAL, but we want to distinguish regular
-	 * tasks from per-CPU kthreads to give more priority to the latter.
-	 */
-	scx_bpf_dispatch(p, dsq_id, slice, 0);
+	scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, 0);
 	__sync_fetch_and_add(&nr_direct_dispatches, 1);

 	/*
@ -286,54 +288,20 @@ static int dispatch_direct_cpu(struct task_struct *p, s32 cpu)
 	return 0;
 }

-/*
- * Dispatch a regular task.
- */
-static void dispatch_task(struct task_struct *p, u64 enq_flags)
-{
-	u64 vtime = task_vtime(p);
-	u64 slice = task_slice(p);
-	struct task_ctx *tctx;
-
-	tctx = lookup_task_ctx(p);
-	if (!tctx)
-		return;
-	/*
-	 * Always dispatch interactive tasks to the priority DSQ and regular
-	 * tasks to the shared DSQ.
-	 */
-	if (tctx->is_interactive) {
-		scx_bpf_dispatch_vtime(p, PRIO_DSQ, slice, vtime, enq_flags);
-		__sync_fetch_and_add(&nr_prio_dispatches, 1);
-	} else {
-		scx_bpf_dispatch_vtime(p, SHARED_DSQ, slice, vtime, enq_flags);
-		__sync_fetch_and_add(&nr_shared_dispatches, 1);
-	}
-}
-
 /*
 * Find an idle CPU in the system.
+ *
+ * NOTE: the idle CPU selection doesn't need to be formally perfect, it is
+ * totally fine to accept racy conditions and potentially make mistakes, by
+ * picking CPUs that are not idle or even offline, the logic has been designed
+ * to handle these mistakes in favor of a more efficient response and a reduced
+ * scheduling overhead.
 */
 static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
 	const struct cpumask *online_cpumask, *idle_smtmask, *idle_cpumask;
-	bool prev_in_cand;
 	s32 cpu;

-	if (builtin_idle) {
-		bool is_idle = false;
-
-		/*
-		 * Find an idle CPU using the sched_ext built-in idle selection
-		 * logic.
-		 */
-		cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
-		if (is_idle)
-			return cpu;
-
-		return -ENOENT;
-	}
-
 	/*
 	 * Acquire the CPU masks to determine the online and idle CPUs in the
 	 * system.
@ -342,14 +310,28 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 	idle_smtmask = scx_bpf_get_idle_smtmask();
 	idle_cpumask = scx_bpf_get_idle_cpumask();

-	prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr);
+	/*
+	 * For tasks that can run only on a single CPU, we can simply verify if
+	 * their only allowed CPU is idle.
+	 */
+	if (p->nr_cpus_allowed == 1) {
+		cpu = bpf_cpumask_first(p->cpus_ptr);

+		if (scx_bpf_test_and_clear_cpu_idle(cpu))
+			goto out_put_cpumask;
+		else
+			goto out_not_found;
+	}
+
+	/*
+	 * Find the best idle CPU, prioritizing full idle cores in SMT systems.
+	 */
 	if (smt_enabled) {
 		/*
 		 * If the task can still run on the previously used CPU and
 		 * it's a full-idle core, keep using it.
 		 */
-		if (prev_in_cand &&
+		if (bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr) &&
 		    bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
 		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
 			cpu = prev_cpu;
@ -366,10 +348,11 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 	}

 	/*
-	 * If a full-idle core can't be found (or if it's not an SMT system)
+	 * If a full-idle core can't be found (or if this is not an SMT system)
 	 * try to re-use the same CPU, even if it's not in a full-idle core.
 	 */
-	if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+	if (bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr) &&
+	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
 		cpu = prev_cpu;
 		goto out_put_cpumask;
 	}
@ -390,16 +373,16 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 		struct task_ctx *tctx;

 		/*
-		 * If we are waking up a task and we couldn't find any idle CPU
-		 * to use, at least set the task as interactive, so that it can
-		 * be dispatched as soon as possible on the first CPU
-		 * available.
+		 * If we are waking up a task and we can't use the current CPU
+		 * at least set the task as interactive, so that it can be
+		 * dispatched as soon as possible on the first CPU available.
 		 */
 		tctx = lookup_task_ctx(p);
 		if (tctx)
 			tctx->is_interactive = true;
 	}

+out_not_found:
 	/*
 	 * If all the previous attempts have failed, dispatch the task to the
 	 * first CPU that will become available.
@ -425,10 +408,18 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64
 	return prev_cpu;
 }

+/*
+ * Dispatch all the other tasks that were not dispatched directly in
+ * select_cpu().
+ */
 void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
 {
+	u64 vtime = task_vtime(p);
+	u64 slice = task_slice(p);
+	struct task_ctx *tctx;
+
 	/*
-	 * Always dispatch per-CPU kthreads immediately.
+	 * Always dispatch per-CPU kthreads directly on their target CPU.
 	 *
 	 * This allows to prioritize critical kernel threads that may
 	 * potentially slow down the entire system if they are blocked for too
@ -440,15 +431,26 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
 	 * this scheduler is desktop usage, this shouldn't be a problem.
 	 */
 	if (is_kthread(p) && p->nr_cpus_allowed == 1) {
-		dispatch_kthread(p, enq_flags);
+		s32 cpu = scx_bpf_task_cpu(p);
+		dispatch_direct_cpu(p, cpu);
 		return;
 	}

+	tctx = lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
 	/*
-	 * Dispatch all the other tasks that were not dispatched directly in
-	 * select_cpu().
+	 * Always dispatch interactive tasks to the priority DSQ and regular
+	 * tasks to the shared DSQ.
 	 */
-	dispatch_task(p, enq_flags);
+	if (tctx->is_interactive) {
+		scx_bpf_dispatch_vtime(p, PRIO_DSQ, slice, vtime, enq_flags);
+		__sync_fetch_and_add(&nr_prio_dispatches, 1);
+	} else {
+		scx_bpf_dispatch_vtime(p, SHARED_DSQ, slice, vtime, enq_flags);
+		__sync_fetch_and_add(&nr_shared_dispatches, 1);
+	}
 }

 /*
@ -456,18 +458,17 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
 *
 * These tasks will be consumed on other active CPUs to prevent indefinite
 * stalling.
+ *
+ * Return true if one task is consumed, false otherwise.
 */
-static int dispatch_offline_cpus(s32 cpu)
+static bool consume_offline_cpus(s32 cpu)
 {
 	u64 cpu_max = scx_bpf_nr_cpu_ids();
 	struct bpf_cpumask *offline;
-	int ret = -ENOENT;
-
-	bpf_rcu_read_lock();

 	offline = offline_cpumask;
 	if (!offline)
-		goto out_rcu;
+		return false;

 	/*
 	 * Cycle through all the CPUs and evenly consume tasks from the DSQs of
@ -482,22 +483,81 @@ static int dispatch_offline_cpus(s32 cpu)
 		 * This CPU is offline, if a task has been dispatched there
 		 * consume it immediately on the current CPU.
 		 */
-		if (scx_bpf_consume(cpu_to_dsq(cpu))) {
-			ret = 0;
-			goto out_rcu;
-		}
+		if (scx_bpf_consume(cpu_to_dsq(cpu)))
+			return true;
 	}
-out_rcu:
-	bpf_rcu_read_unlock();
+
+	return false;
+}
+
+/*
+ * Consume a task from the priority DSQ, transferring it to the local CPU DSQ.
+ *
+ * Return true if a task is consumed, false otherwise.
+ */
+static bool consume_prio_task(u64 now)
+{
+	bool ret;
+
+	ret = scx_bpf_consume(PRIO_DSQ);
+	if (ret)
+		starvation_prio_ts = now;

 	return ret;
 }

+/*
+ * Consume a task from the shared DSQ, transferring it to the local CPU DSQ.
+ *
+ * Return true if a task is consumed, false otherwise.
+ */
+static bool consume_regular_task(u64 now)
+{
+	bool ret;
+
+	ret = scx_bpf_consume(SHARED_DSQ);
+	if (ret)
+		starvation_shared_ts = now;
+
+	return ret;
+}
+
+/*
+ * Consume tasks that are potentially starving.
+ *
+ * In order to limit potential starvation conditions the scheduler uses a
+ * time-based threshold to ensure that at least one task from the
+ * lower-priority DSQs is periodically consumed.
+ */
+static bool consume_starving_tasks(u64 now)
+{
+	if (!starvation_thresh_ns)
+		return false;
+
+	if (vtime_before(starvation_shared_ts + starvation_thresh_ns, now))
+		if (consume_regular_task(now))
+			return true;
+
+	if (vtime_before(starvation_prio_ts + starvation_thresh_ns, now))
+		if (consume_prio_task(now))
+			return true;
+
+	return false;
+}
+
 void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
 {
+	u64 now = bpf_ktime_get_ns();
+
 	/*
-	 * First consume directly dispatched tasks, so that they can
-	 * immediately use the CPU assigned in select_cpu().
+	 * Make sure we are not staving tasks from the lower priority DSQs.
+	 */
+	if (consume_starving_tasks(now))
+		return;
+
+	/*
+	 * Consume directly dispatched tasks, so that they can immediately use
+	 * the CPU assigned in select_cpu().
 	 */
 	if (scx_bpf_consume(cpu_to_dsq(cpu)))
 		return;
@ -506,28 +566,19 @@ void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
 	 * Try also to steal tasks directly dispatched to CPUs that have gone
 	 * offline (this allows to prevent indefinite task stalls).
 	 */
-	if (!dispatch_offline_cpus(cpu))
+	if (consume_offline_cpus(cpu))
 		return;

 	/*
 	 * Then always consume interactive tasks before regular tasks.
-	 *
-	 * This is fine and we shouldn't have starvation, because interactive
-	 * tasks are classified by their amount of voluntary context switches,
-	 * so they should naturally release the CPU quickly and give a chance
-	 * to the regular tasks to run.
-	 *
-	 * TODO: Add a tunable setting to limit the number of priority tasks
-	 * dispatched. Once this limit is reached, at least one regular task
-	 * must be dispatched.
 	 */
-	if (scx_bpf_consume(PRIO_DSQ))
+	if (consume_prio_task(now))
 		return;

 	/*
 	 * Lastly, consume regular tasks from the shared DSQ.
 	 */
-	scx_bpf_consume(SHARED_DSQ);
+	consume_regular_task(now);
 }

 void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
@ -536,6 +587,13 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
 	if (vtime_before(vtime_now, p->scx.dsq_vtime))
 		vtime_now = p->scx.dsq_vtime;

+	/*
+	 * Ensure time slice never exceeds slice_ns when a task is started on a
+	 * CPU.
+	 */
+	if (p->scx.slice > slice_ns)
+		p->scx.slice = slice_ns;
+
 	__sync_fetch_and_add(&nr_running, 1);
 }

@ -604,37 +662,16 @@ void BPF_STRUCT_OPS(bpfland_enable, struct task_struct *p)
 	p->scx.dsq_vtime = vtime_now;
 }

-/*
- * Set the offline state of a CPU, updating the global offline cpumask.
- */
-static void set_cpu_offline(s32 cpu, bool state)
-{
-	struct bpf_cpumask *offline;
-
-	bpf_rcu_read_lock();
-
-	offline = offline_cpumask;
-	if (!offline)
-		goto out_rcu;
-	if (state)
-		bpf_cpumask_set_cpu(cpu, offline);
-	else
-		bpf_cpumask_clear_cpu(cpu, offline);
-
-out_rcu:
-	bpf_rcu_read_unlock();
-}
-
 void BPF_STRUCT_OPS(bpfland_cpu_online, s32 cpu)
 {
 	/* Set the CPU state to offline */
-	set_cpu_offline(cpu, false);
+	set_cpu_state(offline_cpumask, cpu, false);
 }

 void BPF_STRUCT_OPS(bpfland_cpu_offline, s32 cpu)
 {
 	/* Set the CPU state to online */
-	set_cpu_offline(cpu, true);
+	set_cpu_state(offline_cpumask, cpu, true);
 }

 s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p,
@ -649,7 +686,7 @@ s32 BPF_STRUCT_OPS(bpfland_init_task, struct task_struct *p,

 s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
 {
-	struct bpf_cpumask *offline;
+	struct bpf_cpumask *mask;
 	int err;
 	s32 cpu;

@ -678,14 +715,12 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
 	}

 	/* Initialize the offline CPU mask */
-	bpf_rcu_read_lock();
 	err = calloc_cpumask(&offline_cpumask);
-	offline = offline_cpumask;
-	if (err || !offline)
-		err -ENOMEM;
-	bpf_rcu_read_unlock();
+	mask = offline_cpumask;
+	if (!mask)
+		err = -ENOMEM;

-	return 0;
+	return err;
 }

 void BPF_STRUCT_OPS(bpfland_exit, struct scx_exit_info *ei)
--- a/scheds/rust/scx_bpfland/src/main.rs
+++ b/scheds/rust/scx_bpfland/src/main.rs
@ -59,7 +59,7 @@ struct Opts {
    exit_dump_len: u32,

    /// Maximum scheduling slice duration in microseconds.
-    #[clap(short = 's', long, default_value = "20000")]
+    #[clap(short = 's', long, default_value = "5000")]
    slice_us: u64,

    /// Minimum scheduling slice duration in microseconds.
@ -78,9 +78,10 @@ struct Opts {
    #[clap(short = 'c', long, default_value = "10")]
    nvcsw_thresh: u64,

-    /// Enable direct dispatch via sched_ext built-in idle selection logic.
-    #[clap(short = 'i', long, action = clap::ArgAction::SetTrue)]
-    builtin_idle: bool,
+    /// Prevent the starvation making sure that at least one lower priority task is scheduled every
+    /// starvation_thresh_us (0 = disable starvation prevention).
+    #[clap(short = 't', long, default_value = "5000")]
+    starvation_thresh_us: u64,

    /// Enable the Prometheus endpoint for metrics on port 9000.
    #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
@ -101,7 +102,6 @@ struct Opts {

 struct Metrics {
    nr_running: Gauge,
-    nr_kthread_dispatches: Gauge,
    nr_direct_dispatches: Gauge,
    nr_prio_dispatches: Gauge,
    nr_shared_dispatches: Gauge,
@ -113,9 +113,6 @@ impl Metrics {
            nr_running: gauge!(
                "nr_running", "info" => "Number of running tasks"
            ),
-            nr_kthread_dispatches: gauge!(
-                "nr_kthread_dispatches", "info" => "Number of kthread dispatches"
-            ),
            nr_direct_dispatches: gauge!(
                "nr_direct_dispatches", "info" => "Number of direct dispatches"
            ),
@ -173,8 +170,8 @@ impl<'a> Scheduler<'a> {
        skel.rodata_mut().slice_ns = opts.slice_us * 1000;
        skel.rodata_mut().slice_ns_min = opts.slice_us_min * 1000;
        skel.rodata_mut().slice_ns_lag = opts.slice_us_lag * 1000;
+        skel.rodata_mut().starvation_thresh_ns = opts.starvation_thresh_us * 1000;
        skel.rodata_mut().nvcsw_thresh = opts.nvcsw_thresh;
-        skel.rodata_mut().builtin_idle = opts.builtin_idle;

        // Attach the scheduler.
        let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
@ -196,18 +193,14 @@ impl<'a> Scheduler<'a> {
    }

    fn update_stats(&mut self) {
-        let nr_running = self.skel.bss().nr_running;
        let nr_cpus = libbpf_rs::num_possible_cpus().unwrap();
-        let nr_kthread_dispatches = self.skel.bss().nr_kthread_dispatches;
+        let nr_running = self.skel.bss().nr_running;
        let nr_direct_dispatches = self.skel.bss().nr_direct_dispatches;
        let nr_prio_dispatches = self.skel.bss().nr_prio_dispatches;
        let nr_shared_dispatches = self.skel.bss().nr_shared_dispatches;

        // Update Prometheus statistics.
        self.metrics.nr_running.set(nr_running as f64);
-        self.metrics
-            .nr_kthread_dispatches
-            .set(nr_kthread_dispatches as f64);
        self.metrics
            .nr_direct_dispatches
            .set(nr_direct_dispatches as f64);
@ -219,10 +212,9 @@ impl<'a> Scheduler<'a> {
            .set(nr_shared_dispatches as f64);

        // Log scheduling statistics.
-        info!("running={}/{} nr_kthread_dispatches={} nr_direct_dispatches={} nr_prio_dispatches={} nr_shared_dispatches={}",
+        info!("running={}/{} direct_dispatches={} prio_dispatches={} shared_dispatches={}",
            nr_running,
            nr_cpus,
-            nr_kthread_dispatches,
            nr_direct_dispatches,
            nr_prio_dispatches,
            nr_shared_dispatches);