Merge pull request #755 from sched-ext/bpfland-prevent-kthread-stall

scx_bpfland: prevent per-CPU DSQ stall with per-CPU kthreads
2024-11-25 19:10:23 +00:00 · 2024-10-09 05:28:59 +00:00 · 2024-10-09 05:28:59 +00:00 · e3e381dc8e
commit e3e381dc8e
parent 0ed36a17da c8a9207371
3 changed files with 71 additions and 66 deletions
--- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c
@ -46,15 +46,12 @@ const volatile u64 slice_ns_min = 500ULL * NSEC_PER_USEC;
 const volatile s64 slice_ns_lag;

 /*
- * When enabled always dispatch per-CPU kthreads directly on their CPU DSQ.
+ * When enabled always dispatch all kthreads directly.
 *
 * This allows to prioritize critical kernel threads that may potentially slow
- * down the entire system if they are blocked for too long (i.e., ksoftirqd/N,
- * rcuop/N, etc.).
- *
- * NOTE: this could cause interactivity problems or unfairness if there are too
- * many softirqs being scheduled (e.g., in presence of high RX network RX
- * traffic).
+ * down the entire system if they are blocked for too long, but it may also
+ * introduce interactivity issues or unfairness in scenarios with high kthread
+ * activity, such as heavy I/O or network traffic.
 */
 const volatile bool local_kthreads;

@ -121,7 +118,8 @@ static u64 starvation_prio_ts;
 /*
 * Scheduling statistics.
 */
-volatile u64 nr_direct_dispatches, nr_shared_dispatches, nr_prio_dispatches;
+volatile u64 nr_kthread_dispatches, nr_direct_dispatches,
+	     nr_prio_dispatches, nr_shared_dispatches;

 /*
 * Amount of currently running tasks.
@ -494,21 +492,20 @@ static u64 cpu_to_dsq(s32 cpu)
 static int dispatch_direct_cpu(struct task_struct *p, s32 cpu, u64 enq_flags)
 {
 	struct bpf_cpumask *offline;
-	u64 deadline = task_deadline(p);
-	u64 dsq_id = cpu_to_dsq(cpu);
+	s32 dsq_id;

 	/*
-	 * Make sure we can dispatch the task to the target CPU according to
-	 * its cpumask.
+	 * Make sure the CPU is valid and usable by the task.
 	 */
-	if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
-		scx_bpf_error("%d %s can't be dispatched to CPU %d",
-			      p->pid, p->comm, cpu);
+	if (cpu < 0 || !bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
 		return -EINVAL;
-	}
-
-	scx_bpf_dispatch_vtime(p, dsq_id, SCX_SLICE_DFL, deadline, enq_flags);
+	dsq_id = cpu_to_dsq(cpu);

+	/*
+	 * Dispatch the task to the per-CPU DSQ.
+	 */
+	scx_bpf_dispatch_vtime(p, dsq_id, SCX_SLICE_DFL,
+			       task_deadline(p), enq_flags);
 	/*
 	 * If the CPU has gone offline notify that the task needs to be
 	 * consumed from another CPU.
@ -582,6 +579,25 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 	struct cpu_ctx *cctx;
 	s32 cpu;

+	/*
+	 * If the task isn't allowed to use its previously used CPU it means
+	 * that it's changing affinity. In this case just pick a random CPU in
+	 * its new allowed CPU domain.
+	 */
+	if (!bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr))
+		prev_cpu = bpf_cpumask_any_distribute(p->cpus_ptr);
+
+	/*
+	 * For tasks that can run only on a single CPU, we can simply verify if
+	 * their only allowed CPU is still idle.
+	 */
+	if (p->nr_cpus_allowed == 1 || p->migration_disabled) {
+		if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
+			return prev_cpu;
+
+		return -ENOENT;
+	}
+
 	tctx = try_lookup_task_ctx(p);
 	if (!tctx)
 		return -ENOENT;
@ -593,31 +609,6 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 	if (!primary)
 		return -ENOENT;

-	/*
-	 * If the task isn't allowed to use its previously used CPU it means
-	 * that it's rapidly changing affinity. In this case it's pointless to
-	 * find an optimal idle CPU, just return and let the task being
-	 * dispatched to a global DSQ.
-	 */
-	if (!bpf_cpumask_test_cpu(prev_cpu, p->cpus_ptr))
-		return -ENOENT;
-
-	/*
-	 * For tasks that can run only on a single CPU, we can simply verify if
-	 * their only allowed CPU is still idle.
-	 */
-	if (p->nr_cpus_allowed == 1) {
-		if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
-			return prev_cpu;
-		/*
-		 * If local_kthreads is enabled, always dispatch per-CPU
-		 * kthreads directly, even if their allowed CPU is not idle.
-		 */
-		if (local_kthreads && is_kthread(p))
-			return prev_cpu;
-		return -ENOENT;
-	}
-
 	/*
 	 * Acquire the CPU masks to determine the online and idle CPUs in the
 	 * system.
@ -844,7 +835,7 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64
 		return prev_cpu;

 	cpu = pick_idle_cpu(p, prev_cpu, wake_flags);
-	if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0))
+	if (!dispatch_direct_cpu(p, cpu, 0))
 		__sync_fetch_and_add(&nr_direct_dispatches, 1);
 	else
 		cpu = prev_cpu;
@ -861,8 +852,24 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64
 void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	struct task_ctx *tctx;
-	u64 deadline = task_deadline(p);
-	s32 cpu = scx_bpf_task_cpu(p);
+	s32 cpu, dsq_id;
+
+	/*
+	 * Special case for per-CPU kthreads: we want to run them as soon as
+	 * possible, as they are usually important for system performance and
+	 * responsiveness, so dispatch them immediately on the local DSQ of
+	 * their assigned CPU and allow them to preempt the currently running
+	 * task, if present.
+	 *
+	 * If local_kthreads is enabled, consider all kthreads as critical and
+	 * always dispatch them directly.
+	 */
+	if (is_kthread(p) && (local_kthreads || p->nr_cpus_allowed == 1)) {
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
+				 enq_flags | SCX_ENQ_PREEMPT);
+		__sync_fetch_and_add(&nr_kthread_dispatches, 1);
+		return;
+	}

 	/*
 	 * During ttwu, the kernel may decide to skip ->select_task_rq() (e.g.,
@ -877,8 +884,8 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
 	 */
 	tctx = try_lookup_task_ctx(p);
 	if (tctx && !tctx->select_cpu_done) {
-		cpu = pick_idle_cpu(p, cpu, 0);
-		if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0)) {
+		cpu = pick_idle_cpu(p, scx_bpf_task_cpu(p), 0);
+		if (!dispatch_direct_cpu(p, cpu, 0)) {
 			__sync_fetch_and_add(&nr_direct_dispatches, 1);
 			return;
 		}
@ -894,14 +901,14 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
 	 * and simply rely on the vruntime logic.
 	 */
 	if (is_task_interactive(p)) {
-		scx_bpf_dispatch_vtime(p, prio_dsq_id, SCX_SLICE_DFL,
-				       deadline, enq_flags);
+		dsq_id = prio_dsq_id;
 		__sync_fetch_and_add(&nr_prio_dispatches, 1);
 	} else {
-		scx_bpf_dispatch_vtime(p, shared_dsq_id, SCX_SLICE_DFL,
-				       deadline, enq_flags);
+		dsq_id = shared_dsq_id;
 		__sync_fetch_and_add(&nr_shared_dispatches, 1);
 	}
+	scx_bpf_dispatch_vtime(p, dsq_id, SCX_SLICE_DFL,
+			       task_deadline(p), enq_flags);

 	/*
 	 * If there are idle CPUs that are usable by the task, wake them up to
@ -1022,7 +1029,7 @@ void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
 	u64 now = bpf_ktime_get_ns();

 	/*
-	 * Try also to steal tasks directly dispatched to CPUs that have gone
+	 * Try to steal tasks directly dispatched to CPUs that have gone
 	 * offline (this allows to prevent indefinite task stalls).
 	 */
 	if (consume_offline_cpus(cpu))
@ -1486,7 +1493,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
 	}

 	/*
-	 * Create the global priority DSQ (for interactive tasks).
+	 * Create the global priority and shared DSQs.
 	 *
 	 * Allocate a new DSQ id that does not clash with any valid CPU id.
 	 */
@ -1496,12 +1503,6 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
 		scx_bpf_error("failed to create priority DSQ: %d", err);
 		return err;
 	}
-
-	/*
-	 * Create the global shared DSQ (for regular tasks).
-	 *
-	 * Allocate a new DSQ id that does not clash with any valid CPU id..
-	 */
 	shared_dsq_id = nr_cpu_ids++;
 	err = scx_bpf_create_dsq(shared_dsq_id, -1);
 	if (err) {
--- a/scheds/rust/scx_bpfland/src/main.rs
+++ b/scheds/rust/scx_bpfland/src/main.rs
@ -146,12 +146,11 @@ struct Opts {
    #[clap(short = 'L', long, action = clap::ArgAction::SetTrue)]
    lowlatency: bool,

-    /// Enable per-CPU kthreads prioritization.
+    /// Enable kthreads prioritization.
    ///
-    /// Enabling this can enhance the performance of interrupt-driven workloads (e.g., networking
-    /// throughput) over regular system/user workloads. However, it may also introduce
-    /// interactivity issues or unfairness under heavy interrupt-driven loads, such as high RX
-    /// network traffic.
+    /// Enabling this can improve system performance, but it may also introduce interactivity
+    /// issues or unfairness in scenarios with high kthread activity, such as heavy I/O or network
+    /// traffic.
    #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
    local_kthreads: bool,

@ -587,6 +586,7 @@ impl<'a> Scheduler<'a> {
            nr_interactive: self.skel.maps.bss_data.nr_interactive,
            nr_waiting: self.skel.maps.bss_data.nr_waiting,
            nvcsw_avg_thresh: self.skel.maps.bss_data.nvcsw_avg_thresh,
+            nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches,
            nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
            nr_prio_dispatches: self.skel.maps.bss_data.nr_prio_dispatches,
            nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches,
--- a/scheds/rust/scx_bpfland/src/stats.rs
+++ b/scheds/rust/scx_bpfland/src/stats.rs
@ -25,6 +25,8 @@ pub struct Metrics {
    pub nr_waiting: u64,
    #[stat(desc = "Average of voluntary context switches")]
    pub nvcsw_avg_thresh: u64,
+    #[stat(desc = "Number of kthread direct dispatches")]
+    pub nr_kthread_dispatches: u64,
    #[stat(desc = "Number of task direct dispatches")]
    pub nr_direct_dispatches: u64,
    #[stat(desc = "Number of interactive task dispatches")]
@ -37,13 +39,14 @@ impl Metrics {
    fn format<W: Write>(&self, w: &mut W) -> Result<()> {
        writeln!(
            w,
-            "[{}] tasks -> run: {:>2}/{:<2} int: {:<2} wait: {:<4} | nvcsw: {:<4} | dispatch -> dir: {:<5} prio: {:<5} shr: {:<5}",
+            "[{}] tasks -> run: {:>2}/{:<2} int: {:<2} wait: {:<4} | nvcsw: {:<4} | dispatch -> kth: {:<5} dir: {:<5} pri: {:<5} shr: {:<5}",
            crate::SCHEDULER_NAME,
            self.nr_running,
            self.nr_cpus,
            self.nr_interactive,
            self.nr_waiting,
            self.nvcsw_avg_thresh,
+            self.nr_kthread_dispatches,
            self.nr_direct_dispatches,
            self.nr_prio_dispatches,
            self.nr_shared_dispatches
@ -53,6 +56,7 @@ impl Metrics {

    fn delta(&self, rhs: &Self) -> Self {
        Self {
+            nr_kthread_dispatches: self.nr_kthread_dispatches - rhs.nr_kthread_dispatches,
            nr_direct_dispatches: self.nr_direct_dispatches - rhs.nr_direct_dispatches,
            nr_prio_dispatches: self.nr_prio_dispatches - rhs.nr_prio_dispatches,
            nr_shared_dispatches: self.nr_shared_dispatches - rhs.nr_shared_dispatches,