Merge pull request #692 from sched-ext/htejun/sync-kernel

Sync from kernel and re-enable scx_flatcg and scx_pair
2024-11-28 13:40:28 +00:00 · 2024-09-25 13:04:23 -10:00 · 2024-09-25 13:04:23 -10:00 · 818e829e01
commit 818e829e01
parent bd2e90b0b6 540576ac30
10 changed files with 143711 additions and 30 deletions
--- a/meson-scripts/test_sched
+++ b/meson-scripts/test_sched
@ -13,7 +13,6 @@ GUEST_TIMEOUT=60
 # List of schedulers to test
 #
 # TODO:
-#   - scx_flatcg, scx_pair: excluded until cgroup support lands upstream
 #   - scx_mitosis: not ready yet
 #
 declare -A SCHEDS
@ -25,6 +24,8 @@ else
    SCHEDS["scx_simple"]=""
    SCHEDS["scx_central"]=""
    SCHEDS["scx_nest"]=""
+    SCHEDS["scx_flatcg"]=""
+    SCHEDS["scx_pair"]=""
    SCHEDS["scx_rusty"]=""
    SCHEDS["scx_rustland"]=""
    SCHEDS["scx_bpfland"]=""
--- a/scheds/c/meson.build
+++ b/scheds/c/meson.build
@ -1,6 +1,5 @@
-# scx_flatcg and scx_pair are temporarily excluded until cgroup support
-# lands in the upstream kernel.
-c_scheds = ['scx_simple', 'scx_qmap', 'scx_central', 'scx_userland', 'scx_nest']
+c_scheds = ['scx_simple', 'scx_qmap', 'scx_central', 'scx_userland', 'scx_nest',
+            'scx_flatcg', 'scx_pair']

 foreach sched: c_scheds
  thread_dep = dependency('threads')
--- a/scheds/c/scx_central.bpf.c
+++ b/scheds/c/scx_central.bpf.c
@ -198,7 +198,7 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)

 			/* central's gimme is never set */
 			gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
-			if (gimme && !*gimme)
+			if (!gimme || !*gimme)
 				continue;

 			if (dispatch_to_cpu(cpu))
--- a/scheds/c/scx_flatcg.bpf.c
+++ b/scheds/c/scx_flatcg.bpf.c
@ -382,7 +382,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}

-	cgrp = scx_bpf_task_cgroup(p);
+	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (!cgc)
 		goto out_release;
@ -508,7 +508,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
 {
 	struct cgroup *cgrp;

-	cgrp = scx_bpf_task_cgroup(p);
+	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
 	update_active_weight_sums(cgrp, true);
 	bpf_cgroup_release(cgrp);
 }
@ -521,7 +521,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
 	if (fifo_sched)
 		return;

-	cgrp = scx_bpf_task_cgroup(p);
+	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (cgc) {
 		/*
@ -564,7 +564,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
 	if (!taskc->bypassed_at)
 		return;

-	cgrp = scx_bpf_task_cgroup(p);
+	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (cgc) {
 		__sync_fetch_and_add(&cgc->cvtime_delta,
@ -578,7 +578,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
 {
 	struct cgroup *cgrp;

-	cgrp = scx_bpf_task_cgroup(p);
+	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
 	update_active_weight_sums(cgrp, false);
 	bpf_cgroup_release(cgrp);
 }
@ -944,5 +944,5 @@ SCX_OPS_DEFINE(flatcg_ops,
 	       .cgroup_exit		= (void *)fcg_cgroup_exit,
 	       .cgroup_move		= (void *)fcg_cgroup_move,
 	       .exit			= (void *)fcg_exit,
-	       .flags			= SCX_OPS_CGROUP_KNOB_WEIGHT | SCX_OPS_ENQ_EXITING,
+	       .flags			= SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
 	       .name			= "flatcg");
--- a/scheds/c/scx_qmap.bpf.c
+++ b/scheds/c/scx_qmap.bpf.c
@ -27,6 +27,8 @@
 enum consts {
 	ONE_SEC_IN_NS		= 1000000000,
 	SHARED_DSQ		= 0,
+	HIGHPRI_DSQ		= 1,
+	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
 };

 char _license[] SEC("license") = "GPL";
@ -36,10 +38,12 @@ const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_inf_loop_after;
 const volatile u32 dsp_batch;
+const volatile bool highpri_boosting;
 const volatile bool print_shared_dsq;
 const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;

+u64 nr_highpri_queued;
 u32 test_error_cnt;

 UEI_DEFINE(uei);
@ -95,6 +99,7 @@ static u64 core_sched_tail_seqs[5];
 /* Per-task scheduling context */
 struct task_ctx {
 	bool	force_local;	/* Dispatch directly to local_dsq */
+	bool	highpri;
 	u64	core_sched_seq;
 };

@ -122,6 +127,7 @@ struct {
 /* Statistics */
 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
 u64 nr_core_sched_execed;
+u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;

@ -140,17 +146,25 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 	return -1;
 }

+static struct task_ctx *lookup_task_ctx(struct task_struct *p)
+{
+	struct task_ctx *tctx;
+
+	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+		scx_bpf_error("task_ctx lookup failed");
+		return NULL;
+	}
+	return tctx;
+}
+
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
 	struct task_ctx *tctx;
 	s32 cpu;

-	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-	if (!tctx) {
-		scx_bpf_error("task_ctx lookup failed");
+	if (!(tctx = lookup_task_ctx(p)))
 		return -ESRCH;
-	}

 	cpu = pick_direct_dispatch_cpu(p, prev_cpu);

@ -197,16 +211,12 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	if (test_error_cnt && !--test_error_cnt)
 		scx_bpf_error("test triggering error");

-	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-	if (!tctx) {
-		scx_bpf_error("task_ctx lookup failed");
+	if (!(tctx = lookup_task_ctx(p)))
 		return;
-	}

 	/*
 	 * All enqueued tasks must have their core_sched_seq updated for correct
-	 * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in
-	 * qmap_ops.flags.
+	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
 	 */
 	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;

@ -214,7 +224,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	 * If qmap_select_cpu() is telling us to or this is the last runnable
 	 * task on the CPU, enqueue locally.
 	 */
-	if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) {
+	if (tctx->force_local) {
 		tctx->force_local = false;
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
@ -256,6 +266,10 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}

+	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
+		tctx->highpri = true;
+		__sync_fetch_and_add(&nr_highpri_queued, 1);
+	}
 	__sync_fetch_and_add(&nr_enqueued, 1);
 }

@ -272,24 +286,95 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)

 static void update_core_sched_head_seq(struct task_struct *p)
 {
-	struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
 	int idx = weight_to_idx(p->scx.weight);
+	struct task_ctx *tctx;

-	if (tctx)
+	if ((tctx = lookup_task_ctx(p)))
 		core_sched_head_seqs[idx] = tctx->core_sched_seq;
-	else
-		scx_bpf_error("task_ctx lookup failed");
+}
+
+/*
+ * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly
+ * selective priority boosting mechanism by scanning SHARED_DSQ looking for
+ * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This
+ * makes minor difference only when dsp_batch is larger than 1.
+ *
+ * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
+ * non-rq-lock holding BPF programs. As demonstration, this function is called
+ * from qmap_dispatch() and monitor_timerfn().
+ */
+static bool dispatch_highpri(bool from_timer)
+{
+	struct task_struct *p;
+	s32 this_cpu = bpf_get_smp_processor_id();
+
+	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
+	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
+		static u64 highpri_seq;
+		struct task_ctx *tctx;
+
+		if (!(tctx = lookup_task_ctx(p)))
+			return false;
+
+		if (tctx->highpri) {
+			/* exercise the set_*() and vtime interface too */
+			__COMPAT_scx_bpf_dispatch_from_dsq_set_slice(
+				BPF_FOR_EACH_ITER, slice_ns * 2);
+			__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(
+				BPF_FOR_EACH_ITER, highpri_seq++);
+			__COMPAT_scx_bpf_dispatch_vtime_from_dsq(
+				BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
+		}
+	}
+
+	/*
+	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
+	 * is found.
+	 */
+	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
+		bool dispatched = false;
+		s32 cpu;
+
+		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
+			cpu = this_cpu;
+		else
+			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
+
+		if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p,
+						       SCX_DSQ_LOCAL_ON | cpu,
+						       SCX_ENQ_PREEMPT)) {
+			if (cpu == this_cpu) {
+				dispatched = true;
+				__sync_fetch_and_add(&nr_expedited_local, 1);
+			} else {
+				__sync_fetch_and_add(&nr_expedited_remote, 1);
+			}
+			if (from_timer)
+				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
+		} else {
+			__sync_fetch_and_add(&nr_expedited_lost, 1);
+		}
+
+		if (dispatched)
+			return true;
+	}
+
+	return false;
 }

 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cpu_ctx *cpuc;
+	struct task_ctx *tctx;
 	u32 zero = 0, batch = dsp_batch ?: 1;
 	void *fifo;
 	s32 i, pid;

-	if (scx_bpf_consume(SHARED_DSQ))
+	if (dispatch_highpri(false))
+		return;
+
+	if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ))
 		return;

 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
@ -326,6 +411,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)

 		/* Dispatch or advance. */
 		bpf_repeat(BPF_MAX_LOOPS) {
+			struct task_ctx *tctx;
+
 			if (bpf_map_pop_elem(fifo, &pid))
 				break;

@ -333,13 +420,25 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			if (!p)
 				continue;

+			if (!(tctx = lookup_task_ctx(p))) {
+				bpf_task_release(p);
+				return;
+			}
+
+			if (tctx->highpri)
+				__sync_fetch_and_sub(&nr_highpri_queued, 1);
+
 			update_core_sched_head_seq(p);
 			__sync_fetch_and_add(&nr_dispatched, 1);
+
 			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
 			bpf_task_release(p);
+
 			batch--;
 			cpuc->dsp_cnt--;
 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
+				if (dispatch_highpri(false))
+					return;
 				scx_bpf_consume(SHARED_DSQ);
 				return;
 			}
@ -349,6 +448,21 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)

 		cpuc->dsp_cnt = 0;
 	}
+
+	/*
+	 * No other tasks. @prev will keep running. Update its core_sched_seq as
+	 * if the task were enqueued and dispatched immediately.
+	 */
+	if (prev) {
+		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("task_ctx lookup failed");
+			return;
+		}
+
+		tctx->core_sched_seq =
+			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+	}
 }

 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
@ -649,6 +763,10 @@ static void dump_shared_dsq(void)

 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 {
+	bpf_rcu_read_lock();
+	dispatch_highpri(true);
+	bpf_rcu_read_unlock();
+
 	monitor_cpuperf();

 	if (print_shared_dsq)
@ -670,6 +788,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 	if (ret)
 		return ret;

+	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
+	if (ret)
+		return ret;
+
 	timer = bpf_map_lookup_elem(&monitor_timer, &key);
 	if (!timer)
 		return -ESRCH;
@ -701,6 +823,5 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,
 	       .exit			= (void *)qmap_exit,
-	       .flags			= SCX_OPS_ENQ_LAST,
 	       .timeout_ms		= 5000U,
 	       .name			= "qmap");
--- a/scheds/c/scx_qmap.c
+++ b/scheds/c/scx_qmap.c
@ -29,6 +29,7 @@ const char help_fmt[] =
 "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
 "  -P            Print out DSQ content to trace_pipe every second, use with -b\n"
+"  -H            Boost nice -20 tasks in SHARED_DSQ, use with -b\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -D LEN        Set scx_exit_info.dump buffer length\n"
 "  -S            Suppress qmap-specific debug dump\n"
@ -63,7 +64,7 @@ int main(int argc, char **argv)

 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);

-	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:Pd:D:Spvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@ -86,6 +87,9 @@ int main(int argc, char **argv)
 		case 'P':
 			skel->rodata->print_shared_dsq = true;
 			break;
+		case 'H':
+			skel->rodata->highpri_boosting = true;
+			break;
 		case 'd':
 			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
 			if (skel->rodata->disallow_tgid < 0)
@ -121,6 +125,11 @@ int main(int argc, char **argv)
 		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
 		       skel->bss->nr_core_sched_execed,
 		       skel->bss->nr_ddsp_from_enq);
+		printf("         exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
+		       skel->bss->nr_expedited_local,
+		       skel->bss->nr_expedited_remote,
+		       skel->bss->nr_expedited_from_timer,
+		       skel->bss->nr_expedited_lost);
 		if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
 			printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
 			       skel->bss->cpuperf_min,
--- a/scheds/include/scx/common.bpf.h
+++ b/scheds/include/scx/common.bpf.h
@ -41,6 +41,10 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
+void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
+void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
+bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_reenqueue_local(void) __ksym;
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
@ -67,6 +71,13 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
+
+/*
+ * Use the following as @it__iter when calling
+ * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops.
+ */
+#define BPF_FOR_EACH_ITER	(&___it)

 static inline __attribute__((format(printf, 1, 2)))
 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
--- a/scheds/include/scx/compat.bpf.h
+++ b/scheds/include/scx/compat.bpf.h
@ -15,6 +15,25 @@
 	__ret;									\
 })

+/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */
+#define __COMPAT_scx_bpf_task_cgroup(p)						\
+	(bpf_ksym_exists(scx_bpf_task_cgroup) ?					\
+	 scx_bpf_task_cgroup((p)) : NULL)
+
+/* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */
+#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice)			\
+	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ?			\
+	 scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0)
+#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime)			\
+	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ?			\
+	 scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0)
+#define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags)		\
+	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ?				\
+	 scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
+#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags)	\
+	(bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ?			\
+	 scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
+
 /*
 * Define sched_ext_ops. This may be expanded to define multiple variants for
 * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
--- a/scheds/include/vmlinux/vmlinux-v6.12-rc0-ga748db0c8c6a.h
+++ b/scheds/include/vmlinux/vmlinux-v6.12-rc0-ga748db0c8c6a.h
--- a/scheds/include/vmlinux/vmlinux.h
+++ b/scheds/include/vmlinux/vmlinux.h
@ -1 +1 @@
-vmlinux-v6.10-rc2-g1edab907b57d.h
+vmlinux-v6.12-rc0-ga748db0c8c6a.h