Merge pull request #253 from sched-ext/htejun/sync-kernel

Sync to the latest kernel
2024-11-26 04:30:23 +00:00 · 2024-04-29 10:16:35 -10:00 · 2024-04-29 10:16:35 -10:00 · b1bb2a5c5f
commit b1bb2a5c5f
parent 3ee64a1301 c77d101655
20 changed files with 115548 additions and 114769 deletions
--- a/rust/scx_utils/src/builder.rs
+++ b/rust/scx_utils/src/builder.rs
@ -45,7 +45,7 @@ impl Builder {
        let bindings = bindgen::Builder::default()
            .header("bindings.h")
            .allowlist_type("scx_exit_kind")
-            .allowlist_type("scx_internal_consts")
+            .allowlist_type("scx_consts")
            .parse_callbacks(Box::new(bindgen::CargoCallbacks))
            .generate()
            .expect("Unable to generate bindings");
--- a/rust/scx_utils/src/lib.rs
+++ b/rust/scx_utils/src/lib.rs
@ -43,7 +43,7 @@ pub use builder::Builder;

 mod user_exit_info;
 pub use user_exit_info::ScxExitKind;
-pub use user_exit_info::ScxInternalConsts;
+pub use user_exit_info::ScxConsts;
 pub use user_exit_info::UeiDumpPtr;
 pub use user_exit_info::UserExitInfo;
 pub use user_exit_info::UEI_DUMP_PTR_MUTEX;
--- a/rust/scx_utils/src/user_exit_info.rs
+++ b/rust/scx_utils/src/user_exit_info.rs
@ -29,8 +29,8 @@ pub enum ScxExitKind {
    ErrorStall = bindings::scx_exit_kind_SCX_EXIT_ERROR_STALL as isize,
 }

-pub enum ScxInternalConsts {
-    ExitDumpDflLen = bindings::scx_internal_consts_SCX_EXIT_DUMP_DFL_LEN as isize,
+pub enum ScxConsts {
+    ExitDumpDflLen = bindings::scx_consts_SCX_EXIT_DUMP_DFL_LEN as isize,
 }

 /// Takes a reference to C struct user_exit_info and reads it into
@ -65,7 +65,7 @@ macro_rules! uei_set_size {
    ($skel: expr, $ops: ident, $uei:ident) => {{
        scx_utils::paste! {
            let len = match $skel.struct_ops.$ops().exit_dump_len {
-                0 => scx_utils::ScxInternalConsts::ExitDumpDflLen as u32,
+                0 => scx_utils::ScxConsts::ExitDumpDflLen as u32,
                v => v,
            };
            $skel.rodata_mut().[<$uei _dump_len>] = len;
--- a/scheds/c/scx_central.c
+++ b/scheds/c/scx_central.c
@ -24,10 +24,19 @@ const char help_fmt[] =
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -c CPU        Override the central CPU (default: 0)\n"
+"  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";

+static bool verbose;
 static volatile int exit_req;

+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
 static void sigint_handler(int dummy)
 {
 	exit_req = 1;
@ -37,22 +46,20 @@ int main(int argc, char **argv)
 {
 	struct scx_central *skel;
 	struct bpf_link *link;
-	__u64 seq = 0;
+	__u64 seq = 0, ecode;
 	__s32 opt;
 	cpu_set_t *cpuset;

+	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
-
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	skel = scx_central__open();
-	SCX_BUG_ON(!skel, "Failed to open skel");
+restart:
+	skel = SCX_OPS_OPEN(central_ops, scx_central);

 	skel->rodata->central_cpu = 0;
 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();

-	while ((opt = getopt(argc, argv, "s:c:ph")) != -1) {
+	while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@ -60,6 +67,9 @@ int main(int argc, char **argv)
 		case 'c':
 			skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
 			break;
+		case 'v':
+			verbose = true;
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
@ -116,7 +126,10 @@ int main(int argc, char **argv)
 	}

 	bpf_link__destroy(link);
-	UEI_REPORT(skel, uei);
+	ecode = UEI_REPORT(skel, uei);
 	scx_central__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
 	return 0;
 }
--- a/scheds/c/scx_flatcg.c
+++ b/scheds/c/scx_flatcg.c
@ -26,15 +26,24 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f]\n"
+"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -i INTERVAL   Report interval\n"
 "  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
+"  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";

+static bool verbose;
 static volatile int exit_req;

+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
 static void sigint_handler(int dummy)
 {
 	exit_req = 1;
@ -119,18 +128,17 @@ int main(int argc, char **argv)
 	__u64 last_stats[FCG_NR_STATS] = {};
 	unsigned long seq = 0;
 	__s32 opt;
+	__u64 ecode;

+	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
-
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	skel = scx_flatcg__open();
-	SCX_BUG_ON(!skel, "Failed to open skel");
+restart:
+	skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);

 	skel->rodata->nr_cpus = libbpf_num_possible_cpus();

-	while ((opt = getopt(argc, argv, "s:i:dfph")) != -1) {
+	while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
 		double v;

 		switch (opt) {
@ -149,6 +157,9 @@ int main(int argc, char **argv)
 		case 'f':
 			skel->rodata->fifo_sched = true;
 			break;
+		case 'v':
+			verbose = true;
+			break;
 		case 'h':
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
@ -213,7 +224,10 @@ int main(int argc, char **argv)
 	}

 	bpf_link__destroy(link);
-	UEI_REPORT(skel, uei);
+	ecode = UEI_REPORT(skel, uei);
 	scx_flatcg__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
 	return 0;
 }
--- a/scheds/c/scx_nest.c
+++ b/scheds/c/scx_nest.c
@ -29,10 +29,19 @@ const char help_fmt[] =
 "  -i ITERS      Number of successive placement failures tolerated before trying to aggressively expand primary nest (default 2), or 0 to disable\n"
 "  -s SLICE_US   Override slice duration in us (default 20000us / 20ms)\n"
 "  -I            First try to find a fully idle core, and then any idle core, when searching nests. Default behavior is to ignore hypertwins and check for any idle core.\n"
+"  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";

+static bool verbose;
 static volatile int exit_req;

+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
 static void sigint_handler(int nest)
 {
 	exit_req = 1;
@ -152,19 +161,18 @@ int main(int argc, char **argv)
 	struct scx_nest *skel;
 	struct bpf_link *link;
 	__u32 opt;
+	__u64 ecode;

+	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
-
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	skel = scx_nest__open();
-	SCX_BUG_ON(!skel, "Failed to open skel");
+restart:
+	skel = SCX_OPS_OPEN(nest_ops, scx_nest);

 	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
 	skel->rodata->sampling_cadence_ns = SAMPLING_CADENCE_S * 1000 * 1000 * 1000;

-	while ((opt = getopt(argc, argv, "hId:m:i:s:")) != -1) {
+	while ((opt = getopt(argc, argv, "d:m:i:Is:vh")) != -1) {
 		switch (opt) {
 		case 'd':
 			skel->rodata->p_remove_ns = strtoull(optarg, NULL, 0) * 1000;
@ -181,6 +189,9 @@ int main(int argc, char **argv)
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
 			break;
+		case 'v':
+			verbose = true;
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
@ -216,7 +227,10 @@ int main(int argc, char **argv)
 	}

 	bpf_link__destroy(link);
-	UEI_REPORT(skel, uei);
+	ecode = UEI_REPORT(skel, uei);
 	scx_nest__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
 	return 0;
 }
--- a/scheds/c/scx_pair.c
+++ b/scheds/c/scx_pair.c
@ -23,10 +23,19 @@ const char help_fmt[] =
 "Usage: %s [-S STRIDE]\n"
 "\n"
 "  -S STRIDE     Override CPU pair stride (default: nr_cpus_ids / 2)\n"
+"  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";

+static bool verbose;
 static volatile int exit_req;

+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
 static void sigint_handler(int dummy)
 {
 	exit_req = 1;
@ -36,27 +45,28 @@ int main(int argc, char **argv)
 {
 	struct scx_pair *skel;
 	struct bpf_link *link;
-	__u64 seq = 0;
+	__u64 seq = 0, ecode;
 	__s32 stride, i, opt, outer_fd;

+	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
-
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	skel = scx_pair__open();
-	SCX_BUG_ON(!skel, "Failed to open skel");
+restart:
+	skel = SCX_OPS_OPEN(pair_ops, scx_pair);

 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();

 	/* pair up the earlier half to the latter by default, override with -s */
 	stride = skel->rodata->nr_cpu_ids / 2;

-	while ((opt = getopt(argc, argv, "S:ph")) != -1) {
+	while ((opt = getopt(argc, argv, "S:vh")) != -1) {
 		switch (opt) {
 		case 'S':
 			stride = strtoul(optarg, NULL, 0);
 			break;
+		case 'v':
+			verbose = true;
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
@ -158,7 +168,10 @@ int main(int argc, char **argv)
 	}

 	bpf_link__destroy(link);
-	UEI_REPORT(skel, uei);
+	ecode = UEI_REPORT(skel, uei);
 	scx_pair__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
 	return 0;
 }
--- a/scheds/c/scx_qmap.bpf.c
+++ b/scheds/c/scx_qmap.bpf.c
@ -23,6 +23,12 @@
 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
 */
 #include <scx/common.bpf.h>
+#include <string.h>
+
+enum consts {
+	ONE_SEC_IN_NS		= 1000000000,
+	SHARED_DSQ		= 0,
+};

 char _license[] SEC("license") = "GPL";

@ -30,6 +36,9 @@ const volatile u64 slice_ns = SCX_SLICE_DFL;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_inf_loop_after;
+const volatile u32 dsp_batch;
+const volatile bool print_shared_dsq;
+const volatile char exp_prefix[17];
 const volatile s32 disallow_tgid;
 const volatile bool switch_partial;

@ -62,6 +71,18 @@ struct {
 	},
 };

+/*
+ * If enabled, CPU performance target is set according to the queue index
+ * according to the following table.
+ */
+static const u32 qidx_to_cpuperf_target[] = {
+	[0] = SCX_CPUPERF_ONE * 0 / 4,
+	[1] = SCX_CPUPERF_ONE * 1 / 4,
+	[2] = SCX_CPUPERF_ONE * 2 / 4,
+	[3] = SCX_CPUPERF_ONE * 3 / 4,
+	[4] = SCX_CPUPERF_ONE * 4 / 4,
+};
+
 /*
 * Per-queue sequence numbers to implement core-sched ordering.
 *
@ -86,17 +107,25 @@ struct {
 	__type(value, struct task_ctx);
 } task_ctx_stor SEC(".maps");

-/* Per-cpu dispatch index and remaining count */
+struct cpu_ctx {
+	u64	dsp_idx;	/* dispatch index */
+	u64	dsp_cnt;	/* remaining count */
+	u32	avg_weight;
+	u32	cpuperf_target;
+};
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-	__uint(max_entries, 2);
+	__uint(max_entries, 1);
 	__type(key, u32);
-	__type(value, u64);
-} dispatch_idx_cnt SEC(".maps");
+	__type(value, struct cpu_ctx);
+} cpu_ctx_stor SEC(".maps");

 /* Statistics */
 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
-u64 nr_core_sched_execed;
+u64 nr_core_sched_execed, nr_expedited;
+u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;

 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
@ -189,7 +218,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	if (enq_flags & SCX_ENQ_REENQ) {
 		s32 cpu;

-		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags);
+		scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 		if (cpu >= 0)
 			scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE);
@ -204,7 +233,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)

 	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
 	if (bpf_map_push_elem(ring, &pid, 0)) {
-		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags);
+		scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
 		return;
 	}

@ -233,18 +262,49 @@ static void update_core_sched_head_seq(struct task_struct *p)
 		scx_bpf_error("task_ctx lookup failed");
 }

+static bool consume_shared_dsq(void)
+{
+	struct task_struct *p;
+	bool consumed;
+
+	if (exp_prefix[0] == '\0')
+		return scx_bpf_consume(SHARED_DSQ);
+
+	/*
+	 * To demonstrate the use of scx_bpf_consume_task(), implement silly
+	 * selective priority boosting mechanism by scanning SHARED_DSQ looking
+	 * for matching comms and consume them first. This makes difference only
+	 * when dsp_batch is larger than 1.
+	 */
+	consumed = false;
+	__COMPAT_DSQ_FOR_EACH(p, SHARED_DSQ, 0) {
+		char comm[sizeof(exp_prefix)];
+
+		memcpy(comm, p->comm, sizeof(exp_prefix) - 1);
+
+		if (!bpf_strncmp(comm, sizeof(exp_prefix),
+				 (const char *)exp_prefix) &&
+		    __COMPAT_scx_bpf_consume_task(BPF_FOR_EACH_ITER, p)) {
+			consumed = true;
+			__sync_fetch_and_add(&nr_expedited, 1);
+		}
+	}
+
+	return consumed || scx_bpf_consume(SHARED_DSQ);
+}
+
 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
-	u32 zero = 0, one = 1;
-	u64 *idx = bpf_map_lookup_elem(&dispatch_idx_cnt, &zero);
-	u64 *cnt = bpf_map_lookup_elem(&dispatch_idx_cnt, &one);
+	struct task_struct *p;
+	struct cpu_ctx *cpuc;
+	u32 zero = 0, batch = dsp_batch ?: 1;
 	void *fifo;
-	s32 pid;
-	int i;
+	s32 i, pid;
+
+	if (consume_shared_dsq())
+		return;

 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
-		struct task_struct *p;
-
 		/*
 		 * PID 2 should be kthreadd which should mostly be idle and off
 		 * the scheduler. Let's keep dispatching it to force the kernel
@ -252,49 +312,80 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 		 */
 		p = bpf_task_from_pid(2);
 		if (p) {
-			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0);
+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
 			bpf_task_release(p);
 			return;
 		}
 	}

-	if (!idx || !cnt) {
-		scx_bpf_error("failed to lookup idx[%p], cnt[%p]", idx, cnt);
+	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
+		scx_bpf_error("failed to look up cpu_ctx");
 		return;
 	}

 	for (i = 0; i < 5; i++) {
 		/* Advance the dispatch cursor and pick the fifo. */
-		if (!*cnt) {
-			*idx = (*idx + 1) % 5;
-			*cnt = 1 << *idx;
+		if (!cpuc->dsp_cnt) {
+			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
+			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
 		}
-		(*cnt)--;

-		fifo = bpf_map_lookup_elem(&queue_arr, idx);
+		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
 		if (!fifo) {
-			scx_bpf_error("failed to find ring %llu", *idx);
+			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
 			return;
 		}

 		/* Dispatch or advance. */
-		if (!bpf_map_pop_elem(fifo, &pid)) {
-			struct task_struct *p;
+		bpf_repeat(BPF_MAX_LOOPS) {
+			if (bpf_map_pop_elem(fifo, &pid))
+				break;

 			p = bpf_task_from_pid(pid);
-			if (p) {
-				update_core_sched_head_seq(p);
-				__sync_fetch_and_add(&nr_dispatched, 1);
-				scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0);
-				bpf_task_release(p);
+			if (!p)
+				continue;
+
+			update_core_sched_head_seq(p);
+			__sync_fetch_and_add(&nr_dispatched, 1);
+			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
+			bpf_task_release(p);
+			batch--;
+			cpuc->dsp_cnt--;
+			if (!batch || !scx_bpf_dispatch_nr_slots()) {
+				consume_shared_dsq();
 				return;
 			}
+			if (!cpuc->dsp_cnt)
+				break;
 		}

-		*cnt = 0;
+		cpuc->dsp_cnt = 0;
 	}
 }

+void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
+{
+	struct cpu_ctx *cpuc;
+	u32 zero = 0;
+	int idx;
+
+	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
+		scx_bpf_error("failed to look up cpu_ctx");
+		return;
+	}
+
+	/*
+	 * Use the running avg of weights to select the target cpuperf level.
+	 * This is a demonstration of the cpuperf feature rather than a
+	 * practical strategy to regulate CPU frequency.
+	 */
+	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
+	idx = weight_to_idx(cpuc->avg_weight);
+	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
+
+	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
+}
+
 /*
 * The distance from the head of the queue scaled by the weight of the queue.
 * The lower the number, the older the task and the higher the priority.
@ -371,11 +462,189 @@ s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
 		return -ENOMEM;
 }

-s32 BPF_STRUCT_OPS(qmap_init)
+/*
+ * Print out the online and possible CPU map using bpf_printk() as a
+ * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
+ */
+static void print_cpus(void)
 {
+	const struct cpumask *possible, *online;
+	s32 cpu;
+	char buf[128] = "", *p;
+	int idx;
+
+	if (!__COMPAT_HAS_CPUMASKS)
+		return;
+
+	possible = scx_bpf_get_possible_cpumask();
+	online = scx_bpf_get_online_cpumask();
+
+	idx = 0;
+	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
+		if (!(p = MEMBER_VPTR(buf, [idx++])))
+			break;
+		if (bpf_cpumask_test_cpu(cpu, online))
+			*p++ = 'O';
+		else if (bpf_cpumask_test_cpu(cpu, possible))
+			*p++ = 'X';
+		else
+			*p++ = ' ';
+
+		if ((cpu & 7) == 7) {
+			if (!(p = MEMBER_VPTR(buf, [idx++])))
+				break;
+			*p++ = '|';
+		}
+	}
+	buf[sizeof(buf) - 1] = '\0';
+
+	scx_bpf_put_cpumask(online);
+	scx_bpf_put_cpumask(possible);
+
+	bpf_printk("CPUS: |%s", buf);
+}
+
+void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
+{
+	bpf_printk("CPU %d coming online", cpu);
+	/* @cpu is already online at this point */
+	print_cpus();
+}
+
+void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
+{
+	bpf_printk("CPU %d going offline", cpu);
+	/* @cpu is still online at this point */
+	print_cpus();
+}
+
+struct monitor_timer {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, u32);
+	__type(value, struct monitor_timer);
+} central_timer SEC(".maps");
+
+/*
+ * Print out the min, avg and max performance levels of CPUs every second to
+ * demonstrate the cpuperf interface.
+ */
+static void monitor_cpuperf(void)
+{
+	u32 zero = 0;
+	u32 nr_cpu_ids = scx_bpf_nr_cpu_ids();
+	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
+	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
+	const struct cpumask *online;
+	int i, nr_online_cpus = 0;
+
+	online = scx_bpf_get_online_cpumask();
+
+	bpf_for(i, 0, nr_cpu_ids) {
+		struct cpu_ctx *cpuc;
+		u32 cap, cur;
+
+		if (!bpf_cpumask_test_cpu(i, online))
+			continue;
+		nr_online_cpus++;
+
+		/* collect the capacity and current cpuperf */
+		cap = scx_bpf_cpuperf_cap(i);
+		cur = scx_bpf_cpuperf_cur(i);
+
+		cur_min = cur < cur_min ? cur : cur_min;
+		cur_max = cur > cur_max ? cur : cur_max;
+
+		/*
+		 * $cur is relative to $cap. Scale it down accordingly so that
+		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
+		 * makes sense.
+		 */
+		cur_sum += cur * cap / SCX_CPUPERF_ONE;
+		cap_sum += cap;
+
+		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
+			scx_bpf_error("failed to look up cpu_ctx");
+			goto out;
+		}
+
+		/* collect target */
+		cur = cpuc->cpuperf_target;
+		target_sum += cur;
+		target_min = cur < target_min ? cur : target_min;
+		target_max = cur > target_max ? cur : target_max;
+	}
+
+	cpuperf_min = cur_min;
+	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
+	cpuperf_max = cur_max;
+
+	cpuperf_target_min = target_min;
+	cpuperf_target_avg = target_sum / nr_online_cpus;
+	cpuperf_target_max = target_max;
+out:
+	scx_bpf_put_cpumask(online);
+}
+
+/*
+ * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
+ * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
+ * see meaningful dumps in the trace pipe.
+ */
+static void dump_shared_dsq(void)
+{
+	struct task_struct *p;
+	s32 nr;
+
+	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
+		return;
+
+	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
+
+	bpf_rcu_read_lock();
+	__COMPAT_DSQ_FOR_EACH(p, SHARED_DSQ, SCX_DSQ_ITER_REV)
+		bpf_printk("%s[%d]", p->comm, p->pid);
+	bpf_rcu_read_unlock();
+}
+
+static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+	monitor_cpuperf();
+
+	if (print_shared_dsq)
+		dump_shared_dsq();
+
+	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
+{
+	u32 key = 0;
+	struct bpf_timer *timer;
+	s32 ret;
+
 	if (!switch_partial)
 		__COMPAT_scx_bpf_switch_all();
-	return 0;
+
+	print_cpus();
+
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret)
+		return ret;
+
+	timer = bpf_map_lookup_elem(&central_timer, &key);
+	if (!timer)
+		return -ESRCH;
+
+	bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
+	bpf_timer_set_callback(timer, monitor_timerfn);
+
+	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 }

 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
@ -388,9 +657,12 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .enqueue			= (void *)qmap_enqueue,
 	       .dequeue			= (void *)qmap_dequeue,
 	       .dispatch		= (void *)qmap_dispatch,
+	       .tick			= (void *)qmap_tick,
 	       .core_sched_before	= (void *)qmap_core_sched_before,
 	       .cpu_release		= (void *)qmap_cpu_release,
 	       .init_task		= (void *)qmap_init_task,
+	       .cpu_online		= (void *)qmap_cpu_online,
+	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,
 	       .exit			= (void *)qmap_exit,
 	       .flags			= SCX_OPS_ENQ_LAST,
--- a/scheds/c/scx_qmap.c
+++ b/scheds/c/scx_qmap.c
@ -19,21 +19,34 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-d PID]\n"
-"       [-D LEN] [-p]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
+"       [-P] [-E PREFIX] [-d PID] [-D LEN] [-p] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
 "  -t COUNT      Stall every COUNT'th user thread\n"
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
+"  -b COUNT      Dispatch upto COUNT tasks together\n"
+"  -P            Print out DSQ content to trace_pipe every second, use with -b\n"
+"  -E PREFIX     Expedite consumption of threads w/ matching comm, use with -b\n"
+"                (e.g. match shell on a loaded system)\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -D LEN        Set scx_exit_info.dump buffer length\n"
 "  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
+"  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";

+static bool verbose;
 static volatile int exit_req;

+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
 static void sigint_handler(int dummy)
 {
 	exit_req = 1;
@ -45,15 +58,13 @@ int main(int argc, char **argv)
 	struct bpf_link *link;
 	int opt;

+	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);

-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);

-	skel = scx_qmap__open();
-	SCX_BUG_ON(!skel, "Failed to open skel");
-
-	while ((opt = getopt(argc, argv, "s:e:t:T:l:d:D:ph")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PE:d:D:pvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@ -70,6 +81,16 @@ int main(int argc, char **argv)
 		case 'l':
 			skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0);
 			break;
+		case 'b':
+			skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
+			break;
+		case 'P':
+			skel->rodata->print_shared_dsq = true;
+			break;
+		case 'E':
+			strncpy(skel->rodata->exp_prefix, optarg,
+				sizeof(skel->rodata->exp_prefix) - 1);
+			break;
 		case 'd':
 			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
 			if (skel->rodata->disallow_tgid < 0)
@ -82,12 +103,19 @@ int main(int argc, char **argv)
 			skel->rodata->switch_partial = true;
 			skel->struct_ops.qmap_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL;
 			break;
+		case 'v':
+			verbose = true;
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
 		}
 	}

+	if (!__COMPAT_HAS_DSQ_ITER &&
+	    (skel->rodata->print_shared_dsq || strlen(skel->rodata->exp_prefix)))
+		fprintf(stderr, "kernel doesn't support DSQ iteration\n");
+
 	SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
 	link = SCX_OPS_ATTACH(skel, qmap_ops);

@ -95,10 +123,18 @@ int main(int argc, char **argv)
 		long nr_enqueued = skel->bss->nr_enqueued;
 		long nr_dispatched = skel->bss->nr_dispatched;

-		printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%" PRIu64 ", deq=%" PRIu64 ", core=%" PRIu64 "\n",
+		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" exp=%"PRIu64"\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
 		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
-		       skel->bss->nr_core_sched_execed);
+		       skel->bss->nr_core_sched_execed, skel->bss->nr_expedited);
+		if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
+			printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
+			       skel->bss->cpuperf_min,
+			       skel->bss->cpuperf_avg,
+			       skel->bss->cpuperf_max,
+			       skel->bss->cpuperf_target_min,
+			       skel->bss->cpuperf_target_avg,
+			       skel->bss->cpuperf_target_max);
 		fflush(stdout);
 		sleep(1);
 	}
@ -106,5 +142,9 @@ int main(int argc, char **argv)
 	bpf_link__destroy(link);
 	UEI_REPORT(skel, uei);
 	scx_qmap__destroy(skel);
+	/*
+	 * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart
+	 * on CPU hotplug events.
+	 */
 	return 0;
 }
--- a/scheds/c/scx_simple.bpf.c
+++ b/scheds/c/scx_simple.bpf.c
@ -129,7 +129,6 @@ void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)

 s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
 {
-	__COMPAT_scx_bpf_switch_all();
 	return scx_bpf_create_dsq(SHARED_DSQ, -1);
 }

--- a/scheds/c/scx_simple.c
+++ b/scheds/c/scx_simple.c
@ -17,13 +17,22 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-f]\n"
+"Usage: %s [-f] [-v]\n"
 "\n"
 "  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
+"  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";

+static bool verbose;
 static volatile int exit_req;

+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
 static void sigint_handler(int simple)
 {
 	exit_req = 1;
@ -54,20 +63,22 @@ int main(int argc, char **argv)
 	struct scx_simple *skel;
 	struct bpf_link *link;
 	__u32 opt;
+	__u64 ecode;

+	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
+restart:
+	skel = SCX_OPS_OPEN(simple_ops, scx_simple);

-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	skel = scx_simple__open();
-	SCX_BUG_ON(!skel, "Failed to open skel");
-
-	while ((opt = getopt(argc, argv, "fh")) != -1) {
+	while ((opt = getopt(argc, argv, "fvh")) != -1) {
 		switch (opt) {
 		case 'f':
 			skel->rodata->fifo_sched = true;
 			break;
+		case 'v':
+			verbose = true;
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
@ -87,7 +98,10 @@ int main(int argc, char **argv)
 	}

 	bpf_link__destroy(link);
-	UEI_REPORT(skel, uei);
+	ecode = UEI_REPORT(skel, uei);
 	scx_simple__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
 	return 0;
 }
--- a/scheds/c/scx_userland.c
+++ b/scheds/c/scx_userland.c
@ -41,6 +41,7 @@ const char help_fmt[] =
 "Usage: %s [-b BATCH]\n"
 "\n"
 "  -b BATCH      The number of tasks to batch when dispatching (default: 8)\n"
+"  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";

 /* Defined in UAPI */
@ -49,6 +50,7 @@ const char help_fmt[] =
 /* Number of tasks to batch when dispatching to user space. */
 static __u32 batch_size = 8;

+static bool verbose;
 static volatile int exit_req;
 static int enqueued_fd, dispatched_fd;

@ -96,6 +98,13 @@ static int pid_max;

 static double min_vruntime;

+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
 static void sigint_handler(int userland)
 {
 	exit_req = 1;
@ -337,7 +346,7 @@ static void print_example_warning(const char *sched)
 	printf(warning_fmt, sched);
 }

-static void bootstrap(int argc, char **argv)
+static void pre_bootstrap(int argc, char **argv)
 {
 	int err;
 	__u32 opt;
@ -349,9 +358,9 @@ static void bootstrap(int argc, char **argv)
 	if (err)
 		exit(err);

+	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);

 	/*
 	 * Enforce that the user scheduler task is managed by sched_ext. The
@ -363,11 +372,14 @@ static void bootstrap(int argc, char **argv)
 	err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param);
 	SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT");

-	while ((opt = getopt(argc, argv, "b:ph")) != -1) {
+	while ((opt = getopt(argc, argv, "b:vh")) != -1) {
 		switch (opt) {
 		case 'b':
 			batch_size = strtoul(optarg, NULL, 0);
 			break;
+		case 'v':
+			verbose = true;
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			exit(opt != 'h');
@ -381,9 +393,11 @@ static void bootstrap(int argc, char **argv)
 	 */
 	err = mlockall(MCL_CURRENT | MCL_FUTURE);
 	SCX_BUG_ON(err, "Failed to prefault and lock address space");
+}

-	skel = scx_userland__open();
-	SCX_BUG_ON(!skel, "Failed to open skel");
+static void bootstrap(char *comm)
+{
+	skel = SCX_OPS_OPEN(userland_ops, scx_userland);

 	skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
 	assert(skel->rodata->num_possible_cpus > 0);
@ -399,7 +413,7 @@ static void bootstrap(int argc, char **argv)

 	SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread");

-	print_example_warning(basename(argv[0]));
+	print_example_warning(basename(comm));
 	ops_link = SCX_OPS_ATTACH(skel, userland_ops);
 }

@ -428,12 +442,19 @@ static void sched_main_loop(void)

 int main(int argc, char **argv)
 {
-	bootstrap(argc, argv);
+	__u64 ecode;
+
+	pre_bootstrap(argc, argv);
+restart:
+	bootstrap(argv[0]);
 	sched_main_loop();

 	exit_req = 1;
 	bpf_link__destroy(ops_link);
-	UEI_REPORT(skel, uei);
+	ecode = UEI_REPORT(skel, uei);
 	scx_userland__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
 	return 0;
 }
--- a/scheds/include/scx/common.bpf.h
+++ b/scheds/include/scx/common.bpf.h
@ -28,9 +28,54 @@ static inline void ___vmlinux_h_sanity_check___(void)
 		       "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
 }

+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
+u32 scx_bpf_dispatch_nr_slots(void) __ksym;
+void scx_bpf_dispatch_cancel(void) __ksym;
+bool scx_bpf_consume(u64 dsq_id) __ksym;
+bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p) __ksym __weak;
+u32 scx_bpf_reenqueue_local(void) __ksym;
+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
+void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, bool rev) __ksym __weak;
+struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
+void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
+void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
 void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
-void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
-		       unsigned long long *data, u32 data__sz) __ksym __weak;
+u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
+u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
+void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
+u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
+const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
+const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
+void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak;
+const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
+const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
+void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
+s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
+bool scx_bpf_task_running(const struct task_struct *p) __ksym;
+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
+
+/*
+ * Use the following as @it when calling scx_bpf_consume_task() from whitin
+ * bpf_for_each() loops.
+ */
+#define BPF_FOR_EACH_ITER	(&___it)
+
+/* hopefully temporary wrapper to work around BPF restriction */
+static inline bool scx_bpf_consume_task(struct bpf_iter_scx_dsq *it,
+					struct task_struct *p)
+{
+	unsigned long ptr;
+	bpf_probe_read_kernel(&ptr, sizeof(ptr), it);
+	return __scx_bpf_consume_task(ptr, p);
+}

 static inline __attribute__((format(printf, 1, 2)))
 void ___scx_bpf_exit_format_checker(const char *fmt, ...) {}
@ -40,18 +85,18 @@ void ___scx_bpf_exit_format_checker(const char *fmt, ...) {}
 * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
 * refer to the initialized list of inputs to the bstr kfunc.
 */
-#define scx_bpf_exit_preamble(fmt, args...)				\
-	static char ___fmt[] = fmt;					\
-	/*								\
-	 * Note that __param[] must have at least one			\
-	 * element to keep the verifier happy.				\
-	 */								\
-	unsigned long long ___param[___bpf_narg(args) ?: 1] = {};	\
-									\
-	_Pragma("GCC diagnostic push")					\
-	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		\
-	___bpf_fill(___param, args);					\
-	_Pragma("GCC diagnostic pop")					\
+#define scx_bpf_exit_preamble(fmt, args...)					\
+	static char ___fmt[] = fmt;						\
+	/*									\
+	 * Note that __param[] must have at least one				\
+	 * element to keep the verifier happy.					\
+	 */									\
+	unsigned long long ___param[___bpf_narg(args) ?: 1] = {};		\
+										\
+	_Pragma("GCC diagnostic push")						\
+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")			\
+	___bpf_fill(___param, args);						\
+	_Pragma("GCC diagnostic pop")						\

 /*
 * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments
@ -78,30 +123,6 @@ void ___scx_bpf_exit_format_checker(const char *fmt, ...) {}
 	___scx_bpf_exit_format_checker(fmt, ##args);				\
 })

-s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
-bool scx_bpf_consume(u64 dsq_id) __ksym;
-void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
-void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
-u32 scx_bpf_dispatch_nr_slots(void) __ksym;
-void scx_bpf_dispatch_cancel(void) __ksym;
-void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
-s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
-bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
-s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
-s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
-const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
-const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
-void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
-void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
-s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
-bool scx_bpf_task_running(const struct task_struct *p) __ksym;
-s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
-struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
-u32 scx_bpf_reenqueue_local(void) __ksym;
-u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym;
-u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym;
-void scx_bpf_cpuperf_set(u32 cpu, u32 perf) __ksym __weak;
-
 #define BPF_STRUCT_OPS(name, args...)						\
 SEC("struct_ops/"#name)								\
 BPF_PROG(name, ##args)
@ -156,7 +177,8 @@ BPF_PROG(name, ##args)
 * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
 * `MEMBER_VPTR(ptr, ->member)`.
 */
-#define MEMBER_VPTR(base, member) (typeof((base) member) *)({			\
+#define MEMBER_VPTR(base, member) (typeof((base) member) *)			\
+({										\
 	u64 __base = (u64)&(base);						\
 	u64 __addr = (u64)&((base) member) - __base;				\
 	_Static_assert(sizeof(base) >= sizeof((base) member),			\
@ -186,18 +208,19 @@ BPF_PROG(name, ##args)
 * size of the array to compute the max, which will result in rejection by
 * the verifier.
 */
-#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)({	  \
-	u64 __base = (u64)arr;				  \
-	u64 __addr = (u64)&(arr[i]) - __base;		  \
-	asm volatile (					  \
-		"if %0 <= %[max] goto +2\n"		  \
-		"%0 = 0\n"				  \
-		"goto +1\n"				  \
-		"%0 += %1\n"				  \
-		: "+r"(__addr)				  \
-		: "r"(__base),				  \
-		  [max]"r"(sizeof(arr[0]) * ((n) - 1)));  \
-	__addr;						  \
+#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)				\
+({										\
+	u64 __base = (u64)arr;							\
+	u64 __addr = (u64)&(arr[i]) - __base;					\
+	asm volatile (								\
+		"if %0 <= %[max] goto +2\n"					\
+		"%0 = 0\n"							\
+		"goto +1\n"							\
+		"%0 += %1\n"							\
+		: "+r"(__addr)							\
+		: "r"(__base),							\
+		  [max]"r"(sizeof(arr[0]) * ((n) - 1)));			\
+	__addr;									\
 })

 /*
@ -227,7 +250,7 @@ int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,

 struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;

-extern void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
+void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
 #define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL)

 /* task */
--- a/scheds/include/scx/compat.bpf.h
+++ b/scheds/include/scx/compat.bpf.h
@ -18,13 +18,15 @@
 /*
 * %SCX_KICK_IDLE is a later addition. To support both before and after, use
 * %__COMPAT_SCX_KICK_IDLE which becomes 0 on kernels which don't support it.
+ * Users can use %SCX_KICK_IDLE directly in the future.
 */
 #define __COMPAT_SCX_KICK_IDLE							\
 	__COMPAT_ENUM_OR_ZERO(enum scx_kick_flags, SCX_KICK_IDLE)

 /*
 * scx_switch_all() was replaced by %SCX_OPS_SWITCH_PARTIAL. See
- * %__COMPAT_SCX_OPS_SWITCH_PARTIAL in compat.h.
+ * %__COMPAT_SCX_OPS_SWITCH_PARTIAL in compat.h. This can be dropped in the
+ * future.
 */
 void scx_bpf_switch_all(void) __ksym __weak;

@ -34,6 +36,67 @@ static inline void __COMPAT_scx_bpf_switch_all(void)
 		scx_bpf_switch_all();
 }

+/*
+ * scx_bpf_exit() is a new addition. Fall back to scx_bpf_error() if
+ * unavailable. Users can use scx_bpf_exit() directly in the future.
+ */
+#define __COMPAT_scx_bpf_exit(code, fmt, args...)				\
+({										\
+	if (bpf_ksym_exists(scx_bpf_exit_bstr))					\
+		scx_bpf_exit((code), fmt, args);				\
+	else									\
+		scx_bpf_error(fmt, args);					\
+})
+
+/*
+ * scx_bpf_nr_cpu_ids(), scx_bpf_get_possible/online_cpumask() are new. No good
+ * way to noop these kfuncs. Provide a test macro. Users can assume existence in
+ * the future.
+ */
+#define __COMPAT_HAS_CPUMASKS							\
+	bpf_ksym_exists(scx_bpf_nr_cpu_ids)
+
+/*
+ * cpuperf is new. The followings become noop on older kernels. Callers can be
+ * updated to call cpuperf kfuncs directly in the future.
+ */
+static inline u32 __COMPAT_scx_bpf_cpuperf_cap(s32 cpu)
+{
+	if (bpf_ksym_exists(scx_bpf_cpuperf_cap))
+		return scx_bpf_cpuperf_cap(cpu);
+	else
+		return 1024;
+}
+
+static inline u32 __COMPAT_scx_bpf_cpuperf_cur(s32 cpu)
+{
+	if (bpf_ksym_exists(scx_bpf_cpuperf_cur))
+		return scx_bpf_cpuperf_cur(cpu);
+	else
+		return 1024;
+}
+
+static inline void __COMPAT_scx_bpf_cpuperf_set(s32 cpu, u32 perf)
+{
+	if (bpf_ksym_exists(scx_bpf_cpuperf_set))
+		return scx_bpf_cpuperf_set(cpu, perf);
+}
+
+/*
+ * Iteration and scx_bpf_consume_task() are new. The following become noop on
+ * older kernels. The users can switch to bpf_for_each(scx_dsq) and directly
+ * call scx_bpf_consume_task() in the future.
+ */
+#define __COMPAT_DSQ_FOR_EACH(p, dsq_id, flags)					\
+	if (bpf_ksym_exists(bpf_iter_scx_dsq_new))				\
+		bpf_for_each(scx_dsq, (p), (dsq_id), (flags))
+
+static inline bool __COMPAT_scx_bpf_consume_task(struct bpf_iter_scx_dsq *it,
+						 struct task_struct *p)
+{
+	return false;
+}
+
 /*
 * Define sched_ext_ops. This may be expanded to define multiple variants for
 * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
--- a/scheds/include/scx/compat.h
+++ b/scheds/include/scx/compat.h
@ -8,6 +8,9 @@
 #define __SCX_COMPAT_H

 #include <bpf/btf.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>

 struct btf *__COMPAT_vmlinux_btf __attribute__((weak));

@ -69,6 +72,12 @@ static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v
 	__val;									\
 })

+static inline bool __COMPAT_has_ksym(const char *ksym)
+{
+	__COMPAT_load_vmlinux_btf();
+	return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0;
+}
+
 static inline bool __COMPAT_struct_has_field(const char *type, const char *field)
 {
 	const struct btf_type *t;
@ -101,27 +110,79 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
 * An ops flag, %SCX_OPS_SWITCH_PARTIAL, replaced scx_bpf_switch_all() which had
 * to be called from ops.init(). To support both before and after, use both
 * %__COMPAT_SCX_OPS_SWITCH_PARTIAL and %__COMPAT_scx_bpf_switch_all() defined
- * in compat.bpf.h.
+ * in compat.bpf.h. Users can switch to directly using %SCX_OPS_SWITCH_PARTIAL
+ * in the future.
 */
 #define __COMPAT_SCX_OPS_SWITCH_PARTIAL						\
 	__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")

+/*
+ * scx_bpf_nr_cpu_ids(), scx_bpf_get_possible/online_cpumask() are new. Users
+ * will be able to assume existence in the future.
+ */
+#define __COMPAT_HAS_CPUMASKS							\
+	__COMPAT_has_ksym("scx_bpf_nr_cpu_ids")
+
+/*
+ * DSQ iterator is new. Users will be able to assume existence in the future.
+ */
+#define __COMPAT_HAS_DSQ_ITER							\
+	__COMPAT_has_ksym("bpf_iter_scx_dsq_new")
+
+static inline long scx_hotplug_seq(void)
+{
+	int fd;
+	char buf[32];
+	ssize_t len;
+	long val;
+
+	fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY);
+	if (fd < 0)
+		return -ENOENT;
+
+	len = read(fd, buf, sizeof(buf) - 1);
+	SCX_BUG_ON(len <= 0, "read failed (%ld)", len);
+	buf[len] = 0;
+	close(fd);
+
+	val = strtoul(buf, NULL, 10);
+	SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);
+
+	return val;
+}
+
 /*
 * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
 * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
 * and attach it, backward compatibility is automatically maintained where
 * reasonable.
 *
- * - sched_ext_ops.exit_dump_len was added later. On kernels which don't support
- *   it, the value is ignored and a warning is triggered if the value is
- *   requested to be non-zero.
+ * - ops.tick(): Ignored on older kernels with a warning.
+ * - ops.exit_dump_len: Cleared to zero on older kernels with a warning.
+ * - ops.hotplug_seq: Ignored on older kernels.
 */
+#define SCX_OPS_OPEN(__ops_name, __scx_name) ({					\
+	struct __scx_name *__skel;						\
+										\
+	__skel = __scx_name##__open();						\
+	SCX_BUG_ON(!__skel, "Could not open " #__scx_name);			\
+										\
+	if (__COMPAT_struct_has_field("sched_ext_ops", "hotplug_seq"))		\
+		__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq();	\
+	__skel; 								\
+})
+
 #define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({		\
 	UEI_SET_SIZE(__skel, __ops_name, __uei_name);				\
-	if (__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") &&	\
+	if (!__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") &&	\
 	    (__skel)->struct_ops.__ops_name->exit_dump_len) {			\
 		fprintf(stderr, "WARNING: kernel doesn't support setting exit dump len\n"); \
-		(__skel)->struct_ops.__ops_name->exit_dump_len = 0;	\
+		(__skel)->struct_ops.__ops_name->exit_dump_len = 0;		\
+	}									\
+	if (!__COMPAT_struct_has_field("sched_ext_ops", "tick") &&		\
+	    (__skel)->struct_ops.__ops_name->tick) {				\
+		fprintf(stderr, "WARNING: kernel doesn't support ops.tick()\n"); \
+		(__skel)->struct_ops.__ops_name->tick = NULL;			\
 	}									\
 	SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel");	\
 })
--- a/scheds/include/scx/user_exit_info.h
+++ b/scheds/include/scx/user_exit_info.h
@ -77,7 +77,35 @@ struct user_exit_info {
 	if (__uei->msg[0] != '\0')						\
 		fprintf(stderr, " (%s)", __uei->msg);				\
 	fputs("\n", stderr);							\
+	__uei->exit_code;							\
 })

+/*
+ * We can't import vmlinux.h while compiling user C code. Let's duplicate
+ * scx_exit_code definition.
+ */
+enum scx_exit_code {
+	/* Reasons */
+	SCX_ECODE_RSN_HOTPLUG		= 1LLU << 32,
+
+	/* Actions */
+	SCX_ECODE_ACT_RESTART		= 1LLU << 48,
+};
+
+enum uei_ecode_mask {
+	UEI_ECODE_USER_MASK		= ((1LLU << 32) - 1),
+	UEI_ECODE_SYS_RSN_MASK		= ((1LLU << 16) - 1) << 32,
+	UEI_ECODE_SYS_ACT_MASK		= ((1LLU << 16) - 1) << 48,
+};
+
+/*
+ * These macro interpret the ecode returned from UEI_REPORT().
+ */
+#define UEI_ECODE_USER(__ecode)		((__ecode) & UEI_ECODE_USER_MASK)
+#define UEI_ECODE_SYS_RSN(__ecode)	((__ecode) & UEI_ECODE_SYS_RSN_MASK)
+#define UEI_ECODE_SYS_ACT(__ecode)	((__ecode) & UEI_ECODE_SYS_ACT_MASK)
+
+#define UEI_ECODE_RESTART(__ecode)	(UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
+
 #endif	/* __bpf__ */
 #endif	/* __USER_EXIT_INFO_H */
--- a/scheds/include/vmlinux/vmlinux-v6.9-g73f4013eb1eb.h
+++ b/scheds/include/vmlinux/vmlinux-v6.9-g73f4013eb1eb.h
--- a/scheds/include/vmlinux/vmlinux.h
+++ b/scheds/include/vmlinux/vmlinux.h
@ -1 +1 @@
-vmlinux-v6.9-g5dc95302301f.h
+vmlinux-v6.9-g73f4013eb1eb.h
--- a/scheds/rust/scx_layered/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@ -763,8 +763,8 @@ void BPF_STRUCT_OPS(layered_running, struct task_struct *p)
 		}
 	}

-	if (bpf_ksym_exists(scx_bpf_cpuperf_set) && layer->perf > 0)
-		scx_bpf_cpuperf_set(cpu, layer->perf);
+	if (layer->perf > 0)
+		__COMPAT_scx_bpf_cpuperf_set(cpu, layer->perf);

 	cctx->maybe_idle = false;
 }
--- a/scheds/rust/scx_rusty/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_rusty/src/bpf/main.bpf.c
@ -1421,18 +1421,12 @@ static s32 initialize_cpu(s32 cpu)

 void BPF_STRUCT_OPS(rusty_cpu_online, s32 cpu)
 {
-	if (bpf_ksym_exists(scx_bpf_exit_bstr))
-		scx_bpf_exit(RUSTY_EXIT_HOTPLUG, "CPU %d went online", cpu);
-	else
-		scx_bpf_error("CPU %d went online", cpu);
+	__COMPAT_scx_bpf_exit(RUSTY_EXIT_HOTPLUG, "CPU %d went online", cpu);
 }

 void BPF_STRUCT_OPS(rusty_cpu_offline, s32 cpu)
 {
-	if (bpf_ksym_exists(scx_bpf_exit_bstr))
-		scx_bpf_exit(RUSTY_EXIT_HOTPLUG, "CPU %d went offline", cpu);
-	else
-		scx_bpf_error("CPU %d went offline", cpu);
+	__COMPAT_scx_bpf_exit(RUSTY_EXIT_HOTPLUG, "CPU %d went offline", cpu);
 }

 s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)