scx: Sync from kernel, some schedulers are broken

Sync from kernel to receive new vmlinux.h and the updates to common headers. This includes the following updates: - scx_bpf_switch_all() is replaced by SCX_OPS_SWITCH_PARTIAL flag. - sched_ext_ops.exit_dump_len added to allow customizing dump buffer size. - scx_bpf_exit() added. - Common headers updated to provide backward compatibility in a way which hides most complexities from scheduler implementations. scx_simple, qmap, central and flatcg are updated accordingly. Other schedulers are broken for the moment.
2024-11-25 04:00:24 +00:00 · 2024-03-07 08:05:18 -10:00 · 2024-03-07 08:05:18 -10:00 · 9447cb27b2
commit 9447cb27b2
parent 04c9e7fe9d
15 changed files with 141510 additions and 183 deletions
--- a/scheds/c/scx_central.bpf.c
+++ b/scheds/c/scx_central.bpf.c
@ -55,7 +55,6 @@ enum {
 	TIMER_INTERVAL_NS	= 1 * MS_TO_NS,
 };

-const volatile bool switch_partial;
 const volatile s32 central_cpu;
 const volatile u32 nr_cpu_ids = 1;	/* !0 for veristat, set during init */
 const volatile u64 slice_ns = SCX_SLICE_DFL;
@ -65,7 +64,7 @@ u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
 u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
 u64 nr_overflows;

-struct user_exit_info uei;
+UEI_DEFINE(uei);

 struct {
 	__uint(type, BPF_MAP_TYPE_QUEUE);
@ -176,7 +175,7 @@ static bool dispatch_to_cpu(s32 cpu)
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);

 		if (cpu != central_cpu)
-			__COMPAT_scx_bpf_kick_cpu_IDLE(cpu);
+			scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE);

 		bpf_task_release(p);
 		return true;
@ -306,9 +305,6 @@ int BPF_STRUCT_OPS_SLEEPABLE(central_init)
 	struct bpf_timer *timer;
 	int ret;

-	if (!switch_partial)
-		scx_bpf_switch_all();
-
 	ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
 	if (ret)
 		return ret;
@ -344,15 +340,14 @@ int BPF_STRUCT_OPS_SLEEPABLE(central_init)

 void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
 {
-	uei_record(&uei, ei);
+	UEI_RECORD(uei, ei);
 }

-SEC(".struct_ops.link")
-struct sched_ext_ops central_ops = {
+SCX_OPS_DEFINE(central_ops,
 	       /*
-	 * We are offloading all scheduling decisions to the central CPU and
-	 * thus being the last task on a given CPU doesn't mean anything
-	 * special. Enqueue the last tasks like any other tasks.
+		* We are offloading all scheduling decisions to the central CPU
+		* and thus being the last task on a given CPU doesn't mean
+		* anything special. Enqueue the last tasks like any other tasks.
 		*/
 	       .flags			= SCX_OPS_ENQ_LAST,

@ -363,5 +358,4 @@ struct sched_ext_ops central_ops = {
 	       .stopping		= (void *)central_stopping,
 	       .init			= (void *)central_init,
 	       .exit			= (void *)central_exit,
-	.name			= "central",
-};
+	       .name			= "central");
--- a/scheds/c/scx_central.c
+++ b/scheds/c/scx_central.c
@ -24,7 +24,6 @@ const char help_fmt[] =
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -c CPU        Override the central CPU (default: 0)\n"
-"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -h            Display this help and exit\n";

 static volatile int exit_req;
@ -61,9 +60,6 @@ int main(int argc, char **argv)
 		case 'c':
 			skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
 			break;
-		case 'p':
-			skel->rodata->switch_partial = true;
-			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
@ -74,7 +70,7 @@ int main(int argc, char **argv)
 	RESIZE_ARRAY(data, cpu_gimme_task, skel->rodata->nr_cpu_ids);
 	RESIZE_ARRAY(data, cpu_started_at, skel->rodata->nr_cpu_ids);

-	SCX_BUG_ON(scx_central__load(skel), "Failed to load skel");
+	SCX_OPS_LOAD(skel, central_ops, scx_central, uei);

 	/*
 	 * Affinitize the loading thread to the central CPU, as:
@ -96,13 +92,12 @@ int main(int argc, char **argv)
 		   skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
 	CPU_FREE(cpuset);

-	link = bpf_map__attach_struct_ops(skel->maps.central_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
+	link = SCX_OPS_ATTACH(skel, central_ops);

 	if (!skel->data->timer_pinned)
 		printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n");

-	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
 		printf("[SEQ %llu]\n", seq++);
 		printf("total   :%10" PRIu64 "    local:%10" PRIu64 "   queued:%10" PRIu64 "  lost:%10" PRIu64 "\n",
 		       skel->bss->nr_total,
@ -121,7 +116,7 @@ int main(int argc, char **argv)
 	}

 	bpf_link__destroy(link);
-	uei_print(&skel->bss->uei);
+	UEI_REPORT(skel, uei);
 	scx_central__destroy(skel);
 	return 0;
 }
--- a/scheds/c/scx_flatcg.bpf.c
+++ b/scheds/c/scx_flatcg.bpf.c
@ -56,10 +56,9 @@ char _license[] SEC("license") = "GPL";
 const volatile u32 nr_cpus = 32;	/* !0 for veristat, set during init */
 const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL;
 const volatile bool fifo_sched;
-const volatile bool switch_partial;

 u64 cvtime_now;
-struct user_exit_info uei;
+UEI_DEFINE(uei);

 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
@ -917,20 +916,12 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
 	p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
 }

-s32 BPF_STRUCT_OPS(fcg_init)
-{
-	if (!switch_partial)
-		scx_bpf_switch_all();
-	return 0;
-}
-
 void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
 {
-	uei_record(&uei, ei);
+	UEI_RECORD(uei, ei);
 }

-SEC(".struct_ops.link")
-struct sched_ext_ops flatcg_ops = {
+SCX_OPS_DEFINE(flatcg_ops,
 	       .select_cpu		= (void *)fcg_select_cpu,
 	       .enqueue			= (void *)fcg_enqueue,
 	       .dispatch		= (void *)fcg_dispatch,
@ -943,8 +934,6 @@ struct sched_ext_ops flatcg_ops = {
 	       .cgroup_init		= (void *)fcg_cgroup_init,
 	       .cgroup_exit		= (void *)fcg_cgroup_exit,
 	       .cgroup_move		= (void *)fcg_cgroup_move,
-	.init			= (void *)fcg_init,
 	       .exit			= (void *)fcg_exit,
 	       .flags			= SCX_OPS_CGROUP_KNOB_WEIGHT | SCX_OPS_ENQ_EXITING,
-	.name			= "flatcg",
-};
+	       .name			= "flatcg");
--- a/scheds/c/scx_flatcg.c
+++ b/scheds/c/scx_flatcg.c
@ -31,7 +31,6 @@ const char help_fmt[] =
 "  -s SLICE_US   Override slice duration\n"
 "  -i INTERVAL   Report interval\n"
 "  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
-"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -h            Display this help and exit\n";

 static volatile int exit_req;
@ -150,9 +149,6 @@ int main(int argc, char **argv)
 		case 'f':
 			skel->rodata->fifo_sched = true;
 			break;
-		case 'p':
-			skel->rodata->switch_partial = true;
-			break;
 		case 'h':
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
@ -165,12 +161,10 @@ int main(int argc, char **argv)
 	       (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
 	       dump_cgrps);

-	SCX_BUG_ON(scx_flatcg__load(skel), "Failed to load skel");
+	SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei);
+	link = SCX_OPS_ATTACH(skel, flatcg_ops);

-	link = bpf_map__attach_struct_ops(skel->maps.flatcg_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-
-	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
 		__u64 acc_stats[FCG_NR_STATS];
 		__u64 stats[FCG_NR_STATS];
 		float cpu_util;
@ -219,7 +213,7 @@ int main(int argc, char **argv)
 	}

 	bpf_link__destroy(link);
-	uei_print(&skel->bss->uei);
+	UEI_REPORT(skel, uei);
 	scx_flatcg__destroy(skel);
 	return 0;
 }
--- a/scheds/c/scx_qmap.bpf.c
+++ b/scheds/c/scx_qmap.bpf.c
@ -35,7 +35,7 @@ const volatile s32 disallow_tgid;

 u32 test_error_cnt;

-struct user_exit_info uei;
+UEI_DEFINE(uei);

 struct qmap {
 	__uint(type, BPF_MAP_TYPE_QUEUE);
@ -192,7 +192,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags);
 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 		if (cpu >= 0)
-			__COMPAT_scx_bpf_kick_cpu_IDLE(cpu);
+			scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE);
 		return;
 	}

@ -374,17 +374,16 @@ s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
 s32 BPF_STRUCT_OPS(qmap_init)
 {
 	if (!switch_partial)
-		scx_bpf_switch_all();
+		__COMPAT_scx_bpf_switch_all();
 	return 0;
 }

 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 {
-	uei_record(&uei, ei);
+	UEI_RECORD(uei, ei);
 }

-SEC(".struct_ops.link")
-struct sched_ext_ops qmap_ops = {
+SCX_OPS_DEFINE(qmap_ops,
 	       .select_cpu		= (void *)qmap_select_cpu,
 	       .enqueue			= (void *)qmap_enqueue,
 	       .dequeue			= (void *)qmap_dequeue,
@ -396,5 +395,4 @@ struct sched_ext_ops qmap_ops = {
 	       .exit			= (void *)qmap_exit,
 	       .flags			= SCX_OPS_ENQ_LAST,
 	       .timeout_ms		= 5000U,
-	.name			= "qmap",
-};
+	       .name			= "qmap");
--- a/scheds/c/scx_qmap.c
+++ b/scheds/c/scx_qmap.c
@ -19,7 +19,8 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-d PID] [-p]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-d PID]\n"
+"       [-D LEN] [-p]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
@ -27,6 +28,7 @@ const char help_fmt[] =
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
+"  -D LEN        Set scx_exit_info.dump buffer length\n"
 "  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -h            Display this help and exit\n";

@ -51,7 +53,7 @@ int main(int argc, char **argv)
 	skel = scx_qmap__open();
 	SCX_BUG_ON(!skel, "Failed to open skel");

-	while ((opt = getopt(argc, argv, "s:e:t:T:l:d:ph")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:d:D:ph")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@ -73,8 +75,12 @@ int main(int argc, char **argv)
 			if (skel->rodata->disallow_tgid < 0)
 				skel->rodata->disallow_tgid = getpid();
 			break;
+		case 'D':
+			skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0);
+			break;
 		case 'p':
 			skel->rodata->switch_partial = true;
+			skel->struct_ops.qmap_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL;
 			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
@ -82,12 +88,10 @@ int main(int argc, char **argv)
 		}
 	}

-	SCX_BUG_ON(scx_qmap__load(skel), "Failed to load skel");
+	SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
+	link = SCX_OPS_ATTACH(skel, qmap_ops);

-	link = bpf_map__attach_struct_ops(skel->maps.qmap_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-
-	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
 		long nr_enqueued = skel->bss->nr_enqueued;
 		long nr_dispatched = skel->bss->nr_dispatched;

@ -100,7 +104,7 @@ int main(int argc, char **argv)
 	}

 	bpf_link__destroy(link);
-	uei_print(&skel->bss->uei);
+	UEI_REPORT(skel, uei);
 	scx_qmap__destroy(skel);
 	return 0;
 }
--- a/scheds/c/scx_simple.bpf.c
+++ b/scheds/c/scx_simple.bpf.c
@ -25,10 +25,9 @@
 char _license[] SEC("license") = "GPL";

 const volatile bool fifo_sched;
-const volatile bool switch_partial;

 static u64 vtime_now;
-struct user_exit_info uei;
+UEI_DEFINE(uei);

 #define SHARED_DSQ 0

@ -130,19 +129,15 @@ void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)

 s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
 {
-	if (!switch_partial)
-		scx_bpf_switch_all();
-
 	return scx_bpf_create_dsq(SHARED_DSQ, -1);
 }

 void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 {
-	uei_record(&uei, ei);
+	UEI_RECORD(uei, ei);
 }

-SEC(".struct_ops.link")
-struct sched_ext_ops simple_ops = {
+SCX_OPS_DEFINE(simple_ops,
 	       .select_cpu		= (void *)simple_select_cpu,
 	       .enqueue			= (void *)simple_enqueue,
 	       .dispatch		= (void *)simple_dispatch,
@ -151,5 +146,4 @@ struct sched_ext_ops simple_ops = {
 	       .enable			= (void *)simple_enable,
 	       .init			= (void *)simple_init,
 	       .exit			= (void *)simple_exit,
-	.name			= "simple",
-};
+	       .name			= "simple");
--- a/scheds/c/scx_simple.c
+++ b/scheds/c/scx_simple.c
@ -20,7 +20,6 @@ const char help_fmt[] =
 "Usage: %s [-f] [-p]\n"
 "\n"
 "  -f            Use FIFO scheduling instead of weighted vtime scheduling\n"
-"  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -h            Display this help and exit\n";

 static volatile int exit_req;
@ -69,21 +68,16 @@ int main(int argc, char **argv)
 		case 'f':
 			skel->rodata->fifo_sched = true;
 			break;
-		case 'p':
-			skel->rodata->switch_partial = true;
-			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
 		}
 	}

-	SCX_BUG_ON(scx_simple__load(skel), "Failed to load skel");
+	SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei);
+	link = SCX_OPS_ATTACH(skel, simple_ops);

-	link = bpf_map__attach_struct_ops(skel->maps.simple_ops);
-	SCX_BUG_ON(!link, "Failed to attach struct_ops");
-
-	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
 		__u64 stats[2];

 		read_stats(skel, stats);
@ -93,7 +87,7 @@ int main(int argc, char **argv)
 	}

 	bpf_link__destroy(link);
-	uei_print(&skel->bss->uei);
+	UEI_REPORT(skel, uei);
 	scx_simple__destroy(skel);
 	return 0;
 }
--- a/scheds/include/scx/common.bpf.h
+++ b/scheds/include/scx/common.bpf.h
@ -29,31 +29,55 @@ static inline void ___vmlinux_h_sanity_check___(void)
 }

 void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
+void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
+		       unsigned long long *data, u32 data__sz) __ksym;

 static inline __attribute__((format(printf, 1, 2)))
-void ___scx_bpf_error_format_checker(const char *fmt, ...) {}
+void ___scx_bpf_exit_format_checker(const char *fmt, ...) {}

 /*
- * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
- * instead of an array of u64. Note that __param[] must have at least one
- * element to keep the verifier happy.
+ * Helper macro for initializing the fmt and variadic argument inputs to both
+ * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
+ * refer to the initialized list of inputs to the bstr kfunc.
 */
-#define scx_bpf_error(fmt, args...)						\
-({										\
+#define scx_bpf_exit_preamble(fmt, args...)				\
 	static char ___fmt[] = fmt;					\
+	/*								\
+	 * Note that __param[] must have at least one			\
+	 * element to keep the verifier happy.				\
+	 */								\
 	unsigned long long ___param[___bpf_narg(args) ?: 1] = {};	\
 									\
 	_Pragma("GCC diagnostic push")					\
 	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		\
 	___bpf_fill(___param, args);					\
 	_Pragma("GCC diagnostic pop")					\
-										\
-	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
-										\
-	___scx_bpf_error_format_checker(fmt, ##args);				\
+
+/*
+ * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments
+ * instead of an array of u64. Using this macro will cause the scheduler to
+ * exit cleanly with the specified exit code being passed to user space.
+ */
+#define scx_bpf_exit(code, fmt, args...)					\
+({										\
+	scx_bpf_exit_preamble(fmt, args)					\
+	scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param));		\
+	___scx_bpf_exit_format_checker(fmt, ##args);				\
+})
+
+/*
+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
+ * instead of an array of u64. Invoking this macro will cause the scheduler to
+ * exit in an erroneous state, with diagnostic information being passed to the
+ * user.
+ */
+#define scx_bpf_error(fmt, args...)						\
+({										\
+	scx_bpf_exit_preamble(fmt, args)					\
+	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
+	___scx_bpf_exit_format_checker(fmt, ##args);				\
 })

-void scx_bpf_switch_all(void) __ksym;
 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
 void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
--- a/scheds/include/scx/common.h
+++ b/scheds/include/scx/common.h
@ -15,8 +15,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
-
-#include "user_exit_info.h"
+#include <errno.h>

 typedef uint8_t u8;
 typedef uint16_t u16;
@ -66,4 +65,7 @@ typedef int64_t s64;
 			bpf_map__initial_value(skel->maps.elfsec##_##arr, &__sz); \
 	} while (0)

+#include "user_exit_info.h"
+#include "compat.h"
+
 #endif	/* __SCHED_EXT_COMMON_H */
--- a/scheds/include/scx/compat.bpf.h
+++ b/scheds/include/scx/compat.bpf.h
@ -7,12 +7,41 @@
 #ifndef __SCX_COMPAT_BPF_H
 #define __SCX_COMPAT_BPF_H

-static inline void __COMPAT_scx_bpf_kick_cpu_IDLE(s32 cpu)
+#define __COMPAT_ENUM_OR_ZERO(__type, __ent)					\
+({										\
+	__type __ret = 0;							\
+	if (bpf_core_enum_value_exists(__type, __ent))				\
+		__ret = __ent;							\
+	__ret;									\
+})
+
+/*
+ * %SCX_KICK_IDLE is a later addition. To support both before and after, use
+ * %__COMPAT_SCX_KICK_IDLE which becomes 0 on kernels which don't support it.
+ */
+#define __COMPAT_SCX_KICK_IDLE							\
+	__COMPAT_ENUM_OR_ZERO(enum scx_kick_flags, SCX_KICK_IDLE)
+
+/*
+ * scx_switch_all() was replaced by %SCX_OPS_SWITCH_PARTIAL. See
+ * %__COMPAT_SCX_OPS_SWITCH_PARTIAL in compat.h.
+ */
+void scx_bpf_switch_all(void) __ksym __weak;
+
+static inline void __COMPAT_scx_bpf_switch_all(void)
 {
-	if (bpf_core_enum_value_exists(enum scx_kick_flags, SCX_KICK_IDLE))
-		scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
-	else
-		scx_bpf_kick_cpu(cpu, 0);
+	if (!bpf_core_enum_value_exists(enum scx_ops_flags, SCX_OPS_SWITCH_PARTIAL))
+		scx_bpf_switch_all();
 }

-#endif
+/*
+ * Define sched_ext_ops. This may be expanded to define multiple variants for
+ * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
+ */
+#define SCX_OPS_DEFINE(__name, ...)						\
+	SEC(".struct_ops.link")							\
+	struct sched_ext_ops __name = {						\
+		__VA_ARGS__,							\
+	};
+
+#endif	/* __SCX_COMPAT_BPF_H */
--- a/scheds/include/scx/compat.h
+++ b/scheds/include/scx/compat.h
@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCX_COMPAT_H
+#define __SCX_COMPAT_H
+
+#include <bpf/btf.h>
+
+struct btf *__COMPAT_vmlinux_btf __attribute__((weak));
+
+static inline void __COMPAT_load_vmlinux_btf(void)
+{
+	if (!__COMPAT_vmlinux_btf) {
+		__COMPAT_vmlinux_btf = btf__load_vmlinux_btf();
+		SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()");
+	}
+}
+
+static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v)
+{
+	const struct btf_type *t;
+	const char *n;
+	s32 tid;
+	int i;
+
+	__COMPAT_load_vmlinux_btf();
+
+	tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_ENUM);
+	if (tid < 0)
+		return false;
+
+	t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
+	SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
+
+	if (btf_is_enum(t)) {
+		struct btf_enum *e = btf_enum(t);
+
+		for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+			n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
+			SCX_BUG_ON(!n, "btf__name_by_offset()");
+			if (!strcmp(n, name)) {
+				*v = e[i].val;
+				return true;
+			}
+		}
+	} else if (btf_is_enum64(t)) {
+		struct btf_enum64 *e = btf_enum64(t);
+
+		for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+			n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
+			SCX_BUG_ON(!n, "btf__name_by_offset()");
+			if (!strcmp(n, name)) {
+				*v = btf_enum64_value(&e[i]);
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+#define __COMPAT_ENUM_OR_ZERO(__type, __ent)					\
+({										\
+	u64 __val = 0;								\
+	__COMPAT_read_enum(__type, __ent, &__val);				\
+	__val;									\
+})
+
+static inline bool __COMPAT_struct_has_field(const char *type, const char *field)
+{
+	const struct btf_type *t;
+	const struct btf_member *m;
+	const char *n;
+	s32 tid;
+	int i;
+
+	__COMPAT_load_vmlinux_btf();
+	tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT);
+	if (tid < 0)
+		return false;
+
+	t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
+	SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
+
+	m = btf_members(t);
+
+	for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+		n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off);
+		SCX_BUG_ON(!n, "btf__name_by_offset()");
+			if (!strcmp(n, field))
+				return true;
+	}
+
+	return false;
+}
+
+/*
+ * An ops flag, %SCX_OPS_SWITCH_PARTIAL, replaced scx_bpf_switch_all() which had
+ * to be called from ops.init(). To support both before and after, use both
+ * %__COMPAT_SCX_OPS_SWITCH_PARTIAL and %__COMPAT_scx_bpf_switch_all() defined
+ * in compat.bpf.h.
+ */
+#define __COMPAT_SCX_OPS_SWITCH_PARTIAL						\
+	__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
+
+/*
+ * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
+ * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
+ * and attach it, backward compatibility is automatically maintained where
+ * reasonable.
+ *
+ * - sched_ext_ops.exit_dump_len was added later. On kernels which don't support
+ *   it, the value is ignored and a warning is triggered if the value is
+ *   requested to be non-zero.
+ */
+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({		\
+	UEI_SET_SIZE(__skel, __ops_name, __uei_name);				\
+	if (__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") &&	\
+	    (__skel)->struct_ops.__ops_name->exit_dump_len) {			\
+		fprintf(stderr, "WARNING: kernel doesn't support setting exit dump len\n"); \
+		(__skel)->struct_ops.__ops_name->exit_dump_len = 0;	\
+	}									\
+	SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel");	\
+})
+
+#define SCX_OPS_ATTACH(__skel, __ops_name) ({					\
+	struct bpf_link *__link;						\
+	__link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name);		\
+	SCX_BUG_ON(!__link, "Failed to attach struct_ops");			\
+	__link;									\
+})
+
+#endif	/* __SCX_COMPAT_H */
--- a/scheds/include/scx/user_exit_info.h
+++ b/scheds/include/scx/user_exit_info.h
@ -11,16 +11,16 @@
 #define __USER_EXIT_INFO_H

 enum uei_sizes {
-	UEI_REASON_SIZE	= 128,
-	UEI_MSG_SIZE	= 1024,
-	UEI_DUMP_SIZE	= 32768,
+	UEI_REASON_LEN		= 128,
+	UEI_MSG_LEN		= 1024,
+	UEI_DUMP_DFL_LEN	= 32768,
 };

 struct user_exit_info {
 	int		kind;
-	char		reason[UEI_REASON_SIZE];
-	char		msg[UEI_MSG_SIZE];
-	char		dump[UEI_DUMP_SIZE];
+	s64		exit_code;
+	char		reason[UEI_REASON_LEN];
+	char		msg[UEI_MSG_LEN];
 };

 #ifdef __bpf__
@ -28,40 +28,56 @@ struct user_exit_info {
 #include "vmlinux.h"
 #include <bpf/bpf_core_read.h>

-static inline void uei_record(struct user_exit_info *uei,
-			      const struct scx_exit_info *ei)
-{
-	bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason);
-	bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg);
-	bpf_probe_read_kernel_str(uei->dump, sizeof(uei->dump), ei->dump);
-	/* use __sync to force memory barrier */
-	__sync_val_compare_and_swap(&uei->kind, uei->kind, ei->kind);
-}
+#define UEI_DEFINE(__name)							\
+	char RESIZABLE_ARRAY(data, __name##_dump);				\
+	const volatile u32 __name##_dump_len;					\
+	struct user_exit_info __name SEC(".data")
+
+#define UEI_RECORD(__uei_name, __ei) ({						\
+	bpf_probe_read_kernel_str(__uei_name.reason,				\
+				  sizeof(__uei_name.reason), (__ei)->reason);	\
+	bpf_probe_read_kernel_str(__uei_name.msg,				\
+				  sizeof(__uei_name.msg), (__ei)->msg);		\
+	bpf_probe_read_kernel_str(__uei_name##_dump,				\
+				  __uei_name##_dump_len, (__ei)->dump);		\
+	if (bpf_core_field_exists((__ei)->exit_code))				\
+		__uei_name.exit_code = (__ei)->exit_code;			\
+	/* use __sync to force memory barrier */				\
+	__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind,		\
+				    (__ei)->kind);				\
+})

 #else	/* !__bpf__ */

 #include <stdio.h>
 #include <stdbool.h>

-static inline bool uei_exited(struct user_exit_info *uei)
-{
-	/* use __sync to force memory barrier */
-	return __sync_val_compare_and_swap(&uei->kind, -1, -1);
-}
+/* no need to call the following explicitly if SCX_OPS_LOAD() is used */
+#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({				\
+	u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \
+	(__skel)->rodata->__uei_name##_dump_len = __len;			\
+	RESIZE_ARRAY(data, __uei_name##_dump, __len);				\
+})

-static inline void uei_print(const struct user_exit_info *uei)
-{
-	if (uei->dump[0] != '\0') {
-		fputs("\nDEBUG DUMP\n", stderr);
-		fputs("================================================================================\n\n", stderr);
-		fputs(uei->dump, stderr);
-		fputs("\n================================================================================\n\n", stderr);
-	}
-	fprintf(stderr, "EXIT: %s", uei->reason);
-	if (uei->msg[0] != '\0')
-		fprintf(stderr, " (%s)", uei->msg);
-	fputs("\n", stderr);
-}
+#define UEI_EXITED(__skel, __uei_name) ({					\
+	/* use __sync to force memory barrier */				\
+	__sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1);	\
+})
+
+#define UEI_REPORT(__skel, __uei_name) ({					\
+	struct user_exit_info *__uei = &(__skel)->data->__uei_name;		\
+	char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \
+	if (__uei_dump[0] != '\0') {						\
+		fputs("\nDEBUG DUMP\n", stderr);				\
+		fputs("================================================================================\n\n", stderr); \
+		fputs(__uei_dump, stderr);					\
+		fputs("\n================================================================================\n\n", stderr); \
+	}									\
+	fprintf(stderr, "EXIT: %s", __uei->reason);				\
+	if (__uei->msg[0] != '\0')						\
+		fprintf(stderr, " (%s)", __uei->msg);				\
+	fputs("\n", stderr);							\
+})

 #endif	/* __bpf__ */
 #endif	/* __USER_EXIT_INFO_H */
--- a/scheds/include/vmlinux/vmlinux-v6.9-ge34c7df6e8fa.h
+++ b/scheds/include/vmlinux/vmlinux-v6.9-ge34c7df6e8fa.h
--- a/scheds/include/vmlinux/vmlinux.h
+++ b/scheds/include/vmlinux/vmlinux.h
@ -1 +1 @@
-vmlinux-v6.7-g6851d5f5be95.h
+vmlinux-v6.9-ge34c7df6e8fa.h