Merge pull request #253 from sched-ext/htejun/sync-kernel

Sync to the latest kernel
This commit is contained in:
Tejun Heo 2024-04-29 10:16:35 -10:00 committed by GitHub
commit b1bb2a5c5f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 115548 additions and 114769 deletions

View File

@ -45,7 +45,7 @@ impl Builder {
let bindings = bindgen::Builder::default() let bindings = bindgen::Builder::default()
.header("bindings.h") .header("bindings.h")
.allowlist_type("scx_exit_kind") .allowlist_type("scx_exit_kind")
.allowlist_type("scx_internal_consts") .allowlist_type("scx_consts")
.parse_callbacks(Box::new(bindgen::CargoCallbacks)) .parse_callbacks(Box::new(bindgen::CargoCallbacks))
.generate() .generate()
.expect("Unable to generate bindings"); .expect("Unable to generate bindings");

View File

@ -43,7 +43,7 @@ pub use builder::Builder;
mod user_exit_info; mod user_exit_info;
pub use user_exit_info::ScxExitKind; pub use user_exit_info::ScxExitKind;
pub use user_exit_info::ScxInternalConsts; pub use user_exit_info::ScxConsts;
pub use user_exit_info::UeiDumpPtr; pub use user_exit_info::UeiDumpPtr;
pub use user_exit_info::UserExitInfo; pub use user_exit_info::UserExitInfo;
pub use user_exit_info::UEI_DUMP_PTR_MUTEX; pub use user_exit_info::UEI_DUMP_PTR_MUTEX;

View File

@ -29,8 +29,8 @@ pub enum ScxExitKind {
ErrorStall = bindings::scx_exit_kind_SCX_EXIT_ERROR_STALL as isize, ErrorStall = bindings::scx_exit_kind_SCX_EXIT_ERROR_STALL as isize,
} }
pub enum ScxInternalConsts { pub enum ScxConsts {
ExitDumpDflLen = bindings::scx_internal_consts_SCX_EXIT_DUMP_DFL_LEN as isize, ExitDumpDflLen = bindings::scx_consts_SCX_EXIT_DUMP_DFL_LEN as isize,
} }
/// Takes a reference to C struct user_exit_info and reads it into /// Takes a reference to C struct user_exit_info and reads it into
@ -65,7 +65,7 @@ macro_rules! uei_set_size {
($skel: expr, $ops: ident, $uei:ident) => {{ ($skel: expr, $ops: ident, $uei:ident) => {{
scx_utils::paste! { scx_utils::paste! {
let len = match $skel.struct_ops.$ops().exit_dump_len { let len = match $skel.struct_ops.$ops().exit_dump_len {
0 => scx_utils::ScxInternalConsts::ExitDumpDflLen as u32, 0 => scx_utils::ScxConsts::ExitDumpDflLen as u32,
v => v, v => v,
}; };
$skel.rodata_mut().[<$uei _dump_len>] = len; $skel.rodata_mut().[<$uei _dump_len>] = len;

View File

@ -24,10 +24,19 @@ const char help_fmt[] =
"\n" "\n"
" -s SLICE_US Override slice duration\n" " -s SLICE_US Override slice duration\n"
" -c CPU Override the central CPU (default: 0)\n" " -c CPU Override the central CPU (default: 0)\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n"; " -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req; static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int dummy) static void sigint_handler(int dummy)
{ {
exit_req = 1; exit_req = 1;
@ -37,22 +46,20 @@ int main(int argc, char **argv)
{ {
struct scx_central *skel; struct scx_central *skel;
struct bpf_link *link; struct bpf_link *link;
__u64 seq = 0; __u64 seq = 0, ecode;
__s32 opt; __s32 opt;
cpu_set_t *cpuset; cpu_set_t *cpuset;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler); signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler); signal(SIGTERM, sigint_handler);
restart:
libbpf_set_strict_mode(LIBBPF_STRICT_ALL); skel = SCX_OPS_OPEN(central_ops, scx_central);
skel = scx_central__open();
SCX_BUG_ON(!skel, "Failed to open skel");
skel->rodata->central_cpu = 0; skel->rodata->central_cpu = 0;
skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
while ((opt = getopt(argc, argv, "s:c:ph")) != -1) { while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
switch (opt) { switch (opt) {
case 's': case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@ -60,6 +67,9 @@ int main(int argc, char **argv)
case 'c': case 'c':
skel->rodata->central_cpu = strtoul(optarg, NULL, 0); skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
break; break;
case 'v':
verbose = true;
break;
default: default:
fprintf(stderr, help_fmt, basename(argv[0])); fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h'; return opt != 'h';
@ -116,7 +126,10 @@ int main(int argc, char **argv)
} }
bpf_link__destroy(link); bpf_link__destroy(link);
UEI_REPORT(skel, uei); ecode = UEI_REPORT(skel, uei);
scx_central__destroy(skel); scx_central__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0; return 0;
} }

View File

@ -26,15 +26,24 @@ const char help_fmt[] =
"\n" "\n"
"See the top-level comment in .bpf.c for more details.\n" "See the top-level comment in .bpf.c for more details.\n"
"\n" "\n"
"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f]\n" "Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n"
"\n" "\n"
" -s SLICE_US Override slice duration\n" " -s SLICE_US Override slice duration\n"
" -i INTERVAL Report interval\n" " -i INTERVAL Report interval\n"
" -f Use FIFO scheduling instead of weighted vtime scheduling\n" " -f Use FIFO scheduling instead of weighted vtime scheduling\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n"; " -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req; static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int dummy) static void sigint_handler(int dummy)
{ {
exit_req = 1; exit_req = 1;
@ -119,18 +128,17 @@ int main(int argc, char **argv)
__u64 last_stats[FCG_NR_STATS] = {}; __u64 last_stats[FCG_NR_STATS] = {};
unsigned long seq = 0; unsigned long seq = 0;
__s32 opt; __s32 opt;
__u64 ecode;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler); signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler); signal(SIGTERM, sigint_handler);
restart:
libbpf_set_strict_mode(LIBBPF_STRICT_ALL); skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
skel = scx_flatcg__open();
SCX_BUG_ON(!skel, "Failed to open skel");
skel->rodata->nr_cpus = libbpf_num_possible_cpus(); skel->rodata->nr_cpus = libbpf_num_possible_cpus();
while ((opt = getopt(argc, argv, "s:i:dfph")) != -1) { while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
double v; double v;
switch (opt) { switch (opt) {
@ -149,6 +157,9 @@ int main(int argc, char **argv)
case 'f': case 'f':
skel->rodata->fifo_sched = true; skel->rodata->fifo_sched = true;
break; break;
case 'v':
verbose = true;
break;
case 'h': case 'h':
default: default:
fprintf(stderr, help_fmt, basename(argv[0])); fprintf(stderr, help_fmt, basename(argv[0]));
@ -213,7 +224,10 @@ int main(int argc, char **argv)
} }
bpf_link__destroy(link); bpf_link__destroy(link);
UEI_REPORT(skel, uei); ecode = UEI_REPORT(skel, uei);
scx_flatcg__destroy(skel); scx_flatcg__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0; return 0;
} }

View File

@ -29,10 +29,19 @@ const char help_fmt[] =
" -i ITERS Number of successive placement failures tolerated before trying to aggressively expand primary nest (default 2), or 0 to disable\n" " -i ITERS Number of successive placement failures tolerated before trying to aggressively expand primary nest (default 2), or 0 to disable\n"
" -s SLICE_US Override slice duration in us (default 20000us / 20ms)\n" " -s SLICE_US Override slice duration in us (default 20000us / 20ms)\n"
" -I First try to find a fully idle core, and then any idle core, when searching nests. Default behavior is to ignore hypertwins and check for any idle core.\n" " -I First try to find a fully idle core, and then any idle core, when searching nests. Default behavior is to ignore hypertwins and check for any idle core.\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n"; " -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req; static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int nest) static void sigint_handler(int nest)
{ {
exit_req = 1; exit_req = 1;
@ -152,19 +161,18 @@ int main(int argc, char **argv)
struct scx_nest *skel; struct scx_nest *skel;
struct bpf_link *link; struct bpf_link *link;
__u32 opt; __u32 opt;
__u64 ecode;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler); signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler); signal(SIGTERM, sigint_handler);
restart:
libbpf_set_strict_mode(LIBBPF_STRICT_ALL); skel = SCX_OPS_OPEN(nest_ops, scx_nest);
skel = scx_nest__open();
SCX_BUG_ON(!skel, "Failed to open skel");
skel->rodata->nr_cpus = libbpf_num_possible_cpus(); skel->rodata->nr_cpus = libbpf_num_possible_cpus();
skel->rodata->sampling_cadence_ns = SAMPLING_CADENCE_S * 1000 * 1000 * 1000; skel->rodata->sampling_cadence_ns = SAMPLING_CADENCE_S * 1000 * 1000 * 1000;
while ((opt = getopt(argc, argv, "hId:m:i:s:")) != -1) { while ((opt = getopt(argc, argv, "d:m:i:Is:vh")) != -1) {
switch (opt) { switch (opt) {
case 'd': case 'd':
skel->rodata->p_remove_ns = strtoull(optarg, NULL, 0) * 1000; skel->rodata->p_remove_ns = strtoull(optarg, NULL, 0) * 1000;
@ -181,6 +189,9 @@ int main(int argc, char **argv)
case 's': case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
break; break;
case 'v':
verbose = true;
break;
default: default:
fprintf(stderr, help_fmt, basename(argv[0])); fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h'; return opt != 'h';
@ -216,7 +227,10 @@ int main(int argc, char **argv)
} }
bpf_link__destroy(link); bpf_link__destroy(link);
UEI_REPORT(skel, uei); ecode = UEI_REPORT(skel, uei);
scx_nest__destroy(skel); scx_nest__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0; return 0;
} }

View File

@ -23,10 +23,19 @@ const char help_fmt[] =
"Usage: %s [-S STRIDE]\n" "Usage: %s [-S STRIDE]\n"
"\n" "\n"
" -S STRIDE Override CPU pair stride (default: nr_cpus_ids / 2)\n" " -S STRIDE Override CPU pair stride (default: nr_cpus_ids / 2)\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n"; " -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req; static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int dummy) static void sigint_handler(int dummy)
{ {
exit_req = 1; exit_req = 1;
@ -36,27 +45,28 @@ int main(int argc, char **argv)
{ {
struct scx_pair *skel; struct scx_pair *skel;
struct bpf_link *link; struct bpf_link *link;
__u64 seq = 0; __u64 seq = 0, ecode;
__s32 stride, i, opt, outer_fd; __s32 stride, i, opt, outer_fd;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler); signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler); signal(SIGTERM, sigint_handler);
restart:
libbpf_set_strict_mode(LIBBPF_STRICT_ALL); skel = SCX_OPS_OPEN(pair_ops, scx_pair);
skel = scx_pair__open();
SCX_BUG_ON(!skel, "Failed to open skel");
skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
/* pair up the earlier half to the latter by default, override with -s */ /* pair up the earlier half to the latter by default, override with -s */
stride = skel->rodata->nr_cpu_ids / 2; stride = skel->rodata->nr_cpu_ids / 2;
while ((opt = getopt(argc, argv, "S:ph")) != -1) { while ((opt = getopt(argc, argv, "S:vh")) != -1) {
switch (opt) { switch (opt) {
case 'S': case 'S':
stride = strtoul(optarg, NULL, 0); stride = strtoul(optarg, NULL, 0);
break; break;
case 'v':
verbose = true;
break;
default: default:
fprintf(stderr, help_fmt, basename(argv[0])); fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h'; return opt != 'h';
@ -158,7 +168,10 @@ int main(int argc, char **argv)
} }
bpf_link__destroy(link); bpf_link__destroy(link);
UEI_REPORT(skel, uei); ecode = UEI_REPORT(skel, uei);
scx_pair__destroy(skel); scx_pair__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0; return 0;
} }

View File

@ -23,6 +23,12 @@
* Copyright (c) 2022 David Vernet <dvernet@meta.com> * Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/ */
#include <scx/common.bpf.h> #include <scx/common.bpf.h>
#include <string.h>
enum consts {
ONE_SEC_IN_NS = 1000000000,
SHARED_DSQ = 0,
};
char _license[] SEC("license") = "GPL"; char _license[] SEC("license") = "GPL";
@ -30,6 +36,9 @@ const volatile u64 slice_ns = SCX_SLICE_DFL;
const volatile u32 stall_user_nth; const volatile u32 stall_user_nth;
const volatile u32 stall_kernel_nth; const volatile u32 stall_kernel_nth;
const volatile u32 dsp_inf_loop_after; const volatile u32 dsp_inf_loop_after;
const volatile u32 dsp_batch;
const volatile bool print_shared_dsq;
const volatile char exp_prefix[17];
const volatile s32 disallow_tgid; const volatile s32 disallow_tgid;
const volatile bool switch_partial; const volatile bool switch_partial;
@ -62,6 +71,18 @@ struct {
}, },
}; };
/*
* If enabled, CPU performance target is set according to the queue index
* according to the following table.
*/
static const u32 qidx_to_cpuperf_target[] = {
[0] = SCX_CPUPERF_ONE * 0 / 4,
[1] = SCX_CPUPERF_ONE * 1 / 4,
[2] = SCX_CPUPERF_ONE * 2 / 4,
[3] = SCX_CPUPERF_ONE * 3 / 4,
[4] = SCX_CPUPERF_ONE * 4 / 4,
};
/* /*
* Per-queue sequence numbers to implement core-sched ordering. * Per-queue sequence numbers to implement core-sched ordering.
* *
@ -86,17 +107,25 @@ struct {
__type(value, struct task_ctx); __type(value, struct task_ctx);
} task_ctx_stor SEC(".maps"); } task_ctx_stor SEC(".maps");
/* Per-cpu dispatch index and remaining count */ struct cpu_ctx {
u64 dsp_idx; /* dispatch index */
u64 dsp_cnt; /* remaining count */
u32 avg_weight;
u32 cpuperf_target;
};
struct { struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 2); __uint(max_entries, 1);
__type(key, u32); __type(key, u32);
__type(value, u64); __type(value, struct cpu_ctx);
} dispatch_idx_cnt SEC(".maps"); } cpu_ctx_stor SEC(".maps");
/* Statistics */ /* Statistics */
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
u64 nr_core_sched_execed; u64 nr_core_sched_execed, nr_expedited;
u32 cpuperf_min, cpuperf_avg, cpuperf_max;
u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags) s32 prev_cpu, u64 wake_flags)
@ -189,7 +218,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
if (enq_flags & SCX_ENQ_REENQ) { if (enq_flags & SCX_ENQ_REENQ) {
s32 cpu; s32 cpu;
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags); scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
if (cpu >= 0) if (cpu >= 0)
scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE); scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE);
@ -204,7 +233,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
if (bpf_map_push_elem(ring, &pid, 0)) { if (bpf_map_push_elem(ring, &pid, 0)) {
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags); scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
return; return;
} }
@ -233,18 +262,49 @@ static void update_core_sched_head_seq(struct task_struct *p)
scx_bpf_error("task_ctx lookup failed"); scx_bpf_error("task_ctx lookup failed");
} }
static bool consume_shared_dsq(void)
{
struct task_struct *p;
bool consumed;
if (exp_prefix[0] == '\0')
return scx_bpf_consume(SHARED_DSQ);
/*
* To demonstrate the use of scx_bpf_consume_task(), implement silly
* selective priority boosting mechanism by scanning SHARED_DSQ looking
* for matching comms and consume them first. This makes difference only
* when dsp_batch is larger than 1.
*/
consumed = false;
__COMPAT_DSQ_FOR_EACH(p, SHARED_DSQ, 0) {
char comm[sizeof(exp_prefix)];
memcpy(comm, p->comm, sizeof(exp_prefix) - 1);
if (!bpf_strncmp(comm, sizeof(exp_prefix),
(const char *)exp_prefix) &&
__COMPAT_scx_bpf_consume_task(BPF_FOR_EACH_ITER, p)) {
consumed = true;
__sync_fetch_and_add(&nr_expedited, 1);
}
}
return consumed || scx_bpf_consume(SHARED_DSQ);
}
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
{ {
u32 zero = 0, one = 1; struct task_struct *p;
u64 *idx = bpf_map_lookup_elem(&dispatch_idx_cnt, &zero); struct cpu_ctx *cpuc;
u64 *cnt = bpf_map_lookup_elem(&dispatch_idx_cnt, &one); u32 zero = 0, batch = dsp_batch ?: 1;
void *fifo; void *fifo;
s32 pid; s32 i, pid;
int i;
if (consume_shared_dsq())
return;
if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
struct task_struct *p;
/* /*
* PID 2 should be kthreadd which should mostly be idle and off * PID 2 should be kthreadd which should mostly be idle and off
* the scheduler. Let's keep dispatching it to force the kernel * the scheduler. Let's keep dispatching it to force the kernel
@ -252,49 +312,80 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
*/ */
p = bpf_task_from_pid(2); p = bpf_task_from_pid(2);
if (p) { if (p) {
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0); scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
bpf_task_release(p); bpf_task_release(p);
return; return;
} }
} }
if (!idx || !cnt) { if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
scx_bpf_error("failed to lookup idx[%p], cnt[%p]", idx, cnt); scx_bpf_error("failed to look up cpu_ctx");
return; return;
} }
for (i = 0; i < 5; i++) { for (i = 0; i < 5; i++) {
/* Advance the dispatch cursor and pick the fifo. */ /* Advance the dispatch cursor and pick the fifo. */
if (!*cnt) { if (!cpuc->dsp_cnt) {
*idx = (*idx + 1) % 5; cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
*cnt = 1 << *idx; cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
} }
(*cnt)--;
fifo = bpf_map_lookup_elem(&queue_arr, idx); fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
if (!fifo) { if (!fifo) {
scx_bpf_error("failed to find ring %llu", *idx); scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
return; return;
} }
/* Dispatch or advance. */ /* Dispatch or advance. */
if (!bpf_map_pop_elem(fifo, &pid)) { bpf_repeat(BPF_MAX_LOOPS) {
struct task_struct *p; if (bpf_map_pop_elem(fifo, &pid))
break;
p = bpf_task_from_pid(pid); p = bpf_task_from_pid(pid);
if (p) { if (!p)
update_core_sched_head_seq(p); continue;
__sync_fetch_and_add(&nr_dispatched, 1);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0); update_core_sched_head_seq(p);
bpf_task_release(p); __sync_fetch_and_add(&nr_dispatched, 1);
scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
bpf_task_release(p);
batch--;
cpuc->dsp_cnt--;
if (!batch || !scx_bpf_dispatch_nr_slots()) {
consume_shared_dsq();
return; return;
} }
if (!cpuc->dsp_cnt)
break;
} }
*cnt = 0; cpuc->dsp_cnt = 0;
} }
} }
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
{
struct cpu_ctx *cpuc;
u32 zero = 0;
int idx;
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
scx_bpf_error("failed to look up cpu_ctx");
return;
}
/*
* Use the running avg of weights to select the target cpuperf level.
* This is a demonstration of the cpuperf feature rather than a
* practical strategy to regulate CPU frequency.
*/
cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
idx = weight_to_idx(cpuc->avg_weight);
cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
}
/* /*
* The distance from the head of the queue scaled by the weight of the queue. * The distance from the head of the queue scaled by the weight of the queue.
* The lower the number, the older the task and the higher the priority. * The lower the number, the older the task and the higher the priority.
@ -371,11 +462,189 @@ s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
return -ENOMEM; return -ENOMEM;
} }
s32 BPF_STRUCT_OPS(qmap_init) /*
* Print out the online and possible CPU map using bpf_printk() as a
* demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
*/
static void print_cpus(void)
{ {
const struct cpumask *possible, *online;
s32 cpu;
char buf[128] = "", *p;
int idx;
if (!__COMPAT_HAS_CPUMASKS)
return;
possible = scx_bpf_get_possible_cpumask();
online = scx_bpf_get_online_cpumask();
idx = 0;
bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
if (!(p = MEMBER_VPTR(buf, [idx++])))
break;
if (bpf_cpumask_test_cpu(cpu, online))
*p++ = 'O';
else if (bpf_cpumask_test_cpu(cpu, possible))
*p++ = 'X';
else
*p++ = ' ';
if ((cpu & 7) == 7) {
if (!(p = MEMBER_VPTR(buf, [idx++])))
break;
*p++ = '|';
}
}
buf[sizeof(buf) - 1] = '\0';
scx_bpf_put_cpumask(online);
scx_bpf_put_cpumask(possible);
bpf_printk("CPUS: |%s", buf);
}
void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
{
bpf_printk("CPU %d coming online", cpu);
/* @cpu is already online at this point */
print_cpus();
}
void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
{
bpf_printk("CPU %d going offline", cpu);
/* @cpu is still online at this point */
print_cpus();
}
struct monitor_timer {
struct bpf_timer timer;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, u32);
__type(value, struct monitor_timer);
} central_timer SEC(".maps");
/*
* Print out the min, avg and max performance levels of CPUs every second to
* demonstrate the cpuperf interface.
*/
static void monitor_cpuperf(void)
{
u32 zero = 0;
u32 nr_cpu_ids = scx_bpf_nr_cpu_ids();
u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
const struct cpumask *online;
int i, nr_online_cpus = 0;
online = scx_bpf_get_online_cpumask();
bpf_for(i, 0, nr_cpu_ids) {
struct cpu_ctx *cpuc;
u32 cap, cur;
if (!bpf_cpumask_test_cpu(i, online))
continue;
nr_online_cpus++;
/* collect the capacity and current cpuperf */
cap = scx_bpf_cpuperf_cap(i);
cur = scx_bpf_cpuperf_cur(i);
cur_min = cur < cur_min ? cur : cur_min;
cur_max = cur > cur_max ? cur : cur_max;
/*
* $cur is relative to $cap. Scale it down accordingly so that
* it's in the same scale as other CPUs and $cur_sum/$cap_sum
* makes sense.
*/
cur_sum += cur * cap / SCX_CPUPERF_ONE;
cap_sum += cap;
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
scx_bpf_error("failed to look up cpu_ctx");
goto out;
}
/* collect target */
cur = cpuc->cpuperf_target;
target_sum += cur;
target_min = cur < target_min ? cur : target_min;
target_max = cur > target_max ? cur : target_max;
}
cpuperf_min = cur_min;
cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
cpuperf_max = cur_max;
cpuperf_target_min = target_min;
cpuperf_target_avg = target_sum / nr_online_cpus;
cpuperf_target_max = target_max;
out:
scx_bpf_put_cpumask(online);
}
/*
* Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
* scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
* see meaningful dumps in the trace pipe.
*/
static void dump_shared_dsq(void)
{
struct task_struct *p;
s32 nr;
if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
return;
bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
bpf_rcu_read_lock();
__COMPAT_DSQ_FOR_EACH(p, SHARED_DSQ, SCX_DSQ_ITER_REV)
bpf_printk("%s[%d]", p->comm, p->pid);
bpf_rcu_read_unlock();
}
static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
{
monitor_cpuperf();
if (print_shared_dsq)
dump_shared_dsq();
bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
return 0;
}
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
u32 key = 0;
struct bpf_timer *timer;
s32 ret;
if (!switch_partial) if (!switch_partial)
__COMPAT_scx_bpf_switch_all(); __COMPAT_scx_bpf_switch_all();
return 0;
print_cpus();
ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
if (ret)
return ret;
timer = bpf_map_lookup_elem(&central_timer, &key);
if (!timer)
return -ESRCH;
bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
bpf_timer_set_callback(timer, monitor_timerfn);
return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
} }
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
@ -388,9 +657,12 @@ SCX_OPS_DEFINE(qmap_ops,
.enqueue = (void *)qmap_enqueue, .enqueue = (void *)qmap_enqueue,
.dequeue = (void *)qmap_dequeue, .dequeue = (void *)qmap_dequeue,
.dispatch = (void *)qmap_dispatch, .dispatch = (void *)qmap_dispatch,
.tick = (void *)qmap_tick,
.core_sched_before = (void *)qmap_core_sched_before, .core_sched_before = (void *)qmap_core_sched_before,
.cpu_release = (void *)qmap_cpu_release, .cpu_release = (void *)qmap_cpu_release,
.init_task = (void *)qmap_init_task, .init_task = (void *)qmap_init_task,
.cpu_online = (void *)qmap_cpu_online,
.cpu_offline = (void *)qmap_cpu_offline,
.init = (void *)qmap_init, .init = (void *)qmap_init,
.exit = (void *)qmap_exit, .exit = (void *)qmap_exit,
.flags = SCX_OPS_ENQ_LAST, .flags = SCX_OPS_ENQ_LAST,

View File

@ -19,21 +19,34 @@ const char help_fmt[] =
"\n" "\n"
"See the top-level comment in .bpf.c for more details.\n" "See the top-level comment in .bpf.c for more details.\n"
"\n" "\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-d PID]\n" "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
" [-D LEN] [-p]\n" " [-P] [-E PREFIX] [-d PID] [-D LEN] [-p] [-v]\n"
"\n" "\n"
" -s SLICE_US Override slice duration\n" " -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n" " -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n" " -T COUNT Stall every COUNT'th kernel thread\n"
" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -P Print out DSQ content to trace_pipe every second, use with -b\n"
" -E PREFIX Expedite consumption of threads w/ matching comm, use with -b\n"
" (e.g. match shell on a loaded system)\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
" -D LEN Set scx_exit_info.dump buffer length\n" " -D LEN Set scx_exit_info.dump buffer length\n"
" -p Switch only tasks on SCHED_EXT policy intead of all\n" " -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n"; " -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req; static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int dummy) static void sigint_handler(int dummy)
{ {
exit_req = 1; exit_req = 1;
@ -45,15 +58,13 @@ int main(int argc, char **argv)
struct bpf_link *link; struct bpf_link *link;
int opt; int opt;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler); signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler); signal(SIGTERM, sigint_handler);
libbpf_set_strict_mode(LIBBPF_STRICT_ALL); skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
skel = scx_qmap__open(); while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PE:d:D:pvh")) != -1) {
SCX_BUG_ON(!skel, "Failed to open skel");
while ((opt = getopt(argc, argv, "s:e:t:T:l:d:D:ph")) != -1) {
switch (opt) { switch (opt) {
case 's': case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@ -70,6 +81,16 @@ int main(int argc, char **argv)
case 'l': case 'l':
skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0); skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0);
break; break;
case 'b':
skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
break;
case 'P':
skel->rodata->print_shared_dsq = true;
break;
case 'E':
strncpy(skel->rodata->exp_prefix, optarg,
sizeof(skel->rodata->exp_prefix) - 1);
break;
case 'd': case 'd':
skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
if (skel->rodata->disallow_tgid < 0) if (skel->rodata->disallow_tgid < 0)
@ -82,12 +103,19 @@ int main(int argc, char **argv)
skel->rodata->switch_partial = true; skel->rodata->switch_partial = true;
skel->struct_ops.qmap_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL; skel->struct_ops.qmap_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL;
break; break;
case 'v':
verbose = true;
break;
default: default:
fprintf(stderr, help_fmt, basename(argv[0])); fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h'; return opt != 'h';
} }
} }
if (!__COMPAT_HAS_DSQ_ITER &&
(skel->rodata->print_shared_dsq || strlen(skel->rodata->exp_prefix)))
fprintf(stderr, "kernel doesn't support DSQ iteration\n");
SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei); SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops); link = SCX_OPS_ATTACH(skel, qmap_ops);
@ -95,10 +123,18 @@ int main(int argc, char **argv)
long nr_enqueued = skel->bss->nr_enqueued; long nr_enqueued = skel->bss->nr_enqueued;
long nr_dispatched = skel->bss->nr_dispatched; long nr_dispatched = skel->bss->nr_dispatched;
printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%" PRIu64 ", deq=%" PRIu64 ", core=%" PRIu64 "\n", printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" exp=%"PRIu64"\n",
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
skel->bss->nr_reenqueued, skel->bss->nr_dequeued, skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
skel->bss->nr_core_sched_execed); skel->bss->nr_core_sched_execed, skel->bss->nr_expedited);
if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
skel->bss->cpuperf_min,
skel->bss->cpuperf_avg,
skel->bss->cpuperf_max,
skel->bss->cpuperf_target_min,
skel->bss->cpuperf_target_avg,
skel->bss->cpuperf_target_max);
fflush(stdout); fflush(stdout);
sleep(1); sleep(1);
} }
@ -106,5 +142,9 @@ int main(int argc, char **argv)
bpf_link__destroy(link); bpf_link__destroy(link);
UEI_REPORT(skel, uei); UEI_REPORT(skel, uei);
scx_qmap__destroy(skel); scx_qmap__destroy(skel);
/*
* scx_qmap implements ops.cpu_on/offline() and doesn't need to restart
* on CPU hotplug events.
*/
return 0; return 0;
} }

View File

@ -129,7 +129,6 @@ void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
{ {
__COMPAT_scx_bpf_switch_all();
return scx_bpf_create_dsq(SHARED_DSQ, -1); return scx_bpf_create_dsq(SHARED_DSQ, -1);
} }

View File

@ -17,13 +17,22 @@ const char help_fmt[] =
"\n" "\n"
"See the top-level comment in .bpf.c for more details.\n" "See the top-level comment in .bpf.c for more details.\n"
"\n" "\n"
"Usage: %s [-f]\n" "Usage: %s [-f] [-v]\n"
"\n" "\n"
" -f Use FIFO scheduling instead of weighted vtime scheduling\n" " -f Use FIFO scheduling instead of weighted vtime scheduling\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n"; " -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req; static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int simple) static void sigint_handler(int simple)
{ {
exit_req = 1; exit_req = 1;
@ -54,20 +63,22 @@ int main(int argc, char **argv)
struct scx_simple *skel; struct scx_simple *skel;
struct bpf_link *link; struct bpf_link *link;
__u32 opt; __u32 opt;
__u64 ecode;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler); signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler); signal(SIGTERM, sigint_handler);
restart:
skel = SCX_OPS_OPEN(simple_ops, scx_simple);
libbpf_set_strict_mode(LIBBPF_STRICT_ALL); while ((opt = getopt(argc, argv, "fvh")) != -1) {
skel = scx_simple__open();
SCX_BUG_ON(!skel, "Failed to open skel");
while ((opt = getopt(argc, argv, "fh")) != -1) {
switch (opt) { switch (opt) {
case 'f': case 'f':
skel->rodata->fifo_sched = true; skel->rodata->fifo_sched = true;
break; break;
case 'v':
verbose = true;
break;
default: default:
fprintf(stderr, help_fmt, basename(argv[0])); fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h'; return opt != 'h';
@ -87,7 +98,10 @@ int main(int argc, char **argv)
} }
bpf_link__destroy(link); bpf_link__destroy(link);
UEI_REPORT(skel, uei); ecode = UEI_REPORT(skel, uei);
scx_simple__destroy(skel); scx_simple__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0; return 0;
} }

View File

@ -41,6 +41,7 @@ const char help_fmt[] =
"Usage: %s [-b BATCH]\n" "Usage: %s [-b BATCH]\n"
"\n" "\n"
" -b BATCH The number of tasks to batch when dispatching (default: 8)\n" " -b BATCH The number of tasks to batch when dispatching (default: 8)\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n"; " -h Display this help and exit\n";
/* Defined in UAPI */ /* Defined in UAPI */
@ -49,6 +50,7 @@ const char help_fmt[] =
/* Number of tasks to batch when dispatching to user space. */ /* Number of tasks to batch when dispatching to user space. */
static __u32 batch_size = 8; static __u32 batch_size = 8;
static bool verbose;
static volatile int exit_req; static volatile int exit_req;
static int enqueued_fd, dispatched_fd; static int enqueued_fd, dispatched_fd;
@ -96,6 +98,13 @@ static int pid_max;
static double min_vruntime; static double min_vruntime;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int userland) static void sigint_handler(int userland)
{ {
exit_req = 1; exit_req = 1;
@ -337,7 +346,7 @@ static void print_example_warning(const char *sched)
printf(warning_fmt, sched); printf(warning_fmt, sched);
} }
static void bootstrap(int argc, char **argv) static void pre_bootstrap(int argc, char **argv)
{ {
int err; int err;
__u32 opt; __u32 opt;
@ -349,9 +358,9 @@ static void bootstrap(int argc, char **argv)
if (err) if (err)
exit(err); exit(err);
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler); signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler); signal(SIGTERM, sigint_handler);
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
/* /*
* Enforce that the user scheduler task is managed by sched_ext. The * Enforce that the user scheduler task is managed by sched_ext. The
@ -363,11 +372,14 @@ static void bootstrap(int argc, char **argv)
err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param); err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param);
SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT"); SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT");
while ((opt = getopt(argc, argv, "b:ph")) != -1) { while ((opt = getopt(argc, argv, "b:vh")) != -1) {
switch (opt) { switch (opt) {
case 'b': case 'b':
batch_size = strtoul(optarg, NULL, 0); batch_size = strtoul(optarg, NULL, 0);
break; break;
case 'v':
verbose = true;
break;
default: default:
fprintf(stderr, help_fmt, basename(argv[0])); fprintf(stderr, help_fmt, basename(argv[0]));
exit(opt != 'h'); exit(opt != 'h');
@ -381,9 +393,11 @@ static void bootstrap(int argc, char **argv)
*/ */
err = mlockall(MCL_CURRENT | MCL_FUTURE); err = mlockall(MCL_CURRENT | MCL_FUTURE);
SCX_BUG_ON(err, "Failed to prefault and lock address space"); SCX_BUG_ON(err, "Failed to prefault and lock address space");
}
skel = scx_userland__open(); static void bootstrap(char *comm)
SCX_BUG_ON(!skel, "Failed to open skel"); {
skel = SCX_OPS_OPEN(userland_ops, scx_userland);
skel->rodata->num_possible_cpus = libbpf_num_possible_cpus(); skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
assert(skel->rodata->num_possible_cpus > 0); assert(skel->rodata->num_possible_cpus > 0);
@ -399,7 +413,7 @@ static void bootstrap(int argc, char **argv)
SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread"); SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread");
print_example_warning(basename(argv[0])); print_example_warning(basename(comm));
ops_link = SCX_OPS_ATTACH(skel, userland_ops); ops_link = SCX_OPS_ATTACH(skel, userland_ops);
} }
@ -428,12 +442,19 @@ static void sched_main_loop(void)
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
bootstrap(argc, argv); __u64 ecode;
pre_bootstrap(argc, argv);
restart:
bootstrap(argv[0]);
sched_main_loop(); sched_main_loop();
exit_req = 1; exit_req = 1;
bpf_link__destroy(ops_link); bpf_link__destroy(ops_link);
UEI_REPORT(skel, uei); ecode = UEI_REPORT(skel, uei);
scx_userland__destroy(skel); scx_userland__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0; return 0;
} }

View File

@ -28,9 +28,54 @@ static inline void ___vmlinux_h_sanity_check___(void)
"bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
} }
s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch_cancel(void) __ksym;
bool scx_bpf_consume(u64 dsq_id) __ksym;
bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p) __ksym __weak;
u32 scx_bpf_reenqueue_local(void) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, bool rev) __ksym __weak;
struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
void scx_bpf_exit_bstr(s64 exit_code, char *fmt, u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
unsigned long long *data, u32 data__sz) __ksym __weak; u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak;
const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
/*
* Use the following as @it when calling scx_bpf_consume_task() from whitin
* bpf_for_each() loops.
*/
#define BPF_FOR_EACH_ITER (&___it)
/* hopefully temporary wrapper to work around BPF restriction */
static inline bool scx_bpf_consume_task(struct bpf_iter_scx_dsq *it,
struct task_struct *p)
{
unsigned long ptr;
bpf_probe_read_kernel(&ptr, sizeof(ptr), it);
return __scx_bpf_consume_task(ptr, p);
}
static inline __attribute__((format(printf, 1, 2))) static inline __attribute__((format(printf, 1, 2)))
void ___scx_bpf_exit_format_checker(const char *fmt, ...) {} void ___scx_bpf_exit_format_checker(const char *fmt, ...) {}
@ -40,18 +85,18 @@ void ___scx_bpf_exit_format_checker(const char *fmt, ...) {}
* bstr exit kfuncs. Callers to this function should use ___fmt and ___param to * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
* refer to the initialized list of inputs to the bstr kfunc. * refer to the initialized list of inputs to the bstr kfunc.
*/ */
#define scx_bpf_exit_preamble(fmt, args...) \ #define scx_bpf_exit_preamble(fmt, args...) \
static char ___fmt[] = fmt; \ static char ___fmt[] = fmt; \
/* \ /* \
* Note that __param[] must have at least one \ * Note that __param[] must have at least one \
* element to keep the verifier happy. \ * element to keep the verifier happy. \
*/ \ */ \
unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \
\ \
_Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
___bpf_fill(___param, args); \ ___bpf_fill(___param, args); \
_Pragma("GCC diagnostic pop") \ _Pragma("GCC diagnostic pop") \
/* /*
* scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments
@ -78,30 +123,6 @@ void ___scx_bpf_exit_format_checker(const char *fmt, ...) {}
___scx_bpf_exit_format_checker(fmt, ##args); \ ___scx_bpf_exit_format_checker(fmt, ##args); \
}) })
s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
bool scx_bpf_consume(u64 dsq_id) __ksym;
void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch_cancel(void) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
u32 scx_bpf_reenqueue_local(void) __ksym;
u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym;
u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym;
void scx_bpf_cpuperf_set(u32 cpu, u32 perf) __ksym __weak;
#define BPF_STRUCT_OPS(name, args...) \ #define BPF_STRUCT_OPS(name, args...) \
SEC("struct_ops/"#name) \ SEC("struct_ops/"#name) \
BPF_PROG(name, ##args) BPF_PROG(name, ##args)
@ -156,7 +177,8 @@ BPF_PROG(name, ##args)
* be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
* `MEMBER_VPTR(ptr, ->member)`. * `MEMBER_VPTR(ptr, ->member)`.
*/ */
#define MEMBER_VPTR(base, member) (typeof((base) member) *)({ \ #define MEMBER_VPTR(base, member) (typeof((base) member) *) \
({ \
u64 __base = (u64)&(base); \ u64 __base = (u64)&(base); \
u64 __addr = (u64)&((base) member) - __base; \ u64 __addr = (u64)&((base) member) - __base; \
_Static_assert(sizeof(base) >= sizeof((base) member), \ _Static_assert(sizeof(base) >= sizeof((base) member), \
@ -186,18 +208,19 @@ BPF_PROG(name, ##args)
* size of the array to compute the max, which will result in rejection by * size of the array to compute the max, which will result in rejection by
* the verifier. * the verifier.
*/ */
#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)({ \ #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \
u64 __base = (u64)arr; \ ({ \
u64 __addr = (u64)&(arr[i]) - __base; \ u64 __base = (u64)arr; \
asm volatile ( \ u64 __addr = (u64)&(arr[i]) - __base; \
"if %0 <= %[max] goto +2\n" \ asm volatile ( \
"%0 = 0\n" \ "if %0 <= %[max] goto +2\n" \
"goto +1\n" \ "%0 = 0\n" \
"%0 += %1\n" \ "goto +1\n" \
: "+r"(__addr) \ "%0 += %1\n" \
: "r"(__base), \ : "+r"(__addr) \
[max]"r"(sizeof(arr[0]) * ((n) - 1))); \ : "r"(__base), \
__addr; \ [max]"r"(sizeof(arr[0]) * ((n) - 1))); \
__addr; \
}) })
/* /*
@ -227,7 +250,7 @@ int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
extern void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) #define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL)
/* task */ /* task */

View File

@ -18,13 +18,15 @@
/* /*
* %SCX_KICK_IDLE is a later addition. To support both before and after, use * %SCX_KICK_IDLE is a later addition. To support both before and after, use
* %__COMPAT_SCX_KICK_IDLE which becomes 0 on kernels which don't support it. * %__COMPAT_SCX_KICK_IDLE which becomes 0 on kernels which don't support it.
* Users can use %SCX_KICK_IDLE directly in the future.
*/ */
#define __COMPAT_SCX_KICK_IDLE \ #define __COMPAT_SCX_KICK_IDLE \
__COMPAT_ENUM_OR_ZERO(enum scx_kick_flags, SCX_KICK_IDLE) __COMPAT_ENUM_OR_ZERO(enum scx_kick_flags, SCX_KICK_IDLE)
/* /*
* scx_switch_all() was replaced by %SCX_OPS_SWITCH_PARTIAL. See * scx_switch_all() was replaced by %SCX_OPS_SWITCH_PARTIAL. See
* %__COMPAT_SCX_OPS_SWITCH_PARTIAL in compat.h. * %__COMPAT_SCX_OPS_SWITCH_PARTIAL in compat.h. This can be dropped in the
* future.
*/ */
void scx_bpf_switch_all(void) __ksym __weak; void scx_bpf_switch_all(void) __ksym __weak;
@ -34,6 +36,67 @@ static inline void __COMPAT_scx_bpf_switch_all(void)
scx_bpf_switch_all(); scx_bpf_switch_all();
} }
/*
* scx_bpf_exit() is a new addition. Fall back to scx_bpf_error() if
* unavailable. Users can use scx_bpf_exit() directly in the future.
*/
#define __COMPAT_scx_bpf_exit(code, fmt, args...) \
({ \
if (bpf_ksym_exists(scx_bpf_exit_bstr)) \
scx_bpf_exit((code), fmt, args); \
else \
scx_bpf_error(fmt, args); \
})
/*
* scx_bpf_nr_cpu_ids(), scx_bpf_get_possible/online_cpumask() are new. No good
* way to noop these kfuncs. Provide a test macro. Users can assume existence in
* the future.
*/
#define __COMPAT_HAS_CPUMASKS \
bpf_ksym_exists(scx_bpf_nr_cpu_ids)
/*
* cpuperf is new. The followings become noop on older kernels. Callers can be
* updated to call cpuperf kfuncs directly in the future.
*/
static inline u32 __COMPAT_scx_bpf_cpuperf_cap(s32 cpu)
{
if (bpf_ksym_exists(scx_bpf_cpuperf_cap))
return scx_bpf_cpuperf_cap(cpu);
else
return 1024;
}
static inline u32 __COMPAT_scx_bpf_cpuperf_cur(s32 cpu)
{
if (bpf_ksym_exists(scx_bpf_cpuperf_cur))
return scx_bpf_cpuperf_cur(cpu);
else
return 1024;
}
static inline void __COMPAT_scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{
if (bpf_ksym_exists(scx_bpf_cpuperf_set))
return scx_bpf_cpuperf_set(cpu, perf);
}
/*
* Iteration and scx_bpf_consume_task() are new. The following become noop on
* older kernels. The users can switch to bpf_for_each(scx_dsq) and directly
* call scx_bpf_consume_task() in the future.
*/
#define __COMPAT_DSQ_FOR_EACH(p, dsq_id, flags) \
if (bpf_ksym_exists(bpf_iter_scx_dsq_new)) \
bpf_for_each(scx_dsq, (p), (dsq_id), (flags))
static inline bool __COMPAT_scx_bpf_consume_task(struct bpf_iter_scx_dsq *it,
struct task_struct *p)
{
return false;
}
/* /*
* Define sched_ext_ops. This may be expanded to define multiple variants for * Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().

View File

@ -8,6 +8,9 @@
#define __SCX_COMPAT_H #define __SCX_COMPAT_H
#include <bpf/btf.h> #include <bpf/btf.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); struct btf *__COMPAT_vmlinux_btf __attribute__((weak));
@ -69,6 +72,12 @@ static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v
__val; \ __val; \
}) })
static inline bool __COMPAT_has_ksym(const char *ksym)
{
__COMPAT_load_vmlinux_btf();
return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0;
}
static inline bool __COMPAT_struct_has_field(const char *type, const char *field) static inline bool __COMPAT_struct_has_field(const char *type, const char *field)
{ {
const struct btf_type *t; const struct btf_type *t;
@ -101,27 +110,79 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
* An ops flag, %SCX_OPS_SWITCH_PARTIAL, replaced scx_bpf_switch_all() which had * An ops flag, %SCX_OPS_SWITCH_PARTIAL, replaced scx_bpf_switch_all() which had
* to be called from ops.init(). To support both before and after, use both * to be called from ops.init(). To support both before and after, use both
* %__COMPAT_SCX_OPS_SWITCH_PARTIAL and %__COMPAT_scx_bpf_switch_all() defined * %__COMPAT_SCX_OPS_SWITCH_PARTIAL and %__COMPAT_scx_bpf_switch_all() defined
* in compat.bpf.h. * in compat.bpf.h. Users can switch to directly using %SCX_OPS_SWITCH_PARTIAL
* in the future.
*/ */
#define __COMPAT_SCX_OPS_SWITCH_PARTIAL \ #define __COMPAT_SCX_OPS_SWITCH_PARTIAL \
__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
/*
* scx_bpf_nr_cpu_ids(), scx_bpf_get_possible/online_cpumask() are new. Users
* will be able to assume existence in the future.
*/
#define __COMPAT_HAS_CPUMASKS \
__COMPAT_has_ksym("scx_bpf_nr_cpu_ids")
/*
* DSQ iterator is new. Users will be able to assume existence in the future.
*/
#define __COMPAT_HAS_DSQ_ITER \
__COMPAT_has_ksym("bpf_iter_scx_dsq_new")
static inline long scx_hotplug_seq(void)
{
int fd;
char buf[32];
ssize_t len;
long val;
fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY);
if (fd < 0)
return -ENOENT;
len = read(fd, buf, sizeof(buf) - 1);
SCX_BUG_ON(len <= 0, "read failed (%ld)", len);
buf[len] = 0;
close(fd);
val = strtoul(buf, NULL, 10);
SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);
return val;
}
/* /*
* struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
* is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
* and attach it, backward compatibility is automatically maintained where * and attach it, backward compatibility is automatically maintained where
* reasonable. * reasonable.
* *
* - sched_ext_ops.exit_dump_len was added later. On kernels which don't support * - ops.tick(): Ignored on older kernels with a warning.
* it, the value is ignored and a warning is triggered if the value is * - ops.exit_dump_len: Cleared to zero on older kernels with a warning.
* requested to be non-zero. * - ops.hotplug_seq: Ignored on older kernels.
*/ */
#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
struct __scx_name *__skel; \
\
__skel = __scx_name##__open(); \
SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
\
if (__COMPAT_struct_has_field("sched_ext_ops", "hotplug_seq")) \
__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \
__skel; \
})
#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ #define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \
UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ UEI_SET_SIZE(__skel, __ops_name, __uei_name); \
if (__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") && \ if (!__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") && \
(__skel)->struct_ops.__ops_name->exit_dump_len) { \ (__skel)->struct_ops.__ops_name->exit_dump_len) { \
fprintf(stderr, "WARNING: kernel doesn't support setting exit dump len\n"); \ fprintf(stderr, "WARNING: kernel doesn't support setting exit dump len\n"); \
(__skel)->struct_ops.__ops_name->exit_dump_len = 0; \ (__skel)->struct_ops.__ops_name->exit_dump_len = 0; \
} \
if (!__COMPAT_struct_has_field("sched_ext_ops", "tick") && \
(__skel)->struct_ops.__ops_name->tick) { \
fprintf(stderr, "WARNING: kernel doesn't support ops.tick()\n"); \
(__skel)->struct_ops.__ops_name->tick = NULL; \
} \ } \
SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \ SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \
}) })

View File

@ -77,7 +77,35 @@ struct user_exit_info {
if (__uei->msg[0] != '\0') \ if (__uei->msg[0] != '\0') \
fprintf(stderr, " (%s)", __uei->msg); \ fprintf(stderr, " (%s)", __uei->msg); \
fputs("\n", stderr); \ fputs("\n", stderr); \
__uei->exit_code; \
}) })
/*
* We can't import vmlinux.h while compiling user C code. Let's duplicate
* scx_exit_code definition.
*/
enum scx_exit_code {
/* Reasons */
SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
/* Actions */
SCX_ECODE_ACT_RESTART = 1LLU << 48,
};
enum uei_ecode_mask {
UEI_ECODE_USER_MASK = ((1LLU << 32) - 1),
UEI_ECODE_SYS_RSN_MASK = ((1LLU << 16) - 1) << 32,
UEI_ECODE_SYS_ACT_MASK = ((1LLU << 16) - 1) << 48,
};
/*
* These macro interpret the ecode returned from UEI_REPORT().
*/
#define UEI_ECODE_USER(__ecode) ((__ecode) & UEI_ECODE_USER_MASK)
#define UEI_ECODE_SYS_RSN(__ecode) ((__ecode) & UEI_ECODE_SYS_RSN_MASK)
#define UEI_ECODE_SYS_ACT(__ecode) ((__ecode) & UEI_ECODE_SYS_ACT_MASK)
#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
#endif /* __bpf__ */ #endif /* __bpf__ */
#endif /* __USER_EXIT_INFO_H */ #endif /* __USER_EXIT_INFO_H */

View File

@ -1 +1 @@
vmlinux-v6.9-g5dc95302301f.h vmlinux-v6.9-g73f4013eb1eb.h

View File

@ -763,8 +763,8 @@ void BPF_STRUCT_OPS(layered_running, struct task_struct *p)
} }
} }
if (bpf_ksym_exists(scx_bpf_cpuperf_set) && layer->perf > 0) if (layer->perf > 0)
scx_bpf_cpuperf_set(cpu, layer->perf); __COMPAT_scx_bpf_cpuperf_set(cpu, layer->perf);
cctx->maybe_idle = false; cctx->maybe_idle = false;
} }

View File

@ -1421,18 +1421,12 @@ static s32 initialize_cpu(s32 cpu)
void BPF_STRUCT_OPS(rusty_cpu_online, s32 cpu) void BPF_STRUCT_OPS(rusty_cpu_online, s32 cpu)
{ {
if (bpf_ksym_exists(scx_bpf_exit_bstr)) __COMPAT_scx_bpf_exit(RUSTY_EXIT_HOTPLUG, "CPU %d went online", cpu);
scx_bpf_exit(RUSTY_EXIT_HOTPLUG, "CPU %d went online", cpu);
else
scx_bpf_error("CPU %d went online", cpu);
} }
void BPF_STRUCT_OPS(rusty_cpu_offline, s32 cpu) void BPF_STRUCT_OPS(rusty_cpu_offline, s32 cpu)
{ {
if (bpf_ksym_exists(scx_bpf_exit_bstr)) __COMPAT_scx_bpf_exit(RUSTY_EXIT_HOTPLUG, "CPU %d went offline", cpu);
scx_bpf_exit(RUSTY_EXIT_HOTPLUG, "CPU %d went offline", cpu);
else
scx_bpf_error("CPU %d went offline", cpu);
} }
s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init) s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)