Sync from kernel (670bdab6073)

And fix build breakage in scx_utils due to an enum type rename.
This commit is contained in:
Tejun Heo 2024-04-29 09:55:40 -10:00
parent 3ee64a1301
commit cf66e58118
15 changed files with 115473 additions and 114736 deletions

View File

@ -45,7 +45,7 @@ impl Builder {
let bindings = bindgen::Builder::default()
.header("bindings.h")
.allowlist_type("scx_exit_kind")
.allowlist_type("scx_internal_consts")
.allowlist_type("scx_consts")
.parse_callbacks(Box::new(bindgen::CargoCallbacks))
.generate()
.expect("Unable to generate bindings");

View File

@ -43,7 +43,7 @@ pub use builder::Builder;
mod user_exit_info;
pub use user_exit_info::ScxExitKind;
pub use user_exit_info::ScxInternalConsts;
pub use user_exit_info::ScxConsts;
pub use user_exit_info::UeiDumpPtr;
pub use user_exit_info::UserExitInfo;
pub use user_exit_info::UEI_DUMP_PTR_MUTEX;

View File

@ -29,8 +29,8 @@ pub enum ScxExitKind {
ErrorStall = bindings::scx_exit_kind_SCX_EXIT_ERROR_STALL as isize,
}
pub enum ScxInternalConsts {
ExitDumpDflLen = bindings::scx_internal_consts_SCX_EXIT_DUMP_DFL_LEN as isize,
pub enum ScxConsts {
ExitDumpDflLen = bindings::scx_consts_SCX_EXIT_DUMP_DFL_LEN as isize,
}
/// Takes a reference to C struct user_exit_info and reads it into
@ -65,7 +65,7 @@ macro_rules! uei_set_size {
($skel: expr, $ops: ident, $uei:ident) => {{
scx_utils::paste! {
let len = match $skel.struct_ops.$ops().exit_dump_len {
0 => scx_utils::ScxInternalConsts::ExitDumpDflLen as u32,
0 => scx_utils::ScxConsts::ExitDumpDflLen as u32,
v => v,
};
$skel.rodata_mut().[<$uei _dump_len>] = len;

View File

@ -24,10 +24,19 @@ const char help_fmt[] =
"\n"
" -s SLICE_US Override slice duration\n"
" -c CPU Override the central CPU (default: 0)\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int dummy)
{
exit_req = 1;
@ -37,22 +46,20 @@ int main(int argc, char **argv)
{
struct scx_central *skel;
struct bpf_link *link;
__u64 seq = 0;
__u64 seq = 0, ecode;
__s32 opt;
cpu_set_t *cpuset;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
skel = scx_central__open();
SCX_BUG_ON(!skel, "Failed to open skel");
restart:
skel = SCX_OPS_OPEN(central_ops, scx_central);
skel->rodata->central_cpu = 0;
skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
while ((opt = getopt(argc, argv, "s:c:ph")) != -1) {
while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@ -60,6 +67,9 @@ int main(int argc, char **argv)
case 'c':
skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
break;
case 'v':
verbose = true;
break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
@ -116,7 +126,10 @@ int main(int argc, char **argv)
}
bpf_link__destroy(link);
UEI_REPORT(skel, uei);
ecode = UEI_REPORT(skel, uei);
scx_central__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0;
}

View File

@ -26,15 +26,24 @@ const char help_fmt[] =
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f]\n"
"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -i INTERVAL Report interval\n"
" -f Use FIFO scheduling instead of weighted vtime scheduling\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int dummy)
{
exit_req = 1;
@ -119,18 +128,17 @@ int main(int argc, char **argv)
__u64 last_stats[FCG_NR_STATS] = {};
unsigned long seq = 0;
__s32 opt;
__u64 ecode;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
skel = scx_flatcg__open();
SCX_BUG_ON(!skel, "Failed to open skel");
restart:
skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
skel->rodata->nr_cpus = libbpf_num_possible_cpus();
while ((opt = getopt(argc, argv, "s:i:dfph")) != -1) {
while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
double v;
switch (opt) {
@ -149,6 +157,9 @@ int main(int argc, char **argv)
case 'f':
skel->rodata->fifo_sched = true;
break;
case 'v':
verbose = true;
break;
case 'h':
default:
fprintf(stderr, help_fmt, basename(argv[0]));
@ -213,7 +224,10 @@ int main(int argc, char **argv)
}
bpf_link__destroy(link);
UEI_REPORT(skel, uei);
ecode = UEI_REPORT(skel, uei);
scx_flatcg__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0;
}

View File

@ -23,6 +23,12 @@
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
#include <scx/common.bpf.h>
#include <string.h>
enum consts {
ONE_SEC_IN_NS = 1000000000,
SHARED_DSQ = 0,
};
char _license[] SEC("license") = "GPL";
@ -30,6 +36,9 @@ const volatile u64 slice_ns = SCX_SLICE_DFL;
const volatile u32 stall_user_nth;
const volatile u32 stall_kernel_nth;
const volatile u32 dsp_inf_loop_after;
const volatile u32 dsp_batch;
const volatile bool print_shared_dsq;
const volatile char exp_prefix[17];
const volatile s32 disallow_tgid;
const volatile bool switch_partial;
@ -62,6 +71,18 @@ struct {
},
};
/*
* If enabled, CPU performance target is set according to the queue index
* according to the following table.
*/
static const u32 qidx_to_cpuperf_target[] = {
[0] = SCX_CPUPERF_ONE * 0 / 4,
[1] = SCX_CPUPERF_ONE * 1 / 4,
[2] = SCX_CPUPERF_ONE * 2 / 4,
[3] = SCX_CPUPERF_ONE * 3 / 4,
[4] = SCX_CPUPERF_ONE * 4 / 4,
};
/*
* Per-queue sequence numbers to implement core-sched ordering.
*
@ -86,17 +107,25 @@ struct {
__type(value, struct task_ctx);
} task_ctx_stor SEC(".maps");
/* Per-cpu dispatch index and remaining count */
struct cpu_ctx {
u64 dsp_idx; /* dispatch index */
u64 dsp_cnt; /* remaining count */
u32 avg_weight;
u32 cpuperf_target;
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 2);
__uint(max_entries, 1);
__type(key, u32);
__type(value, u64);
} dispatch_idx_cnt SEC(".maps");
__type(value, struct cpu_ctx);
} cpu_ctx_stor SEC(".maps");
/* Statistics */
u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
u64 nr_core_sched_execed;
u64 nr_core_sched_execed, nr_expedited;
u32 cpuperf_min, cpuperf_avg, cpuperf_max;
u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
@ -189,7 +218,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
if (enq_flags & SCX_ENQ_REENQ) {
s32 cpu;
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags);
scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
if (cpu >= 0)
scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE);
@ -204,7 +233,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
if (bpf_map_push_elem(ring, &pid, 0)) {
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags);
scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
return;
}
@ -233,18 +262,49 @@ static void update_core_sched_head_seq(struct task_struct *p)
scx_bpf_error("task_ctx lookup failed");
}
static bool consume_shared_dsq(void)
{
struct task_struct *p;
bool consumed;
if (exp_prefix[0] == '\0')
return scx_bpf_consume(SHARED_DSQ);
/*
* To demonstrate the use of scx_bpf_consume_task(), implement silly
* selective priority boosting mechanism by scanning SHARED_DSQ looking
* for matching comms and consume them first. This makes difference only
* when dsp_batch is larger than 1.
*/
consumed = false;
__COMPAT_DSQ_FOR_EACH(p, SHARED_DSQ, 0) {
char comm[sizeof(exp_prefix)];
memcpy(comm, p->comm, sizeof(exp_prefix) - 1);
if (!bpf_strncmp(comm, sizeof(exp_prefix),
(const char *)exp_prefix) &&
__COMPAT_scx_bpf_consume_task(BPF_FOR_EACH_ITER, p)) {
consumed = true;
__sync_fetch_and_add(&nr_expedited, 1);
}
}
return consumed || scx_bpf_consume(SHARED_DSQ);
}
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
{
u32 zero = 0, one = 1;
u64 *idx = bpf_map_lookup_elem(&dispatch_idx_cnt, &zero);
u64 *cnt = bpf_map_lookup_elem(&dispatch_idx_cnt, &one);
struct task_struct *p;
struct cpu_ctx *cpuc;
u32 zero = 0, batch = dsp_batch ?: 1;
void *fifo;
s32 pid;
int i;
s32 i, pid;
if (consume_shared_dsq())
return;
if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
struct task_struct *p;
/*
* PID 2 should be kthreadd which should mostly be idle and off
* the scheduler. Let's keep dispatching it to force the kernel
@ -252,49 +312,80 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
*/
p = bpf_task_from_pid(2);
if (p) {
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0);
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
bpf_task_release(p);
return;
}
}
if (!idx || !cnt) {
scx_bpf_error("failed to lookup idx[%p], cnt[%p]", idx, cnt);
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
scx_bpf_error("failed to look up cpu_ctx");
return;
}
for (i = 0; i < 5; i++) {
/* Advance the dispatch cursor and pick the fifo. */
if (!*cnt) {
*idx = (*idx + 1) % 5;
*cnt = 1 << *idx;
if (!cpuc->dsp_cnt) {
cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
}
(*cnt)--;
fifo = bpf_map_lookup_elem(&queue_arr, idx);
fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
if (!fifo) {
scx_bpf_error("failed to find ring %llu", *idx);
scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
return;
}
/* Dispatch or advance. */
if (!bpf_map_pop_elem(fifo, &pid)) {
struct task_struct *p;
bpf_repeat(BPF_MAX_LOOPS) {
if (bpf_map_pop_elem(fifo, &pid))
break;
p = bpf_task_from_pid(pid);
if (p) {
update_core_sched_head_seq(p);
__sync_fetch_and_add(&nr_dispatched, 1);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0);
bpf_task_release(p);
if (!p)
continue;
update_core_sched_head_seq(p);
__sync_fetch_and_add(&nr_dispatched, 1);
scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
bpf_task_release(p);
batch--;
cpuc->dsp_cnt--;
if (!batch || !scx_bpf_dispatch_nr_slots()) {
consume_shared_dsq();
return;
}
if (!cpuc->dsp_cnt)
break;
}
*cnt = 0;
cpuc->dsp_cnt = 0;
}
}
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
{
struct cpu_ctx *cpuc;
u32 zero = 0;
int idx;
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
scx_bpf_error("failed to look up cpu_ctx");
return;
}
/*
* Use the running avg of weights to select the target cpuperf level.
* This is a demonstration of the cpuperf feature rather than a
* practical strategy to regulate CPU frequency.
*/
cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
idx = weight_to_idx(cpuc->avg_weight);
cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
}
/*
* The distance from the head of the queue scaled by the weight of the queue.
* The lower the number, the older the task and the higher the priority.
@ -371,11 +462,189 @@ s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
return -ENOMEM;
}
s32 BPF_STRUCT_OPS(qmap_init)
/*
* Print out the online and possible CPU map using bpf_printk() as a
* demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
*/
static void print_cpus(void)
{
const struct cpumask *possible, *online;
s32 cpu;
char buf[128] = "", *p;
int idx;
if (!__COMPAT_HAS_CPUMASKS)
return;
possible = scx_bpf_get_possible_cpumask();
online = scx_bpf_get_online_cpumask();
idx = 0;
bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
if (!(p = MEMBER_VPTR(buf, [idx++])))
break;
if (bpf_cpumask_test_cpu(cpu, online))
*p++ = 'O';
else if (bpf_cpumask_test_cpu(cpu, possible))
*p++ = 'X';
else
*p++ = ' ';
if ((cpu & 7) == 7) {
if (!(p = MEMBER_VPTR(buf, [idx++])))
break;
*p++ = '|';
}
}
buf[sizeof(buf) - 1] = '\0';
scx_bpf_put_cpumask(online);
scx_bpf_put_cpumask(possible);
bpf_printk("CPUS: |%s", buf);
}
void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
{
bpf_printk("CPU %d coming online", cpu);
/* @cpu is already online at this point */
print_cpus();
}
void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
{
bpf_printk("CPU %d going offline", cpu);
/* @cpu is still online at this point */
print_cpus();
}
struct monitor_timer {
struct bpf_timer timer;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, u32);
__type(value, struct monitor_timer);
} central_timer SEC(".maps");
/*
* Print out the min, avg and max performance levels of CPUs every second to
* demonstrate the cpuperf interface.
*/
static void monitor_cpuperf(void)
{
u32 zero = 0;
u32 nr_cpu_ids = scx_bpf_nr_cpu_ids();
u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
const struct cpumask *online;
int i, nr_online_cpus = 0;
online = scx_bpf_get_online_cpumask();
bpf_for(i, 0, nr_cpu_ids) {
struct cpu_ctx *cpuc;
u32 cap, cur;
if (!bpf_cpumask_test_cpu(i, online))
continue;
nr_online_cpus++;
/* collect the capacity and current cpuperf */
cap = scx_bpf_cpuperf_cap(i);
cur = scx_bpf_cpuperf_cur(i);
cur_min = cur < cur_min ? cur : cur_min;
cur_max = cur > cur_max ? cur : cur_max;
/*
* $cur is relative to $cap. Scale it down accordingly so that
* it's in the same scale as other CPUs and $cur_sum/$cap_sum
* makes sense.
*/
cur_sum += cur * cap / SCX_CPUPERF_ONE;
cap_sum += cap;
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
scx_bpf_error("failed to look up cpu_ctx");
goto out;
}
/* collect target */
cur = cpuc->cpuperf_target;
target_sum += cur;
target_min = cur < target_min ? cur : target_min;
target_max = cur > target_max ? cur : target_max;
}
cpuperf_min = cur_min;
cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
cpuperf_max = cur_max;
cpuperf_target_min = target_min;
cpuperf_target_avg = target_sum / nr_online_cpus;
cpuperf_target_max = target_max;
out:
scx_bpf_put_cpumask(online);
}
/*
* Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
* scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
* see meaningful dumps in the trace pipe.
*/
static void dump_shared_dsq(void)
{
struct task_struct *p;
s32 nr;
if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
return;
bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
bpf_rcu_read_lock();
__COMPAT_DSQ_FOR_EACH(p, SHARED_DSQ, SCX_DSQ_ITER_REV)
bpf_printk("%s[%d]", p->comm, p->pid);
bpf_rcu_read_unlock();
}
static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
{
monitor_cpuperf();
if (print_shared_dsq)
dump_shared_dsq();
bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
return 0;
}
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
u32 key = 0;
struct bpf_timer *timer;
s32 ret;
if (!switch_partial)
__COMPAT_scx_bpf_switch_all();
return 0;
print_cpus();
ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
if (ret)
return ret;
timer = bpf_map_lookup_elem(&central_timer, &key);
if (!timer)
return -ESRCH;
bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
bpf_timer_set_callback(timer, monitor_timerfn);
return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
}
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
@ -388,9 +657,12 @@ SCX_OPS_DEFINE(qmap_ops,
.enqueue = (void *)qmap_enqueue,
.dequeue = (void *)qmap_dequeue,
.dispatch = (void *)qmap_dispatch,
.tick = (void *)qmap_tick,
.core_sched_before = (void *)qmap_core_sched_before,
.cpu_release = (void *)qmap_cpu_release,
.init_task = (void *)qmap_init_task,
.cpu_online = (void *)qmap_cpu_online,
.cpu_offline = (void *)qmap_cpu_offline,
.init = (void *)qmap_init,
.exit = (void *)qmap_exit,
.flags = SCX_OPS_ENQ_LAST,

View File

@ -19,21 +19,34 @@ const char help_fmt[] =
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-d PID]\n"
" [-D LEN] [-p]\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
" [-P] [-E PREFIX] [-d PID] [-D LEN] [-p] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n"
" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -P Print out DSQ content to trace_pipe every second, use with -b\n"
" -E PREFIX Expedite consumption of threads w/ matching comm, use with -b\n"
" (e.g. match shell on a loaded system)\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
" -D LEN Set scx_exit_info.dump buffer length\n"
" -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int dummy)
{
exit_req = 1;
@ -45,15 +58,13 @@ int main(int argc, char **argv)
struct bpf_link *link;
int opt;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
skel = scx_qmap__open();
SCX_BUG_ON(!skel, "Failed to open skel");
while ((opt = getopt(argc, argv, "s:e:t:T:l:d:D:ph")) != -1) {
while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PE:d:D:pvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@ -70,6 +81,16 @@ int main(int argc, char **argv)
case 'l':
skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0);
break;
case 'b':
skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
break;
case 'P':
skel->rodata->print_shared_dsq = true;
break;
case 'E':
strncpy(skel->rodata->exp_prefix, optarg,
sizeof(skel->rodata->exp_prefix) - 1);
break;
case 'd':
skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
if (skel->rodata->disallow_tgid < 0)
@ -82,12 +103,19 @@ int main(int argc, char **argv)
skel->rodata->switch_partial = true;
skel->struct_ops.qmap_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL;
break;
case 'v':
verbose = true;
break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
}
}
if (!__COMPAT_HAS_DSQ_ITER &&
(skel->rodata->print_shared_dsq || strlen(skel->rodata->exp_prefix)))
fprintf(stderr, "kernel doesn't support DSQ iteration\n");
SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops);
@ -95,10 +123,18 @@ int main(int argc, char **argv)
long nr_enqueued = skel->bss->nr_enqueued;
long nr_dispatched = skel->bss->nr_dispatched;
printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%" PRIu64 ", deq=%" PRIu64 ", core=%" PRIu64 "\n",
printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" exp=%"PRIu64"\n",
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
skel->bss->nr_core_sched_execed);
skel->bss->nr_core_sched_execed, skel->bss->nr_expedited);
if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
skel->bss->cpuperf_min,
skel->bss->cpuperf_avg,
skel->bss->cpuperf_max,
skel->bss->cpuperf_target_min,
skel->bss->cpuperf_target_avg,
skel->bss->cpuperf_target_max);
fflush(stdout);
sleep(1);
}
@ -106,5 +142,9 @@ int main(int argc, char **argv)
bpf_link__destroy(link);
UEI_REPORT(skel, uei);
scx_qmap__destroy(skel);
/*
* scx_qmap implements ops.cpu_on/offline() and doesn't need to restart
* on CPU hotplug events.
*/
return 0;
}

View File

@ -129,7 +129,6 @@ void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
{
__COMPAT_scx_bpf_switch_all();
return scx_bpf_create_dsq(SHARED_DSQ, -1);
}

View File

@ -17,13 +17,22 @@ const char help_fmt[] =
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-f]\n"
"Usage: %s [-f] [-v]\n"
"\n"
" -f Use FIFO scheduling instead of weighted vtime scheduling\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int simple)
{
exit_req = 1;
@ -54,20 +63,22 @@ int main(int argc, char **argv)
struct scx_simple *skel;
struct bpf_link *link;
__u32 opt;
__u64 ecode;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
skel = SCX_OPS_OPEN(simple_ops, scx_simple);
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
skel = scx_simple__open();
SCX_BUG_ON(!skel, "Failed to open skel");
while ((opt = getopt(argc, argv, "fh")) != -1) {
while ((opt = getopt(argc, argv, "fvh")) != -1) {
switch (opt) {
case 'f':
skel->rodata->fifo_sched = true;
break;
case 'v':
verbose = true;
break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
@ -87,7 +98,10 @@ int main(int argc, char **argv)
}
bpf_link__destroy(link);
UEI_REPORT(skel, uei);
ecode = UEI_REPORT(skel, uei);
scx_simple__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0;
}

View File

@ -28,9 +28,54 @@ static inline void ___vmlinux_h_sanity_check___(void)
"bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
}
s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch_cancel(void) __ksym;
bool scx_bpf_consume(u64 dsq_id) __ksym;
bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p) __ksym __weak;
u32 scx_bpf_reenqueue_local(void) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, bool rev) __ksym __weak;
struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
unsigned long long *data, u32 data__sz) __ksym __weak;
u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak;
const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
/*
* Use the following as @it when calling scx_bpf_consume_task() from whitin
* bpf_for_each() loops.
*/
#define BPF_FOR_EACH_ITER (&___it)
/* hopefully temporary wrapper to work around BPF restriction */
static inline bool scx_bpf_consume_task(struct bpf_iter_scx_dsq *it,
struct task_struct *p)
{
unsigned long ptr;
bpf_probe_read_kernel(&ptr, sizeof(ptr), it);
return __scx_bpf_consume_task(ptr, p);
}
static inline __attribute__((format(printf, 1, 2)))
void ___scx_bpf_exit_format_checker(const char *fmt, ...) {}
@ -40,18 +85,18 @@ void ___scx_bpf_exit_format_checker(const char *fmt, ...) {}
* bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
* refer to the initialized list of inputs to the bstr kfunc.
*/
#define scx_bpf_exit_preamble(fmt, args...) \
static char ___fmt[] = fmt; \
/* \
* Note that __param[] must have at least one \
* element to keep the verifier happy. \
*/ \
unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \
\
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
___bpf_fill(___param, args); \
_Pragma("GCC diagnostic pop") \
#define scx_bpf_exit_preamble(fmt, args...) \
static char ___fmt[] = fmt; \
/* \
* Note that __param[] must have at least one \
* element to keep the verifier happy. \
*/ \
unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \
\
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
___bpf_fill(___param, args); \
_Pragma("GCC diagnostic pop") \
/*
* scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments
@ -78,30 +123,6 @@ void ___scx_bpf_exit_format_checker(const char *fmt, ...) {}
___scx_bpf_exit_format_checker(fmt, ##args); \
})
s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
bool scx_bpf_consume(u64 dsq_id) __ksym;
void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch_cancel(void) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
u32 scx_bpf_reenqueue_local(void) __ksym;
u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym;
u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym;
void scx_bpf_cpuperf_set(u32 cpu, u32 perf) __ksym __weak;
#define BPF_STRUCT_OPS(name, args...) \
SEC("struct_ops/"#name) \
BPF_PROG(name, ##args)
@ -156,7 +177,8 @@ BPF_PROG(name, ##args)
* be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
* `MEMBER_VPTR(ptr, ->member)`.
*/
#define MEMBER_VPTR(base, member) (typeof((base) member) *)({ \
#define MEMBER_VPTR(base, member) (typeof((base) member) *) \
({ \
u64 __base = (u64)&(base); \
u64 __addr = (u64)&((base) member) - __base; \
_Static_assert(sizeof(base) >= sizeof((base) member), \
@ -186,18 +208,19 @@ BPF_PROG(name, ##args)
* size of the array to compute the max, which will result in rejection by
* the verifier.
*/
#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)({ \
u64 __base = (u64)arr; \
u64 __addr = (u64)&(arr[i]) - __base; \
asm volatile ( \
"if %0 <= %[max] goto +2\n" \
"%0 = 0\n" \
"goto +1\n" \
"%0 += %1\n" \
: "+r"(__addr) \
: "r"(__base), \
[max]"r"(sizeof(arr[0]) * ((n) - 1))); \
__addr; \
#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \
({ \
u64 __base = (u64)arr; \
u64 __addr = (u64)&(arr[i]) - __base; \
asm volatile ( \
"if %0 <= %[max] goto +2\n" \
"%0 = 0\n" \
"goto +1\n" \
"%0 += %1\n" \
: "+r"(__addr) \
: "r"(__base), \
[max]"r"(sizeof(arr[0]) * ((n) - 1))); \
__addr; \
})
/*
@ -227,7 +250,7 @@ int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
extern void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL)
/* task */

View File

@ -18,13 +18,15 @@
/*
* %SCX_KICK_IDLE is a later addition. To support both before and after, use
* %__COMPAT_SCX_KICK_IDLE which becomes 0 on kernels which don't support it.
* Users can use %SCX_KICK_IDLE directly in the future.
*/
#define __COMPAT_SCX_KICK_IDLE \
__COMPAT_ENUM_OR_ZERO(enum scx_kick_flags, SCX_KICK_IDLE)
/*
* scx_switch_all() was replaced by %SCX_OPS_SWITCH_PARTIAL. See
* %__COMPAT_SCX_OPS_SWITCH_PARTIAL in compat.h.
* %__COMPAT_SCX_OPS_SWITCH_PARTIAL in compat.h. This can be dropped in the
* future.
*/
void scx_bpf_switch_all(void) __ksym __weak;
@ -34,6 +36,67 @@ static inline void __COMPAT_scx_bpf_switch_all(void)
scx_bpf_switch_all();
}
/*
* scx_bpf_exit() is a new addition. Fall back to scx_bpf_error() if
* unavailable. Users can use scx_bpf_exit() directly in the future.
*/
#define __COMPAT_scx_bpf_exit(code, fmt, args...) \
({ \
if (bpf_ksym_exists(scx_bpf_exit_bstr)) \
scx_bpf_exit((code), fmt, args); \
else \
scx_bpf_error(fmt, args); \
})
/*
* scx_bpf_nr_cpu_ids(), scx_bpf_get_possible/online_cpumask() are new. No good
* way to noop these kfuncs. Provide a test macro. Users can assume existence in
* the future.
*/
#define __COMPAT_HAS_CPUMASKS \
bpf_ksym_exists(scx_bpf_nr_cpu_ids)
/*
* cpuperf is new. The followings become noop on older kernels. Callers can be
* updated to call cpuperf kfuncs directly in the future.
*/
static inline u32 __COMPAT_scx_bpf_cpuperf_cap(s32 cpu)
{
if (bpf_ksym_exists(scx_bpf_cpuperf_cap))
return scx_bpf_cpuperf_cap(cpu);
else
return 1024;
}
static inline u32 __COMPAT_scx_bpf_cpuperf_cur(s32 cpu)
{
if (bpf_ksym_exists(scx_bpf_cpuperf_cur))
return scx_bpf_cpuperf_cur(cpu);
else
return 1024;
}
static inline void __COMPAT_scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{
if (bpf_ksym_exists(scx_bpf_cpuperf_set))
return scx_bpf_cpuperf_set(cpu, perf);
}
/*
* Iteration and scx_bpf_consume_task() are new. The following become noop on
* older kernels. The users can switch to bpf_for_each(scx_dsq) and directly
* call scx_bpf_consume_task() in the future.
*/
#define __COMPAT_DSQ_FOR_EACH(p, dsq_id, flags) \
if (bpf_ksym_exists(bpf_iter_scx_dsq_new)) \
bpf_for_each(scx_dsq, (p), (dsq_id), (flags))
static inline bool __COMPAT_scx_bpf_consume_task(struct bpf_iter_scx_dsq *it,
struct task_struct *p)
{
return false;
}
/*
* Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().

View File

@ -8,6 +8,9 @@
#define __SCX_COMPAT_H
#include <bpf/btf.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
struct btf *__COMPAT_vmlinux_btf __attribute__((weak));
@ -69,6 +72,12 @@ static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v
__val; \
})
static inline bool __COMPAT_has_ksym(const char *ksym)
{
__COMPAT_load_vmlinux_btf();
return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0;
}
static inline bool __COMPAT_struct_has_field(const char *type, const char *field)
{
const struct btf_type *t;
@ -101,27 +110,79 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
* An ops flag, %SCX_OPS_SWITCH_PARTIAL, replaced scx_bpf_switch_all() which had
* to be called from ops.init(). To support both before and after, use both
* %__COMPAT_SCX_OPS_SWITCH_PARTIAL and %__COMPAT_scx_bpf_switch_all() defined
* in compat.bpf.h.
* in compat.bpf.h. Users can switch to directly using %SCX_OPS_SWITCH_PARTIAL
* in the future.
*/
#define __COMPAT_SCX_OPS_SWITCH_PARTIAL \
__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
/*
* scx_bpf_nr_cpu_ids(), scx_bpf_get_possible/online_cpumask() are new. Users
* will be able to assume existence in the future.
*/
#define __COMPAT_HAS_CPUMASKS \
__COMPAT_has_ksym("scx_bpf_nr_cpu_ids")
/*
* DSQ iterator is new. Users will be able to assume existence in the future.
*/
#define __COMPAT_HAS_DSQ_ITER \
__COMPAT_has_ksym("bpf_iter_scx_dsq_new")
static inline long scx_hotplug_seq(void)
{
int fd;
char buf[32];
ssize_t len;
long val;
fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY);
if (fd < 0)
return -ENOENT;
len = read(fd, buf, sizeof(buf) - 1);
SCX_BUG_ON(len <= 0, "read failed (%ld)", len);
buf[len] = 0;
close(fd);
val = strtoul(buf, NULL, 10);
SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);
return val;
}
/*
* struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
* is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
* and attach it, backward compatibility is automatically maintained where
* reasonable.
*
* - sched_ext_ops.exit_dump_len was added later. On kernels which don't support
* it, the value is ignored and a warning is triggered if the value is
* requested to be non-zero.
* - ops.tick(): Ignored on older kernels with a warning.
* - ops.exit_dump_len: Cleared to zero on older kernels with a warning.
* - ops.hotplug_seq: Ignored on older kernels.
*/
#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
struct __scx_name *__skel; \
\
__skel = __scx_name##__open(); \
SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
\
if (__COMPAT_struct_has_field("sched_ext_ops", "hotplug_seq")) \
__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \
__skel; \
})
#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \
UEI_SET_SIZE(__skel, __ops_name, __uei_name); \
if (__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") && \
if (!__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") && \
(__skel)->struct_ops.__ops_name->exit_dump_len) { \
fprintf(stderr, "WARNING: kernel doesn't support setting exit dump len\n"); \
(__skel)->struct_ops.__ops_name->exit_dump_len = 0; \
(__skel)->struct_ops.__ops_name->exit_dump_len = 0; \
} \
if (!__COMPAT_struct_has_field("sched_ext_ops", "tick") && \
(__skel)->struct_ops.__ops_name->tick) { \
fprintf(stderr, "WARNING: kernel doesn't support ops.tick()\n"); \
(__skel)->struct_ops.__ops_name->tick = NULL; \
} \
SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \
})

View File

@ -77,7 +77,35 @@ struct user_exit_info {
if (__uei->msg[0] != '\0') \
fprintf(stderr, " (%s)", __uei->msg); \
fputs("\n", stderr); \
__uei->exit_code; \
})
/*
* We can't import vmlinux.h while compiling user C code. Let's duplicate
* scx_exit_code definition.
*/
enum scx_exit_code {
/* Reasons */
SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
/* Actions */
SCX_ECODE_ACT_RESTART = 1LLU << 48,
};
enum uei_ecode_mask {
UEI_ECODE_USER_MASK = ((1LLU << 32) - 1),
UEI_ECODE_SYS_RSN_MASK = ((1LLU << 16) - 1) << 32,
UEI_ECODE_SYS_ACT_MASK = ((1LLU << 16) - 1) << 48,
};
/*
* These macro interpret the ecode returned from UEI_REPORT().
*/
#define UEI_ECODE_USER(__ecode) ((__ecode) & UEI_ECODE_USER_MASK)
#define UEI_ECODE_SYS_RSN(__ecode) ((__ecode) & UEI_ECODE_SYS_RSN_MASK)
#define UEI_ECODE_SYS_ACT(__ecode) ((__ecode) & UEI_ECODE_SYS_ACT_MASK)
#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
#endif /* __bpf__ */
#endif /* __USER_EXIT_INFO_H */

View File

@ -1 +1 @@
vmlinux-v6.9-g5dc95302301f.h
vmlinux-v6.9-g73f4013eb1eb.h