mirror of
https://github.com/sched-ext/scx.git
synced 2024-12-04 16:27:12 +00:00
Merge b692c415f1
into 7d14df8ca2
This commit is contained in:
commit
dac0142be5
1
.gitignore
vendored
1
.gitignore
vendored
@ -5,3 +5,4 @@ target
|
||||
*.swp
|
||||
.cache/
|
||||
.vscode/
|
||||
**/tags
|
||||
|
@ -1,5 +1,5 @@
|
||||
c_scheds = ['scx_simple', 'scx_qmap', 'scx_central', 'scx_userland', 'scx_nest',
|
||||
'scx_flatcg', 'scx_pair']
|
||||
'scx_flatcg', 'scx_pair', 'scx_sdt']
|
||||
|
||||
foreach sched: c_scheds
|
||||
thread_dep = dependency('threads')
|
||||
|
143
scheds/c/scx_sdt.bpf.c
Normal file
143
scheds/c/scx_sdt.bpf.c
Normal file
@ -0,0 +1,143 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#include <scx/common.bpf.h>
|
||||
#include <scx/sdt_task_impl.bpf.h>
|
||||
|
||||
#include "scx_sdt.h"
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
UEI_DEFINE(uei);
|
||||
|
||||
#define SHARED_DSQ 0
|
||||
|
||||
#define DEFINE_SDT_STAT(metric) \
|
||||
static SDT_TASK_FN_ATTRS void \
|
||||
stat_inc_##metric(struct sdt_stats __arena *stats) \
|
||||
{ \
|
||||
cast_kern(stats); \
|
||||
stats->metric += 1; \
|
||||
} \
|
||||
__u64 stat_##metric; \
|
||||
|
||||
DEFINE_SDT_STAT(enqueue);
|
||||
DEFINE_SDT_STAT(init);
|
||||
DEFINE_SDT_STAT(exit);
|
||||
DEFINE_SDT_STAT(select_idle_cpu);
|
||||
DEFINE_SDT_STAT(select_busy_cpu);
|
||||
|
||||
static SDT_TASK_FN_ATTRS void
|
||||
stat_global_update(struct sdt_stats __arena *stats)
|
||||
{
|
||||
cast_kern(stats);
|
||||
__sync_fetch_and_add(&stat_enqueue, stats->enqueue);
|
||||
__sync_fetch_and_add(&stat_init, stats->init);
|
||||
__sync_fetch_and_add(&stat_exit, stats->exit);
|
||||
__sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
|
||||
__sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
|
||||
{
|
||||
struct sdt_stats __arena *stats;
|
||||
bool is_idle = false;
|
||||
s32 cpu;
|
||||
|
||||
stats = sdt_task_retrieve(p);
|
||||
if (!stats) {
|
||||
bpf_printk("%s: no stats for pid %d", p->pid);
|
||||
return 0;
|
||||
}
|
||||
|
||||
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
|
||||
if (is_idle) {
|
||||
stat_inc_select_idle_cpu(stats);
|
||||
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
|
||||
} else {
|
||||
stat_inc_select_busy_cpu(stats);
|
||||
}
|
||||
|
||||
return cpu;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
{
|
||||
struct sdt_stats __arena *stats;
|
||||
|
||||
stats = sdt_task_retrieve(p);
|
||||
if (!stats) {
|
||||
bpf_printk("%s: no stats for pid %d", p->pid);
|
||||
return;
|
||||
}
|
||||
|
||||
stat_inc_enqueue(stats);
|
||||
|
||||
scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
|
||||
{
|
||||
scx_bpf_consume(SHARED_DSQ);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
|
||||
struct scx_init_task_args *args)
|
||||
{
|
||||
struct sdt_stats __arena *stats;
|
||||
|
||||
stats = sdt_task_alloc(p);
|
||||
if (!stats) {
|
||||
bpf_printk("arena allocator out of memory");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
stats->pid = p->pid;
|
||||
|
||||
stat_inc_init(stats);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
|
||||
struct scx_exit_task_args *args)
|
||||
{
|
||||
struct sdt_stats __arena *stats;
|
||||
|
||||
stats = sdt_task_retrieve(p);
|
||||
if (!stats) {
|
||||
bpf_printk("%s: no stats for pid %d", p->pid);
|
||||
return;
|
||||
}
|
||||
|
||||
stat_inc_exit(stats);
|
||||
stat_global_update(stats);
|
||||
|
||||
sdt_task_free(p);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = sdt_task_init(sizeof(struct sdt_stats));
|
||||
if (ret < 0) {
|
||||
bpf_printk("sdt_init failed with %d", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return scx_bpf_create_dsq(SHARED_DSQ, -1);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
|
||||
{
|
||||
UEI_RECORD(uei, ei);
|
||||
}
|
||||
|
||||
SCX_OPS_DEFINE(sdt_ops,
|
||||
.select_cpu = (void *)sdt_select_cpu,
|
||||
.enqueue = (void *)sdt_enqueue,
|
||||
.dispatch = (void *)sdt_dispatch,
|
||||
.init_task = (void *)sdt_init_task,
|
||||
.exit_task = (void *)sdt_exit_task,
|
||||
.init = (void *)sdt_init,
|
||||
.exit = (void *)sdt_exit,
|
||||
.name = "sdt");
|
92
scheds/c/scx_sdt.c
Normal file
92
scheds/c/scx_sdt.c
Normal file
@ -0,0 +1,92 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
|
||||
* Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
|
||||
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
|
||||
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <libgen.h>
|
||||
#include <bpf/bpf.h>
|
||||
#include <scx/common.h>
|
||||
#include <scx/sdt_task.h>
|
||||
#include "scx_sdt.bpf.skel.h"
|
||||
|
||||
#include "scx_sdt.h"
|
||||
|
||||
const char help_fmt[] =
|
||||
"A simple sched_ext scheduler.\n"
|
||||
"\n"
|
||||
"See the top-level comment in .bpf.c for more details.\n"
|
||||
"\n"
|
||||
"Usage: %s [-f] [-v]\n"
|
||||
"\n"
|
||||
" -v Print libbpf debug messages\n"
|
||||
" -h Display this help and exit\n";
|
||||
|
||||
static bool verbose;
|
||||
static volatile int exit_req;
|
||||
|
||||
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
|
||||
{
|
||||
if (level == LIBBPF_DEBUG && !verbose)
|
||||
return 0;
|
||||
return vfprintf(stderr, format, args);
|
||||
}
|
||||
|
||||
static void sigint_handler(int sig)
|
||||
{
|
||||
exit_req = 1;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct scx_sdt *skel;
|
||||
struct bpf_link *link;
|
||||
__u32 opt;
|
||||
__u64 ecode;
|
||||
|
||||
libbpf_set_print(libbpf_print_fn);
|
||||
signal(SIGINT, sigint_handler);
|
||||
signal(SIGTERM, sigint_handler);
|
||||
restart:
|
||||
skel = SCX_OPS_OPEN(sdt_ops, scx_sdt);
|
||||
|
||||
while ((opt = getopt(argc, argv, "fvh")) != -1) {
|
||||
switch (opt) {
|
||||
case 'v':
|
||||
verbose = true;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, help_fmt, basename(argv[0]));
|
||||
return opt != 'h';
|
||||
}
|
||||
}
|
||||
|
||||
SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei);
|
||||
link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt);
|
||||
|
||||
while (!exit_req && !UEI_EXITED(skel, uei)) {
|
||||
printf("enqueues=%llu\t", skel->bss->stat_enqueue);
|
||||
printf("inits=%llu\t", skel->bss->stat_init);
|
||||
printf("exits=%llu\t", skel->bss->stat_exit);
|
||||
printf("\n");
|
||||
|
||||
printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu);
|
||||
printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu);
|
||||
printf("\n");
|
||||
|
||||
fflush(stdout);
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
bpf_link__destroy(link);
|
||||
ecode = UEI_REPORT(skel, uei);
|
||||
scx_sdt__destroy(skel);
|
||||
|
||||
if (UEI_ECODE_RESTART(ecode))
|
||||
goto restart;
|
||||
return 0;
|
||||
}
|
10
scheds/c/scx_sdt.h
Normal file
10
scheds/c/scx_sdt.h
Normal file
@ -0,0 +1,10 @@
|
||||
#pragma once
|
||||
struct sdt_stats {
|
||||
int seq;
|
||||
pid_t pid;
|
||||
__u64 init;
|
||||
__u64 enqueue;
|
||||
__u64 exit;
|
||||
__u64 select_idle_cpu;
|
||||
__u64 select_busy_cpu;
|
||||
};
|
124
scheds/include/scx/bpf_arena_common.h
Normal file
124
scheds/include/scx/bpf_arena_common.h
Normal file
@ -0,0 +1,124 @@
|
||||
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
|
||||
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
|
||||
#pragma once
|
||||
|
||||
#ifndef WRITE_ONCE
|
||||
#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val))
|
||||
#endif
|
||||
|
||||
#ifndef NUMA_NO_NODE
|
||||
#define NUMA_NO_NODE (-1)
|
||||
#endif
|
||||
|
||||
#ifndef arena_container_of
|
||||
#define arena_container_of(ptr, type, member) \
|
||||
({ \
|
||||
void __arena *__mptr = (void __arena *)(ptr); \
|
||||
((type *)(__mptr - offsetof(type, member))); \
|
||||
})
|
||||
#endif
|
||||
|
||||
#ifdef __BPF__ /* when compiled as bpf program */
|
||||
|
||||
#ifndef PAGE_SIZE
|
||||
#define PAGE_SIZE __PAGE_SIZE
|
||||
/*
|
||||
* for older kernels try sizeof(struct genradix_node)
|
||||
* or flexible:
|
||||
* static inline long __bpf_page_size(void) {
|
||||
* return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node);
|
||||
* }
|
||||
* but generated code is not great.
|
||||
*/
|
||||
#endif
|
||||
|
||||
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM)
|
||||
#define __arena __attribute__((address_space(1)))
|
||||
#define __arena_global __attribute__((address_space(1)))
|
||||
#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */
|
||||
#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */
|
||||
#else
|
||||
|
||||
/* emit instruction:
|
||||
* rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as
|
||||
*
|
||||
* This is a workaround for LLVM compiler versions without
|
||||
* __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena
|
||||
* pointers and native kernel/userspace ones. In this case we explicitly do so
|
||||
* with cast_kern() and cast_user(). E.g., in the Linux kernel tree,
|
||||
* tools/testing/selftests/bpf includes tests that use these macros to implement
|
||||
* linked lists and hashtables backed by arena memory. In sched_ext, we use
|
||||
* cast_kern() and cast_user() for compatibility with older LLVM toolchains.
|
||||
*/
|
||||
#ifndef bpf_addr_space_cast
|
||||
#define bpf_addr_space_cast(var, dst_as, src_as)\
|
||||
asm volatile(".byte 0xBF; \
|
||||
.ifc %[reg], r0; \
|
||||
.byte 0x00; \
|
||||
.endif; \
|
||||
.ifc %[reg], r1; \
|
||||
.byte 0x11; \
|
||||
.endif; \
|
||||
.ifc %[reg], r2; \
|
||||
.byte 0x22; \
|
||||
.endif; \
|
||||
.ifc %[reg], r3; \
|
||||
.byte 0x33; \
|
||||
.endif; \
|
||||
.ifc %[reg], r4; \
|
||||
.byte 0x44; \
|
||||
.endif; \
|
||||
.ifc %[reg], r5; \
|
||||
.byte 0x55; \
|
||||
.endif; \
|
||||
.ifc %[reg], r6; \
|
||||
.byte 0x66; \
|
||||
.endif; \
|
||||
.ifc %[reg], r7; \
|
||||
.byte 0x77; \
|
||||
.endif; \
|
||||
.ifc %[reg], r8; \
|
||||
.byte 0x88; \
|
||||
.endif; \
|
||||
.ifc %[reg], r9; \
|
||||
.byte 0x99; \
|
||||
.endif; \
|
||||
.short %[off]; \
|
||||
.long %[as]" \
|
||||
: [reg]"+r"(var) \
|
||||
: [off]"i"(BPF_ADDR_SPACE_CAST) \
|
||||
, [as]"i"((dst_as << 16) | src_as));
|
||||
#endif
|
||||
|
||||
#define __arena
|
||||
#define __arena_global SEC(".addr_space.1")
|
||||
#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1)
|
||||
#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0)
|
||||
#endif
|
||||
|
||||
void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt,
|
||||
int node_id, __u64 flags) __ksym __weak;
|
||||
void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak;
|
||||
|
||||
#else /* when compiled as user space code */
|
||||
|
||||
#define __arena
|
||||
#define __arg_arena
|
||||
#define cast_kern(ptr) /* nop for user space */
|
||||
#define cast_user(ptr) /* nop for user space */
|
||||
char __attribute__((weak)) arena[1];
|
||||
|
||||
#ifndef offsetof
|
||||
#define offsetof(type, member) ((unsigned long)&((type *)0)->member)
|
||||
#endif
|
||||
|
||||
static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt,
|
||||
int node_id, __u64 flags)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt)
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
63
scheds/include/scx/sdt_list.h
Normal file
63
scheds/include/scx/sdt_list.h
Normal file
@ -0,0 +1,63 @@
|
||||
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
|
||||
/*
|
||||
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
|
||||
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
|
||||
* Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
|
||||
*/
|
||||
#pragma once
|
||||
#include "bpf_arena_common.h"
|
||||
|
||||
struct arena_list_node;
|
||||
|
||||
typedef struct arena_list_node __arena arena_list_node_t;
|
||||
|
||||
struct arena_list_node {
|
||||
arena_list_node_t *next;
|
||||
u64 padding[2];
|
||||
u64 __arena data[];
|
||||
};
|
||||
|
||||
struct arena_list_head {
|
||||
struct arena_list_node __arena *first;
|
||||
};
|
||||
typedef struct arena_list_head __arena arena_list_head_t;
|
||||
|
||||
#ifndef __BPF__
|
||||
static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { return NULL; }
|
||||
static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {}
|
||||
static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; }
|
||||
#define cond_break ({})
|
||||
#define can_loop true
|
||||
#endif
|
||||
|
||||
static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h)
|
||||
{
|
||||
arena_list_node_t *first = h->first;
|
||||
arena_list_node_t * __arena *tmp;
|
||||
|
||||
cast_kern(n);
|
||||
WRITE_ONCE(n->next, first);
|
||||
|
||||
tmp = &h->first;
|
||||
cast_kern(tmp);
|
||||
WRITE_ONCE(*tmp, first);
|
||||
}
|
||||
|
||||
static inline arena_list_node_t *list_pop(arena_list_head_t *h)
|
||||
{
|
||||
arena_list_node_t *first = h->first;
|
||||
arena_list_node_t *tmp;
|
||||
arena_list_node_t *next;
|
||||
|
||||
if (!first)
|
||||
return NULL;
|
||||
|
||||
tmp = first;
|
||||
cast_kern(tmp);
|
||||
next = tmp->next;
|
||||
|
||||
cast_kern(h);
|
||||
WRITE_ONCE(h->first, next);
|
||||
|
||||
return first;
|
||||
}
|
75
scheds/include/scx/sdt_task.h
Normal file
75
scheds/include/scx/sdt_task.h
Normal file
@ -0,0 +1,75 @@
|
||||
/*
|
||||
* SPDX-License-Identifier: GPL-2.0
|
||||
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
|
||||
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
|
||||
* Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "sdt_list.h"
|
||||
|
||||
#ifndef div_round_up
|
||||
#define div_round_up(a, b) (((a) + (b) - 1) / (b))
|
||||
#endif
|
||||
|
||||
enum sdt_task_consts {
|
||||
SDT_TASK_ALLOC_RESERVE = 0xbeefcafe,
|
||||
SDT_TASK_ENT_PAGE_SHIFT = 0,
|
||||
SDT_TASK_ENT_PAGES = 1 << SDT_TASK_ENT_PAGE_SHIFT,
|
||||
SDT_TASK_ENTS_PER_PAGE_SHIFT = 9,
|
||||
SDT_TASK_ALLOCATION_ATTEMPTS = 8192,
|
||||
SDT_TASK_LEVELS = 3,
|
||||
SDT_TASK_ENTS_PER_CHUNK_SHIFT = SDT_TASK_ENT_PAGE_SHIFT + SDT_TASK_ENTS_PER_PAGE_SHIFT,
|
||||
/*
|
||||
* Skim space off the chunk so that both the chunk and the
|
||||
* allocator linked list are included in the same arena page.
|
||||
*/
|
||||
SDT_TASK_ENTS_PER_CHUNK = (1 << SDT_TASK_ENTS_PER_CHUNK_SHIFT) - (16 * sizeof(struct arena_list_node)),
|
||||
SDT_TASK_CHUNK_BITMAP_U64S = div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64),
|
||||
};
|
||||
|
||||
union sdt_task_id {
|
||||
__s64 val;
|
||||
struct {
|
||||
__s32 idx; /* index in the radix tree */
|
||||
__s32 gen; /* ++'d on recycle so that it forms unique'ish 64bit ID */
|
||||
};
|
||||
};
|
||||
|
||||
struct sdt_task_chunk;
|
||||
|
||||
/*
|
||||
* Each index page is described by the following descriptor which carries the
|
||||
* bitmap. This way the actual index can host power-of-two numbers of entries
|
||||
* which makes indexing cheaper.
|
||||
*/
|
||||
struct sdt_task_desc {
|
||||
__u64 allocated[SDT_TASK_CHUNK_BITMAP_U64S];
|
||||
__u64 nr_free;
|
||||
struct sdt_task_chunk __arena *chunk;
|
||||
};
|
||||
|
||||
/*
|
||||
* Leaf node containing per-task data.
|
||||
*/
|
||||
struct sdt_task_data {
|
||||
union sdt_task_id tid;
|
||||
__u64 tptr;
|
||||
__u64 __arena payload[];
|
||||
};
|
||||
|
||||
/*
|
||||
* Intermediate node pointing to another intermediate node or leaf node.
|
||||
*/
|
||||
struct sdt_task_chunk {
|
||||
union {
|
||||
struct sdt_task_desc __arena *descs[SDT_TASK_ENTS_PER_CHUNK];
|
||||
struct sdt_task_data __arena *data[SDT_TASK_ENTS_PER_CHUNK];
|
||||
};
|
||||
};
|
||||
|
||||
struct sdt_task_pool {
|
||||
arena_list_head_t head;
|
||||
__u64 elem_size;
|
||||
__u64 free_size;
|
||||
};
|
579
scheds/include/scx/sdt_task_impl.bpf.h
Normal file
579
scheds/include/scx/sdt_task_impl.bpf.h
Normal file
@ -0,0 +1,579 @@
|
||||
/*
|
||||
* SPDX-License-Identifier: GPL-2.0
|
||||
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
|
||||
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
|
||||
* Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "sdt_task.h"
|
||||
|
||||
#define SDT_TASK_FN_ATTRS inline __attribute__((unused, always_inline))
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARENA);
|
||||
__uint(map_flags, BPF_F_MMAPABLE);
|
||||
__uint(max_entries, 1 << 20); /* number of pages */
|
||||
#ifdef __TARGET_ARCH_arm64
|
||||
__ulong(map_extra, (1ull << 32)); /* start of mmap() region */
|
||||
#else
|
||||
__ulong(map_extra, (1ull << 44)); /* start of mmap() region */
|
||||
#endif
|
||||
} arena __weak SEC(".maps");
|
||||
|
||||
/*
|
||||
* task BPF map entry recording the task's assigned ID and pointing to the data
|
||||
* area allocated in arena.
|
||||
*/
|
||||
struct sdt_task_map_val {
|
||||
union sdt_task_id tid;
|
||||
struct sdt_task_data __arena *data;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
__type(key, int);
|
||||
__type(value, struct sdt_task_map_val);
|
||||
} sdt_task_map SEC(".maps");
|
||||
|
||||
/*
|
||||
* XXX Hack to get the verifier to find the arena for sdt_exit_task.
|
||||
* As of 6.12-rc5, The verifier associates arenas with programs by
|
||||
* checking LD.IMM instruction operands for an arena and populating
|
||||
* the program state with the first instance it finds. This requires
|
||||
* accessing our global arena variable, but scx methods do not necessarily
|
||||
* do so while still using pointers from that arena. Insert a bpf_printk
|
||||
* statement that triggers at most once to generate an LD.IMM instruction
|
||||
* to access the arena and help the verifier.
|
||||
*/
|
||||
static bool sdt_verify_once;
|
||||
|
||||
static SDT_TASK_FN_ATTRS void sdt_arena_verify(void)
|
||||
{
|
||||
if (sdt_verify_once)
|
||||
return;
|
||||
|
||||
bpf_printk("%s: arena pointer %p", __func__, &arena);
|
||||
sdt_verify_once = true;
|
||||
}
|
||||
|
||||
|
||||
static struct sdt_task_desc __arena *sdt_task_desc_root; /* radix tree root */
|
||||
static struct sdt_task_desc __arena *sdt_task_new_chunk; /* new chunk cache */
|
||||
|
||||
private(LOCK) struct bpf_spin_lock sdt_task_lock;
|
||||
private(POOL_LOCK) struct bpf_spin_lock sdt_task_pool_alloc_lock;
|
||||
|
||||
/* allocation pools */
|
||||
struct sdt_task_pool __arena sdt_task_desc_pool;
|
||||
struct sdt_task_pool __arena sdt_task_chunk_pool;
|
||||
struct sdt_task_pool __arena sdt_task_data_pool;
|
||||
|
||||
static SDT_TASK_FN_ATTRS int sdt_ffs(__u64 word)
|
||||
{
|
||||
unsigned int num = 0;
|
||||
|
||||
if ((word & 0xffffffff) == 0) {
|
||||
num += 32;
|
||||
word >>= 32;
|
||||
}
|
||||
|
||||
if ((word & 0xffff) == 0) {
|
||||
num += 16;
|
||||
word >>= 16;
|
||||
}
|
||||
|
||||
if ((word & 0xff) == 0) {
|
||||
num += 8;
|
||||
word >>= 8;
|
||||
}
|
||||
|
||||
if ((word & 0xf) == 0) {
|
||||
num += 4;
|
||||
word >>= 4;
|
||||
}
|
||||
|
||||
if ((word & 0x3) == 0) {
|
||||
num += 2;
|
||||
word >>= 2;
|
||||
}
|
||||
|
||||
if ((word & 0x1) == 0) {
|
||||
num += 1;
|
||||
word >>= 1;
|
||||
}
|
||||
|
||||
return num;
|
||||
}
|
||||
|
||||
/* find the first empty slot */
|
||||
static SDT_TASK_FN_ATTRS __u64 sdt_chunk_find_empty(struct sdt_task_desc __arena *desc)
|
||||
{
|
||||
__u64 freelist;
|
||||
__u64 i;
|
||||
|
||||
cast_kern(desc);
|
||||
|
||||
for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
|
||||
freelist = ~desc->allocated[i];
|
||||
if (freelist == (__u64)0)
|
||||
continue;
|
||||
|
||||
return (i * 64) + sdt_ffs(freelist);
|
||||
}
|
||||
|
||||
return SDT_TASK_ENTS_PER_CHUNK;
|
||||
}
|
||||
|
||||
/* simple memory allocator */
|
||||
static SDT_TASK_FN_ATTRS
|
||||
void __arena *sdt_task_alloc_from_pool(struct sdt_task_pool __arena *pool)
|
||||
{
|
||||
arena_list_node_t *elem = NULL;
|
||||
void __arena *new_page = NULL;
|
||||
arena_list_node_t *new_elem;
|
||||
__u32 u, numelems;
|
||||
|
||||
/* if pool is empty, get new page */
|
||||
bpf_spin_lock(&sdt_task_pool_alloc_lock);
|
||||
|
||||
if (pool->head.first) {
|
||||
bpf_spin_unlock(&sdt_task_pool_alloc_lock);
|
||||
elem = list_pop(&pool->head);
|
||||
return (void __arena *)&elem->data;
|
||||
}
|
||||
|
||||
bpf_spin_unlock(&sdt_task_pool_alloc_lock);
|
||||
|
||||
new_page = bpf_arena_alloc_pages(&arena, NULL, SDT_TASK_ENT_PAGES, NUMA_NO_NODE, 0);
|
||||
if (!new_page)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Push all allocated elements except for last one that we use to
|
||||
* satisfy the allocation.
|
||||
*/
|
||||
|
||||
numelems = (SDT_TASK_ENT_PAGES * PAGE_SIZE) / pool->elem_size;
|
||||
|
||||
bpf_for(u, 0, numelems - 1) {
|
||||
new_elem = new_page + u * pool->elem_size;
|
||||
|
||||
bpf_spin_lock(&sdt_task_pool_alloc_lock);
|
||||
list_add_head(new_elem, &pool->head);
|
||||
bpf_spin_unlock(&sdt_task_pool_alloc_lock);
|
||||
}
|
||||
|
||||
elem = new_page + (numelems - 1) * pool->elem_size;
|
||||
|
||||
return (void __arena *)&elem->data;
|
||||
}
|
||||
|
||||
static SDT_TASK_FN_ATTRS
|
||||
void sdt_task_free_to_pool(void __arena *ptr, struct sdt_task_pool __arena *pool)
|
||||
{
|
||||
arena_list_node_t *elem;
|
||||
__u64 __arena *data;
|
||||
int i;
|
||||
|
||||
elem = arena_container_of(ptr, struct arena_list_node, data);
|
||||
|
||||
/* Zero out one word at a time since we cannot use memset. */
|
||||
data = (__u64 __arena *)&elem->data;
|
||||
cast_kern(data);
|
||||
|
||||
bpf_for(i, 0, pool->elem_size / 8) {
|
||||
data[i] = (__u64)0;
|
||||
}
|
||||
|
||||
bpf_spin_lock(&sdt_task_pool_alloc_lock);
|
||||
list_add_head(elem, &pool->head);
|
||||
bpf_spin_unlock(&sdt_task_pool_alloc_lock);
|
||||
}
|
||||
|
||||
/* alloc desc and chunk and link chunk to desc and return desc */
|
||||
static SDT_TASK_FN_ATTRS struct sdt_task_desc __arena *sdt_alloc_chunk(void)
|
||||
{
|
||||
struct sdt_task_chunk __arena *chunk;
|
||||
struct sdt_task_desc __arena *desc;
|
||||
struct sdt_task_desc __arena *out;
|
||||
|
||||
chunk = sdt_task_alloc_from_pool(&sdt_task_chunk_pool);
|
||||
if (!chunk) {
|
||||
bpf_printk("%s: failed to allocated chunk", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
desc = sdt_task_alloc_from_pool(&sdt_task_desc_pool);
|
||||
if (!desc) {
|
||||
sdt_task_free_to_pool(chunk, &sdt_task_chunk_pool);
|
||||
bpf_printk("%s: failed to allocated desc", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
out = desc;
|
||||
|
||||
cast_kern(desc);
|
||||
|
||||
desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
|
||||
desc->chunk = chunk;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
static SDT_TASK_FN_ATTRS int sdt_pool_set_size(struct sdt_task_pool __arena *pool, __u64 data_size)
|
||||
{
|
||||
/* All allocations are wrapped in a linked list node. */
|
||||
data_size += sizeof(struct arena_list_node);
|
||||
|
||||
if (data_size > (SDT_TASK_ENT_PAGES * PAGE_SIZE)) {
|
||||
bpf_printk("allocation size %ld too large", data_size);
|
||||
return -E2BIG;
|
||||
}
|
||||
|
||||
cast_kern(pool);
|
||||
pool->elem_size = data_size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static SDT_TASK_FN_ATTRS int sdt_pool_set_size_data(struct sdt_task_pool __arena *pool, __u64 data_size, __u64 free_size)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = sdt_pool_set_size(pool, data_size);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
pool->free_size = free_size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* initialize the whole thing, maybe misnomer */
|
||||
static SDT_TASK_FN_ATTRS int sdt_task_init(__u64 data_size)
|
||||
{
|
||||
__u64 free_size;
|
||||
int ret;
|
||||
|
||||
ret = sdt_pool_set_size(&sdt_task_chunk_pool, sizeof(struct sdt_task_chunk));
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
ret = sdt_pool_set_size(&sdt_task_desc_pool, sizeof(struct sdt_task_desc));
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
/* Page align and wrap data into a descriptor. */
|
||||
data_size = div_round_up(data_size, 8) * 8;
|
||||
free_size = data_size;
|
||||
data_size += sizeof(struct sdt_task_data);
|
||||
|
||||
ret = sdt_pool_set_size_data(&sdt_task_data_pool, data_size, free_size);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
sdt_task_desc_root = sdt_alloc_chunk();
|
||||
if (sdt_task_desc_root == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static SDT_TASK_FN_ATTRS
|
||||
int sdt_set_idx_state(struct sdt_task_desc __arena *desc, __u64 pos, bool state)
|
||||
{
|
||||
__u64 __arena *allocated = desc->allocated;
|
||||
__u64 bit;
|
||||
|
||||
cast_kern(allocated);
|
||||
|
||||
if (pos >= SDT_TASK_ENTS_PER_CHUNK) {
|
||||
bpf_spin_unlock(&sdt_task_lock);
|
||||
bpf_printk("invalid access (0x%d, %s)\n", pos, state ? "set" : "unset");
|
||||
|
||||
bpf_spin_lock(&sdt_task_lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
bit = (__u64)1 << (pos % 64);
|
||||
|
||||
if (state)
|
||||
allocated[pos / 64] |= bit;
|
||||
else
|
||||
allocated[pos / 64] &= ~bit;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static SDT_TASK_FN_ATTRS void sdt_task_free_idx(__u64 idx)
|
||||
{
|
||||
const __u64 mask = (1 << SDT_TASK_ENTS_PER_CHUNK_SHIFT) - 1;
|
||||
struct sdt_task_desc __arena *lv_desc[SDT_TASK_LEVELS];
|
||||
struct sdt_task_desc * __arena *desc_children;
|
||||
struct sdt_task_chunk __arena *chunk;
|
||||
struct sdt_task_desc __arena *desc;
|
||||
struct sdt_task_data __arena *data;
|
||||
__u64 u, level, shift, pos;
|
||||
__u64 lv_pos[SDT_TASK_LEVELS];
|
||||
int i;
|
||||
|
||||
bpf_spin_lock(&sdt_task_lock);
|
||||
|
||||
desc = sdt_task_desc_root;
|
||||
if (!desc) {
|
||||
bpf_spin_unlock(&sdt_task_lock);
|
||||
bpf_printk("%s: root not allocated", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
bpf_for(level, 0, SDT_TASK_LEVELS) {
|
||||
shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_CHUNK_SHIFT;
|
||||
pos = (idx >> shift) & mask;
|
||||
|
||||
lv_desc[level] = desc;
|
||||
lv_pos[level] = pos;
|
||||
|
||||
if (level == SDT_TASK_LEVELS - 1)
|
||||
break;
|
||||
|
||||
cast_kern(desc);
|
||||
|
||||
chunk = desc->chunk;
|
||||
cast_kern(chunk);
|
||||
|
||||
desc_children = (struct sdt_task_desc * __arena *)chunk->descs;
|
||||
desc = desc_children[pos];
|
||||
|
||||
if (!desc) {
|
||||
bpf_spin_unlock(&sdt_task_lock);
|
||||
bpf_printk("freeing nonexistent idx [0x%lx] (level %d)", idx, level);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
cast_kern(desc);
|
||||
|
||||
chunk = desc->chunk;
|
||||
cast_kern(chunk);
|
||||
|
||||
pos = idx & mask;
|
||||
data = chunk->data[pos];
|
||||
if (!data) {
|
||||
bpf_spin_unlock(&sdt_task_lock);
|
||||
bpf_printk("freeing idx [0x%lx] (%p) without data", idx, &chunk->data[pos]);
|
||||
return;
|
||||
}
|
||||
|
||||
cast_kern(data);
|
||||
|
||||
data[pos] = (struct sdt_task_data) {
|
||||
.tid.gen = data->tid.gen + 1,
|
||||
.tptr = 0,
|
||||
};
|
||||
|
||||
/* Zero out one word at a time. */
|
||||
bpf_for(i, 0, sdt_task_data_pool.free_size / 8) {
|
||||
data->payload[i] = 0;
|
||||
}
|
||||
|
||||
bpf_for(u, 0, SDT_TASK_LEVELS) {
|
||||
level = SDT_TASK_LEVELS - 1 - u;
|
||||
|
||||
/* Only propagate upwards if we are the parent's only free chunk. */
|
||||
desc = lv_desc[level];
|
||||
|
||||
sdt_set_idx_state(desc, lv_pos[level], false);
|
||||
|
||||
cast_kern(desc);
|
||||
|
||||
desc->nr_free += 1;
|
||||
if (desc->nr_free > 1)
|
||||
break;
|
||||
}
|
||||
|
||||
bpf_spin_unlock(&sdt_task_lock);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static SDT_TASK_FN_ATTRS
|
||||
void __arena *sdt_task_retrieve(struct task_struct *p)
|
||||
{
|
||||
struct sdt_task_data __arena *data;
|
||||
struct sdt_task_map_val *mval;
|
||||
|
||||
sdt_arena_verify();
|
||||
|
||||
mval = bpf_task_storage_get(&sdt_task_map, p, 0, 0);
|
||||
if (!mval)
|
||||
return NULL;
|
||||
|
||||
data = mval->data;
|
||||
|
||||
return (void __arena *)data->payload;
|
||||
}
|
||||
|
||||
|
||||
static SDT_TASK_FN_ATTRS void sdt_task_free(struct task_struct *p)
|
||||
{
|
||||
struct sdt_task_map_val *mval;
|
||||
|
||||
sdt_arena_verify();
|
||||
|
||||
mval = bpf_task_storage_get(&sdt_task_map, p, 0, 0);
|
||||
if (!mval)
|
||||
return;
|
||||
|
||||
sdt_task_free_idx(mval->tid.idx);
|
||||
mval->data = NULL;
|
||||
}
|
||||
|
||||
|
||||
static SDT_TASK_FN_ATTRS
|
||||
int sdt_task_find_empty(struct sdt_task_desc __arena *desc, struct sdt_task_desc * __arena *descp, __u64 *idxp)
|
||||
{
|
||||
struct sdt_task_desc * __arena *desc_children, __arena *new_chunk;
|
||||
struct sdt_task_desc __arena *lv_desc[SDT_TASK_LEVELS];
|
||||
struct sdt_task_chunk __arena *chunk;
|
||||
struct sdt_task_desc __arena *tmp;
|
||||
__u64 lv_pos[SDT_TASK_LEVELS];
|
||||
__u64 u, pos, level;
|
||||
__u64 idx = 0;
|
||||
|
||||
bpf_for(level, 0, SDT_TASK_LEVELS) {
|
||||
pos = sdt_chunk_find_empty(desc);
|
||||
|
||||
/* Something has gon terribly wrong. */
|
||||
if (pos > SDT_TASK_ENTS_PER_CHUNK)
|
||||
return -EINVAL;
|
||||
|
||||
if (pos == SDT_TASK_ENTS_PER_CHUNK)
|
||||
return -ENOMEM;
|
||||
|
||||
idx <<= SDT_TASK_ENTS_PER_CHUNK_SHIFT;
|
||||
idx |= pos;
|
||||
|
||||
/* Log the levels to complete allocation. */
|
||||
lv_desc[level] = desc;
|
||||
lv_pos[level] = pos;
|
||||
|
||||
/* The rest of the loop is for internal node traversal. */
|
||||
if (level == SDT_TASK_LEVELS - 1)
|
||||
break;
|
||||
|
||||
cast_kern(desc);
|
||||
|
||||
chunk = desc->chunk;
|
||||
cast_kern(chunk);
|
||||
|
||||
desc_children = (struct sdt_task_desc * __arena *)chunk->descs;
|
||||
desc = desc_children[pos];
|
||||
|
||||
/* Someone else is populating the subtree. */
|
||||
if (desc == (void *)SDT_TASK_ALLOC_RESERVE)
|
||||
return -EAGAIN;
|
||||
|
||||
if (!desc) {
|
||||
/* Reserve our spot and go allocate. */
|
||||
desc_children[pos] = (void *)SDT_TASK_ALLOC_RESERVE;
|
||||
|
||||
bpf_spin_unlock(&sdt_task_lock);
|
||||
new_chunk = sdt_alloc_chunk();
|
||||
if (!new_chunk) {
|
||||
bpf_printk("%s: allocating new chunk failed", __func__);
|
||||
bpf_spin_lock(&sdt_task_lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
bpf_spin_lock(&sdt_task_lock);
|
||||
|
||||
desc_children[pos] = new_chunk;
|
||||
desc = new_chunk;
|
||||
}
|
||||
}
|
||||
|
||||
bpf_for(u, 0, SDT_TASK_LEVELS) {
|
||||
level = SDT_TASK_LEVELS - 1 - u;
|
||||
tmp = lv_desc[level];
|
||||
|
||||
cast_kern(tmp);
|
||||
sdt_set_idx_state(tmp, lv_pos[level], true);
|
||||
|
||||
tmp->nr_free -= 1;
|
||||
if (tmp->nr_free > 0)
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
*descp = desc;
|
||||
*idxp = idx;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static SDT_TASK_FN_ATTRS
|
||||
void __arena *sdt_task_alloc(struct task_struct *p)
|
||||
{
|
||||
struct sdt_task_data __arena *data = NULL;
|
||||
struct sdt_task_desc __arena *desc;
|
||||
struct sdt_task_chunk __arena *chunk;
|
||||
struct sdt_task_map_val *mval;
|
||||
__u64 idx, pos;
|
||||
int ret;
|
||||
|
||||
mval = bpf_task_storage_get(&sdt_task_map, p, 0,
|
||||
BPF_LOCAL_STORAGE_GET_F_CREATE);
|
||||
if (!mval)
|
||||
return NULL;
|
||||
|
||||
bpf_spin_lock(&sdt_task_lock);
|
||||
|
||||
bpf_repeat(SDT_TASK_ALLOCATION_ATTEMPTS) {
|
||||
ret = sdt_task_find_empty(sdt_task_desc_root, &desc, &idx);
|
||||
if (ret != -EAGAIN)
|
||||
break;
|
||||
}
|
||||
|
||||
if (ret != 0) {
|
||||
bpf_spin_unlock(&sdt_task_lock);
|
||||
bpf_printk("%s: error %d on allocation", __func__, ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
cast_kern(desc);
|
||||
|
||||
chunk = desc->chunk;
|
||||
cast_kern(chunk);
|
||||
|
||||
/* populate leaf node if necessary */
|
||||
pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
|
||||
data = chunk->data[pos];
|
||||
if (!data) {
|
||||
bpf_spin_unlock(&sdt_task_lock);
|
||||
|
||||
data = sdt_task_alloc_from_pool(&sdt_task_data_pool);
|
||||
if (!data) {
|
||||
sdt_task_free_idx(idx);
|
||||
bpf_printk("%s: failed to allocate data from pool", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bpf_spin_lock(&sdt_task_lock);
|
||||
chunk->data[pos] = data;
|
||||
}
|
||||
|
||||
/* init and return */
|
||||
cast_kern(data);
|
||||
|
||||
data->tid.idx = idx;
|
||||
data->tptr = (__u64)p;
|
||||
|
||||
mval->tid = data->tid;
|
||||
mval->data = data;
|
||||
|
||||
bpf_spin_unlock(&sdt_task_lock);
|
||||
|
||||
return (void __arena *)data->payload;
|
||||
}
|
Loading…
Reference in New Issue
Block a user