Compare commits

...

66 Commits

Author SHA1 Message Date
Emil Tsalapatis
dac0142be5
Merge b692c415f1 into 7d14df8ca2 2024-11-30 13:19:29 +02:00
Changwoo Min
7d14df8ca2
Merge pull request #1000 from multics69/lavd-load-balancing
scx_lavd: Load balancing across compute domains
2024-11-30 12:10:04 +09:00
Changwoo Min
047e8c81e9 scx_lavd: Perform load balancing at consume_task()
Upon ops.dispatch, perform load balancing based on the set-up plan,
stealing a task from a stealee domain to a stealer domain. To avoid
the thundering herd problem of concurrent stealers, a stealer steals
a task probabilistically. Also, to minimize the task migration distance,
decrease the stealing probability exponentially for each hop in the
distance. Finally, for every stat cycle (50 ms), a stealer will migrate
only one task from a stealee for gradual load balancing.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
2024-11-30 12:09:43 +09:00
Changwoo Min
4f1ffc1bc6 scx_lavd: Refactor consume_task()
Remove unnecessary variables and arguments and
factor out force_to_steal_task().

Signed-off-by: Changwoo Min <changwoo@igalia.com>
2024-11-30 12:09:43 +09:00
Changwoo Min
7991266773 scx_lavd: Decide load balancing plan across compute domains
The goal of load balancing is to maintain almost equal queued
tasks per CPU in a compute domain. To this end, we first decide
which compute domain is under-utilized (i.e., its queue length
per CPU is below average) and which compute domain is over-utilized
(i.e., its queue length per CPU is above average). We call the
under-utilized domain as a stealer domain and the over-utilized
domain as a stealee domain.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
2024-11-30 12:09:43 +09:00
Changwoo Min
ed14a4ca91 scx_lavd: Log out the number of cross-domain task migration
Collect and log out the number of task migration across compute domains.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
2024-11-30 12:09:43 +09:00
Emil Tsalapatis
b692c415f1 select_cpu metrics 2024-11-25 11:13:51 -08:00
Emil Tsalapatis
7a87c24c7d adjust data alignment, fix typecasting bug in scx_sdt and add more metrics 2024-11-25 10:59:21 -08:00
Emil Tsalapatis
44907b317c sdt: stats template 2024-11-25 09:45:34 -08:00
Emil Tsalapatis
d4e4ffd996 use arena-allocated structures for statistics 2024-11-25 09:32:21 -08:00
Emil Tsalapatis
2044301cf8 share the sdt_stats struct between userspace and kernel 2024-11-25 08:41:09 -08:00
Emil Tsalapatis
b8c30308e1 incorporate data size to the pool struct to sidestep verifier issue 2024-11-25 08:18:02 -08:00
Emil Tsalapatis
173b3d0e06 update allocation code to directly return data 2024-11-22 12:43:55 -08:00
Emil Tsalapatis
ce22e7f622 style: break function attributes into their own line when overly long 2024-11-22 11:04:44 -08:00
Emil Tsalapatis
c48e3af43a update copyright 2024-11-22 10:55:36 -08:00
Emil Tsalapatis
eac323f8ae bugfixes and overhaul list allocation/adjust casting arena allocation 2024-11-22 10:38:43 -08:00
Emil Tsalapatis
6e618d1236 multipage arena allocations and bitmap position find bugfix 2024-11-21 10:58:52 -08:00
Emil Tsalapatis
ee9c69b82b fix descriptor array indexing on the free path 2024-11-20 14:51:15 -08:00
Emil Tsalapatis
60d901c068 remove unnecessary header import 2024-11-20 14:31:49 -08:00
Emil Tsalapatis
740d44723b clarify how we use cast_kern() and cast_user() in the headers where we import them 2024-11-20 14:12:04 -08:00
Emil Tsalapatis
52743e0a51 add back mips vmlinux.h header 2024-11-20 14:05:22 -08:00
Emil Tsalapatis
6022031956 verification passing 2024-11-20 14:03:36 -08:00
Emil Tsalapatis
73c44ddb70 [wip] verification almost passing 2024-11-20 13:51:16 -08:00
Emil Tsalapatis
0a78c2124e [wip] expand allocator into a tree 2024-11-20 13:01:17 -08:00
Emil Tsalapatis
6236ecba40 add tree traversal function stub 2024-11-20 09:47:23 -08:00
Emil Tsalapatis
0357571e98 remove unnecessarily imported header 2024-11-20 09:46:57 -08:00
Emil Tsalapatis
c6cdafe6fd style nit: use __u64 typecast instead of ULL suffix 2024-11-20 09:34:02 -08:00
Emil Tsalapatis
0cc0159e77 bump arena size back up to 4GiB 2024-11-20 07:53:35 -08:00
Emil Tsalapatis
03c8e21717 add back ARM arena vaddr offset 2024-11-20 07:48:35 -08:00
Emil Tsalapatis
a7f0423472 add bitmap back in for allocation tracking 2024-11-20 07:39:33 -08:00
Emil Tsalapatis
bce4be65ab expand SDT_TASK_ENTS_PER_CHUNK_SHIFT to fill in the entire chunk with chunk pointers 2024-11-20 07:26:00 -08:00
Emil Tsalapatis
bc3ef152a6 Merge branch 'main' into etsal/arena-alloc-simple 2024-11-20 07:07:37 -08:00
Emil Tsalapatis
b6d2e99641 turn C schedulers back on 2024-11-19 18:37:27 -08:00
Emil Tsalapatis
87db033d70 print diagnostic on impossible condition 2024-11-19 18:10:01 -08:00
Emil Tsalapatis
f8e22e2d9b fix minor inconsistencies 2024-11-19 17:44:22 -08:00
Emil Tsalapatis
5cd75b810c add locking back in 2024-11-19 16:48:08 -08:00
Emil Tsalapatis
7daf3d4906 remove descriptor head struct now that verification is passing 2024-11-19 10:34:04 -08:00
Emil Tsalapatis
8070eadb34 wip full verifier passing 2024-11-19 10:29:59 -08:00
Emil Tsalapatis
4db4791b49 sdt_task_init working 2024-11-18 15:03:29 -08:00
Emil Tsalapatis
055447960e turning on more parts of the code and adding list_pop() for allocation 2024-11-18 14:27:49 -08:00
Emil Tsalapatis
74cae35225 code passing the verifier 2024-11-18 12:44:35 -08:00
Emil Tsalapatis
4eb81403ca use a single level for the tree for now 2024-11-18 11:36:40 -08:00
Emil Tsalapatis
9d202db786 use one byte per entry 2024-11-18 11:29:59 -08:00
Emil Tsalapatis
36a512c864 [HACK] turn off locking 2024-11-18 11:24:56 -08:00
Emil Tsalapatis
f2dbeba2b5 properly mark sdt_task_exit as non-sleepable 2024-11-18 07:22:57 -08:00
Emil Tsalapatis
24997a3a03 fixing verifier errors along the tree allocation path 2024-11-18 07:21:52 -08:00
Emil Tsalapatis
b3e9b11792 verification for SDT passing 2024-11-15 12:44:50 -08:00
Emil Tsalapatis
9e7a2393f0 do not put the scheduler-local core structs in the __arena_global section 2024-11-15 10:42:19 -08:00
Emil Tsalapatis
e05a7e5989 more aggressively use selftest examples in our own code 2024-11-15 10:38:01 -08:00
Emil Tsalapatis
633c7658c4 get bpf_arena_common.h back compiling with Clang-friendly attribute definition 2024-11-15 10:17:42 -08:00
Emil Tsalapatis
d0e8b63239 import 6.12.0-rc5 BPF headers from selftests 2024-11-15 10:16:27 -08:00
Emil Tsalapatis
94b89ca51b [WIP] build hackery 2024-11-15 10:14:59 -08:00
Emil Tsalapatis
def8489029 turn off enough code paths to get SDT loading 2024-11-08 13:25:29 -08:00
Emil Tsalapatis
692cf44579 use Clang-style attribute for userspace weak symbol 2024-11-08 12:46:58 -08:00
Emil Tsalapatis
5bfe446a0b [WIP] remove __builtin_memset from arena-backed allocator 2024-11-08 12:31:53 -08:00
Emil Tsalapatis
14095789ae add all ctags files to .gitignore 2024-11-08 12:30:58 -08:00
Emil Tsalapatis
88d5a550f5 import more recent bpf_arena_*.h headers for good measure 2024-11-08 11:58:12 -08:00
Emil Tsalapatis
c7e40e01d8 Merge branch 'main' into etsal/bpf-arena-rusty 2024-11-08 11:21:20 -08:00
Emil Tsalapatis
1645e51034 Merge branch 'htejun/task-stor' 2024-11-08 11:11:50 -08:00
Tejun Heo
b7298d3d48 some comments 2024-10-22 08:45:02 -10:00
Tejun Heo
ebb8d319c0 xxx 2024-10-21 15:50:23 -10:00
Tejun Heo
704213b68e builds 2024-10-21 15:01:19 -10:00
Tejun Heo
f289b4d28d xxx 2024-10-21 14:37:28 -10:00
Tejun Heo
eca627ccfe wip 2024-10-21 14:12:11 -10:00
Tejun Heo
f2b9dc60c5 wip 2024-10-21 14:09:51 -10:00
Tejun Heo
a546ff2510 WIP 2024-10-18 15:55:20 -10:00
16 changed files with 1332 additions and 85 deletions

1
.gitignore vendored
View File

@ -5,3 +5,4 @@ target
*.swp
.cache/
.vscode/
**/tags

View File

@ -1,5 +1,5 @@
c_scheds = ['scx_simple', 'scx_qmap', 'scx_central', 'scx_userland', 'scx_nest',
'scx_flatcg', 'scx_pair']
'scx_flatcg', 'scx_pair', 'scx_sdt']
foreach sched: c_scheds
thread_dep = dependency('threads')

143
scheds/c/scx_sdt.bpf.c Normal file
View File

@ -0,0 +1,143 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <scx/common.bpf.h>
#include <scx/sdt_task_impl.bpf.h>
#include "scx_sdt.h"
char _license[] SEC("license") = "GPL";
UEI_DEFINE(uei);
#define SHARED_DSQ 0
#define DEFINE_SDT_STAT(metric) \
static SDT_TASK_FN_ATTRS void \
stat_inc_##metric(struct sdt_stats __arena *stats) \
{ \
cast_kern(stats); \
stats->metric += 1; \
} \
__u64 stat_##metric; \
DEFINE_SDT_STAT(enqueue);
DEFINE_SDT_STAT(init);
DEFINE_SDT_STAT(exit);
DEFINE_SDT_STAT(select_idle_cpu);
DEFINE_SDT_STAT(select_busy_cpu);
static SDT_TASK_FN_ATTRS void
stat_global_update(struct sdt_stats __arena *stats)
{
cast_kern(stats);
__sync_fetch_and_add(&stat_enqueue, stats->enqueue);
__sync_fetch_and_add(&stat_init, stats->init);
__sync_fetch_and_add(&stat_exit, stats->exit);
__sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
__sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
}
s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
struct sdt_stats __arena *stats;
bool is_idle = false;
s32 cpu;
stats = sdt_task_retrieve(p);
if (!stats) {
bpf_printk("%s: no stats for pid %d", p->pid);
return 0;
}
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
if (is_idle) {
stat_inc_select_idle_cpu(stats);
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
} else {
stat_inc_select_busy_cpu(stats);
}
return cpu;
}
void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
{
struct sdt_stats __arena *stats;
stats = sdt_task_retrieve(p);
if (!stats) {
bpf_printk("%s: no stats for pid %d", p->pid);
return;
}
stat_inc_enqueue(stats);
scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
}
void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
{
scx_bpf_consume(SHARED_DSQ);
}
s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
struct scx_init_task_args *args)
{
struct sdt_stats __arena *stats;
stats = sdt_task_alloc(p);
if (!stats) {
bpf_printk("arena allocator out of memory");
return -ENOMEM;
}
stats->pid = p->pid;
stat_inc_init(stats);
return 0;
}
void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
struct scx_exit_task_args *args)
{
struct sdt_stats __arena *stats;
stats = sdt_task_retrieve(p);
if (!stats) {
bpf_printk("%s: no stats for pid %d", p->pid);
return;
}
stat_inc_exit(stats);
stat_global_update(stats);
sdt_task_free(p);
}
s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
{
int ret;
ret = sdt_task_init(sizeof(struct sdt_stats));
if (ret < 0) {
bpf_printk("sdt_init failed with %d", ret);
return ret;
}
return scx_bpf_create_dsq(SHARED_DSQ, -1);
}
void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
{
UEI_RECORD(uei, ei);
}
SCX_OPS_DEFINE(sdt_ops,
.select_cpu = (void *)sdt_select_cpu,
.enqueue = (void *)sdt_enqueue,
.dispatch = (void *)sdt_dispatch,
.init_task = (void *)sdt_init_task,
.exit_task = (void *)sdt_exit_task,
.init = (void *)sdt_init,
.exit = (void *)sdt_exit,
.name = "sdt");

92
scheds/c/scx_sdt.c Normal file
View File

@ -0,0 +1,92 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
#include <stdio.h>
#include <unistd.h>
#include <signal.h>
#include <libgen.h>
#include <bpf/bpf.h>
#include <scx/common.h>
#include <scx/sdt_task.h>
#include "scx_sdt.bpf.skel.h"
#include "scx_sdt.h"
const char help_fmt[] =
"A simple sched_ext scheduler.\n"
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-f] [-v]\n"
"\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int sig)
{
exit_req = 1;
}
int main(int argc, char **argv)
{
struct scx_sdt *skel;
struct bpf_link *link;
__u32 opt;
__u64 ecode;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
skel = SCX_OPS_OPEN(sdt_ops, scx_sdt);
while ((opt = getopt(argc, argv, "fvh")) != -1) {
switch (opt) {
case 'v':
verbose = true;
break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
}
}
SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei);
link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt);
while (!exit_req && !UEI_EXITED(skel, uei)) {
printf("enqueues=%llu\t", skel->bss->stat_enqueue);
printf("inits=%llu\t", skel->bss->stat_init);
printf("exits=%llu\t", skel->bss->stat_exit);
printf("\n");
printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu);
printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu);
printf("\n");
fflush(stdout);
sleep(1);
}
bpf_link__destroy(link);
ecode = UEI_REPORT(skel, uei);
scx_sdt__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0;
}

10
scheds/c/scx_sdt.h Normal file
View File

@ -0,0 +1,10 @@
#pragma once
struct sdt_stats {
int seq;
pid_t pid;
__u64 init;
__u64 enqueue;
__u64 exit;
__u64 select_idle_cpu;
__u64 select_busy_cpu;
};

View File

@ -0,0 +1,124 @@
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#pragma once
#ifndef WRITE_ONCE
#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val))
#endif
#ifndef NUMA_NO_NODE
#define NUMA_NO_NODE (-1)
#endif
#ifndef arena_container_of
#define arena_container_of(ptr, type, member) \
({ \
void __arena *__mptr = (void __arena *)(ptr); \
((type *)(__mptr - offsetof(type, member))); \
})
#endif
#ifdef __BPF__ /* when compiled as bpf program */
#ifndef PAGE_SIZE
#define PAGE_SIZE __PAGE_SIZE
/*
* for older kernels try sizeof(struct genradix_node)
* or flexible:
* static inline long __bpf_page_size(void) {
* return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node);
* }
* but generated code is not great.
*/
#endif
#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM)
#define __arena __attribute__((address_space(1)))
#define __arena_global __attribute__((address_space(1)))
#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */
#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */
#else
/* emit instruction:
* rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as
*
* This is a workaround for LLVM compiler versions without
* __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena
* pointers and native kernel/userspace ones. In this case we explicitly do so
* with cast_kern() and cast_user(). E.g., in the Linux kernel tree,
* tools/testing/selftests/bpf includes tests that use these macros to implement
* linked lists and hashtables backed by arena memory. In sched_ext, we use
* cast_kern() and cast_user() for compatibility with older LLVM toolchains.
*/
#ifndef bpf_addr_space_cast
#define bpf_addr_space_cast(var, dst_as, src_as)\
asm volatile(".byte 0xBF; \
.ifc %[reg], r0; \
.byte 0x00; \
.endif; \
.ifc %[reg], r1; \
.byte 0x11; \
.endif; \
.ifc %[reg], r2; \
.byte 0x22; \
.endif; \
.ifc %[reg], r3; \
.byte 0x33; \
.endif; \
.ifc %[reg], r4; \
.byte 0x44; \
.endif; \
.ifc %[reg], r5; \
.byte 0x55; \
.endif; \
.ifc %[reg], r6; \
.byte 0x66; \
.endif; \
.ifc %[reg], r7; \
.byte 0x77; \
.endif; \
.ifc %[reg], r8; \
.byte 0x88; \
.endif; \
.ifc %[reg], r9; \
.byte 0x99; \
.endif; \
.short %[off]; \
.long %[as]" \
: [reg]"+r"(var) \
: [off]"i"(BPF_ADDR_SPACE_CAST) \
, [as]"i"((dst_as << 16) | src_as));
#endif
#define __arena
#define __arena_global SEC(".addr_space.1")
#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1)
#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0)
#endif
void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt,
int node_id, __u64 flags) __ksym __weak;
void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak;
#else /* when compiled as user space code */
#define __arena
#define __arg_arena
#define cast_kern(ptr) /* nop for user space */
#define cast_user(ptr) /* nop for user space */
char __attribute__((weak)) arena[1];
#ifndef offsetof
#define offsetof(type, member) ((unsigned long)&((type *)0)->member)
#endif
static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt,
int node_id, __u64 flags)
{
return NULL;
}
static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt)
{
}
#endif

View File

@ -0,0 +1,63 @@
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/*
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
* Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
*/
#pragma once
#include "bpf_arena_common.h"
struct arena_list_node;
typedef struct arena_list_node __arena arena_list_node_t;
struct arena_list_node {
arena_list_node_t *next;
u64 padding[2];
u64 __arena data[];
};
struct arena_list_head {
struct arena_list_node __arena *first;
};
typedef struct arena_list_head __arena arena_list_head_t;
#ifndef __BPF__
static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { return NULL; }
static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {}
static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; }
#define cond_break ({})
#define can_loop true
#endif
static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h)
{
arena_list_node_t *first = h->first;
arena_list_node_t * __arena *tmp;
cast_kern(n);
WRITE_ONCE(n->next, first);
tmp = &h->first;
cast_kern(tmp);
WRITE_ONCE(*tmp, first);
}
static inline arena_list_node_t *list_pop(arena_list_head_t *h)
{
arena_list_node_t *first = h->first;
arena_list_node_t *tmp;
arena_list_node_t *next;
if (!first)
return NULL;
tmp = first;
cast_kern(tmp);
next = tmp->next;
cast_kern(h);
WRITE_ONCE(h->first, next);
return first;
}

View File

@ -0,0 +1,75 @@
/*
* SPDX-License-Identifier: GPL-2.0
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
* Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
*/
#pragma once
#include "sdt_list.h"
#ifndef div_round_up
#define div_round_up(a, b) (((a) + (b) - 1) / (b))
#endif
enum sdt_task_consts {
SDT_TASK_ALLOC_RESERVE = 0xbeefcafe,
SDT_TASK_ENT_PAGE_SHIFT = 0,
SDT_TASK_ENT_PAGES = 1 << SDT_TASK_ENT_PAGE_SHIFT,
SDT_TASK_ENTS_PER_PAGE_SHIFT = 9,
SDT_TASK_ALLOCATION_ATTEMPTS = 8192,
SDT_TASK_LEVELS = 3,
SDT_TASK_ENTS_PER_CHUNK_SHIFT = SDT_TASK_ENT_PAGE_SHIFT + SDT_TASK_ENTS_PER_PAGE_SHIFT,
/*
* Skim space off the chunk so that both the chunk and the
* allocator linked list are included in the same arena page.
*/
SDT_TASK_ENTS_PER_CHUNK = (1 << SDT_TASK_ENTS_PER_CHUNK_SHIFT) - (16 * sizeof(struct arena_list_node)),
SDT_TASK_CHUNK_BITMAP_U64S = div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64),
};
union sdt_task_id {
__s64 val;
struct {
__s32 idx; /* index in the radix tree */
__s32 gen; /* ++'d on recycle so that it forms unique'ish 64bit ID */
};
};
struct sdt_task_chunk;
/*
* Each index page is described by the following descriptor which carries the
* bitmap. This way the actual index can host power-of-two numbers of entries
* which makes indexing cheaper.
*/
struct sdt_task_desc {
__u64 allocated[SDT_TASK_CHUNK_BITMAP_U64S];
__u64 nr_free;
struct sdt_task_chunk __arena *chunk;
};
/*
* Leaf node containing per-task data.
*/
struct sdt_task_data {
union sdt_task_id tid;
__u64 tptr;
__u64 __arena payload[];
};
/*
* Intermediate node pointing to another intermediate node or leaf node.
*/
struct sdt_task_chunk {
union {
struct sdt_task_desc __arena *descs[SDT_TASK_ENTS_PER_CHUNK];
struct sdt_task_data __arena *data[SDT_TASK_ENTS_PER_CHUNK];
};
};
struct sdt_task_pool {
arena_list_head_t head;
__u64 elem_size;
__u64 free_size;
};

View File

@ -0,0 +1,579 @@
/*
* SPDX-License-Identifier: GPL-2.0
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
* Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
*/
#pragma once
#include "sdt_task.h"
#define SDT_TASK_FN_ATTRS inline __attribute__((unused, always_inline))
struct {
__uint(type, BPF_MAP_TYPE_ARENA);
__uint(map_flags, BPF_F_MMAPABLE);
__uint(max_entries, 1 << 20); /* number of pages */
#ifdef __TARGET_ARCH_arm64
__ulong(map_extra, (1ull << 32)); /* start of mmap() region */
#else
__ulong(map_extra, (1ull << 44)); /* start of mmap() region */
#endif
} arena __weak SEC(".maps");
/*
* task BPF map entry recording the task's assigned ID and pointing to the data
* area allocated in arena.
*/
struct sdt_task_map_val {
union sdt_task_id tid;
struct sdt_task_data __arena *data;
};
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct sdt_task_map_val);
} sdt_task_map SEC(".maps");
/*
* XXX Hack to get the verifier to find the arena for sdt_exit_task.
* As of 6.12-rc5, The verifier associates arenas with programs by
* checking LD.IMM instruction operands for an arena and populating
* the program state with the first instance it finds. This requires
* accessing our global arena variable, but scx methods do not necessarily
* do so while still using pointers from that arena. Insert a bpf_printk
* statement that triggers at most once to generate an LD.IMM instruction
* to access the arena and help the verifier.
*/
static bool sdt_verify_once;
static SDT_TASK_FN_ATTRS void sdt_arena_verify(void)
{
if (sdt_verify_once)
return;
bpf_printk("%s: arena pointer %p", __func__, &arena);
sdt_verify_once = true;
}
static struct sdt_task_desc __arena *sdt_task_desc_root; /* radix tree root */
static struct sdt_task_desc __arena *sdt_task_new_chunk; /* new chunk cache */
private(LOCK) struct bpf_spin_lock sdt_task_lock;
private(POOL_LOCK) struct bpf_spin_lock sdt_task_pool_alloc_lock;
/* allocation pools */
struct sdt_task_pool __arena sdt_task_desc_pool;
struct sdt_task_pool __arena sdt_task_chunk_pool;
struct sdt_task_pool __arena sdt_task_data_pool;
static SDT_TASK_FN_ATTRS int sdt_ffs(__u64 word)
{
unsigned int num = 0;
if ((word & 0xffffffff) == 0) {
num += 32;
word >>= 32;
}
if ((word & 0xffff) == 0) {
num += 16;
word >>= 16;
}
if ((word & 0xff) == 0) {
num += 8;
word >>= 8;
}
if ((word & 0xf) == 0) {
num += 4;
word >>= 4;
}
if ((word & 0x3) == 0) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0) {
num += 1;
word >>= 1;
}
return num;
}
/* find the first empty slot */
static SDT_TASK_FN_ATTRS __u64 sdt_chunk_find_empty(struct sdt_task_desc __arena *desc)
{
__u64 freelist;
__u64 i;
cast_kern(desc);
for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
freelist = ~desc->allocated[i];
if (freelist == (__u64)0)
continue;
return (i * 64) + sdt_ffs(freelist);
}
return SDT_TASK_ENTS_PER_CHUNK;
}
/* simple memory allocator */
static SDT_TASK_FN_ATTRS
void __arena *sdt_task_alloc_from_pool(struct sdt_task_pool __arena *pool)
{
arena_list_node_t *elem = NULL;
void __arena *new_page = NULL;
arena_list_node_t *new_elem;
__u32 u, numelems;
/* if pool is empty, get new page */
bpf_spin_lock(&sdt_task_pool_alloc_lock);
if (pool->head.first) {
bpf_spin_unlock(&sdt_task_pool_alloc_lock);
elem = list_pop(&pool->head);
return (void __arena *)&elem->data;
}
bpf_spin_unlock(&sdt_task_pool_alloc_lock);
new_page = bpf_arena_alloc_pages(&arena, NULL, SDT_TASK_ENT_PAGES, NUMA_NO_NODE, 0);
if (!new_page)
return NULL;
/*
* Push all allocated elements except for last one that we use to
* satisfy the allocation.
*/
numelems = (SDT_TASK_ENT_PAGES * PAGE_SIZE) / pool->elem_size;
bpf_for(u, 0, numelems - 1) {
new_elem = new_page + u * pool->elem_size;
bpf_spin_lock(&sdt_task_pool_alloc_lock);
list_add_head(new_elem, &pool->head);
bpf_spin_unlock(&sdt_task_pool_alloc_lock);
}
elem = new_page + (numelems - 1) * pool->elem_size;
return (void __arena *)&elem->data;
}
static SDT_TASK_FN_ATTRS
void sdt_task_free_to_pool(void __arena *ptr, struct sdt_task_pool __arena *pool)
{
arena_list_node_t *elem;
__u64 __arena *data;
int i;
elem = arena_container_of(ptr, struct arena_list_node, data);
/* Zero out one word at a time since we cannot use memset. */
data = (__u64 __arena *)&elem->data;
cast_kern(data);
bpf_for(i, 0, pool->elem_size / 8) {
data[i] = (__u64)0;
}
bpf_spin_lock(&sdt_task_pool_alloc_lock);
list_add_head(elem, &pool->head);
bpf_spin_unlock(&sdt_task_pool_alloc_lock);
}
/* alloc desc and chunk and link chunk to desc and return desc */
static SDT_TASK_FN_ATTRS struct sdt_task_desc __arena *sdt_alloc_chunk(void)
{
struct sdt_task_chunk __arena *chunk;
struct sdt_task_desc __arena *desc;
struct sdt_task_desc __arena *out;
chunk = sdt_task_alloc_from_pool(&sdt_task_chunk_pool);
if (!chunk) {
bpf_printk("%s: failed to allocated chunk", __func__);
return NULL;
}
desc = sdt_task_alloc_from_pool(&sdt_task_desc_pool);
if (!desc) {
sdt_task_free_to_pool(chunk, &sdt_task_chunk_pool);
bpf_printk("%s: failed to allocated desc", __func__);
return NULL;
}
out = desc;
cast_kern(desc);
desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
desc->chunk = chunk;
return out;
}
static SDT_TASK_FN_ATTRS int sdt_pool_set_size(struct sdt_task_pool __arena *pool, __u64 data_size)
{
/* All allocations are wrapped in a linked list node. */
data_size += sizeof(struct arena_list_node);
if (data_size > (SDT_TASK_ENT_PAGES * PAGE_SIZE)) {
bpf_printk("allocation size %ld too large", data_size);
return -E2BIG;
}
cast_kern(pool);
pool->elem_size = data_size;
return 0;
}
static SDT_TASK_FN_ATTRS int sdt_pool_set_size_data(struct sdt_task_pool __arena *pool, __u64 data_size, __u64 free_size)
{
int ret;
ret = sdt_pool_set_size(pool, data_size);
if (ret)
return ret;
pool->free_size = free_size;
return 0;
}
/* initialize the whole thing, maybe misnomer */
static SDT_TASK_FN_ATTRS int sdt_task_init(__u64 data_size)
{
__u64 free_size;
int ret;
ret = sdt_pool_set_size(&sdt_task_chunk_pool, sizeof(struct sdt_task_chunk));
if (ret != 0)
return ret;
ret = sdt_pool_set_size(&sdt_task_desc_pool, sizeof(struct sdt_task_desc));
if (ret != 0)
return ret;
/* Page align and wrap data into a descriptor. */
data_size = div_round_up(data_size, 8) * 8;
free_size = data_size;
data_size += sizeof(struct sdt_task_data);
ret = sdt_pool_set_size_data(&sdt_task_data_pool, data_size, free_size);
if (ret != 0)
return ret;
sdt_task_desc_root = sdt_alloc_chunk();
if (sdt_task_desc_root == NULL)
return -ENOMEM;
return 0;
}
static SDT_TASK_FN_ATTRS
int sdt_set_idx_state(struct sdt_task_desc __arena *desc, __u64 pos, bool state)
{
__u64 __arena *allocated = desc->allocated;
__u64 bit;
cast_kern(allocated);
if (pos >= SDT_TASK_ENTS_PER_CHUNK) {
bpf_spin_unlock(&sdt_task_lock);
bpf_printk("invalid access (0x%d, %s)\n", pos, state ? "set" : "unset");
bpf_spin_lock(&sdt_task_lock);
return -EINVAL;
}
bit = (__u64)1 << (pos % 64);
if (state)
allocated[pos / 64] |= bit;
else
allocated[pos / 64] &= ~bit;
return 0;
}
static SDT_TASK_FN_ATTRS void sdt_task_free_idx(__u64 idx)
{
const __u64 mask = (1 << SDT_TASK_ENTS_PER_CHUNK_SHIFT) - 1;
struct sdt_task_desc __arena *lv_desc[SDT_TASK_LEVELS];
struct sdt_task_desc * __arena *desc_children;
struct sdt_task_chunk __arena *chunk;
struct sdt_task_desc __arena *desc;
struct sdt_task_data __arena *data;
__u64 u, level, shift, pos;
__u64 lv_pos[SDT_TASK_LEVELS];
int i;
bpf_spin_lock(&sdt_task_lock);
desc = sdt_task_desc_root;
if (!desc) {
bpf_spin_unlock(&sdt_task_lock);
bpf_printk("%s: root not allocated", __func__);
return;
}
bpf_for(level, 0, SDT_TASK_LEVELS) {
shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_CHUNK_SHIFT;
pos = (idx >> shift) & mask;
lv_desc[level] = desc;
lv_pos[level] = pos;
if (level == SDT_TASK_LEVELS - 1)
break;
cast_kern(desc);
chunk = desc->chunk;
cast_kern(chunk);
desc_children = (struct sdt_task_desc * __arena *)chunk->descs;
desc = desc_children[pos];
if (!desc) {
bpf_spin_unlock(&sdt_task_lock);
bpf_printk("freeing nonexistent idx [0x%lx] (level %d)", idx, level);
return;
}
}
cast_kern(desc);
chunk = desc->chunk;
cast_kern(chunk);
pos = idx & mask;
data = chunk->data[pos];
if (!data) {
bpf_spin_unlock(&sdt_task_lock);
bpf_printk("freeing idx [0x%lx] (%p) without data", idx, &chunk->data[pos]);
return;
}
cast_kern(data);
data[pos] = (struct sdt_task_data) {
.tid.gen = data->tid.gen + 1,
.tptr = 0,
};
/* Zero out one word at a time. */
bpf_for(i, 0, sdt_task_data_pool.free_size / 8) {
data->payload[i] = 0;
}
bpf_for(u, 0, SDT_TASK_LEVELS) {
level = SDT_TASK_LEVELS - 1 - u;
/* Only propagate upwards if we are the parent's only free chunk. */
desc = lv_desc[level];
sdt_set_idx_state(desc, lv_pos[level], false);
cast_kern(desc);
desc->nr_free += 1;
if (desc->nr_free > 1)
break;
}
bpf_spin_unlock(&sdt_task_lock);
return;
}
static SDT_TASK_FN_ATTRS
void __arena *sdt_task_retrieve(struct task_struct *p)
{
struct sdt_task_data __arena *data;
struct sdt_task_map_val *mval;
sdt_arena_verify();
mval = bpf_task_storage_get(&sdt_task_map, p, 0, 0);
if (!mval)
return NULL;
data = mval->data;
return (void __arena *)data->payload;
}
static SDT_TASK_FN_ATTRS void sdt_task_free(struct task_struct *p)
{
struct sdt_task_map_val *mval;
sdt_arena_verify();
mval = bpf_task_storage_get(&sdt_task_map, p, 0, 0);
if (!mval)
return;
sdt_task_free_idx(mval->tid.idx);
mval->data = NULL;
}
static SDT_TASK_FN_ATTRS
int sdt_task_find_empty(struct sdt_task_desc __arena *desc, struct sdt_task_desc * __arena *descp, __u64 *idxp)
{
struct sdt_task_desc * __arena *desc_children, __arena *new_chunk;
struct sdt_task_desc __arena *lv_desc[SDT_TASK_LEVELS];
struct sdt_task_chunk __arena *chunk;
struct sdt_task_desc __arena *tmp;
__u64 lv_pos[SDT_TASK_LEVELS];
__u64 u, pos, level;
__u64 idx = 0;
bpf_for(level, 0, SDT_TASK_LEVELS) {
pos = sdt_chunk_find_empty(desc);
/* Something has gon terribly wrong. */
if (pos > SDT_TASK_ENTS_PER_CHUNK)
return -EINVAL;
if (pos == SDT_TASK_ENTS_PER_CHUNK)
return -ENOMEM;
idx <<= SDT_TASK_ENTS_PER_CHUNK_SHIFT;
idx |= pos;
/* Log the levels to complete allocation. */
lv_desc[level] = desc;
lv_pos[level] = pos;
/* The rest of the loop is for internal node traversal. */
if (level == SDT_TASK_LEVELS - 1)
break;
cast_kern(desc);
chunk = desc->chunk;
cast_kern(chunk);
desc_children = (struct sdt_task_desc * __arena *)chunk->descs;
desc = desc_children[pos];
/* Someone else is populating the subtree. */
if (desc == (void *)SDT_TASK_ALLOC_RESERVE)
return -EAGAIN;
if (!desc) {
/* Reserve our spot and go allocate. */
desc_children[pos] = (void *)SDT_TASK_ALLOC_RESERVE;
bpf_spin_unlock(&sdt_task_lock);
new_chunk = sdt_alloc_chunk();
if (!new_chunk) {
bpf_printk("%s: allocating new chunk failed", __func__);
bpf_spin_lock(&sdt_task_lock);
return -ENOMEM;
}
bpf_spin_lock(&sdt_task_lock);
desc_children[pos] = new_chunk;
desc = new_chunk;
}
}
bpf_for(u, 0, SDT_TASK_LEVELS) {
level = SDT_TASK_LEVELS - 1 - u;
tmp = lv_desc[level];
cast_kern(tmp);
sdt_set_idx_state(tmp, lv_pos[level], true);
tmp->nr_free -= 1;
if (tmp->nr_free > 0)
break;
}
*descp = desc;
*idxp = idx;
return 0;
}
static SDT_TASK_FN_ATTRS
void __arena *sdt_task_alloc(struct task_struct *p)
{
struct sdt_task_data __arena *data = NULL;
struct sdt_task_desc __arena *desc;
struct sdt_task_chunk __arena *chunk;
struct sdt_task_map_val *mval;
__u64 idx, pos;
int ret;
mval = bpf_task_storage_get(&sdt_task_map, p, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!mval)
return NULL;
bpf_spin_lock(&sdt_task_lock);
bpf_repeat(SDT_TASK_ALLOCATION_ATTEMPTS) {
ret = sdt_task_find_empty(sdt_task_desc_root, &desc, &idx);
if (ret != -EAGAIN)
break;
}
if (ret != 0) {
bpf_spin_unlock(&sdt_task_lock);
bpf_printk("%s: error %d on allocation", __func__, ret);
return NULL;
}
cast_kern(desc);
chunk = desc->chunk;
cast_kern(chunk);
/* populate leaf node if necessary */
pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
data = chunk->data[pos];
if (!data) {
bpf_spin_unlock(&sdt_task_lock);
data = sdt_task_alloc_from_pool(&sdt_task_data_pool);
if (!data) {
sdt_task_free_idx(idx);
bpf_printk("%s: failed to allocate data from pool", __func__);
return NULL;
}
bpf_spin_lock(&sdt_task_lock);
chunk->data[pos] = data;
}
/* init and return */
cast_kern(data);
data->tid.idx = idx;
data->tptr = (__u64)p;
mval->tid = data->tid;
mval->data = data;
bpf_spin_unlock(&sdt_task_lock);
return (void __arena *)data->payload;
}

View File

@ -78,12 +78,14 @@ struct sys_stat {
volatile u32 max_perf_cri; /* maximum performance criticality */
volatile u32 thr_perf_cri; /* performance criticality threshold */
volatile u32 nr_stealee; /* number of compute domains to be migrated */
volatile u32 nr_violation; /* number of utilization violation */
volatile u32 nr_active; /* number of active cores */
volatile u64 nr_sched; /* total scheduling so far */
volatile u64 nr_perf_cri; /* number of performance-critical tasks scheduled */
volatile u64 nr_lat_cri; /* number of latency-critical tasks scheduled */
volatile u64 nr_x_migration; /* number of cross domain migration */
volatile u64 nr_big; /* scheduled on big core */
volatile u64 nr_pc_on_big; /* performance-critical tasks scheduled on big core */
volatile u64 nr_lc_on_big; /* latency-critical tasks scheduled on big core */

View File

@ -51,6 +51,9 @@ enum consts_internal {
performance mode when cpu util > 40% */
LAVD_CPDOM_STARV_NS = (2 * LAVD_SLICE_MAX_NS_DFL),
LAVD_CPDOM_MIGRATION_SHIFT = 3, /* 1/2**3 = +/- 12.5% */
LAVD_CPDOM_X_PROB_FT = (LAVD_SYS_STAT_INTERVAL_NS /
(2 * LAVD_SLICE_MAX_NS_DFL)), /* roughly twice per interval */
};
/*
@ -58,12 +61,15 @@ enum consts_internal {
* - system > numa node > llc domain > compute domain per core type (P or E)
*/
struct cpdom_ctx {
u64 last_consume_clk; /* when the associated DSQ was consumed */
u64 id; /* id of this compute domain (== dsq_id) */
u64 alt_id; /* id of the closest compute domain of alternative type (== dsq id) */
u8 node_id; /* numa domain id */
u8 is_big; /* is it a big core or little core? */
u8 is_active; /* if this compute domain is active */
u8 is_stealer; /* this domain should steal tasks from others */
u8 is_stealee; /* stealer doamin should steal tasks from this domain */
u16 nr_cpus; /* the number of CPUs in this compute domain */
u32 nr_q_tasks_per_cpu; /* the number of queued tasks per CPU in this domain (x1000) */
u8 nr_neighbors[LAVD_CPDOM_MAX_DIST]; /* number of neighbors per distance */
u64 neighbor_bits[LAVD_CPDOM_MAX_DIST]; /* bitmask of neighbor bitmask per distance */
u64 __cpumask[LAVD_CPU_ID_MAX/64]; /* cpumasks belongs to this compute domain */
@ -129,6 +135,7 @@ struct cpu_ctx {
/*
* Information for statistics.
*/
volatile u32 nr_x_migration;
volatile u32 nr_perf_cri;
volatile u32 nr_lat_cri;

View File

@ -1108,7 +1108,7 @@ void BPF_STRUCT_OPS(lavd_enqueue, struct task_struct *p, u64 enq_flags)
}
}
static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
static bool consume_dsq(u64 dsq_id)
{
struct cpdom_ctx *cpdomc;
@ -1120,7 +1120,6 @@ static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
return false;
}
WRITE_ONCE(cpdomc->last_consume_clk, now);
/*
* Try to consume a task on the associated DSQ.
@ -1130,81 +1129,110 @@ static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
return false;
}
static bool consume_starving_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
static bool try_to_steal_task(struct cpdom_ctx *cpdomc)
{
struct cpdom_ctx *cpdomc;
u64 dsq_id = cpuc->cpdom_poll_pos;
u64 dl;
bool ret = false;
int i;
if (nr_cpdoms == 1)
return false;
bpf_for(i, 0, nr_cpdoms) {
if (i >= LAVD_CPDOM_MAX_NR)
break;
dsq_id = (dsq_id + i) % LAVD_CPDOM_MAX_NR;
if (dsq_id == cpuc->cpdom_id)
continue;
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
if (!cpdomc) {
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
goto out;
}
if (cpdomc->is_active) {
dl = READ_ONCE(cpdomc->last_consume_clk) + LAVD_CPDOM_STARV_NS;
if (dl < now) {
ret = consume_dsq(cpu, dsq_id, now);
}
goto out;
}
}
out:
cpuc->cpdom_poll_pos = (dsq_id + 1) % LAVD_CPDOM_MAX_NR;
return ret;
}
static bool consume_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
{
struct cpdom_ctx *cpdomc, *cpdomc_pick;
u64 dsq_id, nr_nbr;
struct cpdom_ctx *cpdomc_pick;
u64 nr_nbr, dsq_id;
s64 nuance;
/*
* If there is a starving DSQ, try to consume it first.
* If all CPUs are not used -- i.e., the system is under-utilized,
* there is no point of load balancing. It is better to make an
* effort to increase the system utilization.
*/
if (consume_starving_task(cpu, cpuc, now))
return true;
/*
* Try to consume from CPU's associated DSQ.
*/
dsq_id = cpuc->cpdom_id;
if (consume_dsq(cpu, dsq_id, now))
return true;
/*
* If there is no task in the assssociated DSQ, traverse neighbor
* compute domains in distance order -- task stealing.
*/
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
if (!cpdomc) {
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
if (!use_full_cpus())
return false;
}
/*
* Probabilistically make a go or no go decision to avoid the
* thundering herd problem. In other words, one out of nr_cpus
* will try to steal a task at a moment.
*/
if (!prob_x_out_of_y(1, cpdomc->nr_cpus * LAVD_CPDOM_X_PROB_FT))
return false;
/*
* Traverse neighbor compute domains in distance order.
*/
nuance = bpf_get_prandom_u32();
for (int i = 0; i < LAVD_CPDOM_MAX_DIST; i++) {
nr_nbr = min(cpdomc->nr_neighbors[i], LAVD_CPDOM_MAX_NR);
if (nr_nbr == 0)
break;
nuance = bpf_get_prandom_u32();
for (int j = 0; j < LAVD_CPDOM_MAX_NR; j++, nuance = dsq_id + 1) {
/*
* Traverse neighbor in the same distance in arbitrary order.
*/
for (int j = 0; j < LAVD_CPDOM_MAX_NR; j++, nuance++) {
if (j >= nr_nbr)
break;
dsq_id = pick_any_bit(cpdomc->neighbor_bits[i], nuance);
if (dsq_id == -ENOENT)
continue;
cpdomc_pick = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
if (!cpdomc_pick) {
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
return false;
}
if (!cpdomc_pick->is_stealee || !cpdomc_pick->is_active)
continue;
/*
* If task stealing is successful, mark the stealer
* and the stealee's job done. By marking done,
* those compute domains would not be involved in
* load balancing until the end of this round,
* so this helps gradual migration. Note that multiple
* stealers can steal tasks from the same stealee.
* However, we don't coordinate concurrent stealing
* because the chance is low and there is no harm
* in slight over-stealing.
*/
if (consume_dsq(dsq_id)) {
WRITE_ONCE(cpdomc_pick->is_stealee, false);
WRITE_ONCE(cpdomc->is_stealer, false);
return true;
}
}
/*
* Now, we need to steal a task from a farther neighbor
* for load balancing. Since task migration from a farther
* neighbor is more expensive (e.g., crossing a NUMA boundary),
* we will do this with a lot of hesitation. The chance of
* further migration will decrease exponentially as distance
* increases, so, on the other hand, it increases the chance
* of closer migration.
*/
if (!prob_x_out_of_y(1, LAVD_CPDOM_X_PROB_FT))
break;
}
return false;
}
static bool force_to_steal_task(struct cpdom_ctx *cpdomc)
{
struct cpdom_ctx *cpdomc_pick;
u64 nr_nbr, dsq_id;
s64 nuance;
/*
* Traverse neighbor compute domains in distance order.
*/
nuance = bpf_get_prandom_u32();
for (int i = 0; i < LAVD_CPDOM_MAX_DIST; i++) {
nr_nbr = min(cpdomc->nr_neighbors[i], LAVD_CPDOM_MAX_NR);
if (nr_nbr == 0)
break;
/*
* Traverse neighbor in the same distance in arbitrary order.
*/
for (int j = 0; j < LAVD_CPDOM_MAX_NR; j++, nuance++) {
if (j >= nr_nbr)
break;
@ -1221,7 +1249,7 @@ static bool consume_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
if (!cpdomc_pick->is_active)
continue;
if (consume_dsq(cpu, dsq_id, now))
if (consume_dsq(dsq_id))
return true;
}
}
@ -1229,9 +1257,51 @@ static bool consume_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
return false;
}
static bool consume_task(struct cpu_ctx *cpuc)
{
struct cpdom_ctx *cpdomc;
u64 dsq_id;
dsq_id = cpuc->cpdom_id;
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
if (!cpdomc) {
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
return false;
}
/*
* If the current compute domain is a stealer, try to steal
* a task from any of stealee domains probabilistically.
*/
if (cpdomc->is_stealer && try_to_steal_task(cpdomc))
goto x_domain_migration_out;
/*
* Try to consume a task from CPU's associated DSQ.
*/
if (consume_dsq(dsq_id))
return true;
/*
* If there is no task in the assssociated DSQ, traverse neighbor
* compute domains in distance order -- task stealing.
*/
if (force_to_steal_task(cpdomc))
goto x_domain_migration_out;
return false;
/*
* Task migration across compute domains happens.
* Update the statistics.
*/
x_domain_migration_out:
cpuc->nr_x_migration++;
return true;
}
void BPF_STRUCT_OPS(lavd_dispatch, s32 cpu, struct task_struct *prev)
{
u64 now = bpf_ktime_get_ns();
struct cpu_ctx *cpuc;
struct task_ctx *taskc;
struct bpf_cpumask *active, *ovrflw;
@ -1365,10 +1435,7 @@ consume_out:
/*
* Consume a task if requested.
*/
if (!try_consume)
return;
if (consume_task(cpu, cpuc, now))
if (try_consume && consume_task(cpuc))
return;
/*
@ -1805,8 +1872,6 @@ static s32 init_cpdoms(u64 now)
if (!cpdomc->is_active)
continue;
WRITE_ONCE(cpdomc->last_consume_clk, now);
/*
* Create an associated DSQ on its associated NUMA domain.
*/
@ -2024,6 +2089,7 @@ static s32 init_per_cpu_ctx(u64 now)
}
cpuc->cpdom_id = cpdomc->id;
cpuc->cpdom_alt_id = cpdomc->alt_id;
cpdomc->nr_cpus++;
}
}
}

View File

@ -38,6 +38,8 @@ struct sys_stat_ctx {
u32 nr_sched;
u32 nr_perf_cri;
u32 nr_lat_cri;
u32 nr_x_migration;
u32 nr_stealee;
u32 nr_big;
u32 nr_pc_on_big;
u32 nr_lc_on_big;
@ -62,10 +64,66 @@ static void init_sys_stat_ctx(struct sys_stat_ctx *c)
c->stat_next->last_update_clk = c->now;
}
static void plan_x_cpdom_migration(struct sys_stat_ctx *c)
{
struct cpdom_ctx *cpdomc;
u64 dsq_id;
u32 avg_nr_q_tasks_per_cpu = 0, nr_q_tasks, x_mig_delta;
u32 stealer_threshold, stealee_threshold;
/*
* Calcualte average queued tasks per CPU per compute domain.
*/
bpf_for(dsq_id, 0, nr_cpdoms) {
if (dsq_id >= LAVD_CPDOM_MAX_NR)
break;
nr_q_tasks = scx_bpf_dsq_nr_queued(dsq_id);
c->nr_queued_task += nr_q_tasks;
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
cpdomc->nr_q_tasks_per_cpu = (nr_q_tasks * 1000) / cpdomc->nr_cpus;
avg_nr_q_tasks_per_cpu += cpdomc->nr_q_tasks_per_cpu;
}
avg_nr_q_tasks_per_cpu /= nr_cpdoms;
/*
* Determine stealer and stealee domains.
*
* A stealer domain, whose per-CPU queue length is shorter than
* the average, will steal a task from any of stealee domain,
* whose per-CPU queue length is longer than the average.
* Compute domain around average will not do anything.
*/
x_mig_delta = avg_nr_q_tasks_per_cpu >> LAVD_CPDOM_MIGRATION_SHIFT;
stealer_threshold = avg_nr_q_tasks_per_cpu - x_mig_delta;
stealee_threshold = avg_nr_q_tasks_per_cpu + x_mig_delta;
bpf_for(dsq_id, 0, nr_cpdoms) {
if (dsq_id >= LAVD_CPDOM_MAX_NR)
break;
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
if (cpdomc->nr_q_tasks_per_cpu < stealer_threshold) {
WRITE_ONCE(cpdomc->is_stealer, true);
WRITE_ONCE(cpdomc->is_stealee, false);
}
else if (cpdomc->nr_q_tasks_per_cpu > stealee_threshold) {
WRITE_ONCE(cpdomc->is_stealer, false);
WRITE_ONCE(cpdomc->is_stealee, true);
c->nr_stealee++;
}
else {
WRITE_ONCE(cpdomc->is_stealer, false);
WRITE_ONCE(cpdomc->is_stealee, false);
}
}
}
static void collect_sys_stat(struct sys_stat_ctx *c)
{
u64 dsq_id;
int cpu, nr;
int cpu;
bpf_for(cpu, 0, nr_cpu_ids) {
struct cpu_ctx *cpuc = get_cpu_ctx_id(cpu);
@ -94,6 +152,9 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
c->nr_lat_cri += cpuc->nr_lat_cri;
cpuc->nr_lat_cri = 0;
c->nr_x_migration += cpuc->nr_x_migration;
cpuc->nr_x_migration = 0;
/*
* Accumulate task's latency criticlity information.
*
@ -169,12 +230,6 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
c->idle_total += cpuc->idle_total;
cpuc->idle_total = 0;
}
bpf_for(dsq_id, 0, LAVD_CPDOM_MAX_NR) {
nr = scx_bpf_dsq_nr_queued(dsq_id);
if (nr > 0)
c->nr_queued_task += nr;
}
}
static void calc_sys_stat(struct sys_stat_ctx *c)
@ -239,6 +294,8 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
c->stat_cur->thr_perf_cri; /* will be updated later */
}
stat_next->nr_stealee = c->nr_stealee;
stat_next->nr_violation =
calc_avg32(stat_cur->nr_violation, c->nr_violation);
@ -260,6 +317,7 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
stat_next->nr_sched >>= 1;
stat_next->nr_perf_cri >>= 1;
stat_next->nr_lat_cri >>= 1;
stat_next->nr_x_migration >>= 1;
stat_next->nr_big >>= 1;
stat_next->nr_pc_on_big >>= 1;
stat_next->nr_lc_on_big >>= 1;
@ -272,6 +330,7 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
stat_next->nr_sched += c->nr_sched;
stat_next->nr_perf_cri += c->nr_perf_cri;
stat_next->nr_lat_cri += c->nr_lat_cri;
stat_next->nr_x_migration += c->nr_x_migration;
stat_next->nr_big += c->nr_big;
stat_next->nr_pc_on_big += c->nr_pc_on_big;
stat_next->nr_lc_on_big += c->nr_lc_on_big;
@ -287,6 +346,7 @@ static void do_update_sys_stat(void)
* Collect and prepare the next version of stat.
*/
init_sys_stat_ctx(&c);
plan_x_cpdom_migration(&c);
collect_sys_stat(&c);
calc_sys_stat(&c);
update_sys_stat_next(&c);

View File

@ -299,3 +299,14 @@ static void set_on_core_type(struct task_ctx *taskc,
WRITE_ONCE(taskc->on_big, on_big);
WRITE_ONCE(taskc->on_little, on_little);
}
static bool prob_x_out_of_y(u32 x, u32 y)
{
/*
* [0, r, y)
* ---- x?
*/
u32 r = bpf_get_prandom_u32() % y;
return r < x;
}

View File

@ -711,6 +711,8 @@ impl<'a> Scheduler<'a> {
let nr_sched = st.nr_sched;
let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
let nr_stealee = st.nr_stealee;
let nr_big = st.nr_big;
let pc_big = Self::get_pc(nr_big, nr_sched);
let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
@ -730,6 +732,8 @@ impl<'a> Scheduler<'a> {
nr_sched,
pc_pc,
pc_lc,
pc_x_migration,
nr_stealee,
pc_big,
pc_pc_on_big,
pc_lc_on_big,

View File

@ -37,6 +37,12 @@ pub struct SysStats {
#[stat(desc = "% of latency-critical tasks")]
pub pc_lc: f64,
#[stat(desc = "% of cross domain task migration")]
pub pc_x_migration: f64,
#[stat(desc = "Number of stealee domains")]
pub nr_stealee: u32,
#[stat(desc = "% of tasks scheduled on big cores")]
pub pc_big: f64,
@ -63,13 +69,15 @@ impl SysStats {
pub fn format_header<W: Write>(w: &mut W) -> Result<()> {
writeln!(
w,
"\x1b[93m| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |\x1b[0m",
"\x1b[93m| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |\x1b[0m",
"MSEQ",
"# Q TASK",
"# ACT CPU",
"# SCHED",
"PERF-CR%",
"LAT-CR%",
"X-MIG%",
"# STLEE",
"BIG%",
"PC/BIG%",
"LC/BIG%",
@ -88,13 +96,15 @@ impl SysStats {
writeln!(
w,
"| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |",
"| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |",
self.mseq,
self.nr_queued_task,
self.nr_active,
self.nr_sched,
GPoint(self.pc_pc),
GPoint(self.pc_lc),
GPoint(self.pc_x_migration),
self.nr_stealee,
GPoint(self.pc_big),
GPoint(self.pc_pc_on_big),
GPoint(self.pc_lc_on_big),