Merge b692c415f1 into 7d14df8ca2

Merge pull request #1000 from multics69/lavd-load-balancing
scx_lavd: Load balancing across compute domains
2024-12-04 08:17:11 +00:00 · 2024-11-30 13:19:29 +02:00 · 2024-11-30 12:10:04 +09:00 · 2024-11-30 12:09:43 +09:00 · 2024-11-30 12:09:43 +09:00 · 2024-11-30 12:09:43 +09:00
16 changed files with 1332 additions and 85 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,4 @@ target
 *.swp
 .cache/
 .vscode/
+**/tags
--- a/scheds/c/meson.build
+++ b/scheds/c/meson.build
@ -1,5 +1,5 @@
 c_scheds = ['scx_simple', 'scx_qmap', 'scx_central', 'scx_userland', 'scx_nest',
-            'scx_flatcg', 'scx_pair']
+            'scx_flatcg', 'scx_pair', 'scx_sdt']

 foreach sched: c_scheds
  thread_dep = dependency('threads')
--- a/scheds/c/scx_sdt.bpf.c
+++ b/scheds/c/scx_sdt.bpf.c
@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <scx/common.bpf.h>
+#include <scx/sdt_task_impl.bpf.h>
+
+#include "scx_sdt.h"
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+#define SHARED_DSQ 0
+
+#define DEFINE_SDT_STAT(metric)				\
+static SDT_TASK_FN_ATTRS void				\
+stat_inc_##metric(struct sdt_stats __arena *stats)	\
+{							\
+	cast_kern(stats);				\
+	stats->metric += 1;				\
+}							\
+__u64 stat_##metric;					\
+
+DEFINE_SDT_STAT(enqueue);
+DEFINE_SDT_STAT(init);
+DEFINE_SDT_STAT(exit);
+DEFINE_SDT_STAT(select_idle_cpu);
+DEFINE_SDT_STAT(select_busy_cpu);
+
+static SDT_TASK_FN_ATTRS void
+stat_global_update(struct sdt_stats __arena *stats)
+{
+	cast_kern(stats);
+	__sync_fetch_and_add(&stat_enqueue, stats->enqueue);
+	__sync_fetch_and_add(&stat_init, stats->init);
+	__sync_fetch_and_add(&stat_exit, stats->exit);
+	__sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
+	__sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
+}
+
+s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+	struct sdt_stats __arena *stats;
+	bool is_idle = false;
+	s32 cpu;
+
+	stats = sdt_task_retrieve(p);
+	if (!stats) {
+		bpf_printk("%s: no stats for pid %d", p->pid);
+		return 0;
+	}
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
+	if (is_idle) {
+		stat_inc_select_idle_cpu(stats);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+	} else {
+		stat_inc_select_busy_cpu(stats);
+	}
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct sdt_stats __arena *stats;
+
+	stats = sdt_task_retrieve(p);
+	if (!stats) {
+		bpf_printk("%s: no stats for pid %d", p->pid);
+		return;
+	}
+
+	stat_inc_enqueue(stats);
+
+	scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_consume(SHARED_DSQ);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
+			     struct scx_init_task_args *args)
+{
+	struct sdt_stats __arena *stats;
+
+	stats = sdt_task_alloc(p);
+	if (!stats) {
+		bpf_printk("arena allocator out of memory");
+		return -ENOMEM;
+	}
+
+	stats->pid = p->pid;
+
+	stat_inc_init(stats);
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
+			      struct scx_exit_task_args *args)
+{
+	struct sdt_stats __arena *stats;
+
+	stats = sdt_task_retrieve(p);
+	if (!stats) {
+		bpf_printk("%s: no stats for pid %d", p->pid);
+		return;
+	}
+
+	stat_inc_exit(stats);
+	stat_global_update(stats);
+
+	sdt_task_free(p);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
+{
+	int ret;
+
+	ret = sdt_task_init(sizeof(struct sdt_stats));
+	if (ret < 0) {
+		bpf_printk("sdt_init failed with %d", ret);
+		return ret;
+	}
+
+	return scx_bpf_create_dsq(SHARED_DSQ, -1);
+}
+
+void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(sdt_ops,
+	       .select_cpu		= (void *)sdt_select_cpu,
+	       .enqueue			= (void *)sdt_enqueue,
+	       .dispatch		= (void *)sdt_dispatch,
+	       .init_task		= (void *)sdt_init_task,
+	       .exit_task		= (void *)sdt_exit_task,
+	       .init			= (void *)sdt_init,
+	       .exit			= (void *)sdt_exit,
+	       .name			= "sdt");
--- a/scheds/c/scx_sdt.c
+++ b/scheds/c/scx_sdt.c
@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <scx/sdt_task.h>
+#include "scx_sdt.bpf.skel.h"
+
+#include "scx_sdt.h"
+
+const char help_fmt[] =
+"A simple sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-f] [-v]\n"
+"\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int sig)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_sdt *skel;
+	struct bpf_link *link;
+	__u32 opt;
+	__u64 ecode;
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+restart:
+	skel = SCX_OPS_OPEN(sdt_ops, scx_sdt);
+
+	while ((opt = getopt(argc, argv, "fvh")) != -1) {
+		switch (opt) {
+		case 'v':
+			verbose = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei);
+	link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt);
+
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
+		printf("enqueues=%llu\t", skel->bss->stat_enqueue);
+		printf("inits=%llu\t", skel->bss->stat_init);
+		printf("exits=%llu\t", skel->bss->stat_exit);
+		printf("\n");
+
+		printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu);
+		printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu);
+		printf("\n");
+
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	ecode = UEI_REPORT(skel, uei);
+	scx_sdt__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
+	return 0;
+}
--- a/scheds/c/scx_sdt.h
+++ b/scheds/c/scx_sdt.h
@ -0,0 +1,10 @@
+#pragma once
+struct sdt_stats {
+	int	seq;
+	pid_t	pid;
+	__u64	init;
+	__u64	enqueue;
+	__u64	exit;
+	__u64	select_idle_cpu;
+	__u64	select_busy_cpu;
+};
--- a/scheds/include/scx/bpf_arena_common.h
+++ b/scheds/include/scx/bpf_arena_common.h
@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#pragma once
+
+#ifndef WRITE_ONCE
+#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val))
+#endif
+
+#ifndef NUMA_NO_NODE
+#define	NUMA_NO_NODE	(-1)
+#endif
+
+#ifndef arena_container_of
+#define arena_container_of(ptr, type, member)			\
+	({							\
+		void __arena *__mptr = (void __arena *)(ptr);	\
+		((type *)(__mptr - offsetof(type, member)));	\
+	})
+#endif
+
+#ifdef __BPF__ /* when compiled as bpf program */
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE __PAGE_SIZE
+/*
+ * for older kernels try sizeof(struct genradix_node)
+ * or flexible:
+ * static inline long __bpf_page_size(void) {
+ *   return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node);
+ * }
+ * but generated code is not great.
+ */
+#endif
+
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM)
+#define __arena __attribute__((address_space(1)))
+#define __arena_global __attribute__((address_space(1)))
+#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */
+#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */
+#else
+
+/* emit instruction:
+ * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as
+ *
+ * This is a workaround for LLVM compiler versions without
+ * __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena
+ * pointers and native kernel/userspace ones. In this case we explicitly do so
+ * with cast_kern() and cast_user(). E.g., in the Linux kernel tree,
+ * tools/testing/selftests/bpf includes tests that use these macros to implement
+ * linked lists and hashtables backed by arena memory. In sched_ext, we use
+ * cast_kern() and cast_user() for compatibility with older LLVM toolchains.
+ */
+#ifndef bpf_addr_space_cast
+#define bpf_addr_space_cast(var, dst_as, src_as)\
+	asm volatile(".byte 0xBF;		\
+		     .ifc %[reg], r0;		\
+		     .byte 0x00;		\
+		     .endif;			\
+		     .ifc %[reg], r1;		\
+		     .byte 0x11;		\
+		     .endif;			\
+		     .ifc %[reg], r2;		\
+		     .byte 0x22;		\
+		     .endif;			\
+		     .ifc %[reg], r3;		\
+		     .byte 0x33;		\
+		     .endif;			\
+		     .ifc %[reg], r4;		\
+		     .byte 0x44;		\
+		     .endif;			\
+		     .ifc %[reg], r5;		\
+		     .byte 0x55;		\
+		     .endif;			\
+		     .ifc %[reg], r6;		\
+		     .byte 0x66;		\
+		     .endif;			\
+		     .ifc %[reg], r7;		\
+		     .byte 0x77;		\
+		     .endif;			\
+		     .ifc %[reg], r8;		\
+		     .byte 0x88;		\
+		     .endif;			\
+		     .ifc %[reg], r9;		\
+		     .byte 0x99;		\
+		     .endif;			\
+		     .short %[off];		\
+		     .long %[as]"		\
+		     : [reg]"+r"(var)		\
+		     : [off]"i"(BPF_ADDR_SPACE_CAST) \
+		     , [as]"i"((dst_as << 16) | src_as));
+#endif
+
+#define __arena
+#define __arena_global SEC(".addr_space.1")
+#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1)
+#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0)
+#endif
+
+void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt,
+				    int node_id, __u64 flags) __ksym __weak;
+void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak;
+
+#else /* when compiled as user space code */
+
+#define __arena
+#define __arg_arena
+#define cast_kern(ptr) /* nop for user space */
+#define cast_user(ptr) /* nop for user space */
+char __attribute__((weak)) arena[1];
+
+#ifndef offsetof
+#define offsetof(type, member)  ((unsigned long)&((type *)0)->member)
+#endif
+
+static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt,
+						  int node_id, __u64 flags)
+{
+	return NULL;
+}
+static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt)
+{
+}
+
+#endif
--- a/scheds/include/scx/sdt_list.h
+++ b/scheds/include/scx/sdt_list.h
@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
+ */
+#pragma once
+#include "bpf_arena_common.h"
+
+struct arena_list_node;
+
+typedef struct arena_list_node __arena arena_list_node_t;
+
+struct arena_list_node {
+	arena_list_node_t	*next;
+	u64			padding[2];
+	u64			__arena data[];
+};
+
+struct arena_list_head {
+	struct arena_list_node __arena *first;
+};
+typedef struct arena_list_head __arena arena_list_head_t;
+
+#ifndef __BPF__
+static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { return NULL; }
+static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {}
+static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; }
+#define cond_break ({})
+#define can_loop true
+#endif
+
+static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h)
+{
+	arena_list_node_t *first = h->first;
+	arena_list_node_t * __arena *tmp;
+
+	cast_kern(n);
+	WRITE_ONCE(n->next, first);
+
+	tmp = &h->first;
+	cast_kern(tmp);
+	WRITE_ONCE(*tmp, first);
+}
+
+static inline arena_list_node_t *list_pop(arena_list_head_t *h)
+{
+	arena_list_node_t *first = h->first;
+	arena_list_node_t *tmp;
+	arena_list_node_t *next;
+
+	if (!first)
+		return NULL;
+
+	tmp = first;
+	cast_kern(tmp);
+	next = tmp->next;
+
+	cast_kern(h);
+	WRITE_ONCE(h->first, next);
+
+	return first;
+}
--- a/scheds/include/scx/sdt_task.h
+++ b/scheds/include/scx/sdt_task.h
@ -0,0 +1,75 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
+ */
+#pragma once
+
+#include "sdt_list.h"
+
+#ifndef div_round_up
+#define div_round_up(a, b) (((a) + (b) - 1) / (b))
+#endif
+
+enum sdt_task_consts {
+	SDT_TASK_ALLOC_RESERVE		= 0xbeefcafe,
+	SDT_TASK_ENT_PAGE_SHIFT		= 0,
+	SDT_TASK_ENT_PAGES		= 1 << SDT_TASK_ENT_PAGE_SHIFT,
+	SDT_TASK_ENTS_PER_PAGE_SHIFT	= 9,
+	SDT_TASK_ALLOCATION_ATTEMPTS	= 8192,
+	SDT_TASK_LEVELS			= 3,
+	SDT_TASK_ENTS_PER_CHUNK_SHIFT	= SDT_TASK_ENT_PAGE_SHIFT + SDT_TASK_ENTS_PER_PAGE_SHIFT,
+	/*
+	 * Skim space off the chunk so that both the chunk and the
+	 * allocator linked list are included in the same arena page.
+	 */
+	SDT_TASK_ENTS_PER_CHUNK		= (1 << SDT_TASK_ENTS_PER_CHUNK_SHIFT) - (16 * sizeof(struct arena_list_node)),
+	SDT_TASK_CHUNK_BITMAP_U64S	= div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64),
+};
+
+union sdt_task_id {
+	__s64				val;
+	struct {
+		__s32			idx;	/* index in the radix tree */
+		__s32			gen;	/* ++'d on recycle so that it forms unique'ish 64bit ID */
+	};
+};
+
+struct sdt_task_chunk;
+
+/*
+ * Each index page is described by the following descriptor which carries the
+ * bitmap. This way the actual index can host power-of-two numbers of entries
+ * which makes indexing cheaper.
+ */
+struct sdt_task_desc {
+	__u64				allocated[SDT_TASK_CHUNK_BITMAP_U64S];
+	__u64				nr_free;
+	struct sdt_task_chunk __arena	*chunk;
+};
+
+/*
+ * Leaf node containing per-task data.
+ */
+struct sdt_task_data {
+	union sdt_task_id		tid;
+	__u64				tptr;
+	__u64				__arena payload[];
+};
+
+/*
+ * Intermediate node pointing to another intermediate node or leaf node.
+ */
+struct sdt_task_chunk {
+	union {
+		struct sdt_task_desc __arena *descs[SDT_TASK_ENTS_PER_CHUNK];
+		struct sdt_task_data __arena *data[SDT_TASK_ENTS_PER_CHUNK];
+	};
+};
+
+struct sdt_task_pool {
+	arena_list_head_t		head;
+	__u64				elem_size;
+	__u64				free_size;
+};
--- a/scheds/include/scx/sdt_task_impl.bpf.h
+++ b/scheds/include/scx/sdt_task_impl.bpf.h
@ -0,0 +1,579 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
+ */
+#pragma once
+
+#include "sdt_task.h"
+
+#define SDT_TASK_FN_ATTRS	inline __attribute__((unused, always_inline))
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+	__uint(max_entries, 1 << 20); /* number of pages */
+#ifdef __TARGET_ARCH_arm64
+        __ulong(map_extra, (1ull << 32)); /* start of mmap() region */
+#else
+        __ulong(map_extra, (1ull << 44)); /* start of mmap() region */
+#endif
+} arena __weak SEC(".maps");
+
+/*
+ * task BPF map entry recording the task's assigned ID and pointing to the data
+ * area allocated in arena.
+ */
+struct sdt_task_map_val {
+	union sdt_task_id		tid;
+	struct sdt_task_data __arena	*data;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct sdt_task_map_val);
+} sdt_task_map SEC(".maps");
+
+/*
+ * XXX Hack to get the verifier to find the arena for sdt_exit_task.
+ * As of 6.12-rc5, The verifier associates arenas with programs by
+ * checking LD.IMM instruction operands for an arena and populating
+ * the program state with the first instance it finds. This requires
+ * accessing our global arena variable, but scx methods do not necessarily
+ * do so while still using pointers from that arena. Insert a bpf_printk
+ * statement that triggers at most once to generate an LD.IMM instruction
+ * to access the arena and help the verifier.
+ */
+static bool sdt_verify_once;
+
+static SDT_TASK_FN_ATTRS void sdt_arena_verify(void)
+{
+	if (sdt_verify_once)
+		return;
+
+	bpf_printk("%s: arena pointer %p", __func__, &arena);
+	sdt_verify_once = true;
+}
+
+
+static struct sdt_task_desc __arena *sdt_task_desc_root; /* radix tree root */
+static struct sdt_task_desc __arena *sdt_task_new_chunk; /* new chunk cache */
+
+private(LOCK) struct bpf_spin_lock sdt_task_lock;
+private(POOL_LOCK) struct bpf_spin_lock sdt_task_pool_alloc_lock;
+
+/* allocation pools */
+struct sdt_task_pool __arena sdt_task_desc_pool;
+struct sdt_task_pool __arena sdt_task_chunk_pool;
+struct sdt_task_pool __arena sdt_task_data_pool;
+
+static SDT_TASK_FN_ATTRS int sdt_ffs(__u64 word)
+{
+	unsigned int num = 0;
+
+	if ((word & 0xffffffff) == 0) {
+		num += 32;
+		word >>= 32;
+	}
+
+	if ((word & 0xffff) == 0) {
+		num += 16;
+		word >>= 16;
+	}
+
+	if ((word & 0xff) == 0) {
+		num += 8;
+		word >>= 8;
+	}
+
+	if ((word & 0xf) == 0) {
+		num += 4;
+		word >>= 4;
+	}
+
+	if ((word & 0x3) == 0) {
+		num += 2;
+		word >>= 2;
+	}
+
+	if ((word & 0x1) == 0) {
+		num += 1;
+		word >>= 1;
+	}
+
+	return num;
+}
+
+/* find the first empty slot */
+static SDT_TASK_FN_ATTRS __u64 sdt_chunk_find_empty(struct sdt_task_desc __arena *desc)
+{
+	__u64 freelist;
+	__u64 i;
+
+	cast_kern(desc);
+
+	for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
+		freelist = ~desc->allocated[i];
+		if (freelist == (__u64)0)
+			continue;
+
+		return (i * 64) + sdt_ffs(freelist);
+	}
+
+	return SDT_TASK_ENTS_PER_CHUNK;
+}
+
+/* simple memory allocator */
+static SDT_TASK_FN_ATTRS
+void __arena *sdt_task_alloc_from_pool(struct sdt_task_pool __arena *pool)
+{
+	arena_list_node_t *elem = NULL;
+	void __arena *new_page = NULL;
+	arena_list_node_t *new_elem;
+	__u32 u, numelems;
+
+	/* if pool is empty, get new page */
+	bpf_spin_lock(&sdt_task_pool_alloc_lock);
+
+	if (pool->head.first) {
+		bpf_spin_unlock(&sdt_task_pool_alloc_lock);
+		elem = list_pop(&pool->head);
+		return (void __arena *)&elem->data;
+	}
+
+	bpf_spin_unlock(&sdt_task_pool_alloc_lock);
+
+	new_page = bpf_arena_alloc_pages(&arena, NULL, SDT_TASK_ENT_PAGES, NUMA_NO_NODE, 0);
+	if (!new_page)
+		return NULL;
+
+	/*
+	 * Push all allocated elements except for last one that we use to
+	 * satisfy the allocation.
+	 */
+
+	numelems = (SDT_TASK_ENT_PAGES * PAGE_SIZE) / pool->elem_size;
+
+	bpf_for(u, 0, numelems - 1) {
+		new_elem = new_page + u * pool->elem_size;
+
+		bpf_spin_lock(&sdt_task_pool_alloc_lock);
+		list_add_head(new_elem, &pool->head);
+		bpf_spin_unlock(&sdt_task_pool_alloc_lock);
+	}
+
+	elem = new_page + (numelems - 1) * pool->elem_size;
+
+	return (void __arena *)&elem->data;
+}
+
+static SDT_TASK_FN_ATTRS
+void sdt_task_free_to_pool(void __arena *ptr, struct sdt_task_pool __arena *pool)
+{
+	arena_list_node_t *elem;
+	__u64 __arena *data;
+	int i;
+
+	elem = arena_container_of(ptr, struct arena_list_node, data);
+
+	/* Zero out one word at a time since we cannot use memset. */
+	data = (__u64 __arena *)&elem->data;
+	cast_kern(data);
+
+	bpf_for(i, 0, pool->elem_size / 8) {
+		data[i] = (__u64)0;
+	}
+
+	bpf_spin_lock(&sdt_task_pool_alloc_lock);
+	list_add_head(elem, &pool->head);
+	bpf_spin_unlock(&sdt_task_pool_alloc_lock);
+}
+
+/* alloc desc and chunk and link chunk to desc and return desc */
+static SDT_TASK_FN_ATTRS struct sdt_task_desc __arena *sdt_alloc_chunk(void)
+{
+	struct sdt_task_chunk __arena *chunk;
+	struct sdt_task_desc __arena *desc;
+	struct sdt_task_desc __arena *out;
+
+	chunk = sdt_task_alloc_from_pool(&sdt_task_chunk_pool);
+	if (!chunk) {
+		bpf_printk("%s: failed to allocated chunk", __func__);
+		return NULL;
+	}
+
+	desc = sdt_task_alloc_from_pool(&sdt_task_desc_pool);
+	if (!desc) {
+		sdt_task_free_to_pool(chunk, &sdt_task_chunk_pool);
+		bpf_printk("%s: failed to allocated desc", __func__);
+		return NULL;
+	}
+
+	out = desc;
+
+	cast_kern(desc);
+
+	desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
+	desc->chunk = chunk;
+
+	return out;
+}
+
+static SDT_TASK_FN_ATTRS int sdt_pool_set_size(struct sdt_task_pool __arena *pool, __u64 data_size)
+{
+	/* All allocations are wrapped in a linked list node. */
+	data_size += sizeof(struct arena_list_node);
+
+	if (data_size > (SDT_TASK_ENT_PAGES * PAGE_SIZE)) {
+		bpf_printk("allocation size %ld too large", data_size);
+		return -E2BIG;
+	}
+
+	cast_kern(pool);
+	pool->elem_size = data_size;
+
+	return 0;
+}
+
+static SDT_TASK_FN_ATTRS int sdt_pool_set_size_data(struct sdt_task_pool __arena *pool, __u64 data_size, __u64 free_size)
+{
+	int ret;
+
+	ret = sdt_pool_set_size(pool, data_size);
+	if (ret)
+		return ret;
+
+	pool->free_size = free_size;
+
+	return 0;
+}
+
+/* initialize the whole thing, maybe misnomer */
+static SDT_TASK_FN_ATTRS int sdt_task_init(__u64 data_size)
+{
+	__u64 free_size;
+	int ret;
+
+	ret = sdt_pool_set_size(&sdt_task_chunk_pool, sizeof(struct sdt_task_chunk));
+	if (ret != 0)
+		return ret;
+
+	ret = sdt_pool_set_size(&sdt_task_desc_pool, sizeof(struct sdt_task_desc));
+	if (ret != 0)
+		return ret;
+
+	/* Page align and wrap data into a descriptor. */
+	data_size = div_round_up(data_size, 8) * 8;
+	free_size = data_size;
+	data_size += sizeof(struct sdt_task_data);
+
+	ret = sdt_pool_set_size_data(&sdt_task_data_pool, data_size, free_size);
+	if (ret != 0)
+		return ret;
+
+	sdt_task_desc_root = sdt_alloc_chunk();
+	if (sdt_task_desc_root == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static SDT_TASK_FN_ATTRS
+int sdt_set_idx_state(struct sdt_task_desc __arena *desc, __u64 pos, bool state)
+{
+	__u64 __arena *allocated = desc->allocated;
+	__u64 bit;
+
+	cast_kern(allocated);
+
+	if (pos >= SDT_TASK_ENTS_PER_CHUNK) {
+		bpf_spin_unlock(&sdt_task_lock);
+		bpf_printk("invalid access (0x%d, %s)\n", pos, state ? "set" : "unset");
+
+		bpf_spin_lock(&sdt_task_lock);
+		return -EINVAL;
+	}
+
+	bit = (__u64)1 << (pos % 64);
+
+	if (state)
+		allocated[pos / 64] |= bit;
+	else
+		allocated[pos / 64] &= ~bit;
+
+	return 0;
+}
+
+static SDT_TASK_FN_ATTRS void sdt_task_free_idx(__u64 idx)
+{
+	const __u64 mask = (1 << SDT_TASK_ENTS_PER_CHUNK_SHIFT) - 1;
+	struct sdt_task_desc __arena *lv_desc[SDT_TASK_LEVELS];
+	struct sdt_task_desc * __arena *desc_children;
+	struct sdt_task_chunk __arena *chunk;
+	struct sdt_task_desc __arena *desc;
+	struct sdt_task_data __arena *data;
+	__u64 u, level, shift, pos;
+	__u64 lv_pos[SDT_TASK_LEVELS];
+	int i;
+
+	bpf_spin_lock(&sdt_task_lock);
+
+	desc = sdt_task_desc_root;
+	if (!desc) {
+		bpf_spin_unlock(&sdt_task_lock);
+		bpf_printk("%s: root not allocated", __func__);
+		return;
+	}
+
+	bpf_for(level, 0, SDT_TASK_LEVELS) {
+		shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_CHUNK_SHIFT;
+		pos = (idx >> shift) & mask;
+
+		lv_desc[level] = desc;
+		lv_pos[level] = pos;
+
+		if (level == SDT_TASK_LEVELS - 1)
+			break;
+
+		cast_kern(desc);
+
+		chunk = desc->chunk;
+		cast_kern(chunk);
+
+		desc_children = (struct sdt_task_desc * __arena *)chunk->descs;
+		desc = desc_children[pos];
+
+		if (!desc) {
+			bpf_spin_unlock(&sdt_task_lock);
+			bpf_printk("freeing nonexistent idx [0x%lx] (level %d)", idx, level);
+			return;
+		}
+	}
+
+	cast_kern(desc);
+
+	chunk = desc->chunk;
+	cast_kern(chunk);
+
+	pos = idx & mask;
+	data = chunk->data[pos];
+	if (!data) {
+		bpf_spin_unlock(&sdt_task_lock);
+		bpf_printk("freeing idx [0x%lx] (%p) without data", idx, &chunk->data[pos]);
+		return;
+	}
+
+	cast_kern(data);
+
+	data[pos] = (struct sdt_task_data) {
+		.tid.gen = data->tid.gen + 1,
+		.tptr = 0,
+	};
+
+	/* Zero out one word at a time. */
+	bpf_for(i, 0, sdt_task_data_pool.free_size / 8) {
+		data->payload[i] = 0;
+	}
+
+	bpf_for(u, 0, SDT_TASK_LEVELS) {
+		level = SDT_TASK_LEVELS - 1 - u;
+
+		/* Only propagate upwards if we are the parent's only free chunk. */
+		desc = lv_desc[level];
+
+		sdt_set_idx_state(desc, lv_pos[level], false);
+
+		cast_kern(desc);
+
+		desc->nr_free += 1;
+		if (desc->nr_free > 1)
+			break;
+	}
+
+	bpf_spin_unlock(&sdt_task_lock);
+
+	return;
+}
+
+static SDT_TASK_FN_ATTRS
+void __arena *sdt_task_retrieve(struct task_struct *p)
+{
+	struct sdt_task_data __arena *data;
+	struct sdt_task_map_val *mval;
+
+	sdt_arena_verify();
+
+	mval = bpf_task_storage_get(&sdt_task_map, p, 0, 0);
+	if (!mval)
+		return NULL;
+
+	data = mval->data;
+
+	return (void __arena *)data->payload;
+}
+
+
+static SDT_TASK_FN_ATTRS void sdt_task_free(struct task_struct *p)
+{
+	struct sdt_task_map_val *mval;
+
+	sdt_arena_verify();
+
+	mval = bpf_task_storage_get(&sdt_task_map, p, 0, 0);
+	if (!mval)
+		return;
+
+	sdt_task_free_idx(mval->tid.idx);
+	mval->data = NULL;
+}
+
+
+static SDT_TASK_FN_ATTRS
+int sdt_task_find_empty(struct sdt_task_desc __arena *desc, struct sdt_task_desc * __arena *descp, __u64 *idxp)
+{
+	struct sdt_task_desc * __arena *desc_children,  __arena *new_chunk;
+	struct sdt_task_desc __arena *lv_desc[SDT_TASK_LEVELS];
+	struct sdt_task_chunk __arena *chunk;
+	struct sdt_task_desc __arena *tmp;
+	__u64 lv_pos[SDT_TASK_LEVELS];
+	__u64 u, pos, level;
+	__u64 idx = 0;
+
+	bpf_for(level, 0, SDT_TASK_LEVELS) {
+		pos = sdt_chunk_find_empty(desc);
+
+		/* Something has gon terribly wrong. */
+		if (pos > SDT_TASK_ENTS_PER_CHUNK)
+			return -EINVAL;
+
+		if (pos == SDT_TASK_ENTS_PER_CHUNK)
+			return -ENOMEM;
+
+		idx <<= SDT_TASK_ENTS_PER_CHUNK_SHIFT;
+		idx |= pos;
+
+		/* Log the levels to complete allocation. */
+		lv_desc[level] = desc;
+		lv_pos[level] = pos;
+
+		/* The rest of the loop is for internal node traversal. */
+		if (level == SDT_TASK_LEVELS - 1)
+			break;
+
+		cast_kern(desc);
+
+		chunk = desc->chunk;
+		cast_kern(chunk);
+
+		desc_children = (struct sdt_task_desc * __arena *)chunk->descs;
+		desc = desc_children[pos];
+
+		/* Someone else is populating the subtree. */
+		if (desc == (void *)SDT_TASK_ALLOC_RESERVE)
+			return -EAGAIN;
+
+		if (!desc) {
+			/* Reserve our spot and go allocate. */
+			desc_children[pos] = (void *)SDT_TASK_ALLOC_RESERVE;
+
+			bpf_spin_unlock(&sdt_task_lock);
+			new_chunk = sdt_alloc_chunk();
+			if (!new_chunk) {
+				bpf_printk("%s: allocating new chunk failed", __func__);
+				bpf_spin_lock(&sdt_task_lock);
+				return -ENOMEM;
+			}
+
+			bpf_spin_lock(&sdt_task_lock);
+
+			desc_children[pos] = new_chunk;
+			desc = new_chunk;
+		}
+	}
+
+	bpf_for(u, 0, SDT_TASK_LEVELS) {
+		level = SDT_TASK_LEVELS - 1 - u;
+		tmp = lv_desc[level];
+
+		cast_kern(tmp);
+		sdt_set_idx_state(tmp, lv_pos[level], true);
+
+		tmp->nr_free -= 1;
+		if (tmp->nr_free > 0)
+			break;
+
+	}
+
+	*descp = desc;
+	*idxp = idx;
+
+	return 0;
+}
+
+static SDT_TASK_FN_ATTRS
+void __arena *sdt_task_alloc(struct task_struct *p)
+{
+	struct sdt_task_data __arena *data = NULL;
+	struct sdt_task_desc __arena *desc;
+	struct sdt_task_chunk __arena *chunk;
+	struct sdt_task_map_val *mval;
+	__u64 idx, pos;
+	int ret;
+
+	mval = bpf_task_storage_get(&sdt_task_map, p, 0,
+				    BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!mval)
+		return NULL;
+
+	bpf_spin_lock(&sdt_task_lock);
+
+	bpf_repeat(SDT_TASK_ALLOCATION_ATTEMPTS) {
+		ret = sdt_task_find_empty(sdt_task_desc_root, &desc, &idx);
+		if (ret != -EAGAIN)
+			break;
+	}
+
+	if (ret != 0) {
+		bpf_spin_unlock(&sdt_task_lock);
+		bpf_printk("%s: error %d on allocation", __func__, ret);
+		return NULL;
+	}
+
+	cast_kern(desc);
+
+	chunk = desc->chunk;
+	cast_kern(chunk);
+
+	/* populate leaf node if necessary */
+	pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
+	data = chunk->data[pos];
+	if (!data) {
+		bpf_spin_unlock(&sdt_task_lock);
+
+		data = sdt_task_alloc_from_pool(&sdt_task_data_pool);
+		if (!data) {
+			sdt_task_free_idx(idx);
+			bpf_printk("%s: failed to allocate data from pool", __func__);
+			return NULL;
+		}
+
+		bpf_spin_lock(&sdt_task_lock);
+		chunk->data[pos] = data;
+	}
+
+	/* init and return */
+	cast_kern(data);
+
+	data->tid.idx = idx;
+	data->tptr = (__u64)p;
+
+	mval->tid = data->tid;
+	mval->data = data;
+
+	bpf_spin_unlock(&sdt_task_lock);
+
+	return (void __arena *)data->payload;
+}
--- a/scheds/rust/scx_lavd/src/bpf/intf.h
+++ b/scheds/rust/scx_lavd/src/bpf/intf.h
@ -78,12 +78,14 @@ struct sys_stat {
 	volatile u32	max_perf_cri;	/* maximum performance criticality */
 	volatile u32	thr_perf_cri;	/* performance criticality threshold */

+	volatile u32	nr_stealee;	/* number of compute domains to be migrated */
 	volatile u32	nr_violation;	/* number of utilization violation */
 	volatile u32	nr_active;	/* number of active cores */

 	volatile u64	nr_sched;	/* total scheduling so far */
 	volatile u64	nr_perf_cri;	/* number of performance-critical tasks scheduled */
 	volatile u64	nr_lat_cri;	/* number of latency-critical tasks scheduled */
+	volatile u64	nr_x_migration; /* number of cross domain migration */
 	volatile u64	nr_big;		/* scheduled on big core */
 	volatile u64	nr_pc_on_big;	/* performance-critical tasks scheduled on big core */
 	volatile u64	nr_lc_on_big;	/* latency-critical tasks scheduled on big core */
--- a/scheds/rust/scx_lavd/src/bpf/lavd.bpf.h
+++ b/scheds/rust/scx_lavd/src/bpf/lavd.bpf.h
@ -51,6 +51,9 @@ enum consts_internal  {
 						  performance mode when cpu util > 40% */

 	LAVD_CPDOM_STARV_NS		= (2 * LAVD_SLICE_MAX_NS_DFL),
+	LAVD_CPDOM_MIGRATION_SHIFT	= 3, /* 1/2**3 = +/- 12.5% */
+	LAVD_CPDOM_X_PROB_FT		= (LAVD_SYS_STAT_INTERVAL_NS /
+					   (2 * LAVD_SLICE_MAX_NS_DFL)), /* roughly twice per interval */
 };

 /*
@ -58,12 +61,15 @@ enum consts_internal  {
 * - system > numa node > llc domain > compute domain per core type (P or E)
 */
 struct cpdom_ctx {
-	u64	last_consume_clk;		    /* when the associated DSQ was consumed */
 	u64	id;				    /* id of this compute domain (== dsq_id) */
 	u64	alt_id;				    /* id of the closest compute domain of alternative type (== dsq id) */
 	u8	node_id;			    /* numa domain id */
 	u8	is_big;				    /* is it a big core or little core? */
 	u8	is_active;			    /* if this compute domain is active */
+	u8	is_stealer;			    /* this domain should steal tasks from others */
+	u8	is_stealee;			    /* stealer doamin should steal tasks from this domain */
+	u16	nr_cpus;			    /* the number of CPUs in this compute domain */
+	u32	nr_q_tasks_per_cpu;		    /* the number of queued tasks per CPU in this domain (x1000) */
 	u8	nr_neighbors[LAVD_CPDOM_MAX_DIST];  /* number of neighbors per distance */
 	u64	neighbor_bits[LAVD_CPDOM_MAX_DIST]; /* bitmask of neighbor bitmask per distance */
 	u64	__cpumask[LAVD_CPU_ID_MAX/64];	    /* cpumasks belongs to this compute domain */
@ -129,6 +135,7 @@ struct cpu_ctx {
 	/*
 	 * Information for statistics.
 	 */
+	volatile u32	nr_x_migration;
 	volatile u32	nr_perf_cri;
 	volatile u32	nr_lat_cri;

--- a/scheds/rust/scx_lavd/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/main.bpf.c
@ -1108,7 +1108,7 @@ void BPF_STRUCT_OPS(lavd_enqueue, struct task_struct *p, u64 enq_flags)
 	}
 }

-static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
+static bool consume_dsq(u64 dsq_id)
 {
 	struct cpdom_ctx *cpdomc;

@ -1120,7 +1120,6 @@ static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
 		scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
 		return false;
 	}
-	WRITE_ONCE(cpdomc->last_consume_clk, now);

 	/*
 	 * Try to consume a task on the associated DSQ.
@ -1130,81 +1129,110 @@ static bool consume_dsq(s32 cpu, u64 dsq_id, u64 now)
 	return false;
 }

-static bool consume_starving_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
+static bool try_to_steal_task(struct cpdom_ctx *cpdomc)
 {
-	struct cpdom_ctx *cpdomc;
-	u64 dsq_id = cpuc->cpdom_poll_pos;
-	u64 dl;
-	bool ret = false;
-	int i;
-
-	if (nr_cpdoms == 1)
-		return false;
-
-	bpf_for(i, 0, nr_cpdoms) {
-		if (i >= LAVD_CPDOM_MAX_NR)
-			break;
-
-		dsq_id = (dsq_id + i) % LAVD_CPDOM_MAX_NR;
-
-		if (dsq_id == cpuc->cpdom_id)
-			continue;
-
-		cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
-		if (!cpdomc) {
-			scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
-			goto out;
-		}
-
-		if (cpdomc->is_active) {
-			dl = READ_ONCE(cpdomc->last_consume_clk) + LAVD_CPDOM_STARV_NS;
-			if (dl < now) {
-				ret = consume_dsq(cpu, dsq_id, now);
-			}
-			goto out;
-		}
-	}
-out:
-	cpuc->cpdom_poll_pos = (dsq_id + 1) % LAVD_CPDOM_MAX_NR;
-	return ret;
-}
-
-static bool consume_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
-{
-	struct cpdom_ctx *cpdomc, *cpdomc_pick;
-	u64 dsq_id, nr_nbr;
+	struct cpdom_ctx *cpdomc_pick;
+	u64 nr_nbr, dsq_id;
 	s64 nuance;

 	/*
-	 * If there is a starving DSQ, try to consume it first.
+	 * If all CPUs are not used -- i.e., the system is under-utilized,
+	 * there is no point of load balancing. It is better to make an
+	 * effort to increase the system utilization.
 	 */
-	if (consume_starving_task(cpu, cpuc, now))
-		return true;
-
-	/*
-	 * Try to consume from CPU's associated DSQ.
-	 */
-	dsq_id = cpuc->cpdom_id;
-	if (consume_dsq(cpu, dsq_id, now))
-		return true;
-
-	/*
-	 * If there is no task in the assssociated DSQ, traverse neighbor
-	 * compute domains in distance order -- task stealing.
-	 */
-	cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
-	if (!cpdomc) {
-		scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
+	if (!use_full_cpus())
 		return false;
-	}

+	/*
+	 * Probabilistically make a go or no go decision to avoid the
+	 * thundering herd problem. In other words, one out of nr_cpus
+	 * will try to steal a task at a moment.
+	 */
+	if (!prob_x_out_of_y(1, cpdomc->nr_cpus * LAVD_CPDOM_X_PROB_FT))
+		return false;
+
+	/*
+	 * Traverse neighbor compute domains in distance order.
+	 */
+	nuance = bpf_get_prandom_u32();
 	for (int i = 0; i < LAVD_CPDOM_MAX_DIST; i++) {
 		nr_nbr = min(cpdomc->nr_neighbors[i], LAVD_CPDOM_MAX_NR);
 		if (nr_nbr == 0)
 			break;

-		nuance = bpf_get_prandom_u32();
-		for (int j = 0; j < LAVD_CPDOM_MAX_NR; j++, nuance = dsq_id + 1) {
+		/*
+		 * Traverse neighbor in the same distance in arbitrary order.
+		 */
+		for (int j = 0; j < LAVD_CPDOM_MAX_NR; j++, nuance++) {
+			if (j >= nr_nbr)
+				break;
+
+			dsq_id = pick_any_bit(cpdomc->neighbor_bits[i], nuance);
+			if (dsq_id == -ENOENT)
+				continue;
+
+			cpdomc_pick = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
+			if (!cpdomc_pick) {
+				scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
+				return false;
+			}
+
+			if (!cpdomc_pick->is_stealee || !cpdomc_pick->is_active)
+				continue;
+
+			/*
+			 * If task stealing is successful, mark the stealer
+			 * and the stealee's job done. By marking done,
+			 * those compute domains would not be involved in
+			 * load balancing until the end of this round,
+			 * so this helps gradual migration. Note that multiple
+			 * stealers can steal tasks from the same stealee.
+			 * However, we don't coordinate concurrent stealing
+			 * because the chance is low and there is no harm
+			 * in slight over-stealing.
+			 */
+			if (consume_dsq(dsq_id)) {
+				WRITE_ONCE(cpdomc_pick->is_stealee, false);
+				WRITE_ONCE(cpdomc->is_stealer, false);
+				return true;
+			}
+		}
+
+		/*
+		 * Now, we need to steal a task from a farther neighbor
+		 * for load balancing. Since task migration from a farther
+		 * neighbor is more expensive (e.g., crossing a NUMA boundary),
+		 * we will do this with a lot of hesitation. The chance of
+		 * further migration will decrease exponentially as distance
+		 * increases, so, on the other hand, it increases the chance
+		 * of closer migration.
+		 */
+		if (!prob_x_out_of_y(1, LAVD_CPDOM_X_PROB_FT))
+			break;
+	}
+
+	return false;
+}
+
+static bool force_to_steal_task(struct cpdom_ctx *cpdomc)
+{
+	struct cpdom_ctx *cpdomc_pick;
+	u64 nr_nbr, dsq_id;
+	s64 nuance;
+
+	/*
+	 * Traverse neighbor compute domains in distance order.
+	 */
+	nuance = bpf_get_prandom_u32();
+	for (int i = 0; i < LAVD_CPDOM_MAX_DIST; i++) {
+		nr_nbr = min(cpdomc->nr_neighbors[i], LAVD_CPDOM_MAX_NR);
+		if (nr_nbr == 0)
+			break;
+
+		/*
+		 * Traverse neighbor in the same distance in arbitrary order.
+		 */
+		for (int j = 0; j < LAVD_CPDOM_MAX_NR; j++, nuance++) {
 			if (j >= nr_nbr)
 				break;

@ -1221,7 +1249,7 @@ static bool consume_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
 			if (!cpdomc_pick->is_active)
 				continue;

-			if (consume_dsq(cpu, dsq_id, now))
+			if (consume_dsq(dsq_id))
 				return true;
 		}
 	}
@ -1229,9 +1257,51 @@ static bool consume_task(s32 cpu, struct cpu_ctx *cpuc, u64 now)
 	return false;
 }

+static bool consume_task(struct cpu_ctx *cpuc)
+{
+	struct cpdom_ctx *cpdomc;
+	u64 dsq_id;
+
+	dsq_id = cpuc->cpdom_id;
+	cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
+	if (!cpdomc) {
+		scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
+		return false;
+	}
+
+	/*
+	 * If the current compute domain is a stealer, try to steal
+	 * a task from any of stealee domains probabilistically.
+	 */
+	if (cpdomc->is_stealer && try_to_steal_task(cpdomc))
+		goto x_domain_migration_out;
+
+	/*
+	 * Try to consume a task from CPU's associated DSQ.
+	 */
+	if (consume_dsq(dsq_id))
+		return true;
+
+	/*
+	 * If there is no task in the assssociated DSQ, traverse neighbor
+	 * compute domains in distance order -- task stealing.
+	 */
+	if (force_to_steal_task(cpdomc))
+		goto x_domain_migration_out;
+
+	return false;
+
+	/*
+	 * Task migration across compute domains happens.
+	 * Update the statistics.
+	 */
+x_domain_migration_out:
+	cpuc->nr_x_migration++;
+	return true;
+}
+
 void BPF_STRUCT_OPS(lavd_dispatch, s32 cpu, struct task_struct *prev)
 {
-	u64 now = bpf_ktime_get_ns();
 	struct cpu_ctx *cpuc;
 	struct task_ctx *taskc;
 	struct bpf_cpumask *active, *ovrflw;
@ -1365,10 +1435,7 @@ consume_out:
 	/*
 	 * Consume a task if requested.
 	 */
-	if (!try_consume)
-		return;
-
-	if (consume_task(cpu, cpuc, now))
+	if (try_consume && consume_task(cpuc))
 		return;

 	/*
@ -1805,8 +1872,6 @@ static s32 init_cpdoms(u64 now)
 		if (!cpdomc->is_active)
 			continue;

-		WRITE_ONCE(cpdomc->last_consume_clk, now);
-
 		/*
 		 * Create an associated DSQ on its associated NUMA domain.
 		 */
@ -2024,6 +2089,7 @@ static s32 init_per_cpu_ctx(u64 now)
 					}
 					cpuc->cpdom_id = cpdomc->id;
 					cpuc->cpdom_alt_id = cpdomc->alt_id;
+					cpdomc->nr_cpus++;
 				}
 			}
 		}
--- a/scheds/rust/scx_lavd/src/bpf/sys_stat.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/sys_stat.bpf.c
@ -38,6 +38,8 @@ struct sys_stat_ctx {
 	u32		nr_sched;
 	u32		nr_perf_cri;
 	u32		nr_lat_cri;
+	u32		nr_x_migration;
+	u32		nr_stealee;
 	u32		nr_big;
 	u32		nr_pc_on_big;
 	u32		nr_lc_on_big;
@ -62,10 +64,66 @@ static void init_sys_stat_ctx(struct sys_stat_ctx *c)
 	c->stat_next->last_update_clk = c->now;
 }

+static void plan_x_cpdom_migration(struct sys_stat_ctx *c)
+{
+	struct cpdom_ctx *cpdomc;
+	u64 dsq_id;
+	u32 avg_nr_q_tasks_per_cpu = 0, nr_q_tasks, x_mig_delta;
+	u32 stealer_threshold, stealee_threshold;
+
+	/*
+	 * Calcualte average queued tasks per CPU per compute domain.
+	 */
+	bpf_for(dsq_id, 0, nr_cpdoms) {
+		if (dsq_id >= LAVD_CPDOM_MAX_NR)
+			break;
+
+		nr_q_tasks = scx_bpf_dsq_nr_queued(dsq_id);
+		c->nr_queued_task += nr_q_tasks;
+
+		cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
+		cpdomc->nr_q_tasks_per_cpu = (nr_q_tasks * 1000) / cpdomc->nr_cpus;
+		avg_nr_q_tasks_per_cpu += cpdomc->nr_q_tasks_per_cpu;
+	}
+	avg_nr_q_tasks_per_cpu /= nr_cpdoms;
+
+	/*
+	 * Determine stealer and stealee domains.
+	 *
+	 * A stealer domain, whose per-CPU queue length is shorter than
+	 * the average, will steal a task from any of stealee domain,
+	 * whose per-CPU queue length is longer than the average.
+	 * Compute domain around average will not do anything.
+	 */
+	x_mig_delta = avg_nr_q_tasks_per_cpu >> LAVD_CPDOM_MIGRATION_SHIFT;
+	stealer_threshold = avg_nr_q_tasks_per_cpu - x_mig_delta;
+	stealee_threshold = avg_nr_q_tasks_per_cpu + x_mig_delta;
+
+	bpf_for(dsq_id, 0, nr_cpdoms) {
+		if (dsq_id >= LAVD_CPDOM_MAX_NR)
+			break;
+
+		cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
+
+		if (cpdomc->nr_q_tasks_per_cpu < stealer_threshold) {
+			WRITE_ONCE(cpdomc->is_stealer, true);
+			WRITE_ONCE(cpdomc->is_stealee, false);
+		}
+		else if (cpdomc->nr_q_tasks_per_cpu > stealee_threshold) {
+			WRITE_ONCE(cpdomc->is_stealer, false);
+			WRITE_ONCE(cpdomc->is_stealee, true);
+			c->nr_stealee++;
+		}
+		else {
+			WRITE_ONCE(cpdomc->is_stealer, false);
+			WRITE_ONCE(cpdomc->is_stealee, false);
+		}
+	}
+}
+
 static void collect_sys_stat(struct sys_stat_ctx *c)
 {
-	u64 dsq_id;
-	int cpu, nr;
+	int cpu;

 	bpf_for(cpu, 0, nr_cpu_ids) {
 		struct cpu_ctx *cpuc = get_cpu_ctx_id(cpu);
@ -94,6 +152,9 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
 		c->nr_lat_cri += cpuc->nr_lat_cri;
 		cpuc->nr_lat_cri = 0;

+		c->nr_x_migration += cpuc->nr_x_migration;
+		cpuc->nr_x_migration = 0;
+
 		/*
 		 * Accumulate task's latency criticlity information.
 		 *
@ -169,12 +230,6 @@ static void collect_sys_stat(struct sys_stat_ctx *c)
 		c->idle_total += cpuc->idle_total;
 		cpuc->idle_total = 0;
 	}
- 
-	bpf_for(dsq_id, 0, LAVD_CPDOM_MAX_NR) {
-		nr = scx_bpf_dsq_nr_queued(dsq_id);
-		if (nr > 0)
-			c->nr_queued_task += nr;
-	}
 }

 static void calc_sys_stat(struct sys_stat_ctx *c)
@ -239,6 +294,8 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
 			c->stat_cur->thr_perf_cri; /* will be updated later */
 	}

+	stat_next->nr_stealee = c->nr_stealee;
+
 	stat_next->nr_violation =
 		calc_avg32(stat_cur->nr_violation, c->nr_violation);

@ -260,6 +317,7 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
 		stat_next->nr_sched >>= 1;
 		stat_next->nr_perf_cri >>= 1;
 		stat_next->nr_lat_cri >>= 1;
+		stat_next->nr_x_migration >>= 1;
 		stat_next->nr_big >>= 1;
 		stat_next->nr_pc_on_big >>= 1;
 		stat_next->nr_lc_on_big >>= 1;
@ -272,6 +330,7 @@ static void update_sys_stat_next(struct sys_stat_ctx *c)
 	stat_next->nr_sched += c->nr_sched;
 	stat_next->nr_perf_cri += c->nr_perf_cri;
 	stat_next->nr_lat_cri += c->nr_lat_cri;
+	stat_next->nr_x_migration += c->nr_x_migration;
 	stat_next->nr_big += c->nr_big;
 	stat_next->nr_pc_on_big += c->nr_pc_on_big;
 	stat_next->nr_lc_on_big += c->nr_lc_on_big;
@ -287,6 +346,7 @@ static void do_update_sys_stat(void)
 	 * Collect and prepare the next version of stat.
 	 */
 	init_sys_stat_ctx(&c);
+	plan_x_cpdom_migration(&c);
 	collect_sys_stat(&c);
 	calc_sys_stat(&c);
 	update_sys_stat_next(&c);
--- a/scheds/rust/scx_lavd/src/bpf/util.bpf.c
+++ b/scheds/rust/scx_lavd/src/bpf/util.bpf.c
@ -299,3 +299,14 @@ static void set_on_core_type(struct task_ctx *taskc,
 	WRITE_ONCE(taskc->on_big, on_big);
 	WRITE_ONCE(taskc->on_little, on_little);
 }
+
+static bool prob_x_out_of_y(u32 x, u32 y)
+{
+	/*
+	 * [0, r, y)
+	 *  ---- x?
+	 */
+	u32 r = bpf_get_prandom_u32() % y;
+	return r < x;
+}
+
--- a/scheds/rust/scx_lavd/src/main.rs
+++ b/scheds/rust/scx_lavd/src/main.rs
@ -711,6 +711,8 @@ impl<'a> Scheduler<'a> {
                let nr_sched = st.nr_sched;
                let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
                let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
+                let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
+                let nr_stealee = st.nr_stealee;
                let nr_big = st.nr_big;
                let pc_big = Self::get_pc(nr_big, nr_sched);
                let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
@ -730,6 +732,8 @@ impl<'a> Scheduler<'a> {
                    nr_sched,
                    pc_pc,
                    pc_lc,
+                    pc_x_migration,
+                    nr_stealee,
                    pc_big,
                    pc_pc_on_big,
                    pc_lc_on_big,
--- a/scheds/rust/scx_lavd/src/stats.rs
+++ b/scheds/rust/scx_lavd/src/stats.rs
@ -37,6 +37,12 @@ pub struct SysStats {
    #[stat(desc = "% of latency-critical tasks")]
    pub pc_lc: f64,

+    #[stat(desc = "% of cross domain task migration")]
+    pub pc_x_migration: f64,
+
+    #[stat(desc = "Number of stealee domains")]
+    pub nr_stealee: u32,
+
    #[stat(desc = "% of tasks scheduled on big cores")]
    pub pc_big: f64,

@ -63,13 +69,15 @@ impl SysStats {
    pub fn format_header<W: Write>(w: &mut W) -> Result<()> {
        writeln!(
            w,
-            "\x1b[93m| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |\x1b[0m",
+            "\x1b[93m| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |\x1b[0m",
            "MSEQ",
            "# Q TASK",
            "# ACT CPU",
            "# SCHED",
            "PERF-CR%",
            "LAT-CR%",
+            "X-MIG%",
+            "# STLEE",
            "BIG%",
            "PC/BIG%",
            "LC/BIG%",
@ -88,13 +96,15 @@ impl SysStats {

        writeln!(
            w,
-            "| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |",
+            "| {:8} | {:9} | {:9} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:8} | {:11} | {:12} | {:12} | {:12} |",
            self.mseq,
            self.nr_queued_task,
            self.nr_active,
            self.nr_sched,
            GPoint(self.pc_pc),
            GPoint(self.pc_lc),
+            GPoint(self.pc_x_migration),
+            self.nr_stealee,
            GPoint(self.pc_big),
            GPoint(self.pc_pc_on_big),
            GPoint(self.pc_lc_on_big),
Author	SHA1	Message	Date
Emil Tsalapatis	dac0142be5	Merge `b692c415f1` into `7d14df8ca2`	2024-11-30 13:19:29 +02:00
Changwoo Min	7d14df8ca2	Merge pull request #1000 from multics69/lavd-load-balancing scx_lavd: Load balancing across compute domains	2024-11-30 12:10:04 +09:00
Changwoo Min	047e8c81e9	scx_lavd: Perform load balancing at consume_task() Upon ops.dispatch, perform load balancing based on the set-up plan, stealing a task from a stealee domain to a stealer domain. To avoid the thundering herd problem of concurrent stealers, a stealer steals a task probabilistically. Also, to minimize the task migration distance, decrease the stealing probability exponentially for each hop in the distance. Finally, for every stat cycle (50 ms), a stealer will migrate only one task from a stealee for gradual load balancing. Signed-off-by: Changwoo Min <changwoo@igalia.com>	2024-11-30 12:09:43 +09:00
Changwoo Min	4f1ffc1bc6	scx_lavd: Refactor consume_task() Remove unnecessary variables and arguments and factor out force_to_steal_task(). Signed-off-by: Changwoo Min <changwoo@igalia.com>	2024-11-30 12:09:43 +09:00
Changwoo Min	7991266773	scx_lavd: Decide load balancing plan across compute domains The goal of load balancing is to maintain almost equal queued tasks per CPU in a compute domain. To this end, we first decide which compute domain is under-utilized (i.e., its queue length per CPU is below average) and which compute domain is over-utilized (i.e., its queue length per CPU is above average). We call the under-utilized domain as a stealer domain and the over-utilized domain as a stealee domain. Signed-off-by: Changwoo Min <changwoo@igalia.com>	2024-11-30 12:09:43 +09:00
Changwoo Min	ed14a4ca91	scx_lavd: Log out the number of cross-domain task migration Collect and log out the number of task migration across compute domains. Signed-off-by: Changwoo Min <changwoo@igalia.com>	2024-11-30 12:09:43 +09:00
Emil Tsalapatis	b692c415f1	select_cpu metrics	2024-11-25 11:13:51 -08:00
Emil Tsalapatis	7a87c24c7d	adjust data alignment, fix typecasting bug in scx_sdt and add more metrics	2024-11-25 10:59:21 -08:00
Emil Tsalapatis	44907b317c	sdt: stats template	2024-11-25 09:45:34 -08:00
Emil Tsalapatis	d4e4ffd996	use arena-allocated structures for statistics	2024-11-25 09:32:21 -08:00
Emil Tsalapatis	2044301cf8	share the sdt_stats struct between userspace and kernel	2024-11-25 08:41:09 -08:00
Emil Tsalapatis	b8c30308e1	incorporate data size to the pool struct to sidestep verifier issue	2024-11-25 08:18:02 -08:00
Emil Tsalapatis	173b3d0e06	update allocation code to directly return data	2024-11-22 12:43:55 -08:00
Emil Tsalapatis	ce22e7f622	style: break function attributes into their own line when overly long	2024-11-22 11:04:44 -08:00
Emil Tsalapatis	c48e3af43a	update copyright	2024-11-22 10:55:36 -08:00
Emil Tsalapatis	eac323f8ae	bugfixes and overhaul list allocation/adjust casting arena allocation	2024-11-22 10:38:43 -08:00
Emil Tsalapatis	6e618d1236	multipage arena allocations and bitmap position find bugfix	2024-11-21 10:58:52 -08:00
Emil Tsalapatis	ee9c69b82b	fix descriptor array indexing on the free path	2024-11-20 14:51:15 -08:00
Emil Tsalapatis	60d901c068	remove unnecessary header import	2024-11-20 14:31:49 -08:00
Emil Tsalapatis	740d44723b	clarify how we use cast_kern() and cast_user() in the headers where we import them	2024-11-20 14:12:04 -08:00
Emil Tsalapatis	52743e0a51	add back mips vmlinux.h header	2024-11-20 14:05:22 -08:00
Emil Tsalapatis	6022031956	verification passing	2024-11-20 14:03:36 -08:00
Emil Tsalapatis	73c44ddb70	[wip] verification almost passing	2024-11-20 13:51:16 -08:00
Emil Tsalapatis	0a78c2124e	[wip] expand allocator into a tree	2024-11-20 13:01:17 -08:00
Emil Tsalapatis	6236ecba40	add tree traversal function stub	2024-11-20 09:47:23 -08:00
Emil Tsalapatis	0357571e98	remove unnecessarily imported header	2024-11-20 09:46:57 -08:00
Emil Tsalapatis	c6cdafe6fd	style nit: use __u64 typecast instead of ULL suffix	2024-11-20 09:34:02 -08:00
Emil Tsalapatis	0cc0159e77	bump arena size back up to 4GiB	2024-11-20 07:53:35 -08:00
Emil Tsalapatis	03c8e21717	add back ARM arena vaddr offset	2024-11-20 07:48:35 -08:00
Emil Tsalapatis	a7f0423472	add bitmap back in for allocation tracking	2024-11-20 07:39:33 -08:00
Emil Tsalapatis	bce4be65ab	expand SDT_TASK_ENTS_PER_CHUNK_SHIFT to fill in the entire chunk with chunk pointers	2024-11-20 07:26:00 -08:00
Emil Tsalapatis	bc3ef152a6	Merge branch 'main' into etsal/arena-alloc-simple	2024-11-20 07:07:37 -08:00
Emil Tsalapatis	b6d2e99641	turn C schedulers back on	2024-11-19 18:37:27 -08:00
Emil Tsalapatis	87db033d70	print diagnostic on impossible condition	2024-11-19 18:10:01 -08:00
Emil Tsalapatis	f8e22e2d9b	fix minor inconsistencies	2024-11-19 17:44:22 -08:00
Emil Tsalapatis	5cd75b810c	add locking back in	2024-11-19 16:48:08 -08:00
Emil Tsalapatis	7daf3d4906	remove descriptor head struct now that verification is passing	2024-11-19 10:34:04 -08:00
Emil Tsalapatis	8070eadb34	wip full verifier passing	2024-11-19 10:29:59 -08:00
Emil Tsalapatis	4db4791b49	sdt_task_init working	2024-11-18 15:03:29 -08:00
Emil Tsalapatis	055447960e	turning on more parts of the code and adding list_pop() for allocation	2024-11-18 14:27:49 -08:00
Emil Tsalapatis	74cae35225	code passing the verifier	2024-11-18 12:44:35 -08:00
Emil Tsalapatis	4eb81403ca	use a single level for the tree for now	2024-11-18 11:36:40 -08:00
Emil Tsalapatis	9d202db786	use one byte per entry	2024-11-18 11:29:59 -08:00
Emil Tsalapatis	36a512c864	[HACK] turn off locking	2024-11-18 11:24:56 -08:00
Emil Tsalapatis	f2dbeba2b5	properly mark sdt_task_exit as non-sleepable	2024-11-18 07:22:57 -08:00
Emil Tsalapatis	24997a3a03	fixing verifier errors along the tree allocation path	2024-11-18 07:21:52 -08:00
Emil Tsalapatis	b3e9b11792	verification for SDT passing	2024-11-15 12:44:50 -08:00
Emil Tsalapatis	9e7a2393f0	do not put the scheduler-local core structs in the __arena_global section	2024-11-15 10:42:19 -08:00
Emil Tsalapatis	e05a7e5989	more aggressively use selftest examples in our own code	2024-11-15 10:38:01 -08:00
Emil Tsalapatis	633c7658c4	get bpf_arena_common.h back compiling with Clang-friendly attribute definition	2024-11-15 10:17:42 -08:00
Emil Tsalapatis	d0e8b63239	import 6.12.0-rc5 BPF headers from selftests	2024-11-15 10:16:27 -08:00
Emil Tsalapatis	94b89ca51b	[WIP] build hackery	2024-11-15 10:14:59 -08:00
Emil Tsalapatis	def8489029	turn off enough code paths to get SDT loading	2024-11-08 13:25:29 -08:00
Emil Tsalapatis	692cf44579	use Clang-style attribute for userspace weak symbol	2024-11-08 12:46:58 -08:00
Emil Tsalapatis	5bfe446a0b	[WIP] remove __builtin_memset from arena-backed allocator	2024-11-08 12:31:53 -08:00
Emil Tsalapatis	14095789ae	add all ctags files to .gitignore	2024-11-08 12:30:58 -08:00
Emil Tsalapatis	88d5a550f5	import more recent bpf_arena_*.h headers for good measure	2024-11-08 11:58:12 -08:00
Emil Tsalapatis	c7e40e01d8	Merge branch 'main' into etsal/bpf-arena-rusty	2024-11-08 11:21:20 -08:00
Emil Tsalapatis	1645e51034	Merge branch 'htejun/task-stor'	2024-11-08 11:11:50 -08:00
Tejun Heo	b7298d3d48	some comments	2024-10-22 08:45:02 -10:00
Tejun Heo	ebb8d319c0	xxx	2024-10-21 15:50:23 -10:00
Tejun Heo	704213b68e	builds	2024-10-21 15:01:19 -10:00
Tejun Heo	f289b4d28d	xxx	2024-10-21 14:37:28 -10:00
Tejun Heo	eca627ccfe	wip	2024-10-21 14:12:11 -10:00
Tejun Heo	f2b9dc60c5	wip	2024-10-21 14:09:51 -10:00
Tejun Heo	a546ff2510	WIP	2024-10-18 15:55:20 -10:00