mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-25 04:00:24 +00:00
rustu: Import scx_rusty and scx_layered from kernel tree
This commit is contained in:
parent
30b40ac4d3
commit
01d8351616
1
rust/scx_utils/.gitignore
vendored
Normal file
1
rust/scx_utils/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
Cargo.lock
|
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "scx_utils"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
edition = "2021"
|
||||
authors = ["Tejun Heo <tj@kernel.org>"]
|
||||
license = "GPL-2.0"
|
||||
|
3
scheds/rust-user/scx_layered/.gitignore
vendored
Normal file
3
scheds/rust-user/scx_layered/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
src/bpf/.output
|
||||
Cargo.lock
|
||||
target
|
28
scheds/rust-user/scx_layered/Cargo.toml
Normal file
28
scheds/rust-user/scx_layered/Cargo.toml
Normal file
@ -0,0 +1,28 @@
|
||||
[package]
|
||||
name = "scx_layered"
|
||||
version = "0.0.1"
|
||||
authors = ["Tejun Heo <htejun@meta.com>", "Meta"]
|
||||
edition = "2021"
|
||||
description = "Userspace scheduling with BPF for Ads"
|
||||
license = "GPL-2.0-only"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
bitvec = "1.0"
|
||||
clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
|
||||
ctrlc = { version = "3.1", features = ["termination"] }
|
||||
fb_procfs = "0.7"
|
||||
lazy_static = "1.4"
|
||||
libbpf-rs = "0.21"
|
||||
libc = "0.2"
|
||||
log = "0.4"
|
||||
scx_utils = { path = "../../../rust/scx_utils", version = "0.2" }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
simplelog = "0.12"
|
||||
|
||||
[build-dependencies]
|
||||
scx_utils = { path = "../../../rust/scx_utils", version = "0.2" }
|
||||
|
||||
[features]
|
||||
enable_backtrace = []
|
13
scheds/rust-user/scx_layered/build.rs
Normal file
13
scheds/rust-user/scx_layered/build.rs
Normal file
@ -0,0 +1,13 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
|
||||
fn main() {
|
||||
scx_utils::BpfBuilder::new()
|
||||
.unwrap()
|
||||
.enable_intf("src/bpf/intf.h", "bpf_intf.rs")
|
||||
.enable_skel("src/bpf/main.bpf.c", "bpf")
|
||||
.build()
|
||||
.unwrap();
|
||||
}
|
8
scheds/rust-user/scx_layered/rustfmt.toml
Normal file
8
scheds/rust-user/scx_layered/rustfmt.toml
Normal file
@ -0,0 +1,8 @@
|
||||
# Get help on options with `rustfmt --help=config`
|
||||
# Please keep these in alphabetical order.
|
||||
edition = "2021"
|
||||
group_imports = "StdExternalCrate"
|
||||
imports_granularity = "Item"
|
||||
merge_derives = false
|
||||
use_field_init_shorthand = true
|
||||
version = "Two"
|
100
scheds/rust-user/scx_layered/src/bpf/intf.h
Normal file
100
scheds/rust-user/scx_layered/src/bpf/intf.h
Normal file
@ -0,0 +1,100 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
#ifndef __INTF_H
|
||||
#define __INTF_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#ifndef __kptr
|
||||
#ifdef __KERNEL__
|
||||
#error "__kptr_ref not defined in the kernel"
|
||||
#endif
|
||||
#define __kptr
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL__
|
||||
typedef unsigned long long u64;
|
||||
typedef long long s64;
|
||||
#endif
|
||||
|
||||
#include "ravg.bpf.h"
|
||||
|
||||
enum consts {
|
||||
MAX_CPUS_SHIFT = 9,
|
||||
MAX_CPUS = 1 << MAX_CPUS_SHIFT,
|
||||
MAX_CPUS_U8 = MAX_CPUS / 8,
|
||||
MAX_TASKS = 131072,
|
||||
MAX_PATH = 4096,
|
||||
MAX_COMM = 16,
|
||||
MAX_LAYER_MATCH_ORS = 32,
|
||||
MAX_LAYERS = 16,
|
||||
USAGE_HALF_LIFE = 100000000, /* 100ms */
|
||||
|
||||
/* XXX remove */
|
||||
MAX_CGRP_PREFIXES = 32
|
||||
};
|
||||
|
||||
/* Statistics */
|
||||
enum global_stat_idx {
|
||||
GSTAT_TASK_CTX_FREE_FAILED,
|
||||
NR_GSTATS,
|
||||
};
|
||||
|
||||
enum layer_stat_idx {
|
||||
LSTAT_LOCAL,
|
||||
LSTAT_GLOBAL,
|
||||
LSTAT_OPEN_IDLE,
|
||||
LSTAT_AFFN_VIOL,
|
||||
LSTAT_PREEMPT,
|
||||
NR_LSTATS,
|
||||
};
|
||||
|
||||
struct cpu_ctx {
|
||||
bool current_preempt;
|
||||
u64 layer_cycles[MAX_LAYERS];
|
||||
u64 gstats[NR_GSTATS];
|
||||
u64 lstats[MAX_LAYERS][NR_LSTATS];
|
||||
};
|
||||
|
||||
enum layer_match_kind {
|
||||
MATCH_CGROUP_PREFIX,
|
||||
MATCH_COMM_PREFIX,
|
||||
MATCH_NICE_ABOVE,
|
||||
MATCH_NICE_BELOW,
|
||||
|
||||
NR_LAYER_MATCH_KINDS,
|
||||
};
|
||||
|
||||
struct layer_match {
|
||||
int kind;
|
||||
char cgroup_prefix[MAX_PATH];
|
||||
char comm_prefix[MAX_COMM];
|
||||
int nice_above_or_below;
|
||||
};
|
||||
|
||||
struct layer_match_ands {
|
||||
struct layer_match matches[NR_LAYER_MATCH_KINDS];
|
||||
int nr_match_ands;
|
||||
};
|
||||
|
||||
struct layer {
|
||||
struct layer_match_ands matches[MAX_LAYER_MATCH_ORS];
|
||||
unsigned int nr_match_ors;
|
||||
unsigned int idx;
|
||||
bool open;
|
||||
bool preempt;
|
||||
|
||||
u64 vtime_now;
|
||||
u64 nr_tasks;
|
||||
|
||||
u64 load;
|
||||
struct ravg_data load_rd;
|
||||
|
||||
u64 cpus_seq;
|
||||
unsigned int refresh_cpus;
|
||||
unsigned char cpus[MAX_CPUS_U8];
|
||||
unsigned int nr_cpus; // managed from BPF side
|
||||
};
|
||||
|
||||
#endif /* __INTF_H */
|
974
scheds/rust-user/scx_layered/src/bpf/main.bpf.c
Normal file
974
scheds/rust-user/scx_layered/src/bpf/main.bpf.c
Normal file
@ -0,0 +1,974 @@
|
||||
/* Copyright (c) Meta Platforms, Inc. and affiliates. */
|
||||
#include "scx_common.bpf.h"
|
||||
#include "intf.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
const volatile u32 debug = 0;
|
||||
const volatile u64 slice_ns = SCX_SLICE_DFL;
|
||||
const volatile u32 nr_possible_cpus = 1;
|
||||
const volatile u32 nr_layers = 1;
|
||||
const volatile bool smt_enabled = true;
|
||||
const volatile unsigned char all_cpus[MAX_CPUS_U8];
|
||||
|
||||
private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
|
||||
struct layer layers[MAX_LAYERS];
|
||||
u32 fallback_cpu;
|
||||
static u32 preempt_cursor;
|
||||
|
||||
#define dbg(fmt, args...) do { if (debug) bpf_printk(fmt, ##args); } while (0)
|
||||
#define trace(fmt, args...) do { if (debug > 1) bpf_printk(fmt, ##args); } while (0)
|
||||
|
||||
#include "util.bpf.c"
|
||||
#include "ravg_impl.bpf.h"
|
||||
|
||||
struct user_exit_info uei;
|
||||
|
||||
static inline bool vtime_before(u64 a, u64 b)
|
||||
{
|
||||
return (s64)(a - b) < 0;
|
||||
}
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||
__type(key, u32);
|
||||
__type(value, struct cpu_ctx);
|
||||
__uint(max_entries, 1);
|
||||
} cpu_ctxs SEC(".maps");
|
||||
|
||||
static struct cpu_ctx *lookup_cpu_ctx(int cpu)
|
||||
{
|
||||
struct cpu_ctx *cctx;
|
||||
u32 zero = 0;
|
||||
|
||||
if (cpu < 0)
|
||||
cctx = bpf_map_lookup_elem(&cpu_ctxs, &zero);
|
||||
else
|
||||
cctx = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero, cpu);
|
||||
|
||||
if (!cctx) {
|
||||
scx_bpf_error("no cpu_ctx for cpu %d", cpu);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return cctx;
|
||||
}
|
||||
|
||||
static void gstat_inc(enum global_stat_idx idx, struct cpu_ctx *cctx)
|
||||
{
|
||||
if (idx < 0 || idx >= NR_GSTATS) {
|
||||
scx_bpf_error("invalid global stat idx %d", idx);
|
||||
return;
|
||||
}
|
||||
|
||||
cctx->gstats[idx]++;
|
||||
}
|
||||
|
||||
static void lstat_inc(enum layer_stat_idx idx, struct layer *layer, struct cpu_ctx *cctx)
|
||||
{
|
||||
u64 *vptr;
|
||||
|
||||
if ((vptr = MEMBER_VPTR(*cctx, .lstats[layer->idx][idx])))
|
||||
(*vptr)++;
|
||||
else
|
||||
scx_bpf_error("invalid layer or stat idxs: %d, %d", idx, layer->idx);
|
||||
}
|
||||
|
||||
struct lock_wrapper {
|
||||
struct bpf_spin_lock lock;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__type(key, u32);
|
||||
__type(value, struct lock_wrapper);
|
||||
__uint(max_entries, MAX_LAYERS);
|
||||
__uint(map_flags, 0);
|
||||
} layer_load_locks SEC(".maps");
|
||||
|
||||
static void adj_load(u32 layer_idx, s64 adj, u64 now)
|
||||
{
|
||||
struct layer *layer;
|
||||
struct lock_wrapper *lockw;
|
||||
|
||||
layer = MEMBER_VPTR(layers, [layer_idx]);
|
||||
lockw = bpf_map_lookup_elem(&layer_load_locks, &layer_idx);
|
||||
|
||||
if (!layer || !lockw) {
|
||||
scx_bpf_error("Can't access layer%d or its load_lock", layer_idx);
|
||||
return;
|
||||
}
|
||||
|
||||
bpf_spin_lock(&lockw->lock);
|
||||
layer->load += adj;
|
||||
ravg_accumulate(&layer->load_rd, layer->load, now, USAGE_HALF_LIFE);
|
||||
bpf_spin_unlock(&lockw->lock);
|
||||
|
||||
if (debug && adj < 0 && (s64)layer->load < 0)
|
||||
scx_bpf_error("cpu%d layer%d load underflow (load=%lld adj=%lld)",
|
||||
bpf_get_smp_processor_id(), layer_idx, layer->load, adj);
|
||||
}
|
||||
|
||||
struct layer_cpumask_wrapper {
|
||||
struct bpf_cpumask __kptr *cpumask;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__type(key, u32);
|
||||
__type(value, struct layer_cpumask_wrapper);
|
||||
__uint(max_entries, MAX_LAYERS);
|
||||
__uint(map_flags, 0);
|
||||
} layer_cpumasks SEC(".maps");
|
||||
|
||||
static struct cpumask *lookup_layer_cpumask(int idx)
|
||||
{
|
||||
struct layer_cpumask_wrapper *cpumaskw;
|
||||
|
||||
if ((cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx))) {
|
||||
return (struct cpumask *)cpumaskw->cpumask;
|
||||
} else {
|
||||
scx_bpf_error("no layer_cpumask");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void refresh_cpumasks(int idx)
|
||||
{
|
||||
struct layer_cpumask_wrapper *cpumaskw;
|
||||
struct layer *layer;
|
||||
int cpu, total = 0;
|
||||
|
||||
if (!__sync_val_compare_and_swap(&layers[idx].refresh_cpus, 1, 0))
|
||||
return;
|
||||
|
||||
cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx);
|
||||
|
||||
bpf_for(cpu, 0, nr_possible_cpus) {
|
||||
u8 *u8_ptr;
|
||||
|
||||
if ((u8_ptr = MEMBER_VPTR(layers, [idx].cpus[cpu / 8]))) {
|
||||
/*
|
||||
* XXX - The following test should be outside the loop
|
||||
* but that makes the verifier think that
|
||||
* cpumaskw->cpumask might be NULL in the loop.
|
||||
*/
|
||||
barrier_var(cpumaskw);
|
||||
if (!cpumaskw || !cpumaskw->cpumask) {
|
||||
scx_bpf_error("can't happen");
|
||||
return;
|
||||
}
|
||||
|
||||
if (*u8_ptr & (1 << (cpu % 8))) {
|
||||
bpf_cpumask_set_cpu(cpu, cpumaskw->cpumask);
|
||||
total++;
|
||||
} else {
|
||||
bpf_cpumask_clear_cpu(cpu, cpumaskw->cpumask);
|
||||
}
|
||||
} else {
|
||||
scx_bpf_error("can't happen");
|
||||
}
|
||||
}
|
||||
|
||||
// XXX - shouldn't be necessary
|
||||
layer = MEMBER_VPTR(layers, [idx]);
|
||||
if (!layer) {
|
||||
scx_bpf_error("can't happen");
|
||||
return;
|
||||
}
|
||||
|
||||
layer->nr_cpus = total;
|
||||
__sync_fetch_and_add(&layer->cpus_seq, 1);
|
||||
trace("LAYER[%d] now has %d cpus, seq=%llu", idx, layer->nr_cpus, layer->cpus_seq);
|
||||
}
|
||||
|
||||
SEC("fentry/scheduler_tick")
|
||||
int scheduler_tick_fentry(const void *ctx)
|
||||
{
|
||||
int idx;
|
||||
|
||||
if (bpf_get_smp_processor_id() == 0)
|
||||
bpf_for(idx, 0, nr_layers)
|
||||
refresh_cpumasks(idx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct task_ctx {
|
||||
int pid;
|
||||
|
||||
int layer;
|
||||
bool refresh_layer;
|
||||
u64 layer_cpus_seq;
|
||||
struct bpf_cpumask __kptr *layered_cpumask;
|
||||
|
||||
bool all_cpus_allowed;
|
||||
bool dispatch_local;
|
||||
u64 started_running_at;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, pid_t);
|
||||
__type(value, struct task_ctx);
|
||||
__uint(max_entries, MAX_TASKS);
|
||||
__uint(map_flags, 0);
|
||||
} task_ctxs SEC(".maps");
|
||||
|
||||
struct task_ctx *lookup_task_ctx_may_fail(struct task_struct *p)
|
||||
{
|
||||
s32 pid = p->pid;
|
||||
|
||||
return bpf_map_lookup_elem(&task_ctxs, &pid);
|
||||
}
|
||||
|
||||
struct task_ctx *lookup_task_ctx(struct task_struct *p)
|
||||
{
|
||||
struct task_ctx *tctx;
|
||||
s32 pid = p->pid;
|
||||
|
||||
if ((tctx = bpf_map_lookup_elem(&task_ctxs, &pid))) {
|
||||
return tctx;
|
||||
} else {
|
||||
scx_bpf_error("task_ctx lookup failed");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
struct layer *lookup_layer(int idx)
|
||||
{
|
||||
if (idx < 0 || idx >= nr_layers) {
|
||||
scx_bpf_error("invalid layer %d", idx);
|
||||
return NULL;
|
||||
}
|
||||
return &layers[idx];
|
||||
}
|
||||
|
||||
SEC("tp_btf/cgroup_attach_task")
|
||||
int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path,
|
||||
struct task_struct *leader, bool threadgroup)
|
||||
{
|
||||
struct task_struct *next;
|
||||
struct task_ctx *tctx;
|
||||
int leader_pid = leader->pid;
|
||||
|
||||
if (!(tctx = lookup_task_ctx_may_fail(leader)))
|
||||
return 0;
|
||||
tctx->refresh_layer = true;
|
||||
|
||||
if (!threadgroup)
|
||||
return 0;
|
||||
|
||||
if (!(next = bpf_task_acquire(leader))) {
|
||||
scx_bpf_error("failed to acquire leader");
|
||||
return 0;
|
||||
}
|
||||
|
||||
bpf_repeat(MAX_TASKS) {
|
||||
struct task_struct *p;
|
||||
int pid;
|
||||
|
||||
p = container_of(next->thread_group.next, struct task_struct, thread_group);
|
||||
bpf_task_release(next);
|
||||
|
||||
pid = BPF_CORE_READ(p, pid);
|
||||
if (pid == leader_pid) {
|
||||
next = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
next = bpf_task_from_pid(pid);
|
||||
if (!next) {
|
||||
scx_bpf_error("thread iteration failed");
|
||||
break;
|
||||
}
|
||||
|
||||
if ((tctx = lookup_task_ctx(next)))
|
||||
tctx->refresh_layer = true;
|
||||
}
|
||||
|
||||
if (next)
|
||||
bpf_task_release(next);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("tp_btf/task_rename")
|
||||
int BPF_PROG(tp_task_rename, struct task_struct *p, const char *buf)
|
||||
{
|
||||
struct task_ctx *tctx;
|
||||
|
||||
if ((tctx = lookup_task_ctx_may_fail(p)))
|
||||
tctx->refresh_layer = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void maybe_refresh_layered_cpumask(struct cpumask *layered_cpumask,
|
||||
struct task_struct *p, struct task_ctx *tctx,
|
||||
const struct cpumask *layer_cpumask)
|
||||
{
|
||||
u64 layer_seq = layers->cpus_seq;
|
||||
|
||||
if (tctx->layer_cpus_seq == layer_seq)
|
||||
return;
|
||||
|
||||
/*
|
||||
* XXX - We're assuming that the updated @layer_cpumask matching the new
|
||||
* @layer_seq is visible which may not be true. For now, leave it as-is.
|
||||
* Let's update once BPF grows enough memory ordering constructs.
|
||||
*/
|
||||
bpf_cpumask_and((struct bpf_cpumask *)layered_cpumask, layer_cpumask, p->cpus_ptr);
|
||||
tctx->layer_cpus_seq = layer_seq;
|
||||
trace("%s[%d] cpumask refreshed to seq %llu", p->comm, p->pid, layer_seq);
|
||||
}
|
||||
|
||||
static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu,
|
||||
const struct cpumask *idle_cpumask,
|
||||
const struct cpumask *idle_smtmask)
|
||||
{
|
||||
bool prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, cand_cpumask);
|
||||
s32 cpu;
|
||||
|
||||
/*
|
||||
* If CPU has SMT, any wholly idle CPU is likely a better pick than
|
||||
* partially idle @prev_cpu.
|
||||
*/
|
||||
if (smt_enabled) {
|
||||
if (prev_in_cand &&
|
||||
bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
|
||||
scx_bpf_test_and_clear_cpu_idle(prev_cpu))
|
||||
return prev_cpu;
|
||||
|
||||
cpu = scx_bpf_pick_idle_cpu(cand_cpumask, SCX_PICK_IDLE_CORE);
|
||||
if (cpu >= 0)
|
||||
return cpu;
|
||||
}
|
||||
|
||||
if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu))
|
||||
return prev_cpu;
|
||||
|
||||
return scx_bpf_pick_idle_cpu(cand_cpumask, 0);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS(layered_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
|
||||
{
|
||||
const struct cpumask *idle_cpumask, *idle_smtmask;
|
||||
struct cpumask *layer_cpumask, *layered_cpumask;
|
||||
struct cpu_ctx *cctx;
|
||||
struct task_ctx *tctx;
|
||||
struct layer *layer;
|
||||
s32 cpu;
|
||||
|
||||
/* look up everything we need */
|
||||
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
|
||||
!(layered_cpumask = (struct cpumask *)tctx->layered_cpumask))
|
||||
return prev_cpu;
|
||||
|
||||
/*
|
||||
* We usually update the layer in layered_runnable() to avoid confusing.
|
||||
* As layered_select_cpu() takes place before runnable, new tasks would
|
||||
* still have -1 layer. Just return @prev_cpu.
|
||||
*/
|
||||
if (tctx->layer < 0)
|
||||
return prev_cpu;
|
||||
|
||||
if (!(layer = lookup_layer(tctx->layer)) ||
|
||||
!(layer_cpumask = lookup_layer_cpumask(tctx->layer)))
|
||||
return prev_cpu;
|
||||
|
||||
if (!(idle_cpumask = scx_bpf_get_idle_cpumask()))
|
||||
return prev_cpu;
|
||||
|
||||
if (!(idle_smtmask = scx_bpf_get_idle_smtmask())) {
|
||||
cpu = prev_cpu;
|
||||
goto out_put_idle_cpumask;
|
||||
}
|
||||
|
||||
/* not much to do if bound to a single CPU */
|
||||
if (p->nr_cpus_allowed == 1) {
|
||||
cpu = prev_cpu;
|
||||
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
|
||||
if (!bpf_cpumask_test_cpu(cpu, layer_cpumask))
|
||||
lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
|
||||
goto dispatch_local;
|
||||
} else {
|
||||
goto out_put_cpumasks;
|
||||
}
|
||||
}
|
||||
|
||||
maybe_refresh_layered_cpumask(layered_cpumask, p, tctx, layer_cpumask);
|
||||
|
||||
/*
|
||||
* If CPU has SMT, any wholly idle CPU is likely a better pick than
|
||||
* partially idle @prev_cpu.
|
||||
*/
|
||||
if ((cpu = pick_idle_cpu_from(layered_cpumask, prev_cpu,
|
||||
idle_cpumask, idle_smtmask)) >= 0)
|
||||
goto dispatch_local;
|
||||
|
||||
/*
|
||||
* If the layer is an open one, we can try the whole machine.
|
||||
*/
|
||||
if (layer->open &&
|
||||
((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu,
|
||||
idle_cpumask, idle_smtmask)) >= 0)) {
|
||||
lstat_inc(LSTAT_OPEN_IDLE, layer, cctx);
|
||||
goto dispatch_local;
|
||||
}
|
||||
|
||||
cpu = prev_cpu;
|
||||
goto out_put_cpumasks;
|
||||
|
||||
dispatch_local:
|
||||
tctx->dispatch_local = true;
|
||||
out_put_cpumasks:
|
||||
scx_bpf_put_idle_cpumask(idle_smtmask);
|
||||
out_put_idle_cpumask:
|
||||
scx_bpf_put_idle_cpumask(idle_cpumask);
|
||||
return cpu;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
{
|
||||
struct cpu_ctx *cctx;
|
||||
struct task_ctx *tctx;
|
||||
struct layer *layer;
|
||||
u64 vtime = p->scx.dsq_vtime;
|
||||
u32 idx;
|
||||
|
||||
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
|
||||
!(layer = lookup_layer(tctx->layer)))
|
||||
return;
|
||||
|
||||
if (tctx->dispatch_local) {
|
||||
tctx->dispatch_local = false;
|
||||
lstat_inc(LSTAT_LOCAL, layer, cctx);
|
||||
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
|
||||
return;
|
||||
}
|
||||
|
||||
lstat_inc(LSTAT_GLOBAL, layer, cctx);
|
||||
|
||||
/*
|
||||
* Limit the amount of budget that an idling task can accumulate
|
||||
* to one slice.
|
||||
*/
|
||||
if (vtime_before(vtime, layer->vtime_now - slice_ns))
|
||||
vtime = layer->vtime_now - slice_ns;
|
||||
|
||||
if (!tctx->all_cpus_allowed) {
|
||||
lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
|
||||
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags);
|
||||
return;
|
||||
}
|
||||
|
||||
scx_bpf_dispatch_vtime(p, tctx->layer, slice_ns, vtime, enq_flags);
|
||||
|
||||
if (!layer->preempt)
|
||||
return;
|
||||
|
||||
bpf_for(idx, 0, nr_possible_cpus) {
|
||||
struct cpu_ctx *cand_cctx;
|
||||
u32 cpu = (preempt_cursor + idx) % nr_possible_cpus;
|
||||
|
||||
if (!all_cpumask ||
|
||||
!bpf_cpumask_test_cpu(cpu, (const struct cpumask *)all_cpumask))
|
||||
continue;
|
||||
if (!(cand_cctx = lookup_cpu_ctx(cpu)) || cand_cctx->current_preempt)
|
||||
continue;
|
||||
|
||||
scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
|
||||
|
||||
/*
|
||||
* Round-robining doesn't have to be strict. Let's not bother
|
||||
* with atomic ops on $preempt_cursor.
|
||||
*/
|
||||
preempt_cursor = (cpu + 1) % nr_possible_cpus;
|
||||
|
||||
lstat_inc(LSTAT_PREEMPT, layer, cctx);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
|
||||
{
|
||||
int idx;
|
||||
|
||||
/* consume preempting layers first */
|
||||
bpf_for(idx, 0, nr_layers)
|
||||
if (layers[idx].preempt && scx_bpf_consume(idx))
|
||||
return;
|
||||
|
||||
/* consume !open layers second */
|
||||
bpf_for(idx, 0, nr_layers) {
|
||||
struct layer *layer = &layers[idx];
|
||||
struct cpumask *layer_cpumask;
|
||||
|
||||
if (layer->open)
|
||||
continue;
|
||||
|
||||
/* consume matching layers */
|
||||
if (!(layer_cpumask = lookup_layer_cpumask(idx)))
|
||||
return;
|
||||
|
||||
if (bpf_cpumask_test_cpu(cpu, layer_cpumask) ||
|
||||
(cpu == fallback_cpu && layer->nr_cpus == 0)) {
|
||||
if (scx_bpf_consume(idx))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* consume !preempting open layers */
|
||||
bpf_for(idx, 0, nr_layers) {
|
||||
if (!layers[idx].preempt && layers[idx].open &&
|
||||
scx_bpf_consume(idx))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static bool match_one(struct layer_match *match, struct task_struct *p, const char *cgrp_path)
|
||||
{
|
||||
switch (match->kind) {
|
||||
case MATCH_CGROUP_PREFIX: {
|
||||
return match_prefix(match->cgroup_prefix, cgrp_path, MAX_PATH);
|
||||
}
|
||||
case MATCH_COMM_PREFIX: {
|
||||
char comm[MAX_COMM];
|
||||
memcpy(comm, p->comm, MAX_COMM);
|
||||
return match_prefix(match->comm_prefix, comm, MAX_COMM);
|
||||
}
|
||||
case MATCH_NICE_ABOVE:
|
||||
return (s32)p->static_prio - 120 > match->nice_above_or_below;
|
||||
case MATCH_NICE_BELOW:
|
||||
return (s32)p->static_prio - 120 < match->nice_above_or_below;
|
||||
default:
|
||||
scx_bpf_error("invalid match kind %d", match->kind);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool match_layer(struct layer *layer, struct task_struct *p, const char *cgrp_path)
|
||||
{
|
||||
u32 nr_match_ors = layer->nr_match_ors;
|
||||
u64 or_idx, and_idx;
|
||||
|
||||
if (nr_match_ors > MAX_LAYER_MATCH_ORS) {
|
||||
scx_bpf_error("too many ORs");
|
||||
return false;
|
||||
}
|
||||
|
||||
bpf_for(or_idx, 0, nr_match_ors) {
|
||||
struct layer_match_ands *ands;
|
||||
bool matched = true;
|
||||
|
||||
barrier_var(or_idx);
|
||||
if (or_idx >= MAX_LAYER_MATCH_ORS)
|
||||
return false; /* can't happen */
|
||||
ands = &layer->matches[or_idx];
|
||||
|
||||
if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) {
|
||||
scx_bpf_error("too many ANDs");
|
||||
return false;
|
||||
}
|
||||
|
||||
bpf_for(and_idx, 0, ands->nr_match_ands) {
|
||||
struct layer_match *match;
|
||||
|
||||
barrier_var(and_idx);
|
||||
if (and_idx >= NR_LAYER_MATCH_KINDS)
|
||||
return false; /* can't happen */
|
||||
match = &ands->matches[and_idx];
|
||||
|
||||
if (!match_one(match, p, cgrp_path)) {
|
||||
matched = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (matched)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void maybe_refresh_layer(struct task_struct *p, struct task_ctx *tctx)
|
||||
{
|
||||
const char *cgrp_path;
|
||||
bool matched = false;
|
||||
u64 idx; // XXX - int makes verifier unhappy
|
||||
|
||||
if (!tctx->refresh_layer)
|
||||
return;
|
||||
tctx->refresh_layer = false;
|
||||
|
||||
if (!(cgrp_path = format_cgrp_path(p->cgroups->dfl_cgrp)))
|
||||
return;
|
||||
|
||||
if (tctx->layer >= 0 && tctx->layer < nr_layers)
|
||||
__sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1);
|
||||
|
||||
bpf_for(idx, 0, nr_layers) {
|
||||
if (match_layer(&layers[idx], p, cgrp_path)) {
|
||||
matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (matched) {
|
||||
struct layer *layer = &layers[idx];
|
||||
|
||||
tctx->layer = idx;
|
||||
tctx->layer_cpus_seq = layer->cpus_seq - 1;
|
||||
__sync_fetch_and_add(&layer->nr_tasks, 1);
|
||||
/*
|
||||
* XXX - To be correct, we'd need to calculate the vtime
|
||||
* delta in the previous layer, scale it by the load
|
||||
* fraction difference and then offset from the new
|
||||
* layer's vtime_now. For now, just do the simple thing
|
||||
* and assume the offset to be zero.
|
||||
*
|
||||
* Revisit if high frequency dynamic layer switching
|
||||
* needs to be supported.
|
||||
*/
|
||||
p->scx.dsq_vtime = layer->vtime_now;
|
||||
} else {
|
||||
scx_bpf_error("[%s]%d didn't match any layer", p->comm, p->pid);
|
||||
}
|
||||
|
||||
if (tctx->layer < nr_layers - 1)
|
||||
trace("LAYER=%d %s[%d] cgrp=\"%s\"",
|
||||
tctx->layer, p->comm, p->pid, cgrp_path);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_runnable, struct task_struct *p, u64 enq_flags)
|
||||
{
|
||||
u64 now = bpf_ktime_get_ns();
|
||||
struct task_ctx *tctx;
|
||||
|
||||
if (!(tctx = lookup_task_ctx(p)))
|
||||
return;
|
||||
|
||||
maybe_refresh_layer(p, tctx);
|
||||
|
||||
adj_load(tctx->layer, p->scx.weight, now);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_running, struct task_struct *p)
|
||||
{
|
||||
struct cpu_ctx *cctx;
|
||||
struct task_ctx *tctx;
|
||||
struct layer *layer;
|
||||
|
||||
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) ||
|
||||
!(layer = lookup_layer(tctx->layer)))
|
||||
return;
|
||||
|
||||
if (vtime_before(layer->vtime_now, p->scx.dsq_vtime))
|
||||
layer->vtime_now = p->scx.dsq_vtime;
|
||||
|
||||
cctx->current_preempt = layer->preempt;
|
||||
tctx->started_running_at = bpf_ktime_get_ns();
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
|
||||
{
|
||||
struct cpu_ctx *cctx;
|
||||
struct task_ctx *tctx;
|
||||
u64 used;
|
||||
u32 layer;
|
||||
|
||||
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
|
||||
return;
|
||||
|
||||
layer = tctx->layer;
|
||||
if (layer >= nr_layers) {
|
||||
scx_bpf_error("invalid layer %u", layer);
|
||||
return;
|
||||
}
|
||||
|
||||
used = bpf_ktime_get_ns() - tctx->started_running_at;
|
||||
cctx->layer_cycles[layer] += used;
|
||||
cctx->current_preempt = false;
|
||||
|
||||
/* scale the execution time by the inverse of the weight and charge */
|
||||
p->scx.dsq_vtime += used * 100 / p->scx.weight;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_quiescent, struct task_struct *p, u64 deq_flags)
|
||||
{
|
||||
struct task_ctx *tctx;
|
||||
|
||||
if ((tctx = lookup_task_ctx(p)))
|
||||
adj_load(tctx->layer, -(s64)p->scx.weight, bpf_ktime_get_ns());
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight)
|
||||
{
|
||||
struct task_ctx *tctx;
|
||||
|
||||
if ((tctx = lookup_task_ctx(p)))
|
||||
tctx->refresh_layer = true;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p,
|
||||
const struct cpumask *cpumask)
|
||||
{
|
||||
struct task_ctx *tctx;
|
||||
|
||||
if (!(tctx = lookup_task_ctx(p)))
|
||||
return;
|
||||
|
||||
if (!all_cpumask) {
|
||||
scx_bpf_error("NULL all_cpumask");
|
||||
return;
|
||||
}
|
||||
|
||||
tctx->all_cpus_allowed =
|
||||
bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
|
||||
struct scx_enable_args *args)
|
||||
{
|
||||
struct task_ctx tctx_init = {
|
||||
.pid = p->pid,
|
||||
.layer = -1,
|
||||
.refresh_layer = true,
|
||||
};
|
||||
struct task_ctx *tctx;
|
||||
struct bpf_cpumask *cpumask;
|
||||
s32 pid = p->pid;
|
||||
s32 ret;
|
||||
|
||||
if (all_cpumask)
|
||||
tctx_init.all_cpus_allowed =
|
||||
bpf_cpumask_subset((const struct cpumask *)all_cpumask, p->cpus_ptr);
|
||||
else
|
||||
scx_bpf_error("missing all_cpumask");
|
||||
|
||||
/*
|
||||
* XXX - We want BPF_NOEXIST but bpf_map_delete_elem() in .disable() may
|
||||
* fail spuriously due to BPF recursion protection triggering
|
||||
* unnecessarily.
|
||||
*/
|
||||
if ((ret = bpf_map_update_elem(&task_ctxs, &pid, &tctx_init, 0 /*BPF_NOEXIST*/))) {
|
||||
scx_bpf_error("task_ctx allocation failure, ret=%d", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read the entry from the map immediately so we can add the cpumask
|
||||
* with bpf_kptr_xchg().
|
||||
*/
|
||||
if (!(tctx = lookup_task_ctx(p)))
|
||||
return -ENOENT;
|
||||
|
||||
cpumask = bpf_cpumask_create();
|
||||
if (!cpumask) {
|
||||
bpf_map_delete_elem(&task_ctxs, &pid);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
cpumask = bpf_kptr_xchg(&tctx->layered_cpumask, cpumask);
|
||||
if (cpumask) {
|
||||
/* Should never happen as we just inserted it above. */
|
||||
bpf_cpumask_release(cpumask);
|
||||
bpf_map_delete_elem(&task_ctxs, &pid);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* We are matching cgroup hierarchy path directly rather than the CPU
|
||||
* controller path. As the former isn't available during the scheduler
|
||||
* fork path, let's delay the layer selection until the first
|
||||
* runnable().
|
||||
*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_cancel_enable, struct task_struct *p)
|
||||
{
|
||||
s32 pid = p->pid;
|
||||
|
||||
bpf_map_delete_elem(&task_ctxs, &pid);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_disable, struct task_struct *p)
|
||||
{
|
||||
struct cpu_ctx *cctx;
|
||||
struct task_ctx *tctx;
|
||||
s32 pid = p->pid;
|
||||
int ret;
|
||||
|
||||
if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
|
||||
return;
|
||||
|
||||
if (tctx->layer >= 0 && tctx->layer < nr_layers)
|
||||
__sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1);
|
||||
|
||||
/*
|
||||
* XXX - There's no reason delete should fail here but BPF's recursion
|
||||
* protection can unnecessarily fail the operation. The fact that
|
||||
* deletions aren't reliable means that we sometimes leak task_ctx and
|
||||
* can't use BPF_NOEXIST on allocation in .prep_enable().
|
||||
*/
|
||||
ret = bpf_map_delete_elem(&task_ctxs, &pid);
|
||||
if (ret)
|
||||
gstat_inc(GSTAT_TASK_CTX_FREE_FAILED, cctx);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
|
||||
{
|
||||
struct bpf_cpumask *cpumask;
|
||||
int i, j, k, nr_online_cpus, ret;
|
||||
|
||||
scx_bpf_switch_all();
|
||||
|
||||
cpumask = bpf_cpumask_create();
|
||||
if (!cpumask)
|
||||
return -ENOMEM;
|
||||
|
||||
nr_online_cpus = 0;
|
||||
bpf_for(i, 0, nr_possible_cpus) {
|
||||
const volatile u8 *u8_ptr;
|
||||
|
||||
if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) {
|
||||
if (*u8_ptr & (1 << (i % 8))) {
|
||||
bpf_cpumask_set_cpu(i, cpumask);
|
||||
nr_online_cpus++;
|
||||
}
|
||||
} else {
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
cpumask = bpf_kptr_xchg(&all_cpumask, cpumask);
|
||||
if (cpumask)
|
||||
bpf_cpumask_release(cpumask);
|
||||
|
||||
dbg("CFG: Dumping configuration, nr_online_cpus=%d smt_enabled=%d",
|
||||
nr_online_cpus, smt_enabled);
|
||||
|
||||
bpf_for(i, 0, nr_layers) {
|
||||
struct layer *layer = &layers[i];
|
||||
|
||||
dbg("CFG LAYER[%d] open=%d preempt=%d",
|
||||
i, layer->open, layer->preempt);
|
||||
|
||||
if (layer->nr_match_ors > MAX_LAYER_MATCH_ORS) {
|
||||
scx_bpf_error("too many ORs");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
bpf_for(j, 0, layer->nr_match_ors) {
|
||||
struct layer_match_ands *ands = MEMBER_VPTR(layers, [i].matches[j]);
|
||||
if (!ands) {
|
||||
scx_bpf_error("shouldn't happen");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) {
|
||||
scx_bpf_error("too many ANDs");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
dbg("CFG OR[%02d]", j);
|
||||
|
||||
bpf_for(k, 0, ands->nr_match_ands) {
|
||||
char header[32];
|
||||
u64 header_data[1] = { k };
|
||||
struct layer_match *match;
|
||||
|
||||
bpf_snprintf(header, sizeof(header), "CFG AND[%02d]:",
|
||||
header_data, sizeof(header_data));
|
||||
|
||||
match = MEMBER_VPTR(layers, [i].matches[j].matches[k]);
|
||||
if (!match) {
|
||||
scx_bpf_error("shouldn't happen");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
switch (match->kind) {
|
||||
case MATCH_CGROUP_PREFIX:
|
||||
dbg("%s CGROUP_PREFIX \"%s\"", header, match->cgroup_prefix);
|
||||
break;
|
||||
case MATCH_COMM_PREFIX:
|
||||
dbg("%s COMM_PREFIX \"%s\"", header, match->comm_prefix);
|
||||
break;
|
||||
case MATCH_NICE_ABOVE:
|
||||
dbg("%s NICE_ABOVE %d", header, match->nice_above_or_below);
|
||||
break;
|
||||
case MATCH_NICE_BELOW:
|
||||
dbg("%s NICE_BELOW %d", header, match->nice_above_or_below);
|
||||
break;
|
||||
default:
|
||||
scx_bpf_error("%s Invalid kind", header);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
if (ands->nr_match_ands == 0)
|
||||
dbg("CFG DEFAULT");
|
||||
}
|
||||
}
|
||||
|
||||
bpf_for(i, 0, nr_layers) {
|
||||
struct layer_cpumask_wrapper *cpumaskw;
|
||||
|
||||
layers[i].idx = i;
|
||||
|
||||
ret = scx_bpf_create_dsq(i, -1);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (!(cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &i)))
|
||||
return -ENONET;
|
||||
|
||||
cpumask = bpf_cpumask_create();
|
||||
if (!cpumask)
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* Start all layers with full cpumask so that everything runs
|
||||
* everywhere. This will soon be updated by refresh_cpumasks()
|
||||
* once the scheduler starts running.
|
||||
*/
|
||||
bpf_cpumask_setall(cpumask);
|
||||
|
||||
cpumask = bpf_kptr_xchg(&cpumaskw->cpumask, cpumask);
|
||||
if (cpumask)
|
||||
bpf_cpumask_release(cpumask);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(layered_exit, struct scx_exit_info *ei)
|
||||
{
|
||||
uei_record(&uei, ei);
|
||||
}
|
||||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops layered = {
|
||||
.select_cpu = (void *)layered_select_cpu,
|
||||
.enqueue = (void *)layered_enqueue,
|
||||
.dispatch = (void *)layered_dispatch,
|
||||
.runnable = (void *)layered_runnable,
|
||||
.running = (void *)layered_running,
|
||||
.stopping = (void *)layered_stopping,
|
||||
.quiescent = (void *)layered_quiescent,
|
||||
.set_weight = (void *)layered_set_weight,
|
||||
.set_cpumask = (void *)layered_set_cpumask,
|
||||
.prep_enable = (void *)layered_prep_enable,
|
||||
.cancel_enable = (void *)layered_cancel_enable,
|
||||
.disable = (void *)layered_disable,
|
||||
.init = (void *)layered_init,
|
||||
.exit = (void *)layered_exit,
|
||||
.name = "layered",
|
||||
};
|
68
scheds/rust-user/scx_layered/src/bpf/util.bpf.c
Normal file
68
scheds/rust-user/scx_layered/src/bpf/util.bpf.c
Normal file
@ -0,0 +1,68 @@
|
||||
/* to be included in the main bpf.c file */
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||
__uint(key_size, sizeof(u32));
|
||||
/* double size because verifier can't follow length calculation */
|
||||
__uint(value_size, 2 * MAX_PATH);
|
||||
__uint(max_entries, 1);
|
||||
} cgrp_path_bufs SEC(".maps");
|
||||
|
||||
static char *format_cgrp_path(struct cgroup *cgrp)
|
||||
{
|
||||
u32 zero = 0;
|
||||
char *path = bpf_map_lookup_elem(&cgrp_path_bufs, &zero);
|
||||
u32 len = 0, level, max_level;
|
||||
|
||||
if (!path) {
|
||||
scx_bpf_error("cgrp_path_buf lookup failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
max_level = cgrp->level;
|
||||
if (max_level > 127)
|
||||
max_level = 127;
|
||||
|
||||
bpf_for(level, 1, max_level + 1) {
|
||||
int ret;
|
||||
|
||||
if (level > 1 && len < MAX_PATH - 1)
|
||||
path[len++] = '/';
|
||||
|
||||
if (len >= MAX_PATH - 1) {
|
||||
scx_bpf_error("cgrp_path_buf overflow");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret = bpf_probe_read_kernel_str(path + len, MAX_PATH - len - 1,
|
||||
BPF_CORE_READ(cgrp, ancestors[level], kn, name));
|
||||
if (ret < 0) {
|
||||
scx_bpf_error("bpf_probe_read_kernel_str failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
len += ret - 1;
|
||||
}
|
||||
|
||||
if (len >= MAX_PATH - 2) {
|
||||
scx_bpf_error("cgrp_path_buf overflow");
|
||||
return NULL;
|
||||
}
|
||||
path[len] = '/';
|
||||
path[len + 1] = '\0';
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
static inline bool match_prefix(const char *prefix, const char *str, u32 max_len)
|
||||
{
|
||||
int c;
|
||||
|
||||
bpf_for(c, 0, max_len) {
|
||||
if (prefix[c] == '\0')
|
||||
return true;
|
||||
if (str[c] != prefix[c])
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
10
scheds/rust-user/scx_layered/src/bpf_intf.rs
Normal file
10
scheds/rust-user/scx_layered/src/bpf_intf.rs
Normal file
@ -0,0 +1,10 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
#![allow(dead_code)]
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs"));
|
12
scheds/rust-user/scx_layered/src/bpf_skel.rs
Normal file
12
scheds/rust-user/scx_layered/src/bpf_skel.rs
Normal file
@ -0,0 +1,12 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
|
||||
// We can't directly include the generated skeleton in main.rs as it may
|
||||
// contain compiler attributes that can't be `include!()`ed via macro and we
|
||||
// can't use the `#[path = "..."]` because `concat!(env!("OUT_DIR"),
|
||||
// "/bpf.skel.rs")` does not work inside the path attribute yet (see
|
||||
// https://github.com/rust-lang/rust/pull/83366).
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs"));
|
1639
scheds/rust-user/scx_layered/src/main.rs
Normal file
1639
scheds/rust-user/scx_layered/src/main.rs
Normal file
File diff suppressed because it is too large
Load Diff
3
scheds/rust-user/scx_rusty/.gitignore
vendored
Normal file
3
scheds/rust-user/scx_rusty/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
src/bpf/.output
|
||||
Cargo.lock
|
||||
target
|
27
scheds/rust-user/scx_rusty/Cargo.toml
Normal file
27
scheds/rust-user/scx_rusty/Cargo.toml
Normal file
@ -0,0 +1,27 @@
|
||||
[package]
|
||||
name = "scx_rusty"
|
||||
version = "0.5.0"
|
||||
authors = ["Dan Schatzberg <dschatzberg@meta.com>", "Meta"]
|
||||
edition = "2021"
|
||||
description = "Userspace scheduling with BPF"
|
||||
license = "GPL-2.0-only"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.65"
|
||||
bitvec = { version = "1.0", features = ["serde"] }
|
||||
clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
|
||||
ctrlc = { version = "3.1", features = ["termination"] }
|
||||
fb_procfs = "0.7.0"
|
||||
hex = "0.4.3"
|
||||
libbpf-rs = "0.21.0"
|
||||
libc = "0.2.137"
|
||||
log = "0.4.17"
|
||||
ordered-float = "3.4.0"
|
||||
scx_utils = { path = "../../../rust/scx_utils", version = "0.2" }
|
||||
simplelog = "0.12.0"
|
||||
|
||||
[build-dependencies]
|
||||
scx_utils = { path = "../../../rust/scx_utils", version = "0.2" }
|
||||
|
||||
[features]
|
||||
enable_backtrace = []
|
13
scheds/rust-user/scx_rusty/build.rs
Normal file
13
scheds/rust-user/scx_rusty/build.rs
Normal file
@ -0,0 +1,13 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
|
||||
fn main() {
|
||||
scx_utils::BpfBuilder::new()
|
||||
.unwrap()
|
||||
.enable_intf("src/bpf/intf.h", "bpf_intf.rs")
|
||||
.enable_skel("src/bpf/main.bpf.c", "bpf")
|
||||
.build()
|
||||
.unwrap();
|
||||
}
|
8
scheds/rust-user/scx_rusty/rustfmt.toml
Normal file
8
scheds/rust-user/scx_rusty/rustfmt.toml
Normal file
@ -0,0 +1,8 @@
|
||||
# Get help on options with `rustfmt --help=config`
|
||||
# Please keep these in alphabetical order.
|
||||
edition = "2021"
|
||||
group_imports = "StdExternalCrate"
|
||||
imports_granularity = "Item"
|
||||
merge_derives = false
|
||||
use_field_init_shorthand = true
|
||||
version = "Two"
|
97
scheds/rust-user/scx_rusty/src/bpf/intf.h
Normal file
97
scheds/rust-user/scx_rusty/src/bpf/intf.h
Normal file
@ -0,0 +1,97 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
#ifndef __INTF_H
|
||||
#define __INTF_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#ifndef __kptr
|
||||
#ifdef __KERNEL__
|
||||
#error "__kptr_ref not defined in the kernel"
|
||||
#endif
|
||||
#define __kptr
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL__
|
||||
typedef unsigned char u8;
|
||||
typedef unsigned int u32;
|
||||
typedef unsigned long long u64;
|
||||
#endif
|
||||
|
||||
#include "ravg.bpf.h"
|
||||
|
||||
enum consts {
|
||||
MAX_CPUS = 512,
|
||||
MAX_DOMS = 64, /* limited to avoid complex bitmask ops */
|
||||
CACHELINE_SIZE = 64,
|
||||
|
||||
/*
|
||||
* When userspace load balancer is trying to determine the tasks to push
|
||||
* out from an overloaded domain, it looks at the the following number
|
||||
* of recently active tasks of the domain. While this may lead to
|
||||
* spurious migration victim selection failures in pathological cases,
|
||||
* this isn't a practical problem as the LB rounds are best-effort
|
||||
* anyway and will be retried until loads are balanced.
|
||||
*/
|
||||
MAX_DOM_ACTIVE_PIDS = 1024,
|
||||
};
|
||||
|
||||
/* Statistics */
|
||||
enum stat_idx {
|
||||
/* The following fields add up to all dispatched tasks */
|
||||
RUSTY_STAT_WAKE_SYNC,
|
||||
RUSTY_STAT_PREV_IDLE,
|
||||
RUSTY_STAT_GREEDY_IDLE,
|
||||
RUSTY_STAT_PINNED,
|
||||
RUSTY_STAT_DIRECT_DISPATCH,
|
||||
RUSTY_STAT_DIRECT_GREEDY,
|
||||
RUSTY_STAT_DIRECT_GREEDY_FAR,
|
||||
RUSTY_STAT_DSQ_DISPATCH,
|
||||
RUSTY_STAT_GREEDY,
|
||||
|
||||
/* Extra stats that don't contribute to total */
|
||||
RUSTY_STAT_REPATRIATE,
|
||||
RUSTY_STAT_KICK_GREEDY,
|
||||
RUSTY_STAT_LOAD_BALANCE,
|
||||
|
||||
/* Errors */
|
||||
RUSTY_STAT_TASK_GET_ERR,
|
||||
|
||||
RUSTY_NR_STATS,
|
||||
};
|
||||
|
||||
struct task_ctx {
|
||||
/* The domains this task can run on */
|
||||
u64 dom_mask;
|
||||
|
||||
struct bpf_cpumask __kptr *cpumask;
|
||||
u32 dom_id;
|
||||
u32 weight;
|
||||
bool runnable;
|
||||
u64 dom_active_pids_gen;
|
||||
u64 running_at;
|
||||
|
||||
/* The task is a workqueue worker thread */
|
||||
bool is_kworker;
|
||||
|
||||
/* Allowed on all CPUs and eligible for DIRECT_GREEDY optimization */
|
||||
bool all_cpus;
|
||||
|
||||
/* select_cpu() telling enqueue() to queue directly on the DSQ */
|
||||
bool dispatch_local;
|
||||
|
||||
struct ravg_data dcyc_rd;
|
||||
};
|
||||
|
||||
struct dom_ctx {
|
||||
u64 vtime_now;
|
||||
struct bpf_cpumask __kptr *cpumask;
|
||||
struct bpf_cpumask __kptr *direct_greedy_cpumask;
|
||||
|
||||
u64 load;
|
||||
struct ravg_data load_rd;
|
||||
u64 dbg_load_printed_at;
|
||||
};
|
||||
|
||||
#endif /* __INTF_H */
|
1153
scheds/rust-user/scx_rusty/src/bpf/main.bpf.c
Normal file
1153
scheds/rust-user/scx_rusty/src/bpf/main.bpf.c
Normal file
File diff suppressed because it is too large
Load Diff
10
scheds/rust-user/scx_rusty/src/bpf_intf.rs
Normal file
10
scheds/rust-user/scx_rusty/src/bpf_intf.rs
Normal file
@ -0,0 +1,10 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
#![allow(dead_code)]
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs"));
|
12
scheds/rust-user/scx_rusty/src/bpf_skel.rs
Normal file
12
scheds/rust-user/scx_rusty/src/bpf_skel.rs
Normal file
@ -0,0 +1,12 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
|
||||
// This software may be used and distributed according to the terms of the
|
||||
// GNU General Public License version 2.
|
||||
|
||||
// We can't directly include the generated skeleton in main.rs as it may
|
||||
// contain compiler attributes that can't be `include!()`ed via macro and we
|
||||
// can't use the `#[path = "..."]` because `concat!(env!("OUT_DIR"),
|
||||
// "/bpf.skel.rs")` does not work inside the path attribute yet (see
|
||||
// https://github.com/rust-lang/rust/pull/83366).
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs"));
|
1262
scheds/rust-user/scx_rusty/src/main.rs
Normal file
1262
scheds/rust-user/scx_rusty/src/main.rs
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user