Merge branch 'main' into core_enums

This commit is contained in:
Emil Tsalapatis 2024-11-06 11:07:42 -05:00 committed by GitHub
commit 479d515a45
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 36 additions and 364 deletions

View File

@ -2427,7 +2427,7 @@ struct sched_statistics {
};
struct cpumask {
unsigned long bits[4];
unsigned long bits[128];
};
typedef struct cpumask cpumask_t;

View File

@ -8,52 +8,6 @@
* To be included to the main.bpf.c
*/
#ifndef LAVD_DISABLE_TRACE_LOCKS
#define LAVD_TRACE_SEM
#define LAVD_TRACE_MUTEX
#define LAVD_TRACE_WW_MUTEX
#define LAVD_TRACE_RW_SEM
#define LAVD_TRACE_PERCPU_RW_SEM
#define LAVD_TRACE_FUTEX
#endif /* LAVD_DISABLE_TRACE_LOCKS */
static void try_inc_lock_boost(struct task_ctx *taskc, struct cpu_ctx *cpuc)
{
if (taskc && cpuc) {
taskc->lock_boost++;
cpuc->lock_holder = is_lock_holder(taskc);
}
/*
* If taskc is null, the task is not under sched_ext so ignore the error.
*/
}
static void try_dec_lock_boost(struct task_ctx *taskc, struct cpu_ctx *cpuc)
{
if (taskc && cpuc && taskc->lock_boost > 0) {
taskc->lock_boost--;
cpuc->lock_holder = is_lock_holder(taskc);
}
/*
* If taskc is null, the task is not under sched_ext so ignore the error.
*/
}
static void inc_lock_boost(void)
{
struct task_ctx *taskc = try_get_current_task_ctx();
struct cpu_ctx *cpuc = get_cpu_ctx();
try_inc_lock_boost(taskc, cpuc);
}
static void dec_lock_boost(void)
{
struct task_ctx *taskc = try_get_current_task_ctx();
struct cpu_ctx *cpuc = get_cpu_ctx();
try_dec_lock_boost(taskc, cpuc);
}
static void try_inc_futex_boost(struct task_ctx *taskc, struct cpu_ctx *cpuc, u32 *uaddr)
{
if (taskc && cpuc && (taskc->futex_uaddr != uaddr)) {
@ -105,286 +59,6 @@ static void reset_lock_futex_boost(struct task_ctx *taskc, struct cpu_ctx *cpuc)
cpuc->lock_holder = false;
}
/**
* semaphore in kernel (kernel/locking/semaphore.c)
* - void __sched down(struct semaphore *sem)
* - int __sched down_interruptible(struct semaphore *sem)
* - int __sched down_killable(struct semaphore *sem)
* - int __sched down_trylock(struct semaphore *sem)
* - int __sched down_timeout(struct semaphore *sem, long timeout)
* - void __sched up(struct semaphore *sem)
*/
#ifdef LAVD_TRACE_SEM
struct semaphore;
SEC("fexit/down")
int BPF_PROG(fexit_down, struct semaphore *sem)
{
/*
* A semaphore is successfully acquired.
*/
inc_lock_boost();
return 0;
}
SEC("fexit/down_interruptible")
int BPF_PROG(fexit_down_interruptible, struct semaphore *sem, int ret)
{
if (ret == 0) {
/*
* A semaphore is successfully acquired.
*/
inc_lock_boost();
}
return 0;
}
SEC("fexit/down_killable")
int BPF_PROG(fexit_down_killable, struct semaphore *sem, int ret)
{
if (ret == 0) {
/*
* A semaphore is successfully acquired.
*/
inc_lock_boost();
}
return 0;
}
SEC("fexit/down_trylock")
int BPF_PROG(fexit_down_trylock, struct semaphore *sem, int ret)
{
if (ret == 0) {
/*
* A semaphore is successfully acquired.
*/
inc_lock_boost();
}
return 0;
}
SEC("fexit/up")
int BPF_PROG(fexit_up, struct semaphore *sem)
{
/*
* A semaphore is successfully released.
*/
dec_lock_boost();
return 0;
}
#endif /* LAVD_TRACE_SEM */
/**
* mutex in kernel (kernel/locking/mutex.c)
* - void __sched mutex_lock(struct mutex *lock)
* - int __sched mutex_lock_interruptible(struct mutex *lock)
* - int __sched mutex_lock_killable(struct mutex *lock)
* => They all calls `__mutex_lock()` in the slow path. However, tracing only
* slowpath is not accurate since in the unlock path, there is no way to
* diminish whether a lock is locked in the fast path or slow path. While,
* it is in accurate, let's live with it for now.
* int __sched __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip)
*
* - int __sched mutex_trylock(struct mutex *lock)
* - void __sched mutex_unlock(struct mutex *lock)
*
* - int __sched ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
* - int __sched ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
* - int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
* - void __sched ww_mutex_unlock(struct ww_mutex *lock)
*/
#ifdef LAVD_TRACE_MUTEX
struct mutex;
struct lockdep_map;
SEC("fexit/__mutex_lock")
int BPF_PROG(fexit__mutex_lock, struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, int ret)
{
if (ret == 0) {
/*
* A mutex is successfully acquired.
*/
inc_lock_boost();
}
return 0;
}
SEC("fexit/mutex_trylock")
int BPF_PROG(fexit_mutex_trylock, struct mutex *mutex, int ret)
{
if (ret == 1) {
/*
* A mutex is successfully acquired.
*/
inc_lock_boost();
}
return 0;
}
SEC("fexit/mutex_unlock")
int BPF_PROG(fexit_mutex_unlock, struct mutex *mutex)
{
/*
* A mutex is successfully released.
*/
dec_lock_boost();
return 0;
}
#endif /* LAVD_TRACE_MUTEX */
#ifdef LAVD_TRACE_WW_MUTEX
struct ww_mutex;
struct ww_acquire_ctx;
SEC("fexit/ww_mutex_lock")
int BPF_PROG(fexit_ww_mutex_lock, struct ww_mutex *lock, struct ww_acquire_ctx *x, int ret)
{
if (ret == 0) {
/*
* A ww_mutex is successfully acquired.
*/
inc_lock_boost();
}
return 0;
}
SEC("fexit/ww_mutex_lock_interruptible")
int BPF_PROG(fexit_ww_mutex_lock_interruptible, struct ww_mutex *lock, struct ww_acquire_ctx *x, int ret)
{
if (ret == 0) {
/*
* A ww_mutex is successfully acquired.
*/
inc_lock_boost();
}
return 0;
}
SEC("fexit/ww_mutex_trylock")
int BPF_PROG(fexit_ww_mutex_trylock, struct ww_mutex *lock, struct ww_acquire_ctx *x, int ret)
{
if (ret == 1) {
/*
* A ww_mutex is successfully acquired.
*/
inc_lock_boost();
}
return 0;
}
SEC("fexit/ww_mutex_unlock")
int BPF_PROG(fexit_ww_mutex_unlock, struct ww_mutex *lock)
{
/*
* A ww_mutex is successfully released.
*/
dec_lock_boost();
return 0;
}
#endif /* LAVD_TRACE_WW_MUTEX */
/**
* Reader-writer semaphore in kernel (kernel/locking/rwsem.c)
* The kernel rwsem prioritizes readers, so here prioritizes writers only.
* - void __sched down_write(struct rw_semaphore *sem)
* - int __sched down_write_killable(struct rw_semaphore *sem)
* - int down_write_trylock(struct rw_semaphore *sem)
* - void up_write(struct rw_semaphore *sem)
* - void downgrade_write(struct rw_semaphore *sem)
*/
#ifdef LAVD_TRACE_RW_SEM
struct rw_semaphore;
SEC("fexit/down_write")
int BPF_PROG(fexit_down_write, struct rw_semaphore *sem)
{
/*
* An rw_semaphore is successfully acquired.
*/
inc_lock_boost();
return 0;
}
SEC("fexit/down_write_killable")
int BPF_PROG(fexit_down_write_killable, struct rw_semaphore *sem, int ret)
{
if (ret == 0) {
/*
* An rw_semaphore is successfully acquired.
*/
inc_lock_boost();
}
return 0;
}
SEC("fexit/down_write_trylock")
int BPF_PROG(fexit_down_write_trylock, struct rw_semaphore *sem, int ret)
{
if (ret == 1) {
/*
* An rw_semaphore is successfully acquired.
*/
inc_lock_boost();
}
return 0;
}
SEC("fexit/up_write")
int BPF_PROG(fexit_up_write, struct rw_semaphore *sem)
{
/*
* An rw_semaphore is successfully released.
*/
dec_lock_boost();
return 0;
}
SEC("fexit/downgrade_write")
int BPF_PROG(fexit_downgrade_write, struct rw_semaphore *sem)
{
/*
* An rw_semaphore is successfully downgraded to a read lock.
*/
dec_lock_boost();
return 0;
}
#endif /* LAVD_TRACE_RW_SEM */
/**
* Per-CPU reader-writer semaphore in kernel (kernel/locking/percpu-rwsem.c)
* The kernel rwsem prioritizes readers, so here prioritizes writers only.
* - void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
* - void percpu_up_write(struct percpu_rw_semaphore *sem)
*/
#ifdef LAVD_TRACE_PERCPU_RW_SEM
struct percpu_rw_semaphore;
SEC("fexit/percpu_down_write")
int BPF_PROG(fexit_percpu_down_write, struct percpu_rw_semaphore *sem)
{
/*
* An percpu_rw_semaphore is successfully acquired.
*/
inc_lock_boost();
return 0;
}
SEC("fexit/percpu_up_write")
int BPF_PROG(fexit_percpu_up_write, struct percpu_rw_semaphore *sem)
{
/*
* An percpu_rw_semaphore is successfully released.
*/
dec_lock_boost();
return 0;
}
#endif /* LAVD_TRACE_PERCPU_RW_SEM */
/**
* Futex for userspace synchronization primiteves (kernel/futex/)
*

View File

@ -707,6 +707,7 @@ static s32 pick_idle_cpu(struct task_struct *p, struct task_ctx *taskc,
struct bpf_cpumask *active, *ovrflw, *big, *little;
struct bpf_cpumask *cpdom_mask_prev, *cpdom_mask_waker;
s32 cpu_id, waker_cpu;
int cpdom_id;
/*
* If a task can run only on a single CPU (e.g., per-CPU kworker), we
@ -747,7 +748,8 @@ static s32 pick_idle_cpu(struct task_struct *p, struct task_ctx *taskc,
goto unlock_out;
}
cpdom_mask_prev = MEMBER_VPTR(cpdom_cpumask, [cpuc_prev->cpdom_id]);
cpdom_id = cpuc_prev->cpdom_id;
cpdom_mask_prev = MEMBER_VPTR(cpdom_cpumask, [cpdom_id]);
if (!cpdom_mask_prev) {
scx_bpf_error("Failed to lookup cpdom_cpumask for %d",
cpuc_prev->cpdom_id);
@ -763,7 +765,8 @@ static s32 pick_idle_cpu(struct task_struct *p, struct task_ctx *taskc,
goto unlock_out;
}
cpdom_mask_waker = MEMBER_VPTR(cpdom_cpumask, [cpuc_waker->cpdom_id]);
cpdom_id = cpuc_waker->cpdom_id;
cpdom_mask_waker = MEMBER_VPTR(cpdom_cpumask, [cpdom_id]);
if (!cpdom_mask_waker) {
scx_bpf_error("Failed to lookup cpdom_cpumask for %d",
cpuc_waker->cpdom_id);

View File

@ -188,7 +188,6 @@ struct layer {
u32 weight;
int kind;
bool open;
bool preempt;
bool preempt_first;
bool exclusive;

View File

@ -532,7 +532,7 @@ bool should_try_preempt_first(s32 cand, struct layer *layer,
if (!layer->preempt || !layer->preempt_first)
return false;
if (!layer->open && !bpf_cpumask_test_cpu(cand, layered_cpumask))
if (layer->kind == LAYER_KIND_CONFINED && !bpf_cpumask_test_cpu(cand, layered_cpumask))
return false;
if (!(cand_cctx = lookup_cpu_ctx(cand)) || cand_cctx->current_preempt)
@ -561,7 +561,7 @@ s32 pick_idle_no_topo(struct task_struct *p, s32 prev_cpu,
/* not much to do if bound to a single CPU */
if (p->nr_cpus_allowed == 1 && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
if (!layer->open && !bpf_cpumask_test_cpu(prev_cpu, layer_cpumask))
if (layer->kind == LAYER_KIND_CONFINED && !bpf_cpumask_test_cpu(prev_cpu, layer_cpumask))
lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
return prev_cpu;
}
@ -621,7 +621,7 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
/* not much to do if bound to a single CPU */
if (p->nr_cpus_allowed == 1 && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
if (!layer->open && !bpf_cpumask_test_cpu(prev_cpu, layer_cpumask))
if (layer->kind == LAYER_KIND_CONFINED && !bpf_cpumask_test_cpu(prev_cpu, layer_cpumask))
lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
return prev_cpu;
}
@ -709,7 +709,7 @@ s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
/*
* If the layer is an open one, we can try the whole machine.
*/
if (layer->open &&
if (layer->kind != LAYER_KIND_CONFINED &&
((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu,
idle_cpumask)) >= 0)) {
lstat_inc(LSTAT_OPEN_IDLE, layer, cctx);
@ -778,7 +778,7 @@ bool try_preempt_cpu(s32 cand, struct task_struct *p, struct cpu_ctx *cctx,
struct task_ctx *tctx, struct layer *layer,
bool preempt_first)
{
struct cost *cost;
struct cost *costc;
struct cpu_ctx *cand_cctx, *sib_cctx = NULL;
s32 sib;
@ -788,7 +788,7 @@ bool try_preempt_cpu(s32 cand, struct task_struct *p, struct cpu_ctx *cctx,
if (!(cand_cctx = lookup_cpu_ctx(cand)) || cand_cctx->current_preempt)
return false;
if (!(cost = lookup_cpu_cost(cand)) || has_budget(cost, layer) == 0)
if (!(costc = lookup_cpu_cost(cand)) || has_budget(costc, layer) == 0)
return false;
/*
@ -1070,7 +1070,7 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
(p->flags & PF_KTHREAD) && p->nr_cpus_allowed < nr_possible_cpus) {
struct cpumask *layer_cpumask;
if (!layer->open &&
if (layer->kind == LAYER_KIND_CONFINED &&
(layer_cpumask = lookup_layer_cpumask(tctx->layer)) &&
!bpf_cpumask_test_cpu(task_cpu, layer_cpumask))
lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
@ -1087,7 +1087,7 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
* confined layer may fail to be consumed for an indefinite amount of
* time. Queue them to the fallback DSQ.
*/
if (!layer->open && !tctx->all_cpus_allowed) {
if (layer->kind == LAYER_KIND_CONFINED && !tctx->all_cpus_allowed) {
lstat_inc(LSTAT_AFFN_VIOL, layer, cctx);
/*
* We were previously dispatching to LO_FALLBACK_DSQ for any
@ -1183,7 +1183,7 @@ static bool keep_running(struct cpu_ctx *cctx, struct task_struct *p)
* CPU. If confined, keep running if and only if the layer has
* idle CPUs.
*/
if (layer->open) {
if (layer->kind != LAYER_KIND_CONFINED) {
has_idle = !bpf_cpumask_empty(idle_cpumask);
} else {
struct cpumask *layer_cpumask;
@ -1213,12 +1213,12 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
{
struct cpu_ctx *cctx, *sib_cctx;
struct layer *layer;
struct cost *cost;
struct cost *costc;
u64 dsq_id;
u32 idx, layer_idx;
s32 sib = sibling_cpu(cpu);
if (!(cctx = lookup_cpu_ctx(-1)) || !(cost = lookup_cpu_cost(cpu)))
if (!(cctx = lookup_cpu_ctx(-1)) || !(costc = lookup_cpu_cost(cpu)))
return;
/*
@ -1249,13 +1249,13 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
/* consume preempting layers first */
bpf_for(idx, 0, nr_layers) {
layer_idx = rotate_layer_id(cost->pref_layer, idx);
layer_idx = rotate_layer_id(costc->pref_layer, idx);
if (layer_idx >= nr_layers) {
scx_bpf_error("can't happen");
return;
}
layer = MEMBER_VPTR(layers, [layer_idx]);
if (has_budget(cost, layer) == 0)
if (has_budget(costc, layer) == 0)
continue;
if (layer->preempt && scx_bpf_consume(layer_idx))
return;
@ -1267,13 +1267,13 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
/* consume !open layers second */
bpf_for(idx, 0, nr_layers) {
layer_idx = rotate_layer_id(cost->pref_layer, idx);
layer_idx = rotate_layer_id(costc->pref_layer, idx);
if (layer_idx >= nr_layers) {
scx_bpf_error("can't happen");
return;
}
layer = MEMBER_VPTR(layers, [layer_idx]);
if (has_budget(cost, layer) == 0)
if (has_budget(costc, layer) == 0)
continue;
struct cpumask *layer_cpumask;
@ -1290,15 +1290,15 @@ void layered_dispatch_no_topo(s32 cpu, struct task_struct *prev)
/* consume !preempting open layers */
bpf_for(idx, 0, nr_layers) {
layer_idx = rotate_layer_id(cost->pref_layer, idx);
layer_idx = rotate_layer_id(costc->pref_layer, idx);
if (layer_idx >= nr_layers) {
scx_bpf_error("can't happen");
return;
}
layer = MEMBER_VPTR(layers, [layer_idx]);
if (has_budget(cost, layer) == 0)
if (has_budget(costc, layer) == 0)
continue;
if (!layer->preempt && layers->open &&
if (!layer->preempt && layer->kind != LAYER_KIND_CONFINED &&
scx_bpf_consume(layer_idx))
return;
}
@ -1396,7 +1396,7 @@ int consume_open_no_preempt(struct cost *costc, u32 my_llc_id)
u32 llc_id = rotate_llc_id(my_llc_id, llc_idx);
dsq_id = layer_dsq_id(layer_idx, llc_id);
if (!layer->preempt && layer->open && scx_bpf_consume(dsq_id))
if (!layer->preempt && layer->kind != LAYER_KIND_CONFINED && scx_bpf_consume(dsq_id))
return 0;
}
}
@ -1877,7 +1877,7 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
struct cpu_ctx *cctx;
struct task_ctx *tctx;
struct layer *layer;
struct cost *cost;
struct cost *costc;
s32 lidx;
u64 used;
@ -1885,7 +1885,7 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
return;
lidx = tctx->layer;
if (!(layer = lookup_layer(lidx)) || !(cost = lookup_cpu_cost(-1)))
if (!(layer = lookup_layer(lidx)) || !(costc = lookup_cpu_cost(-1)))
return;
used = bpf_ktime_get_ns() - tctx->running_at;
@ -1895,7 +1895,7 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
used = layer->min_exec_ns;
}
record_cpu_cost(cost, layer->idx, (s64)used);
record_cpu_cost(costc, layer->idx, (s64)used);
cctx->layer_cycles[lidx] += used;
cctx->current_preempt = false;
cctx->prev_exclusive = cctx->current_exclusive;
@ -2285,7 +2285,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
struct layer *layer = &layers[i];
dbg("CFG LAYER[%d][%s] min_exec_ns=%lu open=%d preempt=%d exclusive=%d",
i, layer->name, layer->min_exec_ns, layer->open, layer->preempt,
i, layer->name, layer->min_exec_ns, layer->kind != LAYER_KIND_CONFINED, layer->preempt,
layer->exclusive);
if (layer->nr_match_ors > MAX_LAYER_MATCH_ORS) {

View File

@ -1356,13 +1356,6 @@ impl<'a> Scheduler<'a> {
}
}
match &spec.kind {
LayerKind::Open { .. } | LayerKind::Grouped { .. } => {
layer.open.write(true);
}
_ => {}
}
perf_set |= layer.perf > 0;
}

View File

@ -759,8 +759,8 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
.into_iter()
.filter(|task| {
task.dom_mask & (1 << pull_dom_id) != 0
|| !(self.skip_kworkers && task.is_kworker)
|| !task.migrated.get()
&& !(self.skip_kworkers && task.is_kworker)
&& !task.migrated.get()
})
.collect();
@ -779,7 +779,10 @@ impl<'a, 'b> LoadBalancer<'a, 'b> {
.filter(|x| x.load >= OrderedFloat(to_xfer) && task_filter(x, pull_dom_id)),
),
) {
(None, None) => return Ok(None),
(None, None) => {
std::mem::swap(&mut push_dom.tasks, &mut SortedVec::from_unsorted(tasks));
return Ok(None);
}
(Some(task), None) | (None, Some(task)) => (task, calc_new_imbal(*task.load)),
(Some(task0), Some(task1)) => {
let (new_imbal0, new_imbal1) =

View File

@ -2,4 +2,4 @@
SCX_SCHEDULER=scx_bpfland
# Set custom flags for each scheduler, below is an example of how to use
#SCX_FLAGS='-s 20000 --lowlatency --primary-domain all'
#SCX_FLAGS='-k -m performance'