mirror of
https://github.com/sched-ext/scx.git
synced 2024-12-13 12:07:17 +00:00
7c9aedaefe
In preparation of upstreaming, let's set the min version requirement at the released v6.9 kernels. Drop __COMPAT_scx_bpf_switch_call(). The open helper macros now check the existence of SCX_OPS_SWITCH_PARTIAL and abort if not.
157 lines
4.5 KiB
C
157 lines
4.5 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* A simple scheduler.
|
|
*
|
|
* By default, it operates as a simple global weighted vtime scheduler and can
|
|
* be switched to FIFO scheduling. It also demonstrates the following niceties.
|
|
*
|
|
* - Statistics tracking how many tasks are queued to local and global dsq's.
|
|
* - Termination notification for userspace.
|
|
*
|
|
* While very simple, this scheduler should work reasonably well on CPUs with a
|
|
* uniform L3 cache topology. While preemption is not implemented, the fact that
|
|
* the scheduling queue is shared across all CPUs means that whatever is at the
|
|
* front of the queue is likely to be executed fairly quickly given enough
|
|
* number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
|
|
* but comes with the usual problems with FIFO scheduling where saturating
|
|
* threads can easily drown out interactive ones.
|
|
*
|
|
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
|
|
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
|
|
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
|
|
*/
|
|
#include <scx/common.bpf.h>
|
|
|
|
char _license[] SEC("license") = "GPL";
|
|
|
|
const volatile bool fifo_sched;
|
|
|
|
static u64 vtime_now;
|
|
UEI_DEFINE(uei);
|
|
|
|
/*
|
|
* Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
|
|
* (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We
|
|
* therefore create a separate DSQ with ID 0 that we dispatch to and consume
|
|
* from. If scx_simple only supported global FIFO scheduling, then we could
|
|
* just use SCX_DSQ_GLOBAL.
|
|
*/
|
|
#define SHARED_DSQ 0
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
|
__uint(key_size, sizeof(u32));
|
|
__uint(value_size, sizeof(u64));
|
|
__uint(max_entries, 2); /* [local, global] */
|
|
} stats SEC(".maps");
|
|
|
|
static void stat_inc(u32 idx)
|
|
{
|
|
u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
|
|
if (cnt_p)
|
|
(*cnt_p)++;
|
|
}
|
|
|
|
static inline bool vtime_before(u64 a, u64 b)
|
|
{
|
|
return (s64)(a - b) < 0;
|
|
}
|
|
|
|
s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
|
|
{
|
|
bool is_idle = false;
|
|
s32 cpu;
|
|
|
|
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
|
|
if (is_idle) {
|
|
stat_inc(0); /* count local queueing */
|
|
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
|
|
}
|
|
|
|
return cpu;
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
|
|
{
|
|
stat_inc(1); /* count global queueing */
|
|
|
|
if (fifo_sched) {
|
|
scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
|
|
} else {
|
|
u64 vtime = p->scx.dsq_vtime;
|
|
|
|
/*
|
|
* Limit the amount of budget that an idling task can accumulate
|
|
* to one slice.
|
|
*/
|
|
if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
|
|
vtime = vtime_now - SCX_SLICE_DFL;
|
|
|
|
scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
|
|
enq_flags);
|
|
}
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
|
|
{
|
|
scx_bpf_consume(SHARED_DSQ);
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
|
|
{
|
|
if (fifo_sched)
|
|
return;
|
|
|
|
/*
|
|
* Global vtime always progresses forward as tasks start executing. The
|
|
* test and update can be performed concurrently from multiple CPUs and
|
|
* thus racy. Any error should be contained and temporary. Let's just
|
|
* live with it.
|
|
*/
|
|
if (vtime_before(vtime_now, p->scx.dsq_vtime))
|
|
vtime_now = p->scx.dsq_vtime;
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
|
|
{
|
|
if (fifo_sched)
|
|
return;
|
|
|
|
/*
|
|
* Scale the execution time by the inverse of the weight and charge.
|
|
*
|
|
* Note that the default yield implementation yields by setting
|
|
* @p->scx.slice to zero and the following would treat the yielding task
|
|
* as if it has consumed all its slice. If this penalizes yielding tasks
|
|
* too much, determine the execution time by taking explicit timestamps
|
|
* instead of depending on @p->scx.slice.
|
|
*/
|
|
p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
|
|
{
|
|
p->scx.dsq_vtime = vtime_now;
|
|
}
|
|
|
|
s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
|
|
{
|
|
return scx_bpf_create_dsq(SHARED_DSQ, -1);
|
|
}
|
|
|
|
void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
|
|
{
|
|
UEI_RECORD(uei, ei);
|
|
}
|
|
|
|
SCX_OPS_DEFINE(simple_ops,
|
|
.select_cpu = (void *)simple_select_cpu,
|
|
.enqueue = (void *)simple_enqueue,
|
|
.dispatch = (void *)simple_dispatch,
|
|
.running = (void *)simple_running,
|
|
.stopping = (void *)simple_stopping,
|
|
.enable = (void *)simple_enable,
|
|
.init = (void *)simple_init,
|
|
.exit = (void *)simple_exit,
|
|
.name = "simple");
|