scx_lavd: split main.bpf.c into multiple files

As the main.bpf.c file grows, it gets hard to maintain.
So, split it into multiple logical files. There is no
functional change.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
This commit is contained in:
Changwoo Min 2024-10-05 00:25:40 +09:00
parent db0f83ce89
commit 7c5c83a3a2
9 changed files with 1867 additions and 1800 deletions

View File

@ -47,56 +47,11 @@ extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
/* /*
* common constants * common constants
*/ */
enum consts { enum {
CLOCK_BOOTTIME = 7,
CACHELINE_SIZE = 64,
NSEC_PER_USEC = 1000ULL,
NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC),
LAVD_TIME_ONE_SEC = (1000ULL * NSEC_PER_MSEC),
LAVD_TIME_INFINITY_NS = SCX_SLICE_INF,
LAVD_MAX_RETRY = 4,
LAVD_TARGETED_LATENCY_NS = (20ULL * NSEC_PER_MSEC),
LAVD_SLICE_MIN_NS = (300ULL * NSEC_PER_USEC), /* min time slice */
LAVD_SLICE_MAX_NS = (3ULL * NSEC_PER_MSEC), /* max time slice */
LAVD_SLICE_UNDECIDED = SCX_SLICE_INF,
LAVD_LC_FREQ_MAX = 1000000,
LAVD_LC_RUNTIME_MAX = LAVD_TARGETED_LATENCY_NS,
LAVD_LC_RUNTIME_SHIFT = 15,
LAVD_LC_WAKEUP_FT = 30,
LAVD_LC_KTHREAD_FT = 30,
LAVD_SLICE_BOOST_MAX_FT = 3, /* maximum additional 3x of slice */
LAVD_SLICE_BOOST_MAX_STEP = 6, /* 6 slice exhausitions in a row */
LAVD_NEW_PROC_PENALITY = 5,
LAVD_GREEDY_RATIO_NEW = (1000 * LAVD_NEW_PROC_PENALITY),
LAVD_CPU_UTIL_MAX = 1000, /* 100.0% */
LAVD_CPU_UTIL_MAX_FOR_CPUPERF = 850, /* 85.0% */
LAVD_CPU_ID_HERE = ((u32)-2),
LAVD_CPU_ID_NONE = ((u32)-1),
LAVD_CPU_ID_MAX = 512, LAVD_CPU_ID_MAX = 512,
LAVD_PREEMPT_KICK_MARGIN = (1ULL * NSEC_PER_MSEC),
LAVD_PREEMPT_TICK_MARGIN = (100ULL * NSEC_PER_USEC),
LAVD_SYS_STAT_INTERVAL_NS = (50ULL * NSEC_PER_MSEC),
LAVD_SYS_STAT_DECAY_TIMES = (2ULL * LAVD_TIME_ONE_SEC) / LAVD_SYS_STAT_INTERVAL_NS,
LAVD_CC_PER_CORE_MAX_CTUIL = 500, /* maximum per-core CPU utilization */
LAVD_CC_PER_TURBO_CORE_MAX_CTUIL = 750, /* maximum per-core CPU utilization for a turbo core */
LAVD_CC_NR_ACTIVE_MIN = 1, /* num of mininum active cores */
LAVD_CC_NR_OVRFLW = 1, /* num of overflow cores */
LAVD_CC_CPU_PIN_INTERVAL = (1ULL * LAVD_TIME_ONE_SEC),
LAVD_CC_CPU_PIN_INTERVAL_DIV = (LAVD_CC_CPU_PIN_INTERVAL /
LAVD_SYS_STAT_INTERVAL_NS),
LAVD_AP_HIGH_UTIL = 700, /* balanced mode when 10% < cpu util <= 40%,
performance mode when cpu util > 40% */
LAVD_CPDOM_MAX_NR = 32, /* maximum number of compute domain */ LAVD_CPDOM_MAX_NR = 32, /* maximum number of compute domain */
LAVD_CPDOM_MAX_DIST = 4, /* maximum distance from one compute domain to another */ LAVD_CPDOM_MAX_DIST = 4, /* maximum distance from one compute domain to another */
LAVD_CPDOM_STARV_NS = (5ULL * NSEC_PER_MSEC),
LAVD_STATUS_STR_LEN = 5, /* {LR: Latency-critical, Regular} LAVD_STATUS_STR_LEN = 5, /* {LR: Latency-critical, Regular}
{HI: performance-Hungry, performance-Insensitive} {HI: performance-Hungry, performance-Insensitive}
@ -139,100 +94,6 @@ struct sys_stat {
volatile u64 nr_lc_on_big; /* latency-critical tasks scheduled on big core */ volatile u64 nr_lc_on_big; /* latency-critical tasks scheduled on big core */
}; };
/*
* Compute domain context
* - system > numa node > llc domain > compute domain per core type (P or E)
*/
struct cpdom_ctx {
u64 id; /* id of this compute domain (== dsq_id) */
u64 alt_id; /* id of the closest compute domain of alternative type (== dsq id) */
u64 last_consume_clk; /* when the associated DSQ was consumed */
u8 is_big; /* is it a big core or little core? */
u8 is_active; /* if this compute domain is active */
u8 nr_neighbors[LAVD_CPDOM_MAX_DIST]; /* number of neighbors per distance */
u64 neighbor_bits[LAVD_CPDOM_MAX_DIST]; /* bitmask of neighbor bitmask per distance */
u64 __cpumask[LAVD_CPU_ID_MAX/64]; /* cpumasks belongs to this compute domain */
} __attribute__((aligned(CACHELINE_SIZE)));
/*
* CPU context
*/
struct cpu_ctx {
/*
* Information used to keep track of CPU utilization
*/
volatile u64 util; /* average of the CPU utilization */
volatile u64 idle_total; /* total idle time so far */
volatile u64 idle_start_clk; /* when the CPU becomes idle */
/*
* Information used to keep track of load
*/
volatile u64 load_actual; /* actual load of runnable tasks */
volatile u64 load_run_time_ns; /* total runtime of runnable tasks */
volatile u64 tot_svc_time; /* total service time on a CPU */
volatile u64 last_kick_clk; /* when the CPU was kicked */
/*
* Information for cpu hotplug
*/
u64 online_clk; /* when a CPU becomes online */
u64 offline_clk; /* when a CPU becomes offline */
/*
* Information used to keep track of latency criticality
*/
volatile u32 max_lat_cri; /* maximum latency criticality */
volatile u32 sum_lat_cri; /* sum of latency criticality */
volatile u32 nr_sched; /* number of schedules */
/*
* Information used to keep track of performance criticality
*/
volatile u64 sum_perf_cri; /* sum of performance criticality */
volatile u64 min_perf_cri; /* mininum performance criticality */
volatile u64 max_perf_cri; /* maximum performance criticality */
/*
* Information of a current running task for preemption
*/
volatile u64 stopping_tm_est_ns; /* estimated stopping time */
volatile u16 lat_cri; /* latency criticality */
volatile u8 is_online; /* is this CPU online? */
s32 cpu_id; /* cpu id */
/*
* Information for CPU frequency scaling
*/
u32 cpuperf_cur; /* CPU's current performance target */
u32 cpuperf_task; /* task's CPU performance target */
u32 cpuperf_avg; /* EWMA of task's CPU performance target */
/*
* Fields for core compaction
*
*/
u16 capacity; /* CPU capacity based on 1000 */
u8 big_core; /* is it a big core? */
u8 turbo_core; /* is it a turbo core? */
u8 cpdom_id; /* compute domain id (== dsq_id) */
u8 cpdom_alt_id; /* compute domain id of anternative type (== dsq_id) */
u8 cpdom_poll_pos; /* index to check if a DSQ of a compute domain is starving */
struct bpf_cpumask __kptr *tmp_a_mask; /* temporary cpu mask */
struct bpf_cpumask __kptr *tmp_o_mask; /* temporary cpu mask */
struct bpf_cpumask __kptr *tmp_t_mask; /* temporary cpu mask */
struct bpf_cpumask __kptr *tmp_t2_mask; /* temporary cpu mask */
/*
* Information for statistics.
*/
volatile u32 nr_migration; /* number of migrations */
volatile u32 nr_preemption; /* number of migrations */
volatile u32 nr_greedy; /* number of greedy tasks scheduled */
volatile u32 nr_perf_cri;
volatile u32 nr_lat_cri;
} __attribute__((aligned(CACHELINE_SIZE)));
/* /*
* Task context * Task context
*/ */

View File

@ -0,0 +1,116 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2023, 2024 Valve Corporation.
* Author: Changwoo Min <changwoo@igalia.com>
*/
/*
* To be included to the main.bpf.c
*/
/*
* Introspection commands
*/
struct introspec intrspc;
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 16 * 1024 /* 16 KB */);
} introspec_msg SEC(".maps");
static __always_inline
int submit_task_ctx(struct task_struct *p, struct task_ctx *taskc, u32 cpu_id)
{
struct sys_stat *stat_cur = get_sys_stat_cur();
struct cpu_ctx *cpuc;
struct msg_task_ctx *m;
cpuc = get_cpu_ctx_id(cpu_id);
if (!cpuc)
return -EINVAL;
m = bpf_ringbuf_reserve(&introspec_msg, sizeof(*m), 0);
if (!m)
return -ENOMEM;
m->hdr.kind = LAVD_MSG_TASKC;
m->taskc_x.pid = p->pid;
memcpy(m->taskc_x.comm, p->comm, TASK_COMM_LEN);
m->taskc_x.static_prio = get_nice_prio(p);
m->taskc_x.cpu_util = cpuc->util / 10;
m->taskc_x.cpu_id = cpu_id;
m->taskc_x.avg_lat_cri = stat_cur->avg_lat_cri;
m->taskc_x.thr_perf_cri = stat_cur->thr_perf_cri;
m->taskc_x.nr_active = stat_cur->nr_active;
m->taskc_x.cpuperf_cur = cpuc->cpuperf_cur;
m->taskc_x.stat[0] = is_lat_cri(taskc, stat_cur) ? 'L' : 'R';
m->taskc_x.stat[1] = is_perf_cri(taskc, stat_cur) ? 'H' : 'I';
m->taskc_x.stat[2] = cpuc->big_core ? 'B' : 'T';
m->taskc_x.stat[3] = is_greedy(taskc) ? 'G' : 'E';
m->taskc_x.stat[4] = taskc->victim_cpu >= 0 ? 'P' : 'N';
m->taskc_x.stat[5] = '\0';
memcpy(&m->taskc, taskc, sizeof(m->taskc));
bpf_ringbuf_submit(m, 0);
return 0;
}
static void proc_introspec_sched_n(struct task_struct *p,
struct task_ctx *taskc, u32 cpu_id)
{
u64 cur_nr, prev_nr;
int i;
/* introspec_arg is the number of schedules remaining */
cur_nr = intrspc.arg;
/*
* Note that the bounded retry (@LAVD_MAX_RETRY) does *not *guarantee*
* to decrement introspec_arg. However, it is unlikely to happen. Even
* if it happens, it is nothing but a matter of delaying a message
* delivery. That's because other threads will try and succeed the CAS
* operation eventually. So this is good enough. ;-)
*/
for (i = 0; cur_nr > 0 && i < LAVD_MAX_RETRY; i++) {
prev_nr = __sync_val_compare_and_swap(
&intrspc.arg, cur_nr, cur_nr - 1);
/* CAS success: submit a message and done */
if (prev_nr == cur_nr) {
submit_task_ctx(p, taskc, cpu_id);
break;
}
/* CAS failure: retry */
cur_nr = prev_nr;
}
}
static void proc_introspec_pid(struct task_struct *p, struct task_ctx *taskc,
u32 cpu_id)
{
if (p->pid == intrspc.arg)
submit_task_ctx(p, taskc, cpu_id);
}
static void try_proc_introspec_cmd(struct task_struct *p,
struct task_ctx *taskc, u32 cpu_id)
{
if (LAVD_CPU_ID_HERE == cpu_id)
cpu_id = bpf_get_smp_processor_id();
switch(intrspc.cmd) {
case LAVD_CMD_SCHED_N:
proc_introspec_sched_n(p, taskc, cpu_id);
break;
case LAVD_CMD_NOP:
/* do nothing */
break;
default:
scx_bpf_error("Unknown introspec command: %d", intrspc.cmd);
break;
}
}

View File

@ -0,0 +1,158 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2023, 2024 Valve Corporation.
* Author: Changwoo Min <changwoo@igalia.com>
*/
#ifndef __LAVD_H
#define __LAVD_H
/*
* common constants
*/
enum consts_internal {
CLOCK_BOOTTIME = 7,
CACHELINE_SIZE = 64,
NSEC_PER_USEC = 1000ULL,
NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC),
LAVD_TIME_ONE_SEC = (1000ULL * NSEC_PER_MSEC),
LAVD_TIME_INFINITY_NS = SCX_SLICE_INF,
LAVD_MAX_RETRY = 4,
LAVD_TARGETED_LATENCY_NS = (20ULL * NSEC_PER_MSEC),
LAVD_SLICE_MIN_NS = (300ULL * NSEC_PER_USEC), /* min time slice */
LAVD_SLICE_MAX_NS = (3ULL * NSEC_PER_MSEC), /* max time slice */
LAVD_SLICE_UNDECIDED = SCX_SLICE_INF,
LAVD_LC_FREQ_MAX = 1000000,
LAVD_LC_RUNTIME_MAX = LAVD_TARGETED_LATENCY_NS,
LAVD_LC_RUNTIME_SHIFT = 15,
LAVD_LC_WAKEUP_FT = 30,
LAVD_LC_KTHREAD_FT = 30,
LAVD_SLICE_BOOST_MAX_FT = 3, /* maximum additional 3x of slice */
LAVD_SLICE_BOOST_MAX_STEP = 6, /* 6 slice exhausitions in a row */
LAVD_NEW_PROC_PENALITY = 5,
LAVD_GREEDY_RATIO_NEW = (1000 * LAVD_NEW_PROC_PENALITY),
LAVD_CPU_UTIL_MAX = 1000, /* 100.0% */
LAVD_CPU_UTIL_MAX_FOR_CPUPERF = 850, /* 85.0% */
LAVD_CPU_ID_HERE = ((u32)-2),
LAVD_CPU_ID_NONE = ((u32)-1),
LAVD_PREEMPT_KICK_MARGIN = (1ULL * NSEC_PER_MSEC),
LAVD_PREEMPT_TICK_MARGIN = (100ULL * NSEC_PER_USEC),
LAVD_SYS_STAT_INTERVAL_NS = (50ULL * NSEC_PER_MSEC),
LAVD_SYS_STAT_DECAY_TIMES = (2ULL * LAVD_TIME_ONE_SEC) / LAVD_SYS_STAT_INTERVAL_NS,
LAVD_CC_PER_CORE_MAX_CTUIL = 500, /* maximum per-core CPU utilization */
LAVD_CC_PER_TURBO_CORE_MAX_CTUIL = 750, /* maximum per-core CPU utilization for a turbo core */
LAVD_CC_NR_ACTIVE_MIN = 1, /* num of mininum active cores */
LAVD_CC_NR_OVRFLW = 1, /* num of overflow cores */
LAVD_CC_CPU_PIN_INTERVAL = (1ULL * LAVD_TIME_ONE_SEC),
LAVD_CC_CPU_PIN_INTERVAL_DIV = (LAVD_CC_CPU_PIN_INTERVAL /
LAVD_SYS_STAT_INTERVAL_NS),
LAVD_AP_HIGH_UTIL = 700, /* balanced mode when 10% < cpu util <= 40%,
performance mode when cpu util > 40% */
LAVD_CPDOM_STARV_NS = (5ULL * NSEC_PER_MSEC),
};
/*
* Compute domain context
* - system > numa node > llc domain > compute domain per core type (P or E)
*/
struct cpdom_ctx {
u64 id; /* id of this compute domain (== dsq_id) */
u64 alt_id; /* id of the closest compute domain of alternative type (== dsq id) */
u64 last_consume_clk; /* when the associated DSQ was consumed */
u8 is_big; /* is it a big core or little core? */
u8 is_active; /* if this compute domain is active */
u8 nr_neighbors[LAVD_CPDOM_MAX_DIST]; /* number of neighbors per distance */
u64 neighbor_bits[LAVD_CPDOM_MAX_DIST]; /* bitmask of neighbor bitmask per distance */
u64 __cpumask[LAVD_CPU_ID_MAX/64]; /* cpumasks belongs to this compute domain */
} __attribute__((aligned(CACHELINE_SIZE)));
/*
* CPU context
*/
struct cpu_ctx {
/*
* Information used to keep track of CPU utilization
*/
volatile u64 util; /* average of the CPU utilization */
volatile u64 idle_total; /* total idle time so far */
volatile u64 idle_start_clk; /* when the CPU becomes idle */
/*
* Information used to keep track of load
*/
volatile u64 load_actual; /* actual load of runnable tasks */
volatile u64 load_run_time_ns; /* total runtime of runnable tasks */
volatile u64 tot_svc_time; /* total service time on a CPU */
volatile u64 last_kick_clk; /* when the CPU was kicked */
/*
* Information for cpu hotplug
*/
u64 online_clk; /* when a CPU becomes online */
u64 offline_clk; /* when a CPU becomes offline */
/*
* Information used to keep track of latency criticality
*/
volatile u32 max_lat_cri; /* maximum latency criticality */
volatile u32 sum_lat_cri; /* sum of latency criticality */
volatile u32 nr_sched; /* number of schedules */
/*
* Information used to keep track of performance criticality
*/
volatile u64 sum_perf_cri; /* sum of performance criticality */
volatile u64 min_perf_cri; /* mininum performance criticality */
volatile u64 max_perf_cri; /* maximum performance criticality */
/*
* Information of a current running task for preemption
*/
volatile u64 stopping_tm_est_ns; /* estimated stopping time */
volatile u16 lat_cri; /* latency criticality */
volatile u8 is_online; /* is this CPU online? */
s32 cpu_id; /* cpu id */
/*
* Information for CPU frequency scaling
*/
u32 cpuperf_cur; /* CPU's current performance target */
u32 cpuperf_task; /* task's CPU performance target */
u32 cpuperf_avg; /* EWMA of task's CPU performance target */
/*
* Fields for core compaction
*
*/
u16 capacity; /* CPU capacity based on 1000 */
u8 big_core; /* is it a big core? */
u8 turbo_core; /* is it a turbo core? */
u8 cpdom_id; /* compute domain id (== dsq_id) */
u8 cpdom_alt_id; /* compute domain id of anternative type (== dsq_id) */
u8 cpdom_poll_pos; /* index to check if a DSQ of a compute domain is starving */
struct bpf_cpumask __kptr *tmp_a_mask; /* temporary cpu mask */
struct bpf_cpumask __kptr *tmp_o_mask; /* temporary cpu mask */
struct bpf_cpumask __kptr *tmp_t_mask; /* temporary cpu mask */
struct bpf_cpumask __kptr *tmp_t2_mask; /* temporary cpu mask */
/*
* Information for statistics.
*/
volatile u32 nr_migration; /* number of migrations */
volatile u32 nr_preemption; /* number of migrations */
volatile u32 nr_greedy; /* number of greedy tasks scheduled */
volatile u32 nr_perf_cri;
volatile u32 nr_lat_cri;
} __attribute__((aligned(CACHELINE_SIZE)));
#endif /* __LAVD_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,565 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2023, 2024 Valve Corporation.
* Author: Changwoo Min <changwoo@igalia.com>
*/
/*
* To be included to the main.bpf.c
*/
/*
* CPU topology
*/
static u64 LAVD_AP_LOW_UTIL;
static bool have_turbo_core;
static bool have_little_core;
const volatile u16 cpu_order_performance[LAVD_CPU_ID_MAX]; /* CPU preference order for performance and balanced mode */
const volatile u16 cpu_order_powersave[LAVD_CPU_ID_MAX]; /* CPU preference order for powersave mode */
const volatile u16 __cpu_capacity_hint[LAVD_CPU_ID_MAX]; /* CPU capacity based on 1000 */
struct cpdom_ctx cpdom_ctxs[LAVD_CPDOM_MAX_NR]; /* contexts for compute domains */
/*
* Big core's compute ratio among currently active cores
*/
static u32 cur_big_core_ratio;
/*
* Big core's compute ratio when all cores are active
*/
static u32 default_big_core_ratio;
/*
* Statistics
*/
volatile int power_mode;
volatile u64 last_power_mode_clk;
volatile u64 performance_mode_ns;
volatile u64 balanced_mode_ns;
volatile u64 powersave_mode_ns;
static u64 calc_nr_active_cpus(struct sys_stat *stat_cur)
{
u64 nr_active;
/*
* nr_active = ceil(nr_cpus_onln * cpu_util * per_core_max_util)
*/
nr_active = (nr_cpus_onln * stat_cur->util * 1000) + 500;
nr_active /= (LAVD_CC_PER_CORE_MAX_CTUIL * 1000);
/*
* If a few CPUs are particularly busy, boost the active CPUs more.
*/
nr_active += min(LAVD_CC_NR_OVRFLW, (stat_cur->nr_violation) / 1000);
nr_active = max(min(nr_active, nr_cpus_onln),
LAVD_CC_NR_ACTIVE_MIN);
return nr_active;
}
static bool clear_cpu_periodically(u32 cpu, struct bpf_cpumask *cpumask)
{
u32 clear;
/*
* If the CPU is on, we clear the bit once every four times
* (LAVD_CC_CPU_PIN_INTERVAL_DIV). Hence, the bit will be
* probabilistically cleared once every 100 msec (4 * 25 msec).
*/
clear = !(bpf_get_prandom_u32() % LAVD_CC_CPU_PIN_INTERVAL_DIV);
if (clear)
bpf_cpumask_clear_cpu(cpu, cpumask);
return clear;
}
static void do_core_compaction(void)
{
struct sys_stat *stat_cur = get_sys_stat_cur();
struct cpu_ctx *cpuc;
struct bpf_cpumask *active, *ovrflw;
int nr_cpus, nr_active, nr_active_old, cpu, i;
u32 sum_capacity = 0, big_capacity = 0;
bool clear;
const volatile u16 *cpu_order;
bpf_rcu_read_lock();
/*
* Prepare cpumasks.
*/
active = active_cpumask;
ovrflw = ovrflw_cpumask;
if (!active || !ovrflw) {
scx_bpf_error("Failed to prepare cpumasks.");
goto unlock_out;
}
/*
* Decide a cpuorder to use according to its power mode.
*/
if (is_powersave_mode)
cpu_order = cpu_order_powersave;
else
cpu_order = cpu_order_performance;
/*
* Assign active and overflow cores
*/
nr_active_old = stat_cur->nr_active;
nr_active = calc_nr_active_cpus(stat_cur);
nr_cpus = nr_active + LAVD_CC_NR_OVRFLW;
bpf_for(i, 0, nr_cpu_ids) {
if (i >= LAVD_CPU_ID_MAX)
break;
/*
* Skip offline cpu
*/
cpu = cpu_order[i];
cpuc = get_cpu_ctx_id(cpu);
if (!cpuc || !cpuc->is_online) {
bpf_cpumask_clear_cpu(cpu, active);
bpf_cpumask_clear_cpu(cpu, ovrflw);
continue;
}
/*
* Assign an online cpu to active and overflow cpumasks
*/
if (i < nr_cpus) {
if (i < nr_active) {
bpf_cpumask_set_cpu(cpu, active);
bpf_cpumask_clear_cpu(cpu, ovrflw);
}
else {
bpf_cpumask_set_cpu(cpu, ovrflw);
bpf_cpumask_clear_cpu(cpu, active);
}
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
/*
* Calculate big capacity ratio among active cores.
*/
sum_capacity += cpuc->capacity;
if (cpuc->big_core)
big_capacity += cpuc->capacity;
}
else {
if (i < nr_active_old) {
bpf_cpumask_clear_cpu(cpu, active);
bpf_cpumask_clear_cpu(cpu, ovrflw);
}
else {
/*
* This is the case when a CPU belongs to the
* overflow set even though that CPU was not an
* overflow set initially. This can happen only
* when a pinned userspace task ran on this
* CPU. In this case, we keep the CPU in an
* overflow set since the CPU will be used
* anyway for the task. This will promote equal
* use of all used CPUs, lowering the energy
* consumption by avoiding a few CPUs being
* turbo-boosted. Hence, we do not clear the
* overflow cpumask here for a while,
* approximately for LAVD_CC_CPU_PIN_INTERVAL.
*/
bpf_cpumask_clear_cpu(cpu, active);
clear = clear_cpu_periodically(cpu, ovrflw);
if (!clear)
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
}
}
}
cur_big_core_ratio = (1000 * big_capacity) / sum_capacity;
stat_cur->nr_active = nr_active;
unlock_out:
bpf_rcu_read_unlock();
}
static void update_power_mode_time(void)
{
u64 now = bpf_ktime_get_ns();
u64 delta;
if (last_power_mode_clk == 0)
last_power_mode_clk = now;
delta = now - last_power_mode_clk;
last_power_mode_clk = now;
switch (power_mode) {
case LAVD_PM_PERFORMANCE:
__sync_fetch_and_add(&performance_mode_ns, delta);
break;
case LAVD_PM_BALANCED:
__sync_fetch_and_add(&balanced_mode_ns, delta);
break;
case LAVD_PM_POWERSAVE:
__sync_fetch_and_add(&powersave_mode_ns, delta);
break;
}
}
static int do_set_power_profile(s32 pm, int util)
{
/*
* Skip setting the mode if already in the same mode.
*/
if (power_mode == pm)
return 0;
/*
* Update power mode time
*/
update_power_mode_time();
power_mode = pm;
/*
* Change the power mode.
*/
switch (pm) {
case LAVD_PM_PERFORMANCE:
no_core_compaction = true;
no_freq_scaling = true;
no_prefer_turbo_core = false;
is_powersave_mode = false;
/*
* Since the core compaction becomes off, we need to
* reinitialize the active and overflow cpumask for performance
* mode.
*
* Note that a verifier in an old kernel does not allow calling
* bpf_cpumask_set_cpu(), so we defer the actual update to our
* timer handler, update_sys_stat().
*/
reinit_cpumask_for_performance = true;
debugln("Set the scheduler's power profile to performance mode: %d", util);
break;
case LAVD_PM_BALANCED:
no_core_compaction = false;
no_freq_scaling = false;
no_prefer_turbo_core = false;
is_powersave_mode = false;
reinit_cpumask_for_performance = false;
debugln("Set the scheduler's power profile to balanced mode: %d", util);
break;
case LAVD_PM_POWERSAVE:
no_core_compaction = false;
no_freq_scaling = false;
no_prefer_turbo_core = true;
is_powersave_mode = true;
reinit_cpumask_for_performance = false;
debugln("Set the scheduler's power profile to power-save mode: %d", util);
break;
default:
return -EINVAL;
}
return 0;
}
static int do_autopilot(void)
{
struct sys_stat *stat_cur = get_sys_stat_cur();
/*
* If the CPU utiulization is very low (say <= 5%), it means high
* performance is not required. We run the scheduler in powersave mode
* to save energy consumption.
*/
if (stat_cur->util <= LAVD_AP_LOW_UTIL)
return do_set_power_profile(LAVD_PM_POWERSAVE, stat_cur->util);
/*
* If the CPU utiulization is moderate (say > 5%, <= 30%), we run the
* scheduler in balanced mode. Actually, balanced mode can save energy
* consumption only under moderate CPU load.
*/
if (stat_cur->util <= LAVD_AP_HIGH_UTIL)
return do_set_power_profile(LAVD_PM_BALANCED, stat_cur->util);
/*
* If the CPU utilization is high enough (say > 30%), we run the
* scheduler in performance mode. The system indeed needs perrformance
* also there is little energy benefit even under balanced mode anyway.
*/
return do_set_power_profile(LAVD_PM_PERFORMANCE, stat_cur->util);
}
static void update_thr_perf_cri(void)
{
struct sys_stat *stat_cur = get_sys_stat_cur();
u32 little_core_ratio, delta, diff, thr;
if (no_core_compaction || !have_little_core)
cur_big_core_ratio = default_big_core_ratio;
/*
* If all active cores are big, all tasks should run on the big cores.
*/
if (cur_big_core_ratio == 1000) {
stat_cur->thr_perf_cri = 0;
return;
}
/*
* We approximate the distribution of performance criticality of tasks
* using min, avg, and max performance criticality of a given period.
*
* min_perf_cri
* | avg_perf_cri
* | | max_perf_cri
* | | |
* <--------><----------------------->
*
* The half of compute capacity should be assigned to the below average
* tasks (< avg_perf_cri), and the other half should assigned to the
* above average tasks (>= avg_perf_cri).
*
* <------------><------------------->
* | | |
* | | 1000
* | 1000 - big_core_ratio (i.e., little_core_ratio)
* 0
*/
little_core_ratio = 1000 - cur_big_core_ratio;
if (little_core_ratio < 500) {
/*
* min_perf_cri
* | avg_perf_cri
* | | max_perf_cri
* | | |
* <--------><----------------------->
*
* <-///-><-------------------------->
* | | |
* | | 1000
* | little_core_ratio
* 0
*/
delta = stat_cur->avg_perf_cri - stat_cur->min_perf_cri;
diff = (delta * little_core_ratio) / 1000;
thr = diff + stat_cur->min_perf_cri;
}
else {
/*
* min_perf_cri
* | avg_perf_cri
* | | max_perf_cri
* | | |
* <--------><----------------------->
*
* <---------------------><-////////->
* | | |
* | | 1000
* | little_core_ratio
* 0
*/
delta = stat_cur->max_perf_cri - stat_cur->avg_perf_cri;
diff = (delta * cur_big_core_ratio) / 1000;
thr = stat_cur->max_perf_cri - diff;
}
stat_cur->thr_perf_cri = thr;
}
static int reinit_active_cpumask_for_performance(void)
{
struct cpu_ctx *cpuc;
struct bpf_cpumask *active, *ovrflw;
int cpu, err = 0;
barrier();
bpf_rcu_read_lock();
/*
* Prepare cpumasks.
*/
active = active_cpumask;
ovrflw = ovrflw_cpumask;
if (!active || !ovrflw) {
scx_bpf_error("Failed to prepare cpumasks.");
err = -ENOMEM;
goto unlock_out;
}
/*
* Once core compaction becomes off in performance mode,
* reinitialize active/overflow cpumasks to reflect the mode change.
*/
bpf_for(cpu, 0, nr_cpu_ids) {
cpuc = get_cpu_ctx_id(cpu);
if (!cpuc) {
scx_bpf_error("Failed to lookup cpu_ctx: %d", cpu);
err = -ESRCH;
goto unlock_out;
}
if (cpuc->big_core)
bpf_cpumask_set_cpu(cpu, active);
else
bpf_cpumask_set_cpu(cpu, ovrflw);
}
unlock_out:
bpf_rcu_read_unlock();
return err;
}
static int calc_cpuperf_target(struct sys_stat *stat_cur,
struct task_ctx *taskc, struct cpu_ctx *cpuc)
{
u64 max_load, cpu_load;
u32 cpuperf_target;
if (!stat_cur || !taskc || !cpuc)
return -EINVAL;
if (no_freq_scaling) {
cpuc->cpuperf_task = SCX_CPUPERF_ONE;
cpuc->cpuperf_avg = SCX_CPUPERF_ONE;
return 0;
}
/*
* We determine the clock frequency of a CPU using two factors: 1) the
* current CPU utilization (cpuc->util) and 2) the current task's
* performance criticality (taskc->perf_cri) compared to the
* system-wide average performance criticality
* (stat_cur->thr_perf_cri).
*
* When a current CPU utilization is 85% and the current task's
* performance criticality is the same as the system-wide average
* criticality, we set the target CPU frequency to the maximum.
*
* In other words, even if CPU utilization is not so high, the target
* CPU frequency could be high when the task's performance criticality
* is high enough (i.e., boosting CPU frequency). On the other hand,
* the target CPU frequency could be low even if CPU utilization is
* high when a non-performance-critical task is running (i.e.,
* deboosting CPU frequency).
*/
max_load = stat_cur->thr_perf_cri * LAVD_CPU_UTIL_MAX_FOR_CPUPERF;
cpu_load = taskc->perf_cri * cpuc->util;
cpuperf_target = (cpu_load * SCX_CPUPERF_ONE) / max_load;
cpuperf_target = min(cpuperf_target, SCX_CPUPERF_ONE);
cpuc->cpuperf_task = cpuperf_target;
cpuc->cpuperf_avg = calc_avg32(cpuc->cpuperf_avg, cpuperf_target);
return 0;
}
static bool try_increase_cpuperf_target(struct cpu_ctx *cpuc)
{
/*
* When a task becomes running, update CPU's performance target only
* when the current task's target performance is higher. This helps
* rapidly adopt workload changes by rapidly increasing CPU's
* performance target.
*/
u32 target;
if (!cpuc)
return false;
target = max(cpuc->cpuperf_task, cpuc->cpuperf_avg);
if (cpuc->cpuperf_cur < target) {
cpuc->cpuperf_cur = target;
scx_bpf_cpuperf_set(cpuc->cpu_id, target);
return true;
}
return false;
}
static bool try_decrease_cpuperf_target(struct cpu_ctx *cpuc)
{
/*
* Upon every tick interval, we try to decrease the CPU's performance
* target if the current one is higher than both the current task's
* target and EWMA of past targets. This helps gradually adopt workload
* changes upon sudden down falls.
*/
u32 target;
if (!cpuc)
return false;
target = max(cpuc->cpuperf_task, cpuc->cpuperf_avg);
if (cpuc->cpuperf_cur != target) {
cpuc->cpuperf_cur = target;
scx_bpf_cpuperf_set(cpuc->cpu_id, target);
return true;
}
return false;
}
static u16 get_cpuperf_cap(s32 cpu)
{
if (cpu >= 0 && cpu < nr_cpu_ids && cpu < LAVD_CPU_ID_MAX)
return __cpu_capacity_hint[cpu];
debugln("Infeasible CPU id: %d", cpu);
return 0;
}
static u16 get_cputurbo_cap(void)
{
u16 turbo_cap = 0;
int nr_turbo = 0, cpu;
/*
* Find the maximum CPU frequency
*/
for (cpu = 0; cpu < nr_cpu_ids && cpu < LAVD_CPU_ID_MAX; cpu++) {
if (__cpu_capacity_hint[cpu] > turbo_cap) {
turbo_cap = __cpu_capacity_hint[cpu];
nr_turbo++;
}
}
/*
* If all CPU's frequencies are the same, ignore the turbo.
*/
if (nr_turbo <= 1)
turbo_cap = 0;
return turbo_cap;
}
static void init_autopilot_low_util(void)
{
if (nr_cpus_big < nr_cpus_onln) {
/*
* When there are little cores, we move up to the balanced mode
* if one little core is fully utilized.
*/
LAVD_AP_LOW_UTIL = 1000 / nr_cpus_onln;
}
else {
/*
* When there are only big cores, we move up to the balanced
* mode if two big cores are fully utilized.
*/
LAVD_AP_LOW_UTIL = (2 * 1000) / nr_cpus_onln;
}
}
SEC("syscall")
int set_power_profile(struct power_arg *input)
{
return do_set_power_profile(input->power_mode, 0);
}

View File

@ -0,0 +1,326 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* To be included to the main.bpf.c
*/
/*
* Preemption related ones
*/
struct preemption_info {
u64 stopping_tm_est_ns;
u64 last_kick_clk;
u64 lat_cri;
struct cpu_ctx *cpuc;
};
static u64 get_est_stopping_time(struct task_ctx *taskc)
{
return bpf_ktime_get_ns() + taskc->run_time_ns;
}
static int comp_preemption_info(struct preemption_info *prm_a,
struct preemption_info *prm_b)
{
/*
* Check if one's latency priority _or_ deadline is smaller or not.
*/
if ((prm_a->lat_cri < prm_b->lat_cri) ||
(prm_a->stopping_tm_est_ns < prm_b->stopping_tm_est_ns))
return -1;
if ((prm_a->lat_cri > prm_b->lat_cri) ||
(prm_a->stopping_tm_est_ns > prm_b->stopping_tm_est_ns))
return 1;
return 0;
}
static bool can_task1_kick_task2(struct preemption_info *prm_task1,
struct preemption_info *prm_task2)
{
return comp_preemption_info(prm_task1, prm_task2) < 0;
}
static bool can_cpu1_kick_cpu2(struct preemption_info *prm_cpu1,
struct preemption_info *prm_cpu2,
struct cpu_ctx *cpuc2)
{
/*
* Set a CPU information
*/
prm_cpu2->stopping_tm_est_ns = cpuc2->stopping_tm_est_ns;
prm_cpu2->lat_cri = cpuc2->lat_cri;
prm_cpu2->cpuc = cpuc2;
prm_cpu2->last_kick_clk = cpuc2->last_kick_clk;
/*
* If that CPU runs a lower priority task, that's a victim
* candidate.
*/
return can_task1_kick_task2(prm_cpu1, prm_cpu2);
}
static bool is_worth_kick_other_task(struct task_ctx *taskc)
{
/*
* The scx_bpf_kick_cpu() used for preemption is expensive as an IPI is
* involved. Hence, we first judiciously check whether it is worth
* trying to victimize another CPU as the current task is urgent
* enough.
*/
struct sys_stat *stat_cur = get_sys_stat_cur();
return (taskc->lat_cri >= stat_cur->thr_lat_cri);
}
static bool can_cpu_be_kicked(u64 now, struct cpu_ctx *cpuc)
{
return cpuc->is_online &&
(now - cpuc->last_kick_clk) >= LAVD_PREEMPT_KICK_MARGIN;
}
static struct cpu_ctx *find_victim_cpu(const struct cpumask *cpumask,
struct task_ctx *taskc,
u64 *p_old_last_kick_clk)
{
/*
* We see preemption as a load-balancing problem. In a system with N
* CPUs, ideally, the top N tasks with the highest latency priorities
* should run on the N CPUs all the time. This is the same as the
* load-balancing problem; the load-balancing problem finds a least
* loaded server, and the preemption problem finds a CPU running a
* least latency critical task. Hence, we use the 'power of two random
* choices' technique.
*/
u64 now = bpf_ktime_get_ns();
struct cpu_ctx *cpuc;
struct preemption_info prm_task, prm_cpus[2], *victim_cpu;
int cpu, nr_cpus;
int i, v = 0, cur_cpu = bpf_get_smp_processor_id();
int ret;
/*
* Get task's preemption information for comparison.
*/
prm_task.stopping_tm_est_ns = get_est_stopping_time(taskc) +
LAVD_PREEMPT_KICK_MARGIN;
prm_task.lat_cri = taskc->lat_cri;
prm_task.cpuc = cpuc = get_cpu_ctx();
if (!cpuc) {
scx_bpf_error("Failed to lookup the current cpu_ctx");
goto null_out;
}
prm_task.last_kick_clk = cpuc->last_kick_clk;
/*
* First, test the current CPU since it can skip the expensive IPI.
*/
if (can_cpu_be_kicked(now, cpuc) &&
bpf_cpumask_test_cpu(cur_cpu, cpumask) &&
can_cpu1_kick_cpu2(&prm_task, &prm_cpus[0], cpuc)) {
victim_cpu = &prm_task;
goto bingo_out;
}
/*
* If the current CPU cannot be a victim, let's check if it is worth to
* try to kick other CPU at the expense of IPI.
*/
if (!is_worth_kick_other_task(taskc))
goto null_out;
/*
* Randomly find _two_ CPUs that run lower-priority tasks than @p. To
* traverse CPUs in a random order, we start from a random CPU ID in a
* random direction (left or right). The random-order traversal helps
* to mitigate the thundering herd problem. Otherwise, all CPUs may end
* up finding the same victim CPU.
*
* In the worst case, the current logic traverses _all_ CPUs. It would
* be too expensive to perform every task queue. We need to revisit
* this if the traversal cost becomes problematic.
*/
barrier();
nr_cpus = bpf_cpumask_weight(cpumask);
bpf_for(i, 0, nr_cpus) {
/*
* Decide a CPU ID to examine.
*/
cpu = bpf_cpumask_any_distribute(cpumask);
if (cpu >= nr_cpu_ids || cur_cpu == cpu)
continue;
/*
* Check whether that CPU is qualified to run @p.
*/
cpuc = get_cpu_ctx_id(cpu);
if (!cpuc) {
scx_bpf_error("Failed to lookup cpu_ctx: %d", cpu);
goto null_out;
}
if (!can_cpu_be_kicked(now, cpuc))
continue;
/*
* If that CPU runs a lower priority task, that's a victim
* candidate.
*/
ret = can_cpu1_kick_cpu2(&prm_task, &prm_cpus[v], cpuc);
if (ret == true && ++v >= 2)
break;
}
/*
* Choose a final victim CPU.
*/
switch(v) {
case 2: /* two dandidates */
victim_cpu = can_task1_kick_task2(&prm_cpus[0], &prm_cpus[1]) ?
&prm_cpus[0] : &prm_cpus[1];
goto bingo_out;
case 1: /* one candidate */
victim_cpu = &prm_cpus[0];
goto bingo_out;
case 0: /* no candidate */
goto null_out;
default:/* something wrong */
goto null_out;
}
bingo_out:
taskc->victim_cpu = victim_cpu->cpuc->cpu_id;
*p_old_last_kick_clk = victim_cpu->last_kick_clk;
return victim_cpu->cpuc;
null_out:
taskc->victim_cpu = (s32)LAVD_CPU_ID_NONE;
return NULL;
}
static bool kick_cpu(struct cpu_ctx *victim_cpuc, u64 victim_last_kick_clk)
{
/*
* If the current CPU is a victim, we just reset the current task's
* time slice as an optimization. Othewise, kick the remote CPU for
* preemption.
*
* Kicking the victim CPU does _not_ guarantee that task @p will run on
* that CPU. Enqueuing @p to the global queue is one operation, and
* kicking the victim is another asynchronous operation. However, it is
* okay because, anyway, the victim CPU will run a higher-priority task
* than @p.
*/
if (bpf_get_smp_processor_id() == victim_cpuc->cpu_id) {
struct task_struct *tsk = bpf_get_current_task_btf();
tsk->scx.slice = 0;
return true;
}
/*
* Kick the remote victim CPU if it is not victimized yet by another
* concurrent kick task.
*/
bool ret = __sync_bool_compare_and_swap(&victim_cpuc->last_kick_clk,
victim_last_kick_clk,
bpf_ktime_get_ns());
if (ret)
scx_bpf_kick_cpu(victim_cpuc->cpu_id, SCX_KICK_PREEMPT);
return ret;
}
static bool try_find_and_kick_victim_cpu(struct task_struct *p,
struct task_ctx *taskc,
struct cpu_ctx *cpuc_cur,
u64 dsq_id)
{
struct bpf_cpumask *cd_cpumask, *cpumask;
struct cpdom_ctx *cpdomc;
struct cpu_ctx *victim_cpuc;
u64 victim_last_kick_clk;
bool ret = false;
/*
* Prepare a cpumak so we find a victim @p's compute domain.
*/
cpumask = cpuc_cur->tmp_t_mask;
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
cd_cpumask = MEMBER_VPTR(cpdom_cpumask, [dsq_id]);
if (!cpdomc || !cd_cpumask || !cpumask)
return false;
bpf_cpumask_and(cpumask, cast_mask(cd_cpumask), p->cpus_ptr);
/*
* Find a victim CPU among CPUs that run lower-priority tasks.
*/
victim_cpuc = find_victim_cpu(cast_mask(cpumask), taskc, &victim_last_kick_clk);
/*
* If a victim CPU is chosen, preempt the victim by kicking it.
*/
if (victim_cpuc)
ret = kick_cpu(victim_cpuc, victim_last_kick_clk);
if (!ret)
taskc->victim_cpu = (s32)LAVD_CPU_ID_NONE;
return ret;
}
static bool try_yield_current_cpu(struct task_struct *p_run,
struct cpu_ctx *cpuc_run,
struct task_ctx *taskc_run)
{
struct task_struct *p_wait;
struct task_ctx *taskc_wait;
struct preemption_info prm_run, prm_wait;
s32 cpu_id = scx_bpf_task_cpu(p_run), wait_vtm_cpu_id;
bool ret = false;
/*
* If there is a higher priority task waiting on the global rq, the
* current running task yield the CPU by shrinking its time slice to
* zero.
*/
prm_run.stopping_tm_est_ns = taskc_run->last_running_clk +
taskc_run->run_time_ns -
LAVD_PREEMPT_TICK_MARGIN;
prm_run.lat_cri = taskc_run->lat_cri;
bpf_rcu_read_lock();
bpf_for_each(scx_dsq, p_wait, cpuc_run->cpdom_id, 0) {
taskc_wait = get_task_ctx(p_wait);
if (!taskc_wait)
break;
wait_vtm_cpu_id = taskc_wait->victim_cpu;
if (wait_vtm_cpu_id != (s32)LAVD_CPU_ID_NONE)
break;
prm_wait.stopping_tm_est_ns = get_est_stopping_time(taskc_wait);
prm_wait.lat_cri = taskc_wait->lat_cri;
if (can_task1_kick_task2(&prm_wait, &prm_run)) {
/*
* The atomic CAS guarantees only one task yield its
* CPU for the waiting task.
*/
ret = __sync_bool_compare_and_swap(
&taskc_wait->victim_cpu,
(s32)LAVD_CPU_ID_NONE, cpu_id);
if (ret)
p_run->scx.slice = 0;
}
/*
* Test only the first entry on the DSQ.
*/
break;
}
bpf_rcu_read_unlock();
return ret;
}

View File

@ -0,0 +1,376 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2023, 2024 Valve Corporation.
* Author: Changwoo Min <changwoo@igalia.com>
*/
/*
* To be included to the main.bpf.c
*/
/*
* Timer for updating system-wide status periorically
*/
struct update_timer {
struct bpf_timer timer;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, u32);
__type(value, struct update_timer);
} update_timer SEC(".maps");
struct sys_stat_ctx {
struct sys_stat *stat_cur;
struct sys_stat *stat_next;
u64 now;
u64 duration;
u64 duration_total;
u64 idle_total;
u64 compute_total;
u64 load_actual;
u64 tot_svc_time;
u64 nr_queued_task;
u64 load_run_time_ns;
s32 max_lat_cri;
s32 avg_lat_cri;
u64 sum_lat_cri;
u32 nr_sched;
u32 nr_migration;
u32 nr_preemption;
u32 nr_greedy;
u32 nr_perf_cri;
u32 nr_lat_cri;
u32 nr_big;
u32 nr_pc_on_big;
u32 nr_lc_on_big;
u64 min_perf_cri;
u64 avg_perf_cri;
u64 max_perf_cri;
u64 sum_perf_cri;
u32 thr_perf_cri;
u64 new_util;
u32 nr_violation;
};
static void init_sys_stat_ctx(struct sys_stat_ctx *c)
{
memset(c, 0, sizeof(*c));
c->stat_cur = get_sys_stat_cur();
c->stat_next = get_sys_stat_next();
c->min_perf_cri = 1000;
c->now = bpf_ktime_get_ns();
c->duration = c->now - c->stat_cur->last_update_clk;
c->stat_next->last_update_clk = c->now;
}
static void collect_sys_stat(struct sys_stat_ctx *c)
{
u64 dsq_id;
int cpu, nr;
bpf_for(cpu, 0, nr_cpu_ids) {
struct cpu_ctx *cpuc = get_cpu_ctx_id(cpu);
if (!cpuc) {
c->compute_total = 0;
break;
}
/*
* Accumulate cpus' loads.
*/
c->load_actual += cpuc->load_actual;
c->load_run_time_ns += cpuc->load_run_time_ns;
c->tot_svc_time += cpuc->tot_svc_time;
cpuc->tot_svc_time = 0;
/*
* Accumulate statistics.
*/
if (cpuc->big_core) {
c->nr_big += cpuc->nr_sched;
c->nr_pc_on_big += cpuc->nr_perf_cri;
c->nr_lc_on_big += cpuc->nr_lat_cri;
}
c->nr_perf_cri += cpuc->nr_perf_cri;
cpuc->nr_perf_cri = 0;
c->nr_lat_cri += cpuc->nr_lat_cri;
cpuc->nr_lat_cri = 0;
c->nr_migration += cpuc->nr_migration;
cpuc->nr_migration = 0;
c->nr_preemption += cpuc->nr_preemption;
cpuc->nr_preemption = 0;
c->nr_greedy += cpuc->nr_greedy;
cpuc->nr_greedy = 0;
/*
* Accumulate task's latency criticlity information.
*
* While updating cpu->* is racy, the resulting impact on
* accuracy should be small and very rare and thus should be
* fine.
*/
c->sum_lat_cri += cpuc->sum_lat_cri;
cpuc->sum_lat_cri = 0;
c->nr_sched += cpuc->nr_sched;
cpuc->nr_sched = 0;
if (cpuc->max_lat_cri > c->max_lat_cri)
c->max_lat_cri = cpuc->max_lat_cri;
cpuc->max_lat_cri = 0;
/*
* Accumulate task's performance criticlity information.
*/
if (cpuc->min_perf_cri < c->min_perf_cri)
c->min_perf_cri = cpuc->min_perf_cri;
cpuc->min_perf_cri = 1000;
if (cpuc->max_perf_cri > c->max_perf_cri)
c->max_perf_cri = cpuc->max_perf_cri;
cpuc->max_perf_cri = 0;
c->sum_perf_cri += cpuc->sum_perf_cri;
cpuc->sum_perf_cri = 0;
/*
* If the CPU is in an idle state (i.e., idle_start_clk is
* non-zero), accumulate the current idle peirod so far.
*/
for (int i = 0; i < LAVD_MAX_RETRY; i++) {
u64 old_clk = cpuc->idle_start_clk;
if (old_clk == 0)
break;
bool ret = __sync_bool_compare_and_swap(
&cpuc->idle_start_clk, old_clk, c->now);
if (ret) {
cpuc->idle_total += c->now - old_clk;
break;
}
}
/*
* Calculcate per-CPU utilization
*/
u64 compute = 0;
if (c->duration > cpuc->idle_total)
compute = c->duration - cpuc->idle_total;
c->new_util = (compute * LAVD_CPU_UTIL_MAX) / c->duration;
cpuc->util = calc_avg(cpuc->util, c->new_util);
if (cpuc->turbo_core) {
if (cpuc->util > LAVD_CC_PER_TURBO_CORE_MAX_CTUIL)
c->nr_violation += 1000;
}
else {
if (cpuc->util > LAVD_CC_PER_CORE_MAX_CTUIL)
c->nr_violation += 1000;
}
/*
* Accmulate system-wide idle time
*/
c->idle_total += cpuc->idle_total;
cpuc->idle_total = 0;
}
bpf_for(dsq_id, 0, LAVD_CPDOM_MAX_NR) {
nr = scx_bpf_dsq_nr_queued(dsq_id);
if (nr > 0)
c->nr_queued_task += nr;
}
}
static void calc_sys_stat(struct sys_stat_ctx *c)
{
c->duration_total = c->duration * nr_cpus_onln;
if (c->duration_total > c->idle_total)
c->compute_total = c->duration_total - c->idle_total;
else
c->compute_total = 0;
c->new_util = (c->compute_total * LAVD_CPU_UTIL_MAX)/c->duration_total;
if (c->nr_sched == 0) {
/*
* When a system is completely idle, it is indeed possible
* nothing scheduled for an interval.
*/
c->max_lat_cri = c->stat_cur->max_lat_cri;
c->avg_lat_cri = c->stat_cur->avg_lat_cri;
c->min_perf_cri = c->stat_cur->min_perf_cri;
c->max_perf_cri = c->stat_cur->max_perf_cri;
c->avg_perf_cri = c->stat_cur->avg_perf_cri;
}
else {
c->avg_lat_cri = c->sum_lat_cri / c->nr_sched;
c->avg_perf_cri = c->sum_perf_cri / c->nr_sched;
}
}
static void update_sys_stat_next(struct sys_stat_ctx *c)
{
static int cnt = 0;
u64 avg_svc_time = 0;
/*
* Update the CPU utilization to the next version.
*/
struct sys_stat *stat_cur = c->stat_cur;
struct sys_stat *stat_next = c->stat_next;
stat_next->load_actual =
calc_avg(stat_cur->load_actual, c->load_actual);
stat_next->util =
calc_avg(stat_cur->util, c->new_util);
stat_next->max_lat_cri =
calc_avg32(stat_cur->max_lat_cri, c->max_lat_cri);
stat_next->avg_lat_cri =
calc_avg32(stat_cur->avg_lat_cri, c->avg_lat_cri);
stat_next->thr_lat_cri = stat_next->max_lat_cri -
((stat_next->max_lat_cri - stat_next->avg_lat_cri) >> 1);
stat_next->min_perf_cri =
calc_avg32(stat_cur->min_perf_cri, c->min_perf_cri);
stat_next->avg_perf_cri =
calc_avg32(stat_cur->avg_perf_cri, c->avg_perf_cri);
stat_next->max_perf_cri =
calc_avg32(stat_cur->max_perf_cri, c->max_perf_cri);
stat_next->thr_perf_cri =
c->stat_cur->thr_perf_cri; /* will be updated later */
stat_next->nr_violation =
calc_avg32(stat_cur->nr_violation, c->nr_violation);
if (c->nr_sched > 0)
avg_svc_time = c->tot_svc_time / c->nr_sched;
stat_next->avg_svc_time =
calc_avg(stat_cur->avg_svc_time, avg_svc_time);
stat_next->nr_queued_task =
calc_avg(stat_cur->nr_queued_task, c->nr_queued_task);
/*
* Half the statistics every minitue so the statistics hold the
* information on a few minutes.
*/
if (cnt++ == LAVD_SYS_STAT_DECAY_TIMES) {
cnt = 0;
stat_next->nr_sched >>= 1;
stat_next->nr_migration >>= 1;
stat_next->nr_preemption >>= 1;
stat_next->nr_greedy >>= 1;
stat_next->nr_perf_cri >>= 1;
stat_next->nr_lat_cri >>= 1;
stat_next->nr_big >>= 1;
stat_next->nr_pc_on_big >>= 1;
stat_next->nr_lc_on_big >>= 1;
__sync_fetch_and_sub(&performance_mode_ns, performance_mode_ns/2);
__sync_fetch_and_sub(&balanced_mode_ns, balanced_mode_ns/2);
__sync_fetch_and_sub(&powersave_mode_ns, powersave_mode_ns/2);
}
stat_next->nr_sched += c->nr_sched;
stat_next->nr_migration += c->nr_migration;
stat_next->nr_preemption += c->nr_preemption;
stat_next->nr_greedy += c->nr_greedy;
stat_next->nr_perf_cri += c->nr_perf_cri;
stat_next->nr_lat_cri += c->nr_lat_cri;
stat_next->nr_big += c->nr_big;
stat_next->nr_pc_on_big += c->nr_pc_on_big;
stat_next->nr_lc_on_big += c->nr_lc_on_big;
update_power_mode_time();
}
static void do_update_sys_stat(void)
{
struct sys_stat_ctx c;
/*
* Collect and prepare the next version of stat.
*/
init_sys_stat_ctx(&c);
collect_sys_stat(&c);
calc_sys_stat(&c);
update_sys_stat_next(&c);
/*
* Make the next version atomically visible.
*/
flip_sys_stat();
}
static void update_sys_stat(void)
{
do_update_sys_stat();
if (is_autopilot_on)
do_autopilot();
if (!no_core_compaction)
do_core_compaction();
update_thr_perf_cri();
if (reinit_cpumask_for_performance) {
reinit_cpumask_for_performance = false;
reinit_active_cpumask_for_performance();
}
}
static int update_timer_cb(void *map, int *key, struct bpf_timer *timer)
{
int err;
update_sys_stat();
err = bpf_timer_start(timer, LAVD_SYS_STAT_INTERVAL_NS, 0);
if (err)
scx_bpf_error("Failed to arm update timer");
return 0;
}
static s32 init_sys_stat(u64 now)
{
struct bpf_timer *timer;
u32 key = 0;
int err;
memset(__sys_stats, 0, sizeof(__sys_stats));
__sys_stats[0].last_update_clk = now;
__sys_stats[1].last_update_clk = now;
__sys_stats[0].nr_active = nr_cpus_big;
__sys_stats[1].nr_active = nr_cpus_big;
timer = bpf_map_lookup_elem(&update_timer, &key);
if (!timer) {
scx_bpf_error("Failed to lookup update timer");
return -ESRCH;
}
bpf_timer_init(timer, &update_timer, CLOCK_BOOTTIME);
bpf_timer_set_callback(timer, update_timer_cb);
err = bpf_timer_start(timer, LAVD_SYS_STAT_INTERVAL_NS, 0);
if (err) {
scx_bpf_error("Failed to arm update timer");
return err;
}
return 0;
}

View File

@ -0,0 +1,298 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2023, 2024 Valve Corporation.
* Author: Changwoo Min <changwoo@igalia.com>
*/
/*
* To be included to the main.bpf.c
*/
/*
* Sched related globals
*/
private(LAVD) struct bpf_cpumask __kptr *turbo_cpumask; /* CPU mask for turbo CPUs */
private(LAVD) struct bpf_cpumask __kptr *big_cpumask; /* CPU mask for big CPUs */
private(LAVD) struct bpf_cpumask __kptr *little_cpumask; /* CPU mask for little CPUs */
private(LAVD) struct bpf_cpumask __kptr *active_cpumask; /* CPU mask for active CPUs */
private(LAVD) struct bpf_cpumask __kptr *ovrflw_cpumask; /* CPU mask for overflow CPUs */
private(LAVD) struct bpf_cpumask cpdom_cpumask[LAVD_CPDOM_MAX_NR]; /* CPU mask for each compute domain */
const volatile u64 nr_cpu_ids; /* maximum CPU IDs */
static volatile u64 nr_cpus_onln; /* current number of online CPUs */
static volatile u64 nr_cpus_big;
struct sys_stat __sys_stats[2];
volatile int __sys_stat_idx;
/*
* Options
*/
volatile bool no_core_compaction;
volatile bool no_freq_scaling;
volatile bool no_prefer_turbo_core;
volatile bool is_powersave_mode;
volatile bool reinit_cpumask_for_performance;
const volatile bool is_autopilot_on;
const volatile u32 is_smt_active;
const volatile u8 verbose;
/*
* Exit infomation
*/
UEI_DEFINE(uei);
/*
* per-CPU globals
*/
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, struct cpu_ctx);
__uint(max_entries, 1);
} cpu_ctx_stor SEC(".maps");
/*
* Per-task scheduling context
*/
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct task_ctx);
} task_ctx_stor SEC(".maps");
#define debugln(fmt, ...) \
({ \
if (verbose > 0) \
bpf_printk("[%s:%d] " fmt, __func__, __LINE__, \
##__VA_ARGS__); \
})
#define traceln(fmt, ...) \
({ \
if (verbose > 1) \
bpf_printk("[%s:%d] " fmt, __func__, __LINE__, \
##__VA_ARGS__); \
})
#ifndef min
#define min(X, Y) (((X) < (Y)) ? (X) : (Y))
#endif
#ifndef max
#define max(X, Y) (((X) < (Y)) ? (Y) : (X))
#endif
static struct sys_stat *get_sys_stat_cur(void)
{
if (READ_ONCE(__sys_stat_idx) == 0)
return &__sys_stats[0];
return &__sys_stats[1];
}
static struct sys_stat *get_sys_stat_next(void)
{
if (READ_ONCE(__sys_stat_idx) == 0)
return &__sys_stats[1];
return &__sys_stats[0];
}
static void flip_sys_stat(void)
{
WRITE_ONCE(__sys_stat_idx, __sys_stat_idx ^ 0x1);
}
static u64 sigmoid_u64(u64 v, u64 max)
{
/*
* An integer approximation of the sigmoid function. It is convenient
* to use the sigmoid function since it has a known upper and lower
* bound, [0, max].
*
* |
* | +------ <= max
* | /
* | /
* |/
* +------------->
*/
return (v > max) ? max : v;
}
static u64 rsigmoid_u64(u64 v, u64 max)
{
/*
* A horizontally flipped version of sigmoid function. Again, it is
* convenient since the upper and lower bound of the function is known,
* [0, max].
*
*
* |
* |\ <= max
* | \
* | \
* | \
* +----+-------->
*/
return (v >= max) ? 0 : max - v;
}
static struct task_ctx *try_get_task_ctx(struct task_struct *p)
{
return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
}
static struct task_ctx *get_task_ctx(struct task_struct *p)
{
struct task_ctx *taskc;
taskc = try_get_task_ctx(p);
if (!taskc)
scx_bpf_error("task_ctx lookup failed for %s[%d]",
p->comm, p->pid);
return taskc;
}
static struct cpu_ctx *get_cpu_ctx(void)
{
const u32 idx = 0;
struct cpu_ctx *cpuc;
cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &idx);
if (!cpuc)
scx_bpf_error("cpu_ctx lookup failed for current cpu");
return cpuc;
}
static struct cpu_ctx *get_cpu_ctx_id(s32 cpu_id)
{
const u32 idx = 0;
struct cpu_ctx *cpuc;
cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &idx, cpu_id);
if (!cpuc)
scx_bpf_error("cpu_ctx lookup failed for %d", cpu_id);
return cpuc;
}
static u32 calc_avg32(u32 old_val, u32 new_val)
{
/*
* Calculate the exponential weighted moving average (EWMA).
* - EWMA = (0.75 * old) + (0.25 * new)
*/
return (old_val - (old_val >> 2)) + (new_val >> 2);
}
static u64 calc_avg(u64 old_val, u64 new_val)
{
/*
* Calculate the exponential weighted moving average (EWMA).
* - EWMA = (0.75 * old) + (0.25 * new)
*/
return (old_val - (old_val >> 2)) + (new_val >> 2);
}
static u64 calc_avg_freq(u64 old_freq, u64 interval)
{
u64 new_freq, ewma_freq;
/*
* Calculate the exponential weighted moving average (EWMA) of a
* frequency with a new interval measured.
*/
new_freq = LAVD_TIME_ONE_SEC / interval;
ewma_freq = calc_avg(old_freq, new_freq);
return ewma_freq;
}
static bool is_lat_cri(struct task_ctx *taskc, struct sys_stat *stat_cur)
{
return taskc->lat_cri >= stat_cur->avg_lat_cri;
}
static bool is_perf_cri(struct task_ctx *taskc, struct sys_stat *stat_cur)
{
if (READ_ONCE(taskc->on_big) && READ_ONCE(taskc->on_little))
return taskc->perf_cri >= stat_cur->thr_perf_cri;
return READ_ONCE(taskc->on_big);
}
static bool is_greedy(struct task_ctx *taskc)
{
return taskc->greedy_ratio > 1000;
}
static bool is_eligible(struct task_ctx *taskc)
{
return !is_greedy(taskc);
}
static bool have_scheduled(struct task_ctx *taskc)
{
/*
* If task's time slice hasn't been updated, that means the task has
* been scheduled by this scheduler.
*/
return taskc->slice_ns != 0;
}
static u16 get_nice_prio(struct task_struct *p)
{
u16 prio = p->static_prio - MAX_RT_PRIO; /* [0, 40) */
return prio;
}
static bool use_full_cpus(void)
{
struct sys_stat *stat_cur = get_sys_stat_cur();
return (stat_cur->nr_active + LAVD_CC_NR_OVRFLW) >= nr_cpus_onln;
}
static u64 pick_any_bit(u64 bitmap, u64 nuance)
{
u64 i, pos;
bpf_for(i, 0, 64) {
pos = (i + nuance) % 64;
if (bitmap & (1LLU << pos))
return pos;
}
return -ENOENT;
}
static void set_on_core_type(struct task_ctx *taskc,
const struct cpumask *cpumask)
{
bool on_big = false, on_little = false;
struct cpu_ctx *cpuc;
int cpu;
bpf_for(cpu, 0, nr_cpu_ids) {
if (!bpf_cpumask_test_cpu(cpu, cpumask))
continue;
cpuc = get_cpu_ctx_id(cpu);
if (!cpuc) {
scx_bpf_error("Failed to look up cpu_ctx: %d", cpu);
return;
}
if (cpuc->big_core)
on_big = true;
else
on_little = true;
if (on_big && on_little)
break;
}
WRITE_ONCE(taskc->on_big, on_big);
WRITE_ONCE(taskc->on_little, on_little);
}

View File

@ -65,8 +65,6 @@ use stats::StatsReq;
use stats::StatsRes; use stats::StatsRes;
use stats::SysStats; use stats::SysStats;
const LAVD_CPU_ID_MAX: usize = bpf_intf::consts_LAVD_CPU_ID_MAX as usize;
/// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler /// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler
/// ///
/// The rust part is minimal. It processes command line options and logs out /// The rust part is minimal. It processes command line options and logs out
@ -484,7 +482,7 @@ struct Scheduler<'a> {
impl<'a> Scheduler<'a> { impl<'a> Scheduler<'a> {
fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> { fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
if *NR_CPU_IDS > LAVD_CPU_ID_MAX { if *NR_CPU_IDS > LAVD_CPU_ID_MAX as usize {
panic!( panic!(
"Num possible CPU IDs ({}) exceeds maximum of ({})", "Num possible CPU IDs ({}) exceeds maximum of ({})",
*NR_CPU_IDS, LAVD_CPU_ID_MAX *NR_CPU_IDS, LAVD_CPU_ID_MAX
@ -559,15 +557,13 @@ impl<'a> Scheduler<'a> {
skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].__cpumask[i] |= 0x01 << j; skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].__cpumask[i] |= 0x01 << j;
} }
const LAVD_CPDOM_MAX_NR: u8 = 32; if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST as usize {
const LAVD_CPDOM_MAX_DIST: usize = 4;
if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST {
panic!("The processor topology is too complex to handle in BPF."); panic!("The processor topology is too complex to handle in BPF.");
} }
for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() { for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
let nr_neighbors = neighbors.borrow().len() as u8; let nr_neighbors = neighbors.borrow().len() as u8;
if nr_neighbors > LAVD_CPDOM_MAX_NR { if nr_neighbors > LAVD_CPDOM_MAX_NR as u8 {
panic!("The processor topology is too complex to handle in BPF."); panic!("The processor topology is too complex to handle in BPF.");
} }
skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].nr_neighbors[k] = nr_neighbors; skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].nr_neighbors[k] = nr_neighbors;
@ -666,12 +662,8 @@ impl<'a> Scheduler<'a> {
return 100. * x as f64 / y as f64; return 100. * x as f64 / y as f64;
} }
fn get_power_mode(power_mode: s32) -> &'static str { fn get_power_mode(power_mode: i32) -> &'static str {
const LAVD_PM_PERFORMANCE: s32 = 0; match power_mode as u32 {
const LAVD_PM_BALANCED: s32 = 1;
const LAVD_PM_POWERSAVE: s32 = 2;
match power_mode {
LAVD_PM_PERFORMANCE => { LAVD_PM_PERFORMANCE => {
return &"performance"; return &"performance";
} }