mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-24 20:00:22 +00:00
scx_lavd: split main.bpf.c into multiple files
As the main.bpf.c file grows, it gets hard to maintain. So, split it into multiple logical files. There is no functional change. Signed-off-by: Changwoo Min <changwoo@igalia.com>
This commit is contained in:
parent
db0f83ce89
commit
7c5c83a3a2
@ -47,56 +47,11 @@ extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
|
|||||||
/*
|
/*
|
||||||
* common constants
|
* common constants
|
||||||
*/
|
*/
|
||||||
enum consts {
|
enum {
|
||||||
CLOCK_BOOTTIME = 7,
|
|
||||||
CACHELINE_SIZE = 64,
|
|
||||||
NSEC_PER_USEC = 1000ULL,
|
|
||||||
NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC),
|
|
||||||
LAVD_TIME_ONE_SEC = (1000ULL * NSEC_PER_MSEC),
|
|
||||||
LAVD_TIME_INFINITY_NS = SCX_SLICE_INF,
|
|
||||||
LAVD_MAX_RETRY = 4,
|
|
||||||
|
|
||||||
LAVD_TARGETED_LATENCY_NS = (20ULL * NSEC_PER_MSEC),
|
|
||||||
LAVD_SLICE_MIN_NS = (300ULL * NSEC_PER_USEC), /* min time slice */
|
|
||||||
LAVD_SLICE_MAX_NS = (3ULL * NSEC_PER_MSEC), /* max time slice */
|
|
||||||
LAVD_SLICE_UNDECIDED = SCX_SLICE_INF,
|
|
||||||
|
|
||||||
LAVD_LC_FREQ_MAX = 1000000,
|
|
||||||
LAVD_LC_RUNTIME_MAX = LAVD_TARGETED_LATENCY_NS,
|
|
||||||
LAVD_LC_RUNTIME_SHIFT = 15,
|
|
||||||
LAVD_LC_WAKEUP_FT = 30,
|
|
||||||
LAVD_LC_KTHREAD_FT = 30,
|
|
||||||
|
|
||||||
LAVD_SLICE_BOOST_MAX_FT = 3, /* maximum additional 3x of slice */
|
|
||||||
LAVD_SLICE_BOOST_MAX_STEP = 6, /* 6 slice exhausitions in a row */
|
|
||||||
LAVD_NEW_PROC_PENALITY = 5,
|
|
||||||
LAVD_GREEDY_RATIO_NEW = (1000 * LAVD_NEW_PROC_PENALITY),
|
|
||||||
|
|
||||||
LAVD_CPU_UTIL_MAX = 1000, /* 100.0% */
|
|
||||||
LAVD_CPU_UTIL_MAX_FOR_CPUPERF = 850, /* 85.0% */
|
|
||||||
LAVD_CPU_ID_HERE = ((u32)-2),
|
|
||||||
LAVD_CPU_ID_NONE = ((u32)-1),
|
|
||||||
LAVD_CPU_ID_MAX = 512,
|
LAVD_CPU_ID_MAX = 512,
|
||||||
|
|
||||||
LAVD_PREEMPT_KICK_MARGIN = (1ULL * NSEC_PER_MSEC),
|
|
||||||
LAVD_PREEMPT_TICK_MARGIN = (100ULL * NSEC_PER_USEC),
|
|
||||||
|
|
||||||
LAVD_SYS_STAT_INTERVAL_NS = (50ULL * NSEC_PER_MSEC),
|
|
||||||
LAVD_SYS_STAT_DECAY_TIMES = (2ULL * LAVD_TIME_ONE_SEC) / LAVD_SYS_STAT_INTERVAL_NS,
|
|
||||||
LAVD_CC_PER_CORE_MAX_CTUIL = 500, /* maximum per-core CPU utilization */
|
|
||||||
LAVD_CC_PER_TURBO_CORE_MAX_CTUIL = 750, /* maximum per-core CPU utilization for a turbo core */
|
|
||||||
LAVD_CC_NR_ACTIVE_MIN = 1, /* num of mininum active cores */
|
|
||||||
LAVD_CC_NR_OVRFLW = 1, /* num of overflow cores */
|
|
||||||
LAVD_CC_CPU_PIN_INTERVAL = (1ULL * LAVD_TIME_ONE_SEC),
|
|
||||||
LAVD_CC_CPU_PIN_INTERVAL_DIV = (LAVD_CC_CPU_PIN_INTERVAL /
|
|
||||||
LAVD_SYS_STAT_INTERVAL_NS),
|
|
||||||
|
|
||||||
LAVD_AP_HIGH_UTIL = 700, /* balanced mode when 10% < cpu util <= 40%,
|
|
||||||
performance mode when cpu util > 40% */
|
|
||||||
|
|
||||||
LAVD_CPDOM_MAX_NR = 32, /* maximum number of compute domain */
|
LAVD_CPDOM_MAX_NR = 32, /* maximum number of compute domain */
|
||||||
LAVD_CPDOM_MAX_DIST = 4, /* maximum distance from one compute domain to another */
|
LAVD_CPDOM_MAX_DIST = 4, /* maximum distance from one compute domain to another */
|
||||||
LAVD_CPDOM_STARV_NS = (5ULL * NSEC_PER_MSEC),
|
|
||||||
|
|
||||||
LAVD_STATUS_STR_LEN = 5, /* {LR: Latency-critical, Regular}
|
LAVD_STATUS_STR_LEN = 5, /* {LR: Latency-critical, Regular}
|
||||||
{HI: performance-Hungry, performance-Insensitive}
|
{HI: performance-Hungry, performance-Insensitive}
|
||||||
@ -139,100 +94,6 @@ struct sys_stat {
|
|||||||
volatile u64 nr_lc_on_big; /* latency-critical tasks scheduled on big core */
|
volatile u64 nr_lc_on_big; /* latency-critical tasks scheduled on big core */
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
|
||||||
* Compute domain context
|
|
||||||
* - system > numa node > llc domain > compute domain per core type (P or E)
|
|
||||||
*/
|
|
||||||
struct cpdom_ctx {
|
|
||||||
u64 id; /* id of this compute domain (== dsq_id) */
|
|
||||||
u64 alt_id; /* id of the closest compute domain of alternative type (== dsq id) */
|
|
||||||
u64 last_consume_clk; /* when the associated DSQ was consumed */
|
|
||||||
u8 is_big; /* is it a big core or little core? */
|
|
||||||
u8 is_active; /* if this compute domain is active */
|
|
||||||
u8 nr_neighbors[LAVD_CPDOM_MAX_DIST]; /* number of neighbors per distance */
|
|
||||||
u64 neighbor_bits[LAVD_CPDOM_MAX_DIST]; /* bitmask of neighbor bitmask per distance */
|
|
||||||
u64 __cpumask[LAVD_CPU_ID_MAX/64]; /* cpumasks belongs to this compute domain */
|
|
||||||
} __attribute__((aligned(CACHELINE_SIZE)));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* CPU context
|
|
||||||
*/
|
|
||||||
struct cpu_ctx {
|
|
||||||
/*
|
|
||||||
* Information used to keep track of CPU utilization
|
|
||||||
*/
|
|
||||||
volatile u64 util; /* average of the CPU utilization */
|
|
||||||
volatile u64 idle_total; /* total idle time so far */
|
|
||||||
volatile u64 idle_start_clk; /* when the CPU becomes idle */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Information used to keep track of load
|
|
||||||
*/
|
|
||||||
volatile u64 load_actual; /* actual load of runnable tasks */
|
|
||||||
volatile u64 load_run_time_ns; /* total runtime of runnable tasks */
|
|
||||||
volatile u64 tot_svc_time; /* total service time on a CPU */
|
|
||||||
volatile u64 last_kick_clk; /* when the CPU was kicked */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Information for cpu hotplug
|
|
||||||
*/
|
|
||||||
u64 online_clk; /* when a CPU becomes online */
|
|
||||||
u64 offline_clk; /* when a CPU becomes offline */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Information used to keep track of latency criticality
|
|
||||||
*/
|
|
||||||
volatile u32 max_lat_cri; /* maximum latency criticality */
|
|
||||||
volatile u32 sum_lat_cri; /* sum of latency criticality */
|
|
||||||
volatile u32 nr_sched; /* number of schedules */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Information used to keep track of performance criticality
|
|
||||||
*/
|
|
||||||
volatile u64 sum_perf_cri; /* sum of performance criticality */
|
|
||||||
volatile u64 min_perf_cri; /* mininum performance criticality */
|
|
||||||
volatile u64 max_perf_cri; /* maximum performance criticality */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Information of a current running task for preemption
|
|
||||||
*/
|
|
||||||
volatile u64 stopping_tm_est_ns; /* estimated stopping time */
|
|
||||||
volatile u16 lat_cri; /* latency criticality */
|
|
||||||
volatile u8 is_online; /* is this CPU online? */
|
|
||||||
s32 cpu_id; /* cpu id */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Information for CPU frequency scaling
|
|
||||||
*/
|
|
||||||
u32 cpuperf_cur; /* CPU's current performance target */
|
|
||||||
u32 cpuperf_task; /* task's CPU performance target */
|
|
||||||
u32 cpuperf_avg; /* EWMA of task's CPU performance target */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fields for core compaction
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
u16 capacity; /* CPU capacity based on 1000 */
|
|
||||||
u8 big_core; /* is it a big core? */
|
|
||||||
u8 turbo_core; /* is it a turbo core? */
|
|
||||||
u8 cpdom_id; /* compute domain id (== dsq_id) */
|
|
||||||
u8 cpdom_alt_id; /* compute domain id of anternative type (== dsq_id) */
|
|
||||||
u8 cpdom_poll_pos; /* index to check if a DSQ of a compute domain is starving */
|
|
||||||
struct bpf_cpumask __kptr *tmp_a_mask; /* temporary cpu mask */
|
|
||||||
struct bpf_cpumask __kptr *tmp_o_mask; /* temporary cpu mask */
|
|
||||||
struct bpf_cpumask __kptr *tmp_t_mask; /* temporary cpu mask */
|
|
||||||
struct bpf_cpumask __kptr *tmp_t2_mask; /* temporary cpu mask */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Information for statistics.
|
|
||||||
*/
|
|
||||||
volatile u32 nr_migration; /* number of migrations */
|
|
||||||
volatile u32 nr_preemption; /* number of migrations */
|
|
||||||
volatile u32 nr_greedy; /* number of greedy tasks scheduled */
|
|
||||||
volatile u32 nr_perf_cri;
|
|
||||||
volatile u32 nr_lat_cri;
|
|
||||||
} __attribute__((aligned(CACHELINE_SIZE)));
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Task context
|
* Task context
|
||||||
*/
|
*/
|
||||||
|
116
scheds/rust/scx_lavd/src/bpf/introspec.bpf.c
Normal file
116
scheds/rust/scx_lavd/src/bpf/introspec.bpf.c
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2023, 2024 Valve Corporation.
|
||||||
|
* Author: Changwoo Min <changwoo@igalia.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* To be included to the main.bpf.c
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Introspection commands
|
||||||
|
*/
|
||||||
|
struct introspec intrspc;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
__uint(type, BPF_MAP_TYPE_RINGBUF);
|
||||||
|
__uint(max_entries, 16 * 1024 /* 16 KB */);
|
||||||
|
} introspec_msg SEC(".maps");
|
||||||
|
|
||||||
|
static __always_inline
|
||||||
|
int submit_task_ctx(struct task_struct *p, struct task_ctx *taskc, u32 cpu_id)
|
||||||
|
{
|
||||||
|
struct sys_stat *stat_cur = get_sys_stat_cur();
|
||||||
|
struct cpu_ctx *cpuc;
|
||||||
|
struct msg_task_ctx *m;
|
||||||
|
|
||||||
|
cpuc = get_cpu_ctx_id(cpu_id);
|
||||||
|
if (!cpuc)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
m = bpf_ringbuf_reserve(&introspec_msg, sizeof(*m), 0);
|
||||||
|
if (!m)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
m->hdr.kind = LAVD_MSG_TASKC;
|
||||||
|
m->taskc_x.pid = p->pid;
|
||||||
|
memcpy(m->taskc_x.comm, p->comm, TASK_COMM_LEN);
|
||||||
|
m->taskc_x.static_prio = get_nice_prio(p);
|
||||||
|
m->taskc_x.cpu_util = cpuc->util / 10;
|
||||||
|
m->taskc_x.cpu_id = cpu_id;
|
||||||
|
m->taskc_x.avg_lat_cri = stat_cur->avg_lat_cri;
|
||||||
|
m->taskc_x.thr_perf_cri = stat_cur->thr_perf_cri;
|
||||||
|
m->taskc_x.nr_active = stat_cur->nr_active;
|
||||||
|
m->taskc_x.cpuperf_cur = cpuc->cpuperf_cur;
|
||||||
|
|
||||||
|
m->taskc_x.stat[0] = is_lat_cri(taskc, stat_cur) ? 'L' : 'R';
|
||||||
|
m->taskc_x.stat[1] = is_perf_cri(taskc, stat_cur) ? 'H' : 'I';
|
||||||
|
m->taskc_x.stat[2] = cpuc->big_core ? 'B' : 'T';
|
||||||
|
m->taskc_x.stat[3] = is_greedy(taskc) ? 'G' : 'E';
|
||||||
|
m->taskc_x.stat[4] = taskc->victim_cpu >= 0 ? 'P' : 'N';
|
||||||
|
m->taskc_x.stat[5] = '\0';
|
||||||
|
|
||||||
|
memcpy(&m->taskc, taskc, sizeof(m->taskc));
|
||||||
|
|
||||||
|
bpf_ringbuf_submit(m, 0);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void proc_introspec_sched_n(struct task_struct *p,
|
||||||
|
struct task_ctx *taskc, u32 cpu_id)
|
||||||
|
{
|
||||||
|
u64 cur_nr, prev_nr;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/* introspec_arg is the number of schedules remaining */
|
||||||
|
cur_nr = intrspc.arg;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note that the bounded retry (@LAVD_MAX_RETRY) does *not *guarantee*
|
||||||
|
* to decrement introspec_arg. However, it is unlikely to happen. Even
|
||||||
|
* if it happens, it is nothing but a matter of delaying a message
|
||||||
|
* delivery. That's because other threads will try and succeed the CAS
|
||||||
|
* operation eventually. So this is good enough. ;-)
|
||||||
|
*/
|
||||||
|
for (i = 0; cur_nr > 0 && i < LAVD_MAX_RETRY; i++) {
|
||||||
|
prev_nr = __sync_val_compare_and_swap(
|
||||||
|
&intrspc.arg, cur_nr, cur_nr - 1);
|
||||||
|
/* CAS success: submit a message and done */
|
||||||
|
if (prev_nr == cur_nr) {
|
||||||
|
submit_task_ctx(p, taskc, cpu_id);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* CAS failure: retry */
|
||||||
|
cur_nr = prev_nr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void proc_introspec_pid(struct task_struct *p, struct task_ctx *taskc,
|
||||||
|
u32 cpu_id)
|
||||||
|
{
|
||||||
|
if (p->pid == intrspc.arg)
|
||||||
|
submit_task_ctx(p, taskc, cpu_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void try_proc_introspec_cmd(struct task_struct *p,
|
||||||
|
struct task_ctx *taskc, u32 cpu_id)
|
||||||
|
{
|
||||||
|
if (LAVD_CPU_ID_HERE == cpu_id)
|
||||||
|
cpu_id = bpf_get_smp_processor_id();
|
||||||
|
|
||||||
|
switch(intrspc.cmd) {
|
||||||
|
case LAVD_CMD_SCHED_N:
|
||||||
|
proc_introspec_sched_n(p, taskc, cpu_id);
|
||||||
|
break;
|
||||||
|
case LAVD_CMD_NOP:
|
||||||
|
/* do nothing */
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
scx_bpf_error("Unknown introspec command: %d", intrspc.cmd);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
158
scheds/rust/scx_lavd/src/bpf/lavd.bpf.h
Normal file
158
scheds/rust/scx_lavd/src/bpf/lavd.bpf.h
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2023, 2024 Valve Corporation.
|
||||||
|
* Author: Changwoo Min <changwoo@igalia.com>
|
||||||
|
*/
|
||||||
|
#ifndef __LAVD_H
|
||||||
|
#define __LAVD_H
|
||||||
|
|
||||||
|
/*
|
||||||
|
* common constants
|
||||||
|
*/
|
||||||
|
enum consts_internal {
|
||||||
|
CLOCK_BOOTTIME = 7,
|
||||||
|
CACHELINE_SIZE = 64,
|
||||||
|
|
||||||
|
NSEC_PER_USEC = 1000ULL,
|
||||||
|
NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC),
|
||||||
|
|
||||||
|
LAVD_TIME_ONE_SEC = (1000ULL * NSEC_PER_MSEC),
|
||||||
|
LAVD_TIME_INFINITY_NS = SCX_SLICE_INF,
|
||||||
|
LAVD_MAX_RETRY = 4,
|
||||||
|
|
||||||
|
LAVD_TARGETED_LATENCY_NS = (20ULL * NSEC_PER_MSEC),
|
||||||
|
LAVD_SLICE_MIN_NS = (300ULL * NSEC_PER_USEC), /* min time slice */
|
||||||
|
LAVD_SLICE_MAX_NS = (3ULL * NSEC_PER_MSEC), /* max time slice */
|
||||||
|
LAVD_SLICE_UNDECIDED = SCX_SLICE_INF,
|
||||||
|
|
||||||
|
LAVD_LC_FREQ_MAX = 1000000,
|
||||||
|
LAVD_LC_RUNTIME_MAX = LAVD_TARGETED_LATENCY_NS,
|
||||||
|
LAVD_LC_RUNTIME_SHIFT = 15,
|
||||||
|
LAVD_LC_WAKEUP_FT = 30,
|
||||||
|
LAVD_LC_KTHREAD_FT = 30,
|
||||||
|
|
||||||
|
LAVD_SLICE_BOOST_MAX_FT = 3, /* maximum additional 3x of slice */
|
||||||
|
LAVD_SLICE_BOOST_MAX_STEP = 6, /* 6 slice exhausitions in a row */
|
||||||
|
LAVD_NEW_PROC_PENALITY = 5,
|
||||||
|
LAVD_GREEDY_RATIO_NEW = (1000 * LAVD_NEW_PROC_PENALITY),
|
||||||
|
|
||||||
|
LAVD_CPU_UTIL_MAX = 1000, /* 100.0% */
|
||||||
|
LAVD_CPU_UTIL_MAX_FOR_CPUPERF = 850, /* 85.0% */
|
||||||
|
LAVD_CPU_ID_HERE = ((u32)-2),
|
||||||
|
LAVD_CPU_ID_NONE = ((u32)-1),
|
||||||
|
|
||||||
|
LAVD_PREEMPT_KICK_MARGIN = (1ULL * NSEC_PER_MSEC),
|
||||||
|
LAVD_PREEMPT_TICK_MARGIN = (100ULL * NSEC_PER_USEC),
|
||||||
|
|
||||||
|
LAVD_SYS_STAT_INTERVAL_NS = (50ULL * NSEC_PER_MSEC),
|
||||||
|
LAVD_SYS_STAT_DECAY_TIMES = (2ULL * LAVD_TIME_ONE_SEC) / LAVD_SYS_STAT_INTERVAL_NS,
|
||||||
|
LAVD_CC_PER_CORE_MAX_CTUIL = 500, /* maximum per-core CPU utilization */
|
||||||
|
LAVD_CC_PER_TURBO_CORE_MAX_CTUIL = 750, /* maximum per-core CPU utilization for a turbo core */
|
||||||
|
LAVD_CC_NR_ACTIVE_MIN = 1, /* num of mininum active cores */
|
||||||
|
LAVD_CC_NR_OVRFLW = 1, /* num of overflow cores */
|
||||||
|
LAVD_CC_CPU_PIN_INTERVAL = (1ULL * LAVD_TIME_ONE_SEC),
|
||||||
|
LAVD_CC_CPU_PIN_INTERVAL_DIV = (LAVD_CC_CPU_PIN_INTERVAL /
|
||||||
|
LAVD_SYS_STAT_INTERVAL_NS),
|
||||||
|
|
||||||
|
LAVD_AP_HIGH_UTIL = 700, /* balanced mode when 10% < cpu util <= 40%,
|
||||||
|
performance mode when cpu util > 40% */
|
||||||
|
|
||||||
|
LAVD_CPDOM_STARV_NS = (5ULL * NSEC_PER_MSEC),
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Compute domain context
|
||||||
|
* - system > numa node > llc domain > compute domain per core type (P or E)
|
||||||
|
*/
|
||||||
|
struct cpdom_ctx {
|
||||||
|
u64 id; /* id of this compute domain (== dsq_id) */
|
||||||
|
u64 alt_id; /* id of the closest compute domain of alternative type (== dsq id) */
|
||||||
|
u64 last_consume_clk; /* when the associated DSQ was consumed */
|
||||||
|
u8 is_big; /* is it a big core or little core? */
|
||||||
|
u8 is_active; /* if this compute domain is active */
|
||||||
|
u8 nr_neighbors[LAVD_CPDOM_MAX_DIST]; /* number of neighbors per distance */
|
||||||
|
u64 neighbor_bits[LAVD_CPDOM_MAX_DIST]; /* bitmask of neighbor bitmask per distance */
|
||||||
|
u64 __cpumask[LAVD_CPU_ID_MAX/64]; /* cpumasks belongs to this compute domain */
|
||||||
|
} __attribute__((aligned(CACHELINE_SIZE)));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* CPU context
|
||||||
|
*/
|
||||||
|
struct cpu_ctx {
|
||||||
|
/*
|
||||||
|
* Information used to keep track of CPU utilization
|
||||||
|
*/
|
||||||
|
volatile u64 util; /* average of the CPU utilization */
|
||||||
|
volatile u64 idle_total; /* total idle time so far */
|
||||||
|
volatile u64 idle_start_clk; /* when the CPU becomes idle */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Information used to keep track of load
|
||||||
|
*/
|
||||||
|
volatile u64 load_actual; /* actual load of runnable tasks */
|
||||||
|
volatile u64 load_run_time_ns; /* total runtime of runnable tasks */
|
||||||
|
volatile u64 tot_svc_time; /* total service time on a CPU */
|
||||||
|
volatile u64 last_kick_clk; /* when the CPU was kicked */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Information for cpu hotplug
|
||||||
|
*/
|
||||||
|
u64 online_clk; /* when a CPU becomes online */
|
||||||
|
u64 offline_clk; /* when a CPU becomes offline */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Information used to keep track of latency criticality
|
||||||
|
*/
|
||||||
|
volatile u32 max_lat_cri; /* maximum latency criticality */
|
||||||
|
volatile u32 sum_lat_cri; /* sum of latency criticality */
|
||||||
|
volatile u32 nr_sched; /* number of schedules */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Information used to keep track of performance criticality
|
||||||
|
*/
|
||||||
|
volatile u64 sum_perf_cri; /* sum of performance criticality */
|
||||||
|
volatile u64 min_perf_cri; /* mininum performance criticality */
|
||||||
|
volatile u64 max_perf_cri; /* maximum performance criticality */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Information of a current running task for preemption
|
||||||
|
*/
|
||||||
|
volatile u64 stopping_tm_est_ns; /* estimated stopping time */
|
||||||
|
volatile u16 lat_cri; /* latency criticality */
|
||||||
|
volatile u8 is_online; /* is this CPU online? */
|
||||||
|
s32 cpu_id; /* cpu id */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Information for CPU frequency scaling
|
||||||
|
*/
|
||||||
|
u32 cpuperf_cur; /* CPU's current performance target */
|
||||||
|
u32 cpuperf_task; /* task's CPU performance target */
|
||||||
|
u32 cpuperf_avg; /* EWMA of task's CPU performance target */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Fields for core compaction
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
u16 capacity; /* CPU capacity based on 1000 */
|
||||||
|
u8 big_core; /* is it a big core? */
|
||||||
|
u8 turbo_core; /* is it a turbo core? */
|
||||||
|
u8 cpdom_id; /* compute domain id (== dsq_id) */
|
||||||
|
u8 cpdom_alt_id; /* compute domain id of anternative type (== dsq_id) */
|
||||||
|
u8 cpdom_poll_pos; /* index to check if a DSQ of a compute domain is starving */
|
||||||
|
struct bpf_cpumask __kptr *tmp_a_mask; /* temporary cpu mask */
|
||||||
|
struct bpf_cpumask __kptr *tmp_o_mask; /* temporary cpu mask */
|
||||||
|
struct bpf_cpumask __kptr *tmp_t_mask; /* temporary cpu mask */
|
||||||
|
struct bpf_cpumask __kptr *tmp_t2_mask; /* temporary cpu mask */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Information for statistics.
|
||||||
|
*/
|
||||||
|
volatile u32 nr_migration; /* number of migrations */
|
||||||
|
volatile u32 nr_preemption; /* number of migrations */
|
||||||
|
volatile u32 nr_greedy; /* number of greedy tasks scheduled */
|
||||||
|
volatile u32 nr_perf_cri;
|
||||||
|
volatile u32 nr_lat_cri;
|
||||||
|
} __attribute__((aligned(CACHELINE_SIZE)));
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* __LAVD_H */
|
File diff suppressed because it is too large
Load Diff
565
scheds/rust/scx_lavd/src/bpf/power.bpf.c
Normal file
565
scheds/rust/scx_lavd/src/bpf/power.bpf.c
Normal file
@ -0,0 +1,565 @@
|
|||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2023, 2024 Valve Corporation.
|
||||||
|
* Author: Changwoo Min <changwoo@igalia.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* To be included to the main.bpf.c
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* CPU topology
|
||||||
|
*/
|
||||||
|
static u64 LAVD_AP_LOW_UTIL;
|
||||||
|
static bool have_turbo_core;
|
||||||
|
static bool have_little_core;
|
||||||
|
|
||||||
|
const volatile u16 cpu_order_performance[LAVD_CPU_ID_MAX]; /* CPU preference order for performance and balanced mode */
|
||||||
|
const volatile u16 cpu_order_powersave[LAVD_CPU_ID_MAX]; /* CPU preference order for powersave mode */
|
||||||
|
const volatile u16 __cpu_capacity_hint[LAVD_CPU_ID_MAX]; /* CPU capacity based on 1000 */
|
||||||
|
struct cpdom_ctx cpdom_ctxs[LAVD_CPDOM_MAX_NR]; /* contexts for compute domains */
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Big core's compute ratio among currently active cores
|
||||||
|
*/
|
||||||
|
static u32 cur_big_core_ratio;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Big core's compute ratio when all cores are active
|
||||||
|
*/
|
||||||
|
static u32 default_big_core_ratio;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Statistics
|
||||||
|
*/
|
||||||
|
volatile int power_mode;
|
||||||
|
volatile u64 last_power_mode_clk;
|
||||||
|
volatile u64 performance_mode_ns;
|
||||||
|
volatile u64 balanced_mode_ns;
|
||||||
|
volatile u64 powersave_mode_ns;
|
||||||
|
|
||||||
|
static u64 calc_nr_active_cpus(struct sys_stat *stat_cur)
|
||||||
|
{
|
||||||
|
u64 nr_active;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* nr_active = ceil(nr_cpus_onln * cpu_util * per_core_max_util)
|
||||||
|
*/
|
||||||
|
nr_active = (nr_cpus_onln * stat_cur->util * 1000) + 500;
|
||||||
|
nr_active /= (LAVD_CC_PER_CORE_MAX_CTUIL * 1000);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If a few CPUs are particularly busy, boost the active CPUs more.
|
||||||
|
*/
|
||||||
|
nr_active += min(LAVD_CC_NR_OVRFLW, (stat_cur->nr_violation) / 1000);
|
||||||
|
nr_active = max(min(nr_active, nr_cpus_onln),
|
||||||
|
LAVD_CC_NR_ACTIVE_MIN);
|
||||||
|
|
||||||
|
return nr_active;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool clear_cpu_periodically(u32 cpu, struct bpf_cpumask *cpumask)
|
||||||
|
{
|
||||||
|
u32 clear;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the CPU is on, we clear the bit once every four times
|
||||||
|
* (LAVD_CC_CPU_PIN_INTERVAL_DIV). Hence, the bit will be
|
||||||
|
* probabilistically cleared once every 100 msec (4 * 25 msec).
|
||||||
|
*/
|
||||||
|
clear = !(bpf_get_prandom_u32() % LAVD_CC_CPU_PIN_INTERVAL_DIV);
|
||||||
|
if (clear)
|
||||||
|
bpf_cpumask_clear_cpu(cpu, cpumask);
|
||||||
|
|
||||||
|
return clear;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void do_core_compaction(void)
|
||||||
|
{
|
||||||
|
struct sys_stat *stat_cur = get_sys_stat_cur();
|
||||||
|
struct cpu_ctx *cpuc;
|
||||||
|
struct bpf_cpumask *active, *ovrflw;
|
||||||
|
int nr_cpus, nr_active, nr_active_old, cpu, i;
|
||||||
|
u32 sum_capacity = 0, big_capacity = 0;
|
||||||
|
bool clear;
|
||||||
|
const volatile u16 *cpu_order;
|
||||||
|
|
||||||
|
bpf_rcu_read_lock();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Prepare cpumasks.
|
||||||
|
*/
|
||||||
|
active = active_cpumask;
|
||||||
|
ovrflw = ovrflw_cpumask;
|
||||||
|
if (!active || !ovrflw) {
|
||||||
|
scx_bpf_error("Failed to prepare cpumasks.");
|
||||||
|
goto unlock_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Decide a cpuorder to use according to its power mode.
|
||||||
|
*/
|
||||||
|
if (is_powersave_mode)
|
||||||
|
cpu_order = cpu_order_powersave;
|
||||||
|
else
|
||||||
|
cpu_order = cpu_order_performance;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Assign active and overflow cores
|
||||||
|
*/
|
||||||
|
nr_active_old = stat_cur->nr_active;
|
||||||
|
nr_active = calc_nr_active_cpus(stat_cur);
|
||||||
|
nr_cpus = nr_active + LAVD_CC_NR_OVRFLW;
|
||||||
|
bpf_for(i, 0, nr_cpu_ids) {
|
||||||
|
if (i >= LAVD_CPU_ID_MAX)
|
||||||
|
break;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Skip offline cpu
|
||||||
|
*/
|
||||||
|
cpu = cpu_order[i];
|
||||||
|
cpuc = get_cpu_ctx_id(cpu);
|
||||||
|
if (!cpuc || !cpuc->is_online) {
|
||||||
|
bpf_cpumask_clear_cpu(cpu, active);
|
||||||
|
bpf_cpumask_clear_cpu(cpu, ovrflw);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Assign an online cpu to active and overflow cpumasks
|
||||||
|
*/
|
||||||
|
if (i < nr_cpus) {
|
||||||
|
if (i < nr_active) {
|
||||||
|
bpf_cpumask_set_cpu(cpu, active);
|
||||||
|
bpf_cpumask_clear_cpu(cpu, ovrflw);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
bpf_cpumask_set_cpu(cpu, ovrflw);
|
||||||
|
bpf_cpumask_clear_cpu(cpu, active);
|
||||||
|
}
|
||||||
|
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Calculate big capacity ratio among active cores.
|
||||||
|
*/
|
||||||
|
sum_capacity += cpuc->capacity;
|
||||||
|
if (cpuc->big_core)
|
||||||
|
big_capacity += cpuc->capacity;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (i < nr_active_old) {
|
||||||
|
bpf_cpumask_clear_cpu(cpu, active);
|
||||||
|
bpf_cpumask_clear_cpu(cpu, ovrflw);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/*
|
||||||
|
* This is the case when a CPU belongs to the
|
||||||
|
* overflow set even though that CPU was not an
|
||||||
|
* overflow set initially. This can happen only
|
||||||
|
* when a pinned userspace task ran on this
|
||||||
|
* CPU. In this case, we keep the CPU in an
|
||||||
|
* overflow set since the CPU will be used
|
||||||
|
* anyway for the task. This will promote equal
|
||||||
|
* use of all used CPUs, lowering the energy
|
||||||
|
* consumption by avoiding a few CPUs being
|
||||||
|
* turbo-boosted. Hence, we do not clear the
|
||||||
|
* overflow cpumask here for a while,
|
||||||
|
* approximately for LAVD_CC_CPU_PIN_INTERVAL.
|
||||||
|
*/
|
||||||
|
bpf_cpumask_clear_cpu(cpu, active);
|
||||||
|
clear = clear_cpu_periodically(cpu, ovrflw);
|
||||||
|
if (!clear)
|
||||||
|
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cur_big_core_ratio = (1000 * big_capacity) / sum_capacity;
|
||||||
|
stat_cur->nr_active = nr_active;
|
||||||
|
|
||||||
|
unlock_out:
|
||||||
|
bpf_rcu_read_unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void update_power_mode_time(void)
|
||||||
|
{
|
||||||
|
u64 now = bpf_ktime_get_ns();
|
||||||
|
u64 delta;
|
||||||
|
|
||||||
|
if (last_power_mode_clk == 0)
|
||||||
|
last_power_mode_clk = now;
|
||||||
|
|
||||||
|
delta = now - last_power_mode_clk;
|
||||||
|
last_power_mode_clk = now;
|
||||||
|
|
||||||
|
switch (power_mode) {
|
||||||
|
case LAVD_PM_PERFORMANCE:
|
||||||
|
__sync_fetch_and_add(&performance_mode_ns, delta);
|
||||||
|
break;
|
||||||
|
case LAVD_PM_BALANCED:
|
||||||
|
__sync_fetch_and_add(&balanced_mode_ns, delta);
|
||||||
|
break;
|
||||||
|
case LAVD_PM_POWERSAVE:
|
||||||
|
__sync_fetch_and_add(&powersave_mode_ns, delta);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int do_set_power_profile(s32 pm, int util)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Skip setting the mode if already in the same mode.
|
||||||
|
*/
|
||||||
|
if (power_mode == pm)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Update power mode time
|
||||||
|
*/
|
||||||
|
update_power_mode_time();
|
||||||
|
power_mode = pm;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Change the power mode.
|
||||||
|
*/
|
||||||
|
switch (pm) {
|
||||||
|
case LAVD_PM_PERFORMANCE:
|
||||||
|
no_core_compaction = true;
|
||||||
|
no_freq_scaling = true;
|
||||||
|
no_prefer_turbo_core = false;
|
||||||
|
is_powersave_mode = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Since the core compaction becomes off, we need to
|
||||||
|
* reinitialize the active and overflow cpumask for performance
|
||||||
|
* mode.
|
||||||
|
*
|
||||||
|
* Note that a verifier in an old kernel does not allow calling
|
||||||
|
* bpf_cpumask_set_cpu(), so we defer the actual update to our
|
||||||
|
* timer handler, update_sys_stat().
|
||||||
|
*/
|
||||||
|
reinit_cpumask_for_performance = true;
|
||||||
|
debugln("Set the scheduler's power profile to performance mode: %d", util);
|
||||||
|
break;
|
||||||
|
case LAVD_PM_BALANCED:
|
||||||
|
no_core_compaction = false;
|
||||||
|
no_freq_scaling = false;
|
||||||
|
no_prefer_turbo_core = false;
|
||||||
|
is_powersave_mode = false;
|
||||||
|
reinit_cpumask_for_performance = false;
|
||||||
|
debugln("Set the scheduler's power profile to balanced mode: %d", util);
|
||||||
|
break;
|
||||||
|
case LAVD_PM_POWERSAVE:
|
||||||
|
no_core_compaction = false;
|
||||||
|
no_freq_scaling = false;
|
||||||
|
no_prefer_turbo_core = true;
|
||||||
|
is_powersave_mode = true;
|
||||||
|
reinit_cpumask_for_performance = false;
|
||||||
|
debugln("Set the scheduler's power profile to power-save mode: %d", util);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int do_autopilot(void)
|
||||||
|
{
|
||||||
|
struct sys_stat *stat_cur = get_sys_stat_cur();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the CPU utiulization is very low (say <= 5%), it means high
|
||||||
|
* performance is not required. We run the scheduler in powersave mode
|
||||||
|
* to save energy consumption.
|
||||||
|
*/
|
||||||
|
if (stat_cur->util <= LAVD_AP_LOW_UTIL)
|
||||||
|
return do_set_power_profile(LAVD_PM_POWERSAVE, stat_cur->util);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the CPU utiulization is moderate (say > 5%, <= 30%), we run the
|
||||||
|
* scheduler in balanced mode. Actually, balanced mode can save energy
|
||||||
|
* consumption only under moderate CPU load.
|
||||||
|
*/
|
||||||
|
if (stat_cur->util <= LAVD_AP_HIGH_UTIL)
|
||||||
|
return do_set_power_profile(LAVD_PM_BALANCED, stat_cur->util);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the CPU utilization is high enough (say > 30%), we run the
|
||||||
|
* scheduler in performance mode. The system indeed needs perrformance
|
||||||
|
* also there is little energy benefit even under balanced mode anyway.
|
||||||
|
*/
|
||||||
|
return do_set_power_profile(LAVD_PM_PERFORMANCE, stat_cur->util);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void update_thr_perf_cri(void)
|
||||||
|
{
|
||||||
|
struct sys_stat *stat_cur = get_sys_stat_cur();
|
||||||
|
u32 little_core_ratio, delta, diff, thr;
|
||||||
|
|
||||||
|
if (no_core_compaction || !have_little_core)
|
||||||
|
cur_big_core_ratio = default_big_core_ratio;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If all active cores are big, all tasks should run on the big cores.
|
||||||
|
*/
|
||||||
|
if (cur_big_core_ratio == 1000) {
|
||||||
|
stat_cur->thr_perf_cri = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We approximate the distribution of performance criticality of tasks
|
||||||
|
* using min, avg, and max performance criticality of a given period.
|
||||||
|
*
|
||||||
|
* min_perf_cri
|
||||||
|
* | avg_perf_cri
|
||||||
|
* | | max_perf_cri
|
||||||
|
* | | |
|
||||||
|
* <--------><----------------------->
|
||||||
|
*
|
||||||
|
* The half of compute capacity should be assigned to the below average
|
||||||
|
* tasks (< avg_perf_cri), and the other half should assigned to the
|
||||||
|
* above average tasks (>= avg_perf_cri).
|
||||||
|
*
|
||||||
|
* <------------><------------------->
|
||||||
|
* | | |
|
||||||
|
* | | 1000
|
||||||
|
* | 1000 - big_core_ratio (i.e., little_core_ratio)
|
||||||
|
* 0
|
||||||
|
*/
|
||||||
|
little_core_ratio = 1000 - cur_big_core_ratio;
|
||||||
|
if (little_core_ratio < 500) {
|
||||||
|
/*
|
||||||
|
* min_perf_cri
|
||||||
|
* | avg_perf_cri
|
||||||
|
* | | max_perf_cri
|
||||||
|
* | | |
|
||||||
|
* <--------><----------------------->
|
||||||
|
*
|
||||||
|
* <-///-><-------------------------->
|
||||||
|
* | | |
|
||||||
|
* | | 1000
|
||||||
|
* | little_core_ratio
|
||||||
|
* 0
|
||||||
|
*/
|
||||||
|
delta = stat_cur->avg_perf_cri - stat_cur->min_perf_cri;
|
||||||
|
diff = (delta * little_core_ratio) / 1000;
|
||||||
|
thr = diff + stat_cur->min_perf_cri;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/*
|
||||||
|
* min_perf_cri
|
||||||
|
* | avg_perf_cri
|
||||||
|
* | | max_perf_cri
|
||||||
|
* | | |
|
||||||
|
* <--------><----------------------->
|
||||||
|
*
|
||||||
|
* <---------------------><-////////->
|
||||||
|
* | | |
|
||||||
|
* | | 1000
|
||||||
|
* | little_core_ratio
|
||||||
|
* 0
|
||||||
|
*/
|
||||||
|
delta = stat_cur->max_perf_cri - stat_cur->avg_perf_cri;
|
||||||
|
diff = (delta * cur_big_core_ratio) / 1000;
|
||||||
|
thr = stat_cur->max_perf_cri - diff;
|
||||||
|
}
|
||||||
|
|
||||||
|
stat_cur->thr_perf_cri = thr;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int reinit_active_cpumask_for_performance(void)
|
||||||
|
{
|
||||||
|
struct cpu_ctx *cpuc;
|
||||||
|
struct bpf_cpumask *active, *ovrflw;
|
||||||
|
int cpu, err = 0;
|
||||||
|
|
||||||
|
barrier();
|
||||||
|
bpf_rcu_read_lock();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Prepare cpumasks.
|
||||||
|
*/
|
||||||
|
active = active_cpumask;
|
||||||
|
ovrflw = ovrflw_cpumask;
|
||||||
|
if (!active || !ovrflw) {
|
||||||
|
scx_bpf_error("Failed to prepare cpumasks.");
|
||||||
|
err = -ENOMEM;
|
||||||
|
goto unlock_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Once core compaction becomes off in performance mode,
|
||||||
|
* reinitialize active/overflow cpumasks to reflect the mode change.
|
||||||
|
*/
|
||||||
|
bpf_for(cpu, 0, nr_cpu_ids) {
|
||||||
|
cpuc = get_cpu_ctx_id(cpu);
|
||||||
|
if (!cpuc) {
|
||||||
|
scx_bpf_error("Failed to lookup cpu_ctx: %d", cpu);
|
||||||
|
err = -ESRCH;
|
||||||
|
goto unlock_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cpuc->big_core)
|
||||||
|
bpf_cpumask_set_cpu(cpu, active);
|
||||||
|
else
|
||||||
|
bpf_cpumask_set_cpu(cpu, ovrflw);
|
||||||
|
}
|
||||||
|
|
||||||
|
unlock_out:
|
||||||
|
bpf_rcu_read_unlock();
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int calc_cpuperf_target(struct sys_stat *stat_cur,
|
||||||
|
struct task_ctx *taskc, struct cpu_ctx *cpuc)
|
||||||
|
{
|
||||||
|
u64 max_load, cpu_load;
|
||||||
|
u32 cpuperf_target;
|
||||||
|
|
||||||
|
if (!stat_cur || !taskc || !cpuc)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
if (no_freq_scaling) {
|
||||||
|
cpuc->cpuperf_task = SCX_CPUPERF_ONE;
|
||||||
|
cpuc->cpuperf_avg = SCX_CPUPERF_ONE;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We determine the clock frequency of a CPU using two factors: 1) the
|
||||||
|
* current CPU utilization (cpuc->util) and 2) the current task's
|
||||||
|
* performance criticality (taskc->perf_cri) compared to the
|
||||||
|
* system-wide average performance criticality
|
||||||
|
* (stat_cur->thr_perf_cri).
|
||||||
|
*
|
||||||
|
* When a current CPU utilization is 85% and the current task's
|
||||||
|
* performance criticality is the same as the system-wide average
|
||||||
|
* criticality, we set the target CPU frequency to the maximum.
|
||||||
|
*
|
||||||
|
* In other words, even if CPU utilization is not so high, the target
|
||||||
|
* CPU frequency could be high when the task's performance criticality
|
||||||
|
* is high enough (i.e., boosting CPU frequency). On the other hand,
|
||||||
|
* the target CPU frequency could be low even if CPU utilization is
|
||||||
|
* high when a non-performance-critical task is running (i.e.,
|
||||||
|
* deboosting CPU frequency).
|
||||||
|
*/
|
||||||
|
max_load = stat_cur->thr_perf_cri * LAVD_CPU_UTIL_MAX_FOR_CPUPERF;
|
||||||
|
cpu_load = taskc->perf_cri * cpuc->util;
|
||||||
|
cpuperf_target = (cpu_load * SCX_CPUPERF_ONE) / max_load;
|
||||||
|
cpuperf_target = min(cpuperf_target, SCX_CPUPERF_ONE);
|
||||||
|
|
||||||
|
cpuc->cpuperf_task = cpuperf_target;
|
||||||
|
cpuc->cpuperf_avg = calc_avg32(cpuc->cpuperf_avg, cpuperf_target);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool try_increase_cpuperf_target(struct cpu_ctx *cpuc)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* When a task becomes running, update CPU's performance target only
|
||||||
|
* when the current task's target performance is higher. This helps
|
||||||
|
* rapidly adopt workload changes by rapidly increasing CPU's
|
||||||
|
* performance target.
|
||||||
|
*/
|
||||||
|
u32 target;
|
||||||
|
|
||||||
|
if (!cpuc)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
target = max(cpuc->cpuperf_task, cpuc->cpuperf_avg);
|
||||||
|
if (cpuc->cpuperf_cur < target) {
|
||||||
|
cpuc->cpuperf_cur = target;
|
||||||
|
scx_bpf_cpuperf_set(cpuc->cpu_id, target);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool try_decrease_cpuperf_target(struct cpu_ctx *cpuc)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Upon every tick interval, we try to decrease the CPU's performance
|
||||||
|
* target if the current one is higher than both the current task's
|
||||||
|
* target and EWMA of past targets. This helps gradually adopt workload
|
||||||
|
* changes upon sudden down falls.
|
||||||
|
*/
|
||||||
|
u32 target;
|
||||||
|
|
||||||
|
if (!cpuc)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
target = max(cpuc->cpuperf_task, cpuc->cpuperf_avg);
|
||||||
|
if (cpuc->cpuperf_cur != target) {
|
||||||
|
cpuc->cpuperf_cur = target;
|
||||||
|
scx_bpf_cpuperf_set(cpuc->cpu_id, target);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static u16 get_cpuperf_cap(s32 cpu)
|
||||||
|
{
|
||||||
|
if (cpu >= 0 && cpu < nr_cpu_ids && cpu < LAVD_CPU_ID_MAX)
|
||||||
|
return __cpu_capacity_hint[cpu];
|
||||||
|
|
||||||
|
debugln("Infeasible CPU id: %d", cpu);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static u16 get_cputurbo_cap(void)
|
||||||
|
{
|
||||||
|
u16 turbo_cap = 0;
|
||||||
|
int nr_turbo = 0, cpu;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Find the maximum CPU frequency
|
||||||
|
*/
|
||||||
|
for (cpu = 0; cpu < nr_cpu_ids && cpu < LAVD_CPU_ID_MAX; cpu++) {
|
||||||
|
if (__cpu_capacity_hint[cpu] > turbo_cap) {
|
||||||
|
turbo_cap = __cpu_capacity_hint[cpu];
|
||||||
|
nr_turbo++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If all CPU's frequencies are the same, ignore the turbo.
|
||||||
|
*/
|
||||||
|
if (nr_turbo <= 1)
|
||||||
|
turbo_cap = 0;
|
||||||
|
|
||||||
|
return turbo_cap;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void init_autopilot_low_util(void)
|
||||||
|
{
|
||||||
|
if (nr_cpus_big < nr_cpus_onln) {
|
||||||
|
/*
|
||||||
|
* When there are little cores, we move up to the balanced mode
|
||||||
|
* if one little core is fully utilized.
|
||||||
|
*/
|
||||||
|
LAVD_AP_LOW_UTIL = 1000 / nr_cpus_onln;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/*
|
||||||
|
* When there are only big cores, we move up to the balanced
|
||||||
|
* mode if two big cores are fully utilized.
|
||||||
|
*/
|
||||||
|
LAVD_AP_LOW_UTIL = (2 * 1000) / nr_cpus_onln;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SEC("syscall")
|
||||||
|
int set_power_profile(struct power_arg *input)
|
||||||
|
{
|
||||||
|
return do_set_power_profile(input->power_mode, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
326
scheds/rust/scx_lavd/src/bpf/preempt.bpf.c
Normal file
326
scheds/rust/scx_lavd/src/bpf/preempt.bpf.c
Normal file
@ -0,0 +1,326 @@
|
|||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
/*
|
||||||
|
* To be included to the main.bpf.c
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Preemption related ones
|
||||||
|
*/
|
||||||
|
struct preemption_info {
|
||||||
|
u64 stopping_tm_est_ns;
|
||||||
|
u64 last_kick_clk;
|
||||||
|
u64 lat_cri;
|
||||||
|
struct cpu_ctx *cpuc;
|
||||||
|
};
|
||||||
|
|
||||||
|
static u64 get_est_stopping_time(struct task_ctx *taskc)
|
||||||
|
{
|
||||||
|
return bpf_ktime_get_ns() + taskc->run_time_ns;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int comp_preemption_info(struct preemption_info *prm_a,
|
||||||
|
struct preemption_info *prm_b)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Check if one's latency priority _or_ deadline is smaller or not.
|
||||||
|
*/
|
||||||
|
if ((prm_a->lat_cri < prm_b->lat_cri) ||
|
||||||
|
(prm_a->stopping_tm_est_ns < prm_b->stopping_tm_est_ns))
|
||||||
|
return -1;
|
||||||
|
if ((prm_a->lat_cri > prm_b->lat_cri) ||
|
||||||
|
(prm_a->stopping_tm_est_ns > prm_b->stopping_tm_est_ns))
|
||||||
|
return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool can_task1_kick_task2(struct preemption_info *prm_task1,
|
||||||
|
struct preemption_info *prm_task2)
|
||||||
|
{
|
||||||
|
return comp_preemption_info(prm_task1, prm_task2) < 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool can_cpu1_kick_cpu2(struct preemption_info *prm_cpu1,
|
||||||
|
struct preemption_info *prm_cpu2,
|
||||||
|
struct cpu_ctx *cpuc2)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Set a CPU information
|
||||||
|
*/
|
||||||
|
prm_cpu2->stopping_tm_est_ns = cpuc2->stopping_tm_est_ns;
|
||||||
|
prm_cpu2->lat_cri = cpuc2->lat_cri;
|
||||||
|
prm_cpu2->cpuc = cpuc2;
|
||||||
|
prm_cpu2->last_kick_clk = cpuc2->last_kick_clk;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If that CPU runs a lower priority task, that's a victim
|
||||||
|
* candidate.
|
||||||
|
*/
|
||||||
|
return can_task1_kick_task2(prm_cpu1, prm_cpu2);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_worth_kick_other_task(struct task_ctx *taskc)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* The scx_bpf_kick_cpu() used for preemption is expensive as an IPI is
|
||||||
|
* involved. Hence, we first judiciously check whether it is worth
|
||||||
|
* trying to victimize another CPU as the current task is urgent
|
||||||
|
* enough.
|
||||||
|
*/
|
||||||
|
struct sys_stat *stat_cur = get_sys_stat_cur();
|
||||||
|
|
||||||
|
return (taskc->lat_cri >= stat_cur->thr_lat_cri);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool can_cpu_be_kicked(u64 now, struct cpu_ctx *cpuc)
|
||||||
|
{
|
||||||
|
return cpuc->is_online &&
|
||||||
|
(now - cpuc->last_kick_clk) >= LAVD_PREEMPT_KICK_MARGIN;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct cpu_ctx *find_victim_cpu(const struct cpumask *cpumask,
|
||||||
|
struct task_ctx *taskc,
|
||||||
|
u64 *p_old_last_kick_clk)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* We see preemption as a load-balancing problem. In a system with N
|
||||||
|
* CPUs, ideally, the top N tasks with the highest latency priorities
|
||||||
|
* should run on the N CPUs all the time. This is the same as the
|
||||||
|
* load-balancing problem; the load-balancing problem finds a least
|
||||||
|
* loaded server, and the preemption problem finds a CPU running a
|
||||||
|
* least latency critical task. Hence, we use the 'power of two random
|
||||||
|
* choices' technique.
|
||||||
|
*/
|
||||||
|
u64 now = bpf_ktime_get_ns();
|
||||||
|
struct cpu_ctx *cpuc;
|
||||||
|
struct preemption_info prm_task, prm_cpus[2], *victim_cpu;
|
||||||
|
int cpu, nr_cpus;
|
||||||
|
int i, v = 0, cur_cpu = bpf_get_smp_processor_id();
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get task's preemption information for comparison.
|
||||||
|
*/
|
||||||
|
prm_task.stopping_tm_est_ns = get_est_stopping_time(taskc) +
|
||||||
|
LAVD_PREEMPT_KICK_MARGIN;
|
||||||
|
prm_task.lat_cri = taskc->lat_cri;
|
||||||
|
prm_task.cpuc = cpuc = get_cpu_ctx();
|
||||||
|
if (!cpuc) {
|
||||||
|
scx_bpf_error("Failed to lookup the current cpu_ctx");
|
||||||
|
goto null_out;
|
||||||
|
}
|
||||||
|
prm_task.last_kick_clk = cpuc->last_kick_clk;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* First, test the current CPU since it can skip the expensive IPI.
|
||||||
|
*/
|
||||||
|
if (can_cpu_be_kicked(now, cpuc) &&
|
||||||
|
bpf_cpumask_test_cpu(cur_cpu, cpumask) &&
|
||||||
|
can_cpu1_kick_cpu2(&prm_task, &prm_cpus[0], cpuc)) {
|
||||||
|
victim_cpu = &prm_task;
|
||||||
|
goto bingo_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the current CPU cannot be a victim, let's check if it is worth to
|
||||||
|
* try to kick other CPU at the expense of IPI.
|
||||||
|
*/
|
||||||
|
if (!is_worth_kick_other_task(taskc))
|
||||||
|
goto null_out;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Randomly find _two_ CPUs that run lower-priority tasks than @p. To
|
||||||
|
* traverse CPUs in a random order, we start from a random CPU ID in a
|
||||||
|
* random direction (left or right). The random-order traversal helps
|
||||||
|
* to mitigate the thundering herd problem. Otherwise, all CPUs may end
|
||||||
|
* up finding the same victim CPU.
|
||||||
|
*
|
||||||
|
* In the worst case, the current logic traverses _all_ CPUs. It would
|
||||||
|
* be too expensive to perform every task queue. We need to revisit
|
||||||
|
* this if the traversal cost becomes problematic.
|
||||||
|
*/
|
||||||
|
barrier();
|
||||||
|
nr_cpus = bpf_cpumask_weight(cpumask);
|
||||||
|
bpf_for(i, 0, nr_cpus) {
|
||||||
|
/*
|
||||||
|
* Decide a CPU ID to examine.
|
||||||
|
*/
|
||||||
|
cpu = bpf_cpumask_any_distribute(cpumask);
|
||||||
|
|
||||||
|
if (cpu >= nr_cpu_ids || cur_cpu == cpu)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check whether that CPU is qualified to run @p.
|
||||||
|
*/
|
||||||
|
cpuc = get_cpu_ctx_id(cpu);
|
||||||
|
if (!cpuc) {
|
||||||
|
scx_bpf_error("Failed to lookup cpu_ctx: %d", cpu);
|
||||||
|
goto null_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!can_cpu_be_kicked(now, cpuc))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If that CPU runs a lower priority task, that's a victim
|
||||||
|
* candidate.
|
||||||
|
*/
|
||||||
|
ret = can_cpu1_kick_cpu2(&prm_task, &prm_cpus[v], cpuc);
|
||||||
|
if (ret == true && ++v >= 2)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Choose a final victim CPU.
|
||||||
|
*/
|
||||||
|
switch(v) {
|
||||||
|
case 2: /* two dandidates */
|
||||||
|
victim_cpu = can_task1_kick_task2(&prm_cpus[0], &prm_cpus[1]) ?
|
||||||
|
&prm_cpus[0] : &prm_cpus[1];
|
||||||
|
goto bingo_out;
|
||||||
|
case 1: /* one candidate */
|
||||||
|
victim_cpu = &prm_cpus[0];
|
||||||
|
goto bingo_out;
|
||||||
|
case 0: /* no candidate */
|
||||||
|
goto null_out;
|
||||||
|
default:/* something wrong */
|
||||||
|
goto null_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
bingo_out:
|
||||||
|
taskc->victim_cpu = victim_cpu->cpuc->cpu_id;
|
||||||
|
*p_old_last_kick_clk = victim_cpu->last_kick_clk;
|
||||||
|
return victim_cpu->cpuc;
|
||||||
|
|
||||||
|
null_out:
|
||||||
|
taskc->victim_cpu = (s32)LAVD_CPU_ID_NONE;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool kick_cpu(struct cpu_ctx *victim_cpuc, u64 victim_last_kick_clk)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If the current CPU is a victim, we just reset the current task's
|
||||||
|
* time slice as an optimization. Othewise, kick the remote CPU for
|
||||||
|
* preemption.
|
||||||
|
*
|
||||||
|
* Kicking the victim CPU does _not_ guarantee that task @p will run on
|
||||||
|
* that CPU. Enqueuing @p to the global queue is one operation, and
|
||||||
|
* kicking the victim is another asynchronous operation. However, it is
|
||||||
|
* okay because, anyway, the victim CPU will run a higher-priority task
|
||||||
|
* than @p.
|
||||||
|
*/
|
||||||
|
if (bpf_get_smp_processor_id() == victim_cpuc->cpu_id) {
|
||||||
|
struct task_struct *tsk = bpf_get_current_task_btf();
|
||||||
|
tsk->scx.slice = 0;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Kick the remote victim CPU if it is not victimized yet by another
|
||||||
|
* concurrent kick task.
|
||||||
|
*/
|
||||||
|
bool ret = __sync_bool_compare_and_swap(&victim_cpuc->last_kick_clk,
|
||||||
|
victim_last_kick_clk,
|
||||||
|
bpf_ktime_get_ns());
|
||||||
|
if (ret)
|
||||||
|
scx_bpf_kick_cpu(victim_cpuc->cpu_id, SCX_KICK_PREEMPT);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool try_find_and_kick_victim_cpu(struct task_struct *p,
|
||||||
|
struct task_ctx *taskc,
|
||||||
|
struct cpu_ctx *cpuc_cur,
|
||||||
|
u64 dsq_id)
|
||||||
|
{
|
||||||
|
struct bpf_cpumask *cd_cpumask, *cpumask;
|
||||||
|
struct cpdom_ctx *cpdomc;
|
||||||
|
struct cpu_ctx *victim_cpuc;
|
||||||
|
u64 victim_last_kick_clk;
|
||||||
|
bool ret = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Prepare a cpumak so we find a victim @p's compute domain.
|
||||||
|
*/
|
||||||
|
cpumask = cpuc_cur->tmp_t_mask;
|
||||||
|
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
|
||||||
|
cd_cpumask = MEMBER_VPTR(cpdom_cpumask, [dsq_id]);
|
||||||
|
if (!cpdomc || !cd_cpumask || !cpumask)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
bpf_cpumask_and(cpumask, cast_mask(cd_cpumask), p->cpus_ptr);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Find a victim CPU among CPUs that run lower-priority tasks.
|
||||||
|
*/
|
||||||
|
victim_cpuc = find_victim_cpu(cast_mask(cpumask), taskc, &victim_last_kick_clk);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If a victim CPU is chosen, preempt the victim by kicking it.
|
||||||
|
*/
|
||||||
|
if (victim_cpuc)
|
||||||
|
ret = kick_cpu(victim_cpuc, victim_last_kick_clk);
|
||||||
|
|
||||||
|
if (!ret)
|
||||||
|
taskc->victim_cpu = (s32)LAVD_CPU_ID_NONE;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool try_yield_current_cpu(struct task_struct *p_run,
|
||||||
|
struct cpu_ctx *cpuc_run,
|
||||||
|
struct task_ctx *taskc_run)
|
||||||
|
{
|
||||||
|
struct task_struct *p_wait;
|
||||||
|
struct task_ctx *taskc_wait;
|
||||||
|
struct preemption_info prm_run, prm_wait;
|
||||||
|
s32 cpu_id = scx_bpf_task_cpu(p_run), wait_vtm_cpu_id;
|
||||||
|
bool ret = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If there is a higher priority task waiting on the global rq, the
|
||||||
|
* current running task yield the CPU by shrinking its time slice to
|
||||||
|
* zero.
|
||||||
|
*/
|
||||||
|
prm_run.stopping_tm_est_ns = taskc_run->last_running_clk +
|
||||||
|
taskc_run->run_time_ns -
|
||||||
|
LAVD_PREEMPT_TICK_MARGIN;
|
||||||
|
prm_run.lat_cri = taskc_run->lat_cri;
|
||||||
|
|
||||||
|
bpf_rcu_read_lock();
|
||||||
|
bpf_for_each(scx_dsq, p_wait, cpuc_run->cpdom_id, 0) {
|
||||||
|
taskc_wait = get_task_ctx(p_wait);
|
||||||
|
if (!taskc_wait)
|
||||||
|
break;
|
||||||
|
|
||||||
|
wait_vtm_cpu_id = taskc_wait->victim_cpu;
|
||||||
|
if (wait_vtm_cpu_id != (s32)LAVD_CPU_ID_NONE)
|
||||||
|
break;
|
||||||
|
|
||||||
|
prm_wait.stopping_tm_est_ns = get_est_stopping_time(taskc_wait);
|
||||||
|
prm_wait.lat_cri = taskc_wait->lat_cri;
|
||||||
|
|
||||||
|
if (can_task1_kick_task2(&prm_wait, &prm_run)) {
|
||||||
|
/*
|
||||||
|
* The atomic CAS guarantees only one task yield its
|
||||||
|
* CPU for the waiting task.
|
||||||
|
*/
|
||||||
|
ret = __sync_bool_compare_and_swap(
|
||||||
|
&taskc_wait->victim_cpu,
|
||||||
|
(s32)LAVD_CPU_ID_NONE, cpu_id);
|
||||||
|
if (ret)
|
||||||
|
p_run->scx.slice = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test only the first entry on the DSQ.
|
||||||
|
*/
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
bpf_rcu_read_unlock();
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
376
scheds/rust/scx_lavd/src/bpf/sys_stat.bpf.c
Normal file
376
scheds/rust/scx_lavd/src/bpf/sys_stat.bpf.c
Normal file
@ -0,0 +1,376 @@
|
|||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2023, 2024 Valve Corporation.
|
||||||
|
* Author: Changwoo Min <changwoo@igalia.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* To be included to the main.bpf.c
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Timer for updating system-wide status periorically
|
||||||
|
*/
|
||||||
|
struct update_timer {
|
||||||
|
struct bpf_timer timer;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct {
|
||||||
|
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||||
|
__uint(max_entries, 1);
|
||||||
|
__type(key, u32);
|
||||||
|
__type(value, struct update_timer);
|
||||||
|
} update_timer SEC(".maps");
|
||||||
|
|
||||||
|
struct sys_stat_ctx {
|
||||||
|
struct sys_stat *stat_cur;
|
||||||
|
struct sys_stat *stat_next;
|
||||||
|
u64 now;
|
||||||
|
u64 duration;
|
||||||
|
u64 duration_total;
|
||||||
|
u64 idle_total;
|
||||||
|
u64 compute_total;
|
||||||
|
u64 load_actual;
|
||||||
|
u64 tot_svc_time;
|
||||||
|
u64 nr_queued_task;
|
||||||
|
u64 load_run_time_ns;
|
||||||
|
s32 max_lat_cri;
|
||||||
|
s32 avg_lat_cri;
|
||||||
|
u64 sum_lat_cri;
|
||||||
|
u32 nr_sched;
|
||||||
|
u32 nr_migration;
|
||||||
|
u32 nr_preemption;
|
||||||
|
u32 nr_greedy;
|
||||||
|
u32 nr_perf_cri;
|
||||||
|
u32 nr_lat_cri;
|
||||||
|
u32 nr_big;
|
||||||
|
u32 nr_pc_on_big;
|
||||||
|
u32 nr_lc_on_big;
|
||||||
|
u64 min_perf_cri;
|
||||||
|
u64 avg_perf_cri;
|
||||||
|
u64 max_perf_cri;
|
||||||
|
u64 sum_perf_cri;
|
||||||
|
u32 thr_perf_cri;
|
||||||
|
u64 new_util;
|
||||||
|
u32 nr_violation;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void init_sys_stat_ctx(struct sys_stat_ctx *c)
|
||||||
|
{
|
||||||
|
memset(c, 0, sizeof(*c));
|
||||||
|
|
||||||
|
c->stat_cur = get_sys_stat_cur();
|
||||||
|
c->stat_next = get_sys_stat_next();
|
||||||
|
c->min_perf_cri = 1000;
|
||||||
|
c->now = bpf_ktime_get_ns();
|
||||||
|
c->duration = c->now - c->stat_cur->last_update_clk;
|
||||||
|
c->stat_next->last_update_clk = c->now;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void collect_sys_stat(struct sys_stat_ctx *c)
|
||||||
|
{
|
||||||
|
u64 dsq_id;
|
||||||
|
int cpu, nr;
|
||||||
|
|
||||||
|
bpf_for(cpu, 0, nr_cpu_ids) {
|
||||||
|
struct cpu_ctx *cpuc = get_cpu_ctx_id(cpu);
|
||||||
|
if (!cpuc) {
|
||||||
|
c->compute_total = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Accumulate cpus' loads.
|
||||||
|
*/
|
||||||
|
c->load_actual += cpuc->load_actual;
|
||||||
|
c->load_run_time_ns += cpuc->load_run_time_ns;
|
||||||
|
c->tot_svc_time += cpuc->tot_svc_time;
|
||||||
|
cpuc->tot_svc_time = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Accumulate statistics.
|
||||||
|
*/
|
||||||
|
if (cpuc->big_core) {
|
||||||
|
c->nr_big += cpuc->nr_sched;
|
||||||
|
c->nr_pc_on_big += cpuc->nr_perf_cri;
|
||||||
|
c->nr_lc_on_big += cpuc->nr_lat_cri;
|
||||||
|
}
|
||||||
|
c->nr_perf_cri += cpuc->nr_perf_cri;
|
||||||
|
cpuc->nr_perf_cri = 0;
|
||||||
|
|
||||||
|
c->nr_lat_cri += cpuc->nr_lat_cri;
|
||||||
|
cpuc->nr_lat_cri = 0;
|
||||||
|
|
||||||
|
c->nr_migration += cpuc->nr_migration;
|
||||||
|
cpuc->nr_migration = 0;
|
||||||
|
|
||||||
|
c->nr_preemption += cpuc->nr_preemption;
|
||||||
|
cpuc->nr_preemption = 0;
|
||||||
|
|
||||||
|
c->nr_greedy += cpuc->nr_greedy;
|
||||||
|
cpuc->nr_greedy = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Accumulate task's latency criticlity information.
|
||||||
|
*
|
||||||
|
* While updating cpu->* is racy, the resulting impact on
|
||||||
|
* accuracy should be small and very rare and thus should be
|
||||||
|
* fine.
|
||||||
|
*/
|
||||||
|
c->sum_lat_cri += cpuc->sum_lat_cri;
|
||||||
|
cpuc->sum_lat_cri = 0;
|
||||||
|
|
||||||
|
c->nr_sched += cpuc->nr_sched;
|
||||||
|
cpuc->nr_sched = 0;
|
||||||
|
|
||||||
|
if (cpuc->max_lat_cri > c->max_lat_cri)
|
||||||
|
c->max_lat_cri = cpuc->max_lat_cri;
|
||||||
|
cpuc->max_lat_cri = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Accumulate task's performance criticlity information.
|
||||||
|
*/
|
||||||
|
if (cpuc->min_perf_cri < c->min_perf_cri)
|
||||||
|
c->min_perf_cri = cpuc->min_perf_cri;
|
||||||
|
cpuc->min_perf_cri = 1000;
|
||||||
|
|
||||||
|
if (cpuc->max_perf_cri > c->max_perf_cri)
|
||||||
|
c->max_perf_cri = cpuc->max_perf_cri;
|
||||||
|
cpuc->max_perf_cri = 0;
|
||||||
|
|
||||||
|
c->sum_perf_cri += cpuc->sum_perf_cri;
|
||||||
|
cpuc->sum_perf_cri = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the CPU is in an idle state (i.e., idle_start_clk is
|
||||||
|
* non-zero), accumulate the current idle peirod so far.
|
||||||
|
*/
|
||||||
|
for (int i = 0; i < LAVD_MAX_RETRY; i++) {
|
||||||
|
u64 old_clk = cpuc->idle_start_clk;
|
||||||
|
if (old_clk == 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
bool ret = __sync_bool_compare_and_swap(
|
||||||
|
&cpuc->idle_start_clk, old_clk, c->now);
|
||||||
|
if (ret) {
|
||||||
|
cpuc->idle_total += c->now - old_clk;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Calculcate per-CPU utilization
|
||||||
|
*/
|
||||||
|
u64 compute = 0;
|
||||||
|
if (c->duration > cpuc->idle_total)
|
||||||
|
compute = c->duration - cpuc->idle_total;
|
||||||
|
|
||||||
|
c->new_util = (compute * LAVD_CPU_UTIL_MAX) / c->duration;
|
||||||
|
cpuc->util = calc_avg(cpuc->util, c->new_util);
|
||||||
|
|
||||||
|
if (cpuc->turbo_core) {
|
||||||
|
if (cpuc->util > LAVD_CC_PER_TURBO_CORE_MAX_CTUIL)
|
||||||
|
c->nr_violation += 1000;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (cpuc->util > LAVD_CC_PER_CORE_MAX_CTUIL)
|
||||||
|
c->nr_violation += 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Accmulate system-wide idle time
|
||||||
|
*/
|
||||||
|
c->idle_total += cpuc->idle_total;
|
||||||
|
cpuc->idle_total = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bpf_for(dsq_id, 0, LAVD_CPDOM_MAX_NR) {
|
||||||
|
nr = scx_bpf_dsq_nr_queued(dsq_id);
|
||||||
|
if (nr > 0)
|
||||||
|
c->nr_queued_task += nr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void calc_sys_stat(struct sys_stat_ctx *c)
|
||||||
|
{
|
||||||
|
c->duration_total = c->duration * nr_cpus_onln;
|
||||||
|
if (c->duration_total > c->idle_total)
|
||||||
|
c->compute_total = c->duration_total - c->idle_total;
|
||||||
|
else
|
||||||
|
c->compute_total = 0;
|
||||||
|
c->new_util = (c->compute_total * LAVD_CPU_UTIL_MAX)/c->duration_total;
|
||||||
|
|
||||||
|
if (c->nr_sched == 0) {
|
||||||
|
/*
|
||||||
|
* When a system is completely idle, it is indeed possible
|
||||||
|
* nothing scheduled for an interval.
|
||||||
|
*/
|
||||||
|
c->max_lat_cri = c->stat_cur->max_lat_cri;
|
||||||
|
c->avg_lat_cri = c->stat_cur->avg_lat_cri;
|
||||||
|
|
||||||
|
c->min_perf_cri = c->stat_cur->min_perf_cri;
|
||||||
|
c->max_perf_cri = c->stat_cur->max_perf_cri;
|
||||||
|
c->avg_perf_cri = c->stat_cur->avg_perf_cri;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
c->avg_lat_cri = c->sum_lat_cri / c->nr_sched;
|
||||||
|
c->avg_perf_cri = c->sum_perf_cri / c->nr_sched;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void update_sys_stat_next(struct sys_stat_ctx *c)
|
||||||
|
{
|
||||||
|
static int cnt = 0;
|
||||||
|
u64 avg_svc_time = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Update the CPU utilization to the next version.
|
||||||
|
*/
|
||||||
|
struct sys_stat *stat_cur = c->stat_cur;
|
||||||
|
struct sys_stat *stat_next = c->stat_next;
|
||||||
|
|
||||||
|
stat_next->load_actual =
|
||||||
|
calc_avg(stat_cur->load_actual, c->load_actual);
|
||||||
|
stat_next->util =
|
||||||
|
calc_avg(stat_cur->util, c->new_util);
|
||||||
|
|
||||||
|
stat_next->max_lat_cri =
|
||||||
|
calc_avg32(stat_cur->max_lat_cri, c->max_lat_cri);
|
||||||
|
stat_next->avg_lat_cri =
|
||||||
|
calc_avg32(stat_cur->avg_lat_cri, c->avg_lat_cri);
|
||||||
|
stat_next->thr_lat_cri = stat_next->max_lat_cri -
|
||||||
|
((stat_next->max_lat_cri - stat_next->avg_lat_cri) >> 1);
|
||||||
|
|
||||||
|
stat_next->min_perf_cri =
|
||||||
|
calc_avg32(stat_cur->min_perf_cri, c->min_perf_cri);
|
||||||
|
stat_next->avg_perf_cri =
|
||||||
|
calc_avg32(stat_cur->avg_perf_cri, c->avg_perf_cri);
|
||||||
|
stat_next->max_perf_cri =
|
||||||
|
calc_avg32(stat_cur->max_perf_cri, c->max_perf_cri);
|
||||||
|
stat_next->thr_perf_cri =
|
||||||
|
c->stat_cur->thr_perf_cri; /* will be updated later */
|
||||||
|
|
||||||
|
stat_next->nr_violation =
|
||||||
|
calc_avg32(stat_cur->nr_violation, c->nr_violation);
|
||||||
|
|
||||||
|
if (c->nr_sched > 0)
|
||||||
|
avg_svc_time = c->tot_svc_time / c->nr_sched;
|
||||||
|
stat_next->avg_svc_time =
|
||||||
|
calc_avg(stat_cur->avg_svc_time, avg_svc_time);
|
||||||
|
|
||||||
|
stat_next->nr_queued_task =
|
||||||
|
calc_avg(stat_cur->nr_queued_task, c->nr_queued_task);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Half the statistics every minitue so the statistics hold the
|
||||||
|
* information on a few minutes.
|
||||||
|
*/
|
||||||
|
if (cnt++ == LAVD_SYS_STAT_DECAY_TIMES) {
|
||||||
|
cnt = 0;
|
||||||
|
stat_next->nr_sched >>= 1;
|
||||||
|
stat_next->nr_migration >>= 1;
|
||||||
|
stat_next->nr_preemption >>= 1;
|
||||||
|
stat_next->nr_greedy >>= 1;
|
||||||
|
stat_next->nr_perf_cri >>= 1;
|
||||||
|
stat_next->nr_lat_cri >>= 1;
|
||||||
|
stat_next->nr_big >>= 1;
|
||||||
|
stat_next->nr_pc_on_big >>= 1;
|
||||||
|
stat_next->nr_lc_on_big >>= 1;
|
||||||
|
|
||||||
|
__sync_fetch_and_sub(&performance_mode_ns, performance_mode_ns/2);
|
||||||
|
__sync_fetch_and_sub(&balanced_mode_ns, balanced_mode_ns/2);
|
||||||
|
__sync_fetch_and_sub(&powersave_mode_ns, powersave_mode_ns/2);
|
||||||
|
}
|
||||||
|
|
||||||
|
stat_next->nr_sched += c->nr_sched;
|
||||||
|
stat_next->nr_migration += c->nr_migration;
|
||||||
|
stat_next->nr_preemption += c->nr_preemption;
|
||||||
|
stat_next->nr_greedy += c->nr_greedy;
|
||||||
|
stat_next->nr_perf_cri += c->nr_perf_cri;
|
||||||
|
stat_next->nr_lat_cri += c->nr_lat_cri;
|
||||||
|
stat_next->nr_big += c->nr_big;
|
||||||
|
stat_next->nr_pc_on_big += c->nr_pc_on_big;
|
||||||
|
stat_next->nr_lc_on_big += c->nr_lc_on_big;
|
||||||
|
|
||||||
|
update_power_mode_time();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void do_update_sys_stat(void)
|
||||||
|
{
|
||||||
|
struct sys_stat_ctx c;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Collect and prepare the next version of stat.
|
||||||
|
*/
|
||||||
|
init_sys_stat_ctx(&c);
|
||||||
|
collect_sys_stat(&c);
|
||||||
|
calc_sys_stat(&c);
|
||||||
|
update_sys_stat_next(&c);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Make the next version atomically visible.
|
||||||
|
*/
|
||||||
|
flip_sys_stat();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void update_sys_stat(void)
|
||||||
|
{
|
||||||
|
do_update_sys_stat();
|
||||||
|
|
||||||
|
if (is_autopilot_on)
|
||||||
|
do_autopilot();
|
||||||
|
|
||||||
|
if (!no_core_compaction)
|
||||||
|
do_core_compaction();
|
||||||
|
|
||||||
|
update_thr_perf_cri();
|
||||||
|
|
||||||
|
if (reinit_cpumask_for_performance) {
|
||||||
|
reinit_cpumask_for_performance = false;
|
||||||
|
reinit_active_cpumask_for_performance();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int update_timer_cb(void *map, int *key, struct bpf_timer *timer)
|
||||||
|
{
|
||||||
|
int err;
|
||||||
|
|
||||||
|
update_sys_stat();
|
||||||
|
|
||||||
|
err = bpf_timer_start(timer, LAVD_SYS_STAT_INTERVAL_NS, 0);
|
||||||
|
if (err)
|
||||||
|
scx_bpf_error("Failed to arm update timer");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static s32 init_sys_stat(u64 now)
|
||||||
|
{
|
||||||
|
struct bpf_timer *timer;
|
||||||
|
u32 key = 0;
|
||||||
|
int err;
|
||||||
|
|
||||||
|
memset(__sys_stats, 0, sizeof(__sys_stats));
|
||||||
|
__sys_stats[0].last_update_clk = now;
|
||||||
|
__sys_stats[1].last_update_clk = now;
|
||||||
|
__sys_stats[0].nr_active = nr_cpus_big;
|
||||||
|
__sys_stats[1].nr_active = nr_cpus_big;
|
||||||
|
|
||||||
|
timer = bpf_map_lookup_elem(&update_timer, &key);
|
||||||
|
if (!timer) {
|
||||||
|
scx_bpf_error("Failed to lookup update timer");
|
||||||
|
return -ESRCH;
|
||||||
|
}
|
||||||
|
bpf_timer_init(timer, &update_timer, CLOCK_BOOTTIME);
|
||||||
|
bpf_timer_set_callback(timer, update_timer_cb);
|
||||||
|
err = bpf_timer_start(timer, LAVD_SYS_STAT_INTERVAL_NS, 0);
|
||||||
|
if (err) {
|
||||||
|
scx_bpf_error("Failed to arm update timer");
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
298
scheds/rust/scx_lavd/src/bpf/util.bpf.c
Normal file
298
scheds/rust/scx_lavd/src/bpf/util.bpf.c
Normal file
@ -0,0 +1,298 @@
|
|||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2023, 2024 Valve Corporation.
|
||||||
|
* Author: Changwoo Min <changwoo@igalia.com>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* To be included to the main.bpf.c
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sched related globals
|
||||||
|
*/
|
||||||
|
private(LAVD) struct bpf_cpumask __kptr *turbo_cpumask; /* CPU mask for turbo CPUs */
|
||||||
|
private(LAVD) struct bpf_cpumask __kptr *big_cpumask; /* CPU mask for big CPUs */
|
||||||
|
private(LAVD) struct bpf_cpumask __kptr *little_cpumask; /* CPU mask for little CPUs */
|
||||||
|
private(LAVD) struct bpf_cpumask __kptr *active_cpumask; /* CPU mask for active CPUs */
|
||||||
|
private(LAVD) struct bpf_cpumask __kptr *ovrflw_cpumask; /* CPU mask for overflow CPUs */
|
||||||
|
private(LAVD) struct bpf_cpumask cpdom_cpumask[LAVD_CPDOM_MAX_NR]; /* CPU mask for each compute domain */
|
||||||
|
|
||||||
|
const volatile u64 nr_cpu_ids; /* maximum CPU IDs */
|
||||||
|
static volatile u64 nr_cpus_onln; /* current number of online CPUs */
|
||||||
|
static volatile u64 nr_cpus_big;
|
||||||
|
|
||||||
|
struct sys_stat __sys_stats[2];
|
||||||
|
volatile int __sys_stat_idx;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Options
|
||||||
|
*/
|
||||||
|
volatile bool no_core_compaction;
|
||||||
|
volatile bool no_freq_scaling;
|
||||||
|
volatile bool no_prefer_turbo_core;
|
||||||
|
volatile bool is_powersave_mode;
|
||||||
|
volatile bool reinit_cpumask_for_performance;
|
||||||
|
const volatile bool is_autopilot_on;
|
||||||
|
const volatile u32 is_smt_active;
|
||||||
|
const volatile u8 verbose;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Exit infomation
|
||||||
|
*/
|
||||||
|
UEI_DEFINE(uei);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* per-CPU globals
|
||||||
|
*/
|
||||||
|
struct {
|
||||||
|
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||||
|
__type(key, u32);
|
||||||
|
__type(value, struct cpu_ctx);
|
||||||
|
__uint(max_entries, 1);
|
||||||
|
} cpu_ctx_stor SEC(".maps");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Per-task scheduling context
|
||||||
|
*/
|
||||||
|
struct {
|
||||||
|
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
|
||||||
|
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||||
|
__type(key, int);
|
||||||
|
__type(value, struct task_ctx);
|
||||||
|
} task_ctx_stor SEC(".maps");
|
||||||
|
|
||||||
|
|
||||||
|
#define debugln(fmt, ...) \
|
||||||
|
({ \
|
||||||
|
if (verbose > 0) \
|
||||||
|
bpf_printk("[%s:%d] " fmt, __func__, __LINE__, \
|
||||||
|
##__VA_ARGS__); \
|
||||||
|
})
|
||||||
|
|
||||||
|
#define traceln(fmt, ...) \
|
||||||
|
({ \
|
||||||
|
if (verbose > 1) \
|
||||||
|
bpf_printk("[%s:%d] " fmt, __func__, __LINE__, \
|
||||||
|
##__VA_ARGS__); \
|
||||||
|
})
|
||||||
|
|
||||||
|
#ifndef min
|
||||||
|
#define min(X, Y) (((X) < (Y)) ? (X) : (Y))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef max
|
||||||
|
#define max(X, Y) (((X) < (Y)) ? (Y) : (X))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static struct sys_stat *get_sys_stat_cur(void)
|
||||||
|
{
|
||||||
|
if (READ_ONCE(__sys_stat_idx) == 0)
|
||||||
|
return &__sys_stats[0];
|
||||||
|
return &__sys_stats[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct sys_stat *get_sys_stat_next(void)
|
||||||
|
{
|
||||||
|
if (READ_ONCE(__sys_stat_idx) == 0)
|
||||||
|
return &__sys_stats[1];
|
||||||
|
return &__sys_stats[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void flip_sys_stat(void)
|
||||||
|
{
|
||||||
|
WRITE_ONCE(__sys_stat_idx, __sys_stat_idx ^ 0x1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static u64 sigmoid_u64(u64 v, u64 max)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* An integer approximation of the sigmoid function. It is convenient
|
||||||
|
* to use the sigmoid function since it has a known upper and lower
|
||||||
|
* bound, [0, max].
|
||||||
|
*
|
||||||
|
* |
|
||||||
|
* | +------ <= max
|
||||||
|
* | /
|
||||||
|
* | /
|
||||||
|
* |/
|
||||||
|
* +------------->
|
||||||
|
*/
|
||||||
|
return (v > max) ? max : v;
|
||||||
|
}
|
||||||
|
|
||||||
|
static u64 rsigmoid_u64(u64 v, u64 max)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* A horizontally flipped version of sigmoid function. Again, it is
|
||||||
|
* convenient since the upper and lower bound of the function is known,
|
||||||
|
* [0, max].
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* |
|
||||||
|
* |\ <= max
|
||||||
|
* | \
|
||||||
|
* | \
|
||||||
|
* | \
|
||||||
|
* +----+-------->
|
||||||
|
*/
|
||||||
|
return (v >= max) ? 0 : max - v;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct task_ctx *try_get_task_ctx(struct task_struct *p)
|
||||||
|
{
|
||||||
|
return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct task_ctx *get_task_ctx(struct task_struct *p)
|
||||||
|
{
|
||||||
|
struct task_ctx *taskc;
|
||||||
|
|
||||||
|
taskc = try_get_task_ctx(p);
|
||||||
|
if (!taskc)
|
||||||
|
scx_bpf_error("task_ctx lookup failed for %s[%d]",
|
||||||
|
p->comm, p->pid);
|
||||||
|
return taskc;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct cpu_ctx *get_cpu_ctx(void)
|
||||||
|
{
|
||||||
|
const u32 idx = 0;
|
||||||
|
struct cpu_ctx *cpuc;
|
||||||
|
|
||||||
|
cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &idx);
|
||||||
|
if (!cpuc)
|
||||||
|
scx_bpf_error("cpu_ctx lookup failed for current cpu");
|
||||||
|
|
||||||
|
return cpuc;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct cpu_ctx *get_cpu_ctx_id(s32 cpu_id)
|
||||||
|
{
|
||||||
|
const u32 idx = 0;
|
||||||
|
struct cpu_ctx *cpuc;
|
||||||
|
|
||||||
|
cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &idx, cpu_id);
|
||||||
|
if (!cpuc)
|
||||||
|
scx_bpf_error("cpu_ctx lookup failed for %d", cpu_id);
|
||||||
|
|
||||||
|
return cpuc;
|
||||||
|
}
|
||||||
|
|
||||||
|
static u32 calc_avg32(u32 old_val, u32 new_val)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Calculate the exponential weighted moving average (EWMA).
|
||||||
|
* - EWMA = (0.75 * old) + (0.25 * new)
|
||||||
|
*/
|
||||||
|
return (old_val - (old_val >> 2)) + (new_val >> 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
static u64 calc_avg(u64 old_val, u64 new_val)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Calculate the exponential weighted moving average (EWMA).
|
||||||
|
* - EWMA = (0.75 * old) + (0.25 * new)
|
||||||
|
*/
|
||||||
|
return (old_val - (old_val >> 2)) + (new_val >> 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
static u64 calc_avg_freq(u64 old_freq, u64 interval)
|
||||||
|
{
|
||||||
|
u64 new_freq, ewma_freq;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Calculate the exponential weighted moving average (EWMA) of a
|
||||||
|
* frequency with a new interval measured.
|
||||||
|
*/
|
||||||
|
new_freq = LAVD_TIME_ONE_SEC / interval;
|
||||||
|
ewma_freq = calc_avg(old_freq, new_freq);
|
||||||
|
return ewma_freq;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_lat_cri(struct task_ctx *taskc, struct sys_stat *stat_cur)
|
||||||
|
{
|
||||||
|
return taskc->lat_cri >= stat_cur->avg_lat_cri;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_perf_cri(struct task_ctx *taskc, struct sys_stat *stat_cur)
|
||||||
|
{
|
||||||
|
if (READ_ONCE(taskc->on_big) && READ_ONCE(taskc->on_little))
|
||||||
|
return taskc->perf_cri >= stat_cur->thr_perf_cri;
|
||||||
|
return READ_ONCE(taskc->on_big);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_greedy(struct task_ctx *taskc)
|
||||||
|
{
|
||||||
|
return taskc->greedy_ratio > 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_eligible(struct task_ctx *taskc)
|
||||||
|
{
|
||||||
|
return !is_greedy(taskc);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool have_scheduled(struct task_ctx *taskc)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If task's time slice hasn't been updated, that means the task has
|
||||||
|
* been scheduled by this scheduler.
|
||||||
|
*/
|
||||||
|
return taskc->slice_ns != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static u16 get_nice_prio(struct task_struct *p)
|
||||||
|
{
|
||||||
|
u16 prio = p->static_prio - MAX_RT_PRIO; /* [0, 40) */
|
||||||
|
return prio;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool use_full_cpus(void)
|
||||||
|
{
|
||||||
|
struct sys_stat *stat_cur = get_sys_stat_cur();
|
||||||
|
return (stat_cur->nr_active + LAVD_CC_NR_OVRFLW) >= nr_cpus_onln;
|
||||||
|
}
|
||||||
|
|
||||||
|
static u64 pick_any_bit(u64 bitmap, u64 nuance)
|
||||||
|
{
|
||||||
|
u64 i, pos;
|
||||||
|
|
||||||
|
bpf_for(i, 0, 64) {
|
||||||
|
pos = (i + nuance) % 64;
|
||||||
|
if (bitmap & (1LLU << pos))
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
return -ENOENT;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void set_on_core_type(struct task_ctx *taskc,
|
||||||
|
const struct cpumask *cpumask)
|
||||||
|
{
|
||||||
|
bool on_big = false, on_little = false;
|
||||||
|
struct cpu_ctx *cpuc;
|
||||||
|
int cpu;
|
||||||
|
|
||||||
|
bpf_for(cpu, 0, nr_cpu_ids) {
|
||||||
|
if (!bpf_cpumask_test_cpu(cpu, cpumask))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
cpuc = get_cpu_ctx_id(cpu);
|
||||||
|
if (!cpuc) {
|
||||||
|
scx_bpf_error("Failed to look up cpu_ctx: %d", cpu);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cpuc->big_core)
|
||||||
|
on_big = true;
|
||||||
|
else
|
||||||
|
on_little = true;
|
||||||
|
|
||||||
|
if (on_big && on_little)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
WRITE_ONCE(taskc->on_big, on_big);
|
||||||
|
WRITE_ONCE(taskc->on_little, on_little);
|
||||||
|
}
|
@ -65,8 +65,6 @@ use stats::StatsReq;
|
|||||||
use stats::StatsRes;
|
use stats::StatsRes;
|
||||||
use stats::SysStats;
|
use stats::SysStats;
|
||||||
|
|
||||||
const LAVD_CPU_ID_MAX: usize = bpf_intf::consts_LAVD_CPU_ID_MAX as usize;
|
|
||||||
|
|
||||||
/// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler
|
/// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler
|
||||||
///
|
///
|
||||||
/// The rust part is minimal. It processes command line options and logs out
|
/// The rust part is minimal. It processes command line options and logs out
|
||||||
@ -484,7 +482,7 @@ struct Scheduler<'a> {
|
|||||||
|
|
||||||
impl<'a> Scheduler<'a> {
|
impl<'a> Scheduler<'a> {
|
||||||
fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
|
fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
|
||||||
if *NR_CPU_IDS > LAVD_CPU_ID_MAX {
|
if *NR_CPU_IDS > LAVD_CPU_ID_MAX as usize {
|
||||||
panic!(
|
panic!(
|
||||||
"Num possible CPU IDs ({}) exceeds maximum of ({})",
|
"Num possible CPU IDs ({}) exceeds maximum of ({})",
|
||||||
*NR_CPU_IDS, LAVD_CPU_ID_MAX
|
*NR_CPU_IDS, LAVD_CPU_ID_MAX
|
||||||
@ -559,15 +557,13 @@ impl<'a> Scheduler<'a> {
|
|||||||
skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].__cpumask[i] |= 0x01 << j;
|
skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].__cpumask[i] |= 0x01 << j;
|
||||||
}
|
}
|
||||||
|
|
||||||
const LAVD_CPDOM_MAX_NR: u8 = 32;
|
if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST as usize {
|
||||||
const LAVD_CPDOM_MAX_DIST: usize = 4;
|
|
||||||
if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST {
|
|
||||||
panic!("The processor topology is too complex to handle in BPF.");
|
panic!("The processor topology is too complex to handle in BPF.");
|
||||||
}
|
}
|
||||||
|
|
||||||
for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
|
for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
|
||||||
let nr_neighbors = neighbors.borrow().len() as u8;
|
let nr_neighbors = neighbors.borrow().len() as u8;
|
||||||
if nr_neighbors > LAVD_CPDOM_MAX_NR {
|
if nr_neighbors > LAVD_CPDOM_MAX_NR as u8 {
|
||||||
panic!("The processor topology is too complex to handle in BPF.");
|
panic!("The processor topology is too complex to handle in BPF.");
|
||||||
}
|
}
|
||||||
skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].nr_neighbors[k] = nr_neighbors;
|
skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].nr_neighbors[k] = nr_neighbors;
|
||||||
@ -666,12 +662,8 @@ impl<'a> Scheduler<'a> {
|
|||||||
return 100. * x as f64 / y as f64;
|
return 100. * x as f64 / y as f64;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_power_mode(power_mode: s32) -> &'static str {
|
fn get_power_mode(power_mode: i32) -> &'static str {
|
||||||
const LAVD_PM_PERFORMANCE: s32 = 0;
|
match power_mode as u32 {
|
||||||
const LAVD_PM_BALANCED: s32 = 1;
|
|
||||||
const LAVD_PM_POWERSAVE: s32 = 2;
|
|
||||||
|
|
||||||
match power_mode {
|
|
||||||
LAVD_PM_PERFORMANCE => {
|
LAVD_PM_PERFORMANCE => {
|
||||||
return &"performance";
|
return &"performance";
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user