mirror of
https://github.com/JakeHillion/scx.git
synced 2024-11-26 03:20:24 +00:00
Merge pull request #655 from sched-ext/bpfland-refine-wake-sync
scx_bpfland: refine idle CPU selection logic
This commit is contained in:
commit
8656157ee4
@ -139,11 +139,6 @@ UEI_DEFINE(uei);
|
||||
*/
|
||||
private(BPFLAND) struct bpf_cpumask __kptr *primary_cpumask;
|
||||
|
||||
/*
|
||||
* Mask of preferred CPUs in the system.
|
||||
*/
|
||||
private(BPFLAND) struct bpf_cpumask __kptr *preferred_cpumask;
|
||||
|
||||
/*
|
||||
* Mask of offline CPUs, used to properly support CPU hotplugging.
|
||||
*/
|
||||
@ -290,7 +285,7 @@ static bool is_task_interactive(struct task_struct *p)
|
||||
*/
|
||||
static inline bool is_kthread(const struct task_struct *p)
|
||||
{
|
||||
return !!(p->flags & PF_KTHREAD);
|
||||
return p->flags & PF_KTHREAD;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -538,11 +533,10 @@ static void handle_sync_wakeup(struct task_struct *p)
|
||||
* to handle these mistakes in favor of a more efficient response and a reduced
|
||||
* scheduling overhead.
|
||||
*/
|
||||
static s32 pick_idle_cpu(struct task_struct *p,
|
||||
s32 prev_cpu, u64 wake_flags, bool do_preferred)
|
||||
static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
|
||||
{
|
||||
const struct cpumask *online_cpumask, *idle_smtmask, *idle_cpumask;
|
||||
struct bpf_cpumask *primary, *preferred, *l2_domain, *l3_domain;
|
||||
struct bpf_cpumask *primary, *l2_domain, *l3_domain;
|
||||
struct bpf_cpumask *p_mask, *l2_mask, *l3_mask;
|
||||
struct task_ctx *tctx;
|
||||
struct cpu_ctx *cctx;
|
||||
@ -558,9 +552,6 @@ static s32 pick_idle_cpu(struct task_struct *p,
|
||||
primary = primary_cpumask;
|
||||
if (!primary)
|
||||
return -ENOENT;
|
||||
preferred = preferred_cpumask;
|
||||
if (!preferred)
|
||||
return -ENOENT;
|
||||
|
||||
/*
|
||||
* If the task isn't allowed to use its previously used CPU it means
|
||||
@ -574,14 +565,9 @@ static s32 pick_idle_cpu(struct task_struct *p,
|
||||
/*
|
||||
* For tasks that can run only on a single CPU, we can simply verify if
|
||||
* their only allowed CPU is still idle.
|
||||
*
|
||||
* Moreover, if local_kthreads is enabled, always allow to dispatch
|
||||
* per-CPU kthreads directly to their target CPU, independently on the
|
||||
* idle state.
|
||||
*/
|
||||
if (p->nr_cpus_allowed == 1) {
|
||||
if ((is_kthread(p) && local_kthreads) ||
|
||||
scx_bpf_test_and_clear_cpu_idle(prev_cpu))
|
||||
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
|
||||
return prev_cpu;
|
||||
return -ENOENT;
|
||||
}
|
||||
@ -628,18 +614,9 @@ static s32 pick_idle_cpu(struct task_struct *p,
|
||||
|
||||
/*
|
||||
* Determine the task's scheduling domain.
|
||||
*
|
||||
* Try to dispatch on the preferred CPUs first. If we can't find any
|
||||
* idle CPU, re-try again with the primary scheduling domain.
|
||||
*/
|
||||
if (do_preferred &&
|
||||
!bpf_cpumask_empty(cast_mask(preferred)) &&
|
||||
!bpf_cpumask_equal(cast_mask(preferred), cast_mask(primary))) {
|
||||
bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(preferred));
|
||||
} else {
|
||||
bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary));
|
||||
do_preferred = false;
|
||||
}
|
||||
bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary));
|
||||
|
||||
/*
|
||||
* Determine the L2 cache domain as the intersection of the task's
|
||||
@ -661,33 +638,51 @@ static s32 pick_idle_cpu(struct task_struct *p,
|
||||
if (bpf_cpumask_empty(cast_mask(l3_mask)))
|
||||
l3_mask = NULL;
|
||||
|
||||
/*
|
||||
* Try to prioritize newly awakened tasks by immediately promoting them
|
||||
* as interactive.
|
||||
*/
|
||||
if (wake_flags & SCX_WAKE_SYNC) {
|
||||
struct task_struct *current = (void *)bpf_get_current_task_btf();
|
||||
bool share_llc, has_idle;
|
||||
|
||||
/*
|
||||
* Prioritize newly awakened tasks by immediately promoting
|
||||
* them as interactive.
|
||||
*/
|
||||
handle_sync_wakeup(p);
|
||||
|
||||
/*
|
||||
* If CPUs of the waker and the wakee share the same L3 cache,
|
||||
* try to re-use the same CPU, if idle.
|
||||
* Determine waker CPU scheduling domain.
|
||||
*/
|
||||
cpu = bpf_get_smp_processor_id();
|
||||
if (l3_mask && bpf_cpumask_test_cpu(cpu, cast_mask(l3_mask)) &&
|
||||
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
|
||||
|
||||
cctx = try_lookup_cpu_ctx(cpu);
|
||||
if (!cctx) {
|
||||
cpu = -ENOENT;
|
||||
goto out_put_cpumask;
|
||||
}
|
||||
|
||||
l3_domain = cctx->l3_cpumask;
|
||||
if (!l3_domain) {
|
||||
scx_bpf_error("CPU L3 cpumask not initialized");
|
||||
cpu = -ENOENT;
|
||||
goto out_put_cpumask;
|
||||
}
|
||||
|
||||
/*
|
||||
* If both the waker and wakee share the same L3 cache keep
|
||||
* using the same CPU if possible.
|
||||
*/
|
||||
share_llc = bpf_cpumask_test_cpu(prev_cpu, cast_mask(l3_domain));
|
||||
if (share_llc && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
|
||||
cpu = prev_cpu;
|
||||
goto out_put_cpumask;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to run the task on the same CPU as the waker if it's in
|
||||
* the same scheduling domain and if it's not completely
|
||||
* saturated.
|
||||
* If the waker's L3 domain is not saturated attempt to migrate
|
||||
* the wakee on the same CPU as the waker.
|
||||
*/
|
||||
if (bpf_cpumask_intersects(cast_mask(p_mask), idle_cpumask) &&
|
||||
bpf_cpumask_test_cpu(cpu, cast_mask(p_mask)) &&
|
||||
has_idle = bpf_cpumask_intersects(cast_mask(l3_domain), idle_cpumask);
|
||||
if (has_idle &&
|
||||
bpf_cpumask_test_cpu(cpu, p->cpus_ptr) &&
|
||||
!(current->flags & PF_EXITING) &&
|
||||
scx_bpf_dsq_nr_queued(cpu_to_dsq(cpu)) == 0)
|
||||
goto out_put_cpumask;
|
||||
@ -730,15 +725,6 @@ static s32 pick_idle_cpu(struct task_struct *p,
|
||||
goto out_put_cpumask;
|
||||
}
|
||||
|
||||
/*
|
||||
* When considering the preferred domain (first idle CPU
|
||||
* selection pass) try to stay on the same LLC.
|
||||
*/
|
||||
if (do_preferred) {
|
||||
cpu = -ENOENT;
|
||||
goto out_put_cpumask;
|
||||
}
|
||||
|
||||
/*
|
||||
* Search for any other full-idle core in the primary domain.
|
||||
*/
|
||||
@ -780,15 +766,6 @@ static s32 pick_idle_cpu(struct task_struct *p,
|
||||
goto out_put_cpumask;
|
||||
}
|
||||
|
||||
/*
|
||||
* When considering the preferred domain (first idle CPU selection
|
||||
* pass) try to stay on the same LLC.
|
||||
*/
|
||||
if (do_preferred) {
|
||||
cpu = -ENOENT;
|
||||
goto out_put_cpumask;
|
||||
}
|
||||
|
||||
/*
|
||||
* Search for any idle CPU in the scheduling domain.
|
||||
*/
|
||||
@ -815,7 +792,7 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, s32 prev_cpu, u64
|
||||
{
|
||||
s32 cpu;
|
||||
|
||||
cpu = pick_idle_cpu(p, prev_cpu, wake_flags, true);
|
||||
cpu = pick_idle_cpu(p, prev_cpu, wake_flags);
|
||||
if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0)) {
|
||||
__sync_fetch_and_add(&nr_direct_dispatches, 1);
|
||||
return cpu;
|
||||
@ -832,16 +809,18 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
{
|
||||
struct bpf_cpumask *primary;
|
||||
u64 deadline = task_deadline(p);
|
||||
s32 cpu, prev_cpu = scx_bpf_task_cpu(p);
|
||||
s32 cpu;
|
||||
|
||||
/*
|
||||
* If we couldn't find an idle CPU in ops.select_cpu(), give the task
|
||||
* another chance here to keep using the same CPU / cache / domain.
|
||||
* If local_kthreads is enabled, always dispatch per-CPU kthreads
|
||||
* directly to their target CPU.
|
||||
*/
|
||||
cpu = pick_idle_cpu(p, prev_cpu, 0, false);
|
||||
if (cpu >= 0 && !dispatch_direct_cpu(p, cpu, 0)) {
|
||||
__sync_fetch_and_add(&nr_direct_dispatches, 1);
|
||||
return;
|
||||
if (local_kthreads && is_kthread(p) && p->nr_cpus_allowed == 1) {
|
||||
cpu = scx_bpf_task_cpu(p);
|
||||
if (!dispatch_direct_cpu(p, cpu, enq_flags)) {
|
||||
__sync_fetch_and_add(&nr_direct_dispatches, 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -868,14 +847,9 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
* task, wake them up to see whether they'd be able to steal the just
|
||||
* queued task.
|
||||
*/
|
||||
primary = primary_cpumask;
|
||||
if (!primary)
|
||||
return;
|
||||
if (bpf_cpumask_subset(cast_mask(primary), p->cpus_ptr)) {
|
||||
cpu = scx_bpf_pick_idle_cpu(cast_mask(primary), 0);
|
||||
if (cpu >= 0)
|
||||
scx_bpf_kick_cpu(cpu, 0);
|
||||
}
|
||||
cpu = scx_bpf_pick_idle_cpu(cast_mask(p->cpus_ptr), 0);
|
||||
if (cpu >= 0)
|
||||
scx_bpf_kick_cpu(cpu, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1366,34 +1340,6 @@ int enable_sibling_cpu(struct domain_arg *input)
|
||||
return err;
|
||||
}
|
||||
|
||||
SEC("syscall")
|
||||
int enable_preferred_cpu(struct cpu_arg *input)
|
||||
{
|
||||
struct bpf_cpumask *mask;
|
||||
int err = 0;
|
||||
|
||||
/* Make sure the primary CPU mask is initialized */
|
||||
err = init_cpumask(&preferred_cpumask);
|
||||
if (err)
|
||||
return err;
|
||||
/*
|
||||
* Enable the target CPU in the preferred scheduling domain.
|
||||
*/
|
||||
bpf_rcu_read_lock();
|
||||
mask = preferred_cpumask;
|
||||
if (mask) {
|
||||
s32 cpu = input->cpu_id;
|
||||
|
||||
if (cpu < 0)
|
||||
bpf_cpumask_clear(mask);
|
||||
else
|
||||
bpf_cpumask_set_cpu(cpu, mask);
|
||||
}
|
||||
bpf_rcu_read_unlock();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
SEC("syscall")
|
||||
int enable_primary_cpu(struct cpu_arg *input)
|
||||
{
|
||||
@ -1481,11 +1427,6 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/* Initialize the preferred scheduling domain */
|
||||
err = init_cpumask(&preferred_cpumask);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -52,7 +52,6 @@ const SCHEDULER_NAME: &'static str = "scx_bpfland";
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum Powermode {
|
||||
Turbo,
|
||||
Performance,
|
||||
Powersave,
|
||||
Any,
|
||||
@ -66,8 +65,6 @@ fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
|
||||
.into_iter()
|
||||
.flat_map(|core| core.cpus())
|
||||
.filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
|
||||
// Turbo mode: only add turbo-boosted CPUs
|
||||
(Powermode::Turbo, CoreType::Big { turbo: true }) |
|
||||
// Performance mode: add all the Big CPUs (either Turbo or non-Turbo)
|
||||
(Powermode::Performance, CoreType::Big { .. }) |
|
||||
// Powersave mode: add all the Little CPUs
|
||||
@ -158,15 +155,6 @@ struct Opts {
|
||||
#[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
|
||||
local_kthreads: bool,
|
||||
|
||||
/// Specifies a group of preferred CPUs, represented as a bitmask in hex (e.g., 0xff), that the
|
||||
/// scheduler will try to prioritize to dispatch tasks.
|
||||
///
|
||||
/// Special values:
|
||||
/// - "auto" = automaticlly detect the fastest CPUs based on the current scheduler and system
|
||||
/// energy profiles.
|
||||
#[clap(short = 'M', long, default_value = "auto")]
|
||||
preferred_domain: String,
|
||||
|
||||
/// Specifies the initial set of CPUs, represented as a bitmask in hex (e.g., 0xff), that the
|
||||
/// scheduler will use to dispatch tasks, until the system becomes saturated, at which point
|
||||
/// tasks may overflow to other available CPUs.
|
||||
@ -288,9 +276,6 @@ impl<'a> Scheduler<'a> {
|
||||
|
||||
// Initialize the primary scheduling domain and the preferred domain.
|
||||
let energy_profile = Self::read_energy_profile();
|
||||
if let Err(err) = Self::init_preferred_domain(&mut skel, &opts.preferred_domain) {
|
||||
warn!("failed to initialize preferred domain: error {}", err);
|
||||
}
|
||||
if let Err(err) = Self::init_energy_domain(&mut skel, &opts.primary_domain, &energy_profile)
|
||||
{
|
||||
warn!("failed to initialize primary domain: error {}", err);
|
||||
@ -372,28 +357,6 @@ impl<'a> Scheduler<'a> {
|
||||
res.unwrap_or_else(|_: String| "none".to_string())
|
||||
}
|
||||
|
||||
fn enable_preferred_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
|
||||
let prog = &mut skel.progs.enable_preferred_cpu;
|
||||
let mut args = cpu_arg {
|
||||
cpu_id: cpu as c_int,
|
||||
};
|
||||
let input = ProgramInput {
|
||||
context_in: Some(unsafe {
|
||||
std::slice::from_raw_parts_mut(
|
||||
&mut args as *mut _ as *mut u8,
|
||||
std::mem::size_of_val(&args),
|
||||
)
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
let out = prog.test_run(input).unwrap();
|
||||
if out.return_value != 0 {
|
||||
return Err(out.return_value);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
|
||||
let mut cpus = get_primary_cpus(profile).unwrap_or(Vec::new());
|
||||
if cpus.is_empty() {
|
||||
@ -402,32 +365,6 @@ impl<'a> Scheduler<'a> {
|
||||
Cpumask::from_str(&cpus_to_cpumask(&cpus))
|
||||
}
|
||||
|
||||
fn init_preferred_domain(skel: &mut BpfSkel<'_>, preferred_domain: &String) -> Result<()> {
|
||||
let domain = match preferred_domain.as_str() {
|
||||
"auto" => Self::epp_to_cpumask(Powermode::Turbo)?,
|
||||
&_ => Cpumask::from_str(&preferred_domain)?,
|
||||
};
|
||||
|
||||
info!("preferred CPU domain = 0x{:x}", domain);
|
||||
|
||||
// Clear the preferred domain by passing a negative CPU id.
|
||||
if let Err(err) = Self::enable_preferred_cpu(skel, -1) {
|
||||
warn!("failed to reset preferred domain: error {}", err);
|
||||
}
|
||||
for cpu in 0..*NR_CPU_IDS {
|
||||
if domain.test_cpu(cpu) {
|
||||
if let Err(err) = Self::enable_preferred_cpu(skel, cpu as i32) {
|
||||
warn!(
|
||||
"failed to add CPU {} to preferred domain: error {}",
|
||||
cpu, err
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_energy_domain(
|
||||
skel: &mut BpfSkel<'_>,
|
||||
primary_domain: &String,
|
||||
@ -504,11 +441,6 @@ impl<'a> Scheduler<'a> {
|
||||
self.energy_profile = energy_profile.clone();
|
||||
|
||||
if self.opts.primary_domain == "auto" {
|
||||
if let Err(err) =
|
||||
Self::init_preferred_domain(&mut self.skel, &self.opts.preferred_domain)
|
||||
{
|
||||
warn!("failed to refresh preferred domain: error {}", err);
|
||||
}
|
||||
if let Err(err) = Self::init_energy_domain(
|
||||
&mut self.skel,
|
||||
&self.opts.primary_domain,
|
||||
|
Loading…
Reference in New Issue
Block a user