scx_rustland: improve scheduler's idle CPU selection

The current CPU selection logic in the scheduler presents some
inefficiencies.

When a task is drained from the BPF queue, the scheduler immediately
checks whether the CPU previously assigned to the task is still idle,
assigning it if it is. Otherwise, it iterates through available CPUs,
always starting from CPU #0, and selects the first idle one without
updating its state. This approach is consistently applied to the entire
batch of tasks drained from the BPF queue, resulting in all of them
being assigned to the same idle CPU (also with a higher likelihood of
allocation to lower CPU ids rather than higher ones).

While dispatching a batch of tasks to the same idle CPU is not
necessarily problematic, a fairer distribution among the list of idle
CPUs would be preferable.

Therefore change the CPU selection logic to distribute tasks equally
among the idle CPUs, still maintaining the preference for the previously
used one. Additionally, apply the CPU selection logic just before tasks
are dispatched, rather than assigning a CPU when tasks are drained from
the BPF queue. This adjustment is important, because tasks may linger in
the scheduler's internal structures for a bit and the idle state of the
CPUs in the system may change during that period.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
This commit is contained in:
Andrea Righi 2023-12-29 23:49:00 +01:00
parent e90bc923f9
commit 1a2c9f5fd4

View File

@ -216,7 +216,7 @@ struct Scheduler<'a> {
task_pool: TaskTree, // tasks ordered by vruntime
task_map: TaskInfoMap, // map pids to the corresponding task information
min_vruntime: u64, // Keep track of the minimum vruntime across all tasks
nr_cpus_online: u64, // Amount of the available CPUs in the system
nr_cpus_online: i32, // Amount of the available CPUs in the system
struct_ops: Option<libbpf_rs::Link>,
}
@ -240,7 +240,7 @@ impl<'a> Scheduler<'a> {
//
// We should probably refresh this counter during the normal execution to support cpu
// hotplugging, but for now let's keep it simple and set this only at initialization).
let nr_cpus_online = libbpf_rs::num_possible_cpus().unwrap() as u64;
let nr_cpus_online = libbpf_rs::num_possible_cpus().unwrap() as i32;
// Set scheduler options (defined in the BPF part).
skel.bss_mut().usersched_pid = pid;
@ -292,7 +292,7 @@ impl<'a> Scheduler<'a> {
}
// Get the pid running on a certain CPU, if no tasks are running return 0
fn get_cpu_pid(&self, cpu: u32) -> u32 {
fn get_cpu_pid(&self, cpu: i32) -> u32 {
let maps = self.skel.maps();
let cpu_map = maps.cpu_map();
@ -307,35 +307,18 @@ impl<'a> Scheduler<'a> {
pid
}
// Return the amount of idle CPUs in the system.
fn get_idle_cpus(&self) -> u32 {
let mut count = 0;
// Return the array of idle CPU ids.
fn get_idle_cpus(&self) -> Vec<i32> {
let mut idle_cpus = Vec::new();
for cpu in 0..self.nr_cpus_online {
let pid = self.get_cpu_pid(cpu as u32);
let pid = self.get_cpu_pid(cpu);
if pid == 0 {
count += 1;
}
}
return count;
}
// Search for an idle CPU in the system.
//
// First check the previously used CPU, that is always the best choice (to mitigate migration
// overhead), otherwise check all the others in order.
//
// If all the CPUs are busy return the previouly used CPU.
fn select_task_cpu(&self, prev_cpu: i32) -> i32 {
if self.get_cpu_pid(prev_cpu as u32) != 0 {
for cpu in 0..self.nr_cpus_online {
let pid = self.get_cpu_pid(cpu as u32);
if pid == 0 {
return cpu as i32;
}
idle_cpus.push(cpu);
}
}
prev_cpu
idle_cpus
}
// Update task's vruntime based on the information collected from the kernel part.
@ -382,9 +365,8 @@ impl<'a> Scheduler<'a> {
sum_exec_runtime: task.sum_exec_runtime,
vruntime: self.min_vruntime,
};
let cpu = self.select_task_cpu(task.cpu);
self.task_map.insert(task.pid, task_info);
self.task_pool.push(task.pid, cpu, self.min_vruntime);
self.task_pool.push(task.pid, task.cpu, self.min_vruntime);
}
}
Ok(None) => {
@ -407,18 +389,27 @@ impl<'a> Scheduler<'a> {
fn dispatch_tasks(&mut self) {
let maps = self.skel.maps();
let dispatched = maps.dispatched();
let idle_cpus = self.get_idle_cpus();
// Dispatch only a batch of tasks equal to the amount of idle CPUs in the system.
//
// This allows to have more tasks sitting in the task pool, reducing the pressure on the
// dispatcher queues and giving a chance to higher priority tasks to come in and get
// dispatched earlier, mitigating potential priority inversion issues.
for _ in 0..self.get_idle_cpus() {
for cpu in &idle_cpus {
match self.task_pool.pop() {
Some(task) => {
Some(mut task) => {
// Update global minimum vruntime.
self.min_vruntime = task.vruntime;
// Select a CPU to dispatch the task.
//
// Use the previously used CPU if idle, that is always the best choice (to
// mitigate migration overhead), otherwise pick the next idle CPU available.
if !idle_cpus.contains(&task.cpu) {
task.cpu = *cpu;
}
// Send task to the dispatcher.
let msg = DispatchedMessage::from_task(&task);
match dispatched.update(&[], msg.as_bytes(), libbpf_rs::MapFlags::ANY) {
@ -478,7 +469,7 @@ impl<'a> Scheduler<'a> {
// Show tasks that are currently running.
info!("Running tasks:");
for cpu in 0..self.nr_cpus_online {
let pid = self.get_cpu_pid(cpu as u32);
let pid = self.get_cpu_pid(cpu);
info!(" cpu={} pid={}", cpu, pid);
}