rusty: Add separate flag for x NUMA greedy task stealing

In scx_rusty, a CPU that is going to go idle will attempt to steal tasks
from remote domains when its domain has no tasks to run, and a remote
domain has at least greedy_threshold enqueued tasks. This stealing is
temporary, but of course has a cost in that the CPU that's stealing the
task may cause it to suffer from cache misses, or in the case of
multi-node machines, remote NUMA accesses and working sets split across
multiple domains.

Given the higher cost of x NUMA work stealing, let's add a separate flag
that lets users tune the threshold for doing cross NUMA greedy task
stealing.

Signed-off-by: David Vernet <void@manifault.com>
This commit is contained in:
David Vernet 2024-03-08 11:41:17 -06:00
parent 1c3168d2a4
commit 3d2507e6f2
3 changed files with 158 additions and 32 deletions

View File

@ -55,7 +55,8 @@ enum stat_idx {
RUSTY_STAT_DIRECT_GREEDY,
RUSTY_STAT_DIRECT_GREEDY_FAR,
RUSTY_STAT_DSQ_DISPATCH,
RUSTY_STAT_GREEDY,
RUSTY_STAT_GREEDY_LOCAL,
RUSTY_STAT_GREEDY_XNUMA,
/* Extra stats that don't contribute to total */
RUSTY_STAT_REPATRIATE,
@ -102,6 +103,7 @@ struct dom_ctx {
struct bpf_cpumask __kptr *cpumask;
struct bpf_cpumask __kptr *direct_greedy_cpumask;
struct bpf_cpumask __kptr *node_cpumask;
u32 node_id;
u64 dbg_dcycle_printed_at;
struct bucket_ctx buckets[LB_LOAD_BUCKETS];

View File

@ -72,6 +72,7 @@ const volatile bool fifo_sched;
const volatile bool switch_partial;
const volatile bool direct_greedy_numa;
const volatile u32 greedy_threshold;
const volatile u32 greedy_threshold_x_numa;
const volatile u32 debug;
/* base slice duration */
@ -82,9 +83,12 @@ const volatile u64 slice_ns = SCX_SLICE_DFL;
*/
struct pcpu_ctx {
u32 dom_rr_cur; /* used when scanning other doms */
u32 dom_id;
u32 nr_node_doms;
u32 node_doms[MAX_DOMS];
/* libbpf-rs does not respect the alignment, so pad out the struct explicitly */
u8 _padding[CACHELINE_SIZE - sizeof(u32)];
u8 _padding[CACHELINE_SIZE - ((3 + MAX_DOMS) * sizeof(u32) % CACHELINE_SIZE)];
} __attribute__((aligned(CACHELINE_SIZE)));
struct pcpu_ctx pcpu_ctx[MAX_CPUS];
@ -854,24 +858,43 @@ static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
static u32 dom_rr_next(s32 cpu)
{
struct pcpu_ctx *pcpuc;
u32 dom_id;
u32 idx, *dom_id;
pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
if (!pcpuc)
if (!pcpuc || !pcpuc->nr_node_doms)
return 0;
dom_id = (pcpuc->dom_rr_cur + 1) % nr_doms;
idx = (pcpuc->dom_rr_cur + 1) % pcpuc->nr_node_doms;
dom_id = MEMBER_VPTR(pcpuc->node_doms, [idx]);
if (!dom_id) {
scx_bpf_error("Failed to lookup dom for %d", cpu);
return 0;
}
if (dom_id == cpu_to_dom_id(cpu))
dom_id = (dom_id + 1) % nr_doms;
if (*dom_id == cpu_to_dom_id(cpu))
scx_bpf_error("%d found current dom in node_doms array", cpu);
pcpuc->dom_rr_cur = dom_id;
return dom_id;
pcpuc->dom_rr_cur++;
return *dom_id;
}
u32 dom_node_id(u32 dom_id)
{
u32 *nid_ptr;
nid_ptr = MEMBER_VPTR(dom_numa_id_map, [dom_id]);
if (!nid_ptr) {
scx_bpf_error("Couldn't look up node ID for %s", dom_id);
return 0;
}
return *nid_ptr;
}
void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
{
u32 dom = cpu_to_dom_id(cpu);
struct pcpu_ctx *pcpuc;
u32 node_doms, my_node, i;
if (scx_bpf_consume(dom)) {
stat_add(RUSTY_STAT_DSQ_DISPATCH, 1);
@ -881,13 +904,35 @@ void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
if (!greedy_threshold)
return;
bpf_repeat(nr_doms - 1) {
u32 dom_id = dom_rr_next(cpu);
pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
if (!pcpuc) {
scx_bpf_error("Failed to get PCPU context");
return;
}
node_doms = pcpuc->nr_node_doms;
if (scx_bpf_dsq_nr_queued(dom_id) >= greedy_threshold &&
scx_bpf_consume(dom_id)) {
stat_add(RUSTY_STAT_GREEDY, 1);
break;
/* try to steal a task from domains on the current NUMA node */
bpf_for(i, 0, node_doms) {
dom = (pcpuc->dom_rr_cur + 1 + i) % node_doms;
if (scx_bpf_consume(dom)) {
stat_add(RUSTY_STAT_GREEDY_LOCAL, 1);
return;
}
}
if (!greedy_threshold_x_numa || nr_nodes == 1)
return;
/* try to steal a task from domains on other NUMA nodes */
my_node = dom_node_id(pcpuc->dom_id);
bpf_repeat(nr_doms - 1) {
dom = (pcpuc->dom_rr_cur + 1) % nr_doms;
pcpuc->dom_rr_cur++;
if (dom_node_id(dom) != my_node &&
scx_bpf_dsq_nr_queued(dom) >= greedy_threshold_x_numa &&
scx_bpf_consume(dom)) {
stat_add(RUSTY_STAT_GREEDY_XNUMA, 1);
return;
}
}
}
@ -1192,7 +1237,7 @@ static s32 create_dom(u32 dom_id)
struct dom_ctx *domc;
struct node_ctx *nodec;
struct bpf_cpumask *cpumask, *node_mask;
u32 cpu, node_id, *nid_ptr;
u32 cpu, node_id;
s32 ret;
if (dom_id >= MAX_DOMS) {
@ -1260,12 +1305,7 @@ static s32 create_dom(u32 dom_id)
return -EEXIST;
}
nid_ptr = MEMBER_VPTR(dom_numa_id_map, [dom_id]);
if (!nid_ptr) {
scx_bpf_error("Couldn't look up node ID for %s", dom_id);
return -EEXIST;
}
node_id = *nid_ptr;
node_id = dom_node_id(dom_id);
nodec = bpf_map_lookup_elem(&node_data, &node_id);
if (!nodec) {
/* Should never happen, it's created statically at load time. */
@ -1300,6 +1340,67 @@ static s32 create_dom(u32 dom_id)
return 0;
}
static s32 initialize_cpu(s32 cpu)
{
struct bpf_cpumask *cpumask;
struct dom_ctx *domc;
int i, j = 0;
struct pcpu_ctx *pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
u32 *dom_nodes;
if (!pcpuc) {
scx_bpf_error("Failed to lookup pcpu ctx %d", cpu);
return -ENOENT;
}
pcpuc->dom_rr_cur = cpu;
bpf_for(i, 0, nr_doms) {
domc = bpf_map_lookup_elem(&dom_data, &i);
if (!domc) {
scx_bpf_error("Failed to lookup dom_ctx");
return -ENOENT;
}
bpf_rcu_read_lock();
cpumask = domc->node_cpumask;
if (!cpumask) {
bpf_rcu_read_unlock();
scx_bpf_error("Failed to lookup dom node cpumask");
return -ENOENT;
}
if (bpf_cpumask_test_cpu(cpu, (const struct cpumask *)cpumask)) {
cpumask = domc->cpumask;
if (!cpumask) {
bpf_rcu_read_unlock();
scx_bpf_error("Failed to lookup dom cpumask");
return -ENOENT;
}
/*
* Only record the remote domains in this array, as
* we'll only ever consume from them on the greedy
* threshold path.
*/
if (!bpf_cpumask_test_cpu(cpu,
(const struct cpumask *)cpumask)) {
dom_nodes = MEMBER_VPTR(pcpuc->node_doms, [j]);
if (!dom_nodes) {
bpf_rcu_read_unlock();
scx_bpf_error("Failed to lookup doms ptr");
return -EINVAL;
}
*dom_nodes = i;
j++;
} else {
pcpuc->dom_id = i;
}
}
bpf_rcu_read_unlock();
}
pcpuc->nr_node_doms = j;
return 0;
}
s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
{
struct bpf_cpumask *cpumask;
@ -1334,15 +1435,17 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
if (ret)
return ret;
}
bpf_for(i, 0, nr_doms) {
ret = create_dom(i);
if (ret)
return ret;
}
bpf_for(i, 0, nr_cpus)
pcpu_ctx[i].dom_rr_cur = i;
bpf_for(i, 0, nr_cpus) {
ret = initialize_cpu(i);
if (ret)
return ret;
}
return 0;
}

View File

@ -110,13 +110,27 @@ struct Opts {
#[clap(short = 'C', long, num_args = 1.., conflicts_with = "cache_level")]
cpumasks: Vec<String>,
/// When non-zero, enable greedy task stealing. When a domain is idle, a
/// cpu will attempt to steal tasks from a domain with at least
/// greedy_threshold tasks enqueued. These tasks aren't permanently
/// stolen from the domain.
/// When non-zero, enable greedy task stealing. When a domain is idle, a cpu
/// will attempt to steal tasks from another domain as follows:
///
/// 1. Try to consume a task from the current domain
/// 2. Try to consume a task from another domain in the current NUMA node
/// (or globally, if running on a single-socket system), if the domain
/// has at least this specified number of tasks enqueued.
///
/// See greedy_threshold_x_numa to enable task stealing across NUMA nodes.
/// Tasks stolen in this manner are not permanently stolen from their
/// domain.
#[clap(short = 'g', long, default_value = "1")]
greedy_threshold: u32,
/// When non-zero, enable greedy task stealing across NUMA nodes. The order
/// of greedy task stealing follows greedy_threshold as described above, and
/// greedy_threshold must be nonzero to enable task stealing across NUMA
/// nodes.
#[clap(long, default_value = "0")]
greedy_threshold_x_numa: u32,
/// Disable load balancing. Unless disabled, periodically userspace will
/// calculate the load factor of each domain and instruct BPF which
/// processes to move.
@ -286,6 +300,7 @@ impl<'a> Scheduler<'a> {
skel.rodata_mut().fifo_sched = opts.fifo_sched;
skel.rodata_mut().switch_partial = opts.partial;
skel.rodata_mut().greedy_threshold = opts.greedy_threshold;
skel.rodata_mut().greedy_threshold_x_numa = opts.greedy_threshold_x_numa;
skel.rodata_mut().direct_greedy_numa = opts.direct_greedy_numa;
skel.rodata_mut().debug = opts.verbose as u32;
@ -423,7 +438,8 @@ impl<'a> Scheduler<'a> {
+ stat(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY)
+ stat(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR)
+ stat(bpf_intf::stat_idx_RUSTY_STAT_DSQ_DISPATCH)
+ stat(bpf_intf::stat_idx_RUSTY_STAT_GREEDY);
+ stat(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_LOCAL)
+ stat(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_XNUMA);
let numa_load_avg = lb_stats[0].load.load_avg();
let dom_load_avg = lb_stats[0].domains[0].load.load_avg();
@ -456,9 +472,14 @@ impl<'a> Scheduler<'a> {
);
info!(
"dsq={:5.2} greedy={:5.2} kick_greedy={:5.2} rep={:5.2}",
"dsq={:5.2} greedy_local={:5.2} greedy_xnuma={:5.2}",
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DSQ_DISPATCH),
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_GREEDY),
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_LOCAL),
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_XNUMA),
);
info!(
"kick_greedy={:5.2} rep={:5.2}",
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_KICK_GREEDY),
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_REPATRIATE),
);