mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-24 20:00:22 +00:00
rusty: Add separate flag for x NUMA greedy task stealing
In scx_rusty, a CPU that is going to go idle will attempt to steal tasks from remote domains when its domain has no tasks to run, and a remote domain has at least greedy_threshold enqueued tasks. This stealing is temporary, but of course has a cost in that the CPU that's stealing the task may cause it to suffer from cache misses, or in the case of multi-node machines, remote NUMA accesses and working sets split across multiple domains. Given the higher cost of x NUMA work stealing, let's add a separate flag that lets users tune the threshold for doing cross NUMA greedy task stealing. Signed-off-by: David Vernet <void@manifault.com>
This commit is contained in:
parent
1c3168d2a4
commit
3d2507e6f2
@ -55,7 +55,8 @@ enum stat_idx {
|
||||
RUSTY_STAT_DIRECT_GREEDY,
|
||||
RUSTY_STAT_DIRECT_GREEDY_FAR,
|
||||
RUSTY_STAT_DSQ_DISPATCH,
|
||||
RUSTY_STAT_GREEDY,
|
||||
RUSTY_STAT_GREEDY_LOCAL,
|
||||
RUSTY_STAT_GREEDY_XNUMA,
|
||||
|
||||
/* Extra stats that don't contribute to total */
|
||||
RUSTY_STAT_REPATRIATE,
|
||||
@ -102,6 +103,7 @@ struct dom_ctx {
|
||||
struct bpf_cpumask __kptr *cpumask;
|
||||
struct bpf_cpumask __kptr *direct_greedy_cpumask;
|
||||
struct bpf_cpumask __kptr *node_cpumask;
|
||||
u32 node_id;
|
||||
|
||||
u64 dbg_dcycle_printed_at;
|
||||
struct bucket_ctx buckets[LB_LOAD_BUCKETS];
|
||||
|
@ -72,6 +72,7 @@ const volatile bool fifo_sched;
|
||||
const volatile bool switch_partial;
|
||||
const volatile bool direct_greedy_numa;
|
||||
const volatile u32 greedy_threshold;
|
||||
const volatile u32 greedy_threshold_x_numa;
|
||||
const volatile u32 debug;
|
||||
|
||||
/* base slice duration */
|
||||
@ -82,9 +83,12 @@ const volatile u64 slice_ns = SCX_SLICE_DFL;
|
||||
*/
|
||||
struct pcpu_ctx {
|
||||
u32 dom_rr_cur; /* used when scanning other doms */
|
||||
u32 dom_id;
|
||||
u32 nr_node_doms;
|
||||
u32 node_doms[MAX_DOMS];
|
||||
|
||||
/* libbpf-rs does not respect the alignment, so pad out the struct explicitly */
|
||||
u8 _padding[CACHELINE_SIZE - sizeof(u32)];
|
||||
u8 _padding[CACHELINE_SIZE - ((3 + MAX_DOMS) * sizeof(u32) % CACHELINE_SIZE)];
|
||||
} __attribute__((aligned(CACHELINE_SIZE)));
|
||||
|
||||
struct pcpu_ctx pcpu_ctx[MAX_CPUS];
|
||||
@ -854,24 +858,43 @@ static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
|
||||
static u32 dom_rr_next(s32 cpu)
|
||||
{
|
||||
struct pcpu_ctx *pcpuc;
|
||||
u32 dom_id;
|
||||
u32 idx, *dom_id;
|
||||
|
||||
pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
|
||||
if (!pcpuc)
|
||||
if (!pcpuc || !pcpuc->nr_node_doms)
|
||||
return 0;
|
||||
|
||||
dom_id = (pcpuc->dom_rr_cur + 1) % nr_doms;
|
||||
idx = (pcpuc->dom_rr_cur + 1) % pcpuc->nr_node_doms;
|
||||
dom_id = MEMBER_VPTR(pcpuc->node_doms, [idx]);
|
||||
if (!dom_id) {
|
||||
scx_bpf_error("Failed to lookup dom for %d", cpu);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (dom_id == cpu_to_dom_id(cpu))
|
||||
dom_id = (dom_id + 1) % nr_doms;
|
||||
if (*dom_id == cpu_to_dom_id(cpu))
|
||||
scx_bpf_error("%d found current dom in node_doms array", cpu);
|
||||
|
||||
pcpuc->dom_rr_cur = dom_id;
|
||||
return dom_id;
|
||||
pcpuc->dom_rr_cur++;
|
||||
return *dom_id;
|
||||
}
|
||||
|
||||
u32 dom_node_id(u32 dom_id)
|
||||
{
|
||||
u32 *nid_ptr;
|
||||
|
||||
nid_ptr = MEMBER_VPTR(dom_numa_id_map, [dom_id]);
|
||||
if (!nid_ptr) {
|
||||
scx_bpf_error("Couldn't look up node ID for %s", dom_id);
|
||||
return 0;
|
||||
}
|
||||
return *nid_ptr;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
|
||||
{
|
||||
u32 dom = cpu_to_dom_id(cpu);
|
||||
struct pcpu_ctx *pcpuc;
|
||||
u32 node_doms, my_node, i;
|
||||
|
||||
if (scx_bpf_consume(dom)) {
|
||||
stat_add(RUSTY_STAT_DSQ_DISPATCH, 1);
|
||||
@ -881,13 +904,35 @@ void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
|
||||
if (!greedy_threshold)
|
||||
return;
|
||||
|
||||
bpf_repeat(nr_doms - 1) {
|
||||
u32 dom_id = dom_rr_next(cpu);
|
||||
pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
|
||||
if (!pcpuc) {
|
||||
scx_bpf_error("Failed to get PCPU context");
|
||||
return;
|
||||
}
|
||||
node_doms = pcpuc->nr_node_doms;
|
||||
|
||||
if (scx_bpf_dsq_nr_queued(dom_id) >= greedy_threshold &&
|
||||
scx_bpf_consume(dom_id)) {
|
||||
stat_add(RUSTY_STAT_GREEDY, 1);
|
||||
break;
|
||||
/* try to steal a task from domains on the current NUMA node */
|
||||
bpf_for(i, 0, node_doms) {
|
||||
dom = (pcpuc->dom_rr_cur + 1 + i) % node_doms;
|
||||
if (scx_bpf_consume(dom)) {
|
||||
stat_add(RUSTY_STAT_GREEDY_LOCAL, 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (!greedy_threshold_x_numa || nr_nodes == 1)
|
||||
return;
|
||||
|
||||
/* try to steal a task from domains on other NUMA nodes */
|
||||
my_node = dom_node_id(pcpuc->dom_id);
|
||||
bpf_repeat(nr_doms - 1) {
|
||||
dom = (pcpuc->dom_rr_cur + 1) % nr_doms;
|
||||
pcpuc->dom_rr_cur++;
|
||||
if (dom_node_id(dom) != my_node &&
|
||||
scx_bpf_dsq_nr_queued(dom) >= greedy_threshold_x_numa &&
|
||||
scx_bpf_consume(dom)) {
|
||||
stat_add(RUSTY_STAT_GREEDY_XNUMA, 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1192,7 +1237,7 @@ static s32 create_dom(u32 dom_id)
|
||||
struct dom_ctx *domc;
|
||||
struct node_ctx *nodec;
|
||||
struct bpf_cpumask *cpumask, *node_mask;
|
||||
u32 cpu, node_id, *nid_ptr;
|
||||
u32 cpu, node_id;
|
||||
s32 ret;
|
||||
|
||||
if (dom_id >= MAX_DOMS) {
|
||||
@ -1260,12 +1305,7 @@ static s32 create_dom(u32 dom_id)
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
nid_ptr = MEMBER_VPTR(dom_numa_id_map, [dom_id]);
|
||||
if (!nid_ptr) {
|
||||
scx_bpf_error("Couldn't look up node ID for %s", dom_id);
|
||||
return -EEXIST;
|
||||
}
|
||||
node_id = *nid_ptr;
|
||||
node_id = dom_node_id(dom_id);
|
||||
nodec = bpf_map_lookup_elem(&node_data, &node_id);
|
||||
if (!nodec) {
|
||||
/* Should never happen, it's created statically at load time. */
|
||||
@ -1300,6 +1340,67 @@ static s32 create_dom(u32 dom_id)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static s32 initialize_cpu(s32 cpu)
|
||||
{
|
||||
struct bpf_cpumask *cpumask;
|
||||
struct dom_ctx *domc;
|
||||
int i, j = 0;
|
||||
struct pcpu_ctx *pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
|
||||
u32 *dom_nodes;
|
||||
|
||||
if (!pcpuc) {
|
||||
scx_bpf_error("Failed to lookup pcpu ctx %d", cpu);
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
pcpuc->dom_rr_cur = cpu;
|
||||
bpf_for(i, 0, nr_doms) {
|
||||
domc = bpf_map_lookup_elem(&dom_data, &i);
|
||||
if (!domc) {
|
||||
scx_bpf_error("Failed to lookup dom_ctx");
|
||||
return -ENOENT;
|
||||
}
|
||||
bpf_rcu_read_lock();
|
||||
cpumask = domc->node_cpumask;
|
||||
if (!cpumask) {
|
||||
bpf_rcu_read_unlock();
|
||||
scx_bpf_error("Failed to lookup dom node cpumask");
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
if (bpf_cpumask_test_cpu(cpu, (const struct cpumask *)cpumask)) {
|
||||
cpumask = domc->cpumask;
|
||||
if (!cpumask) {
|
||||
bpf_rcu_read_unlock();
|
||||
scx_bpf_error("Failed to lookup dom cpumask");
|
||||
return -ENOENT;
|
||||
}
|
||||
/*
|
||||
* Only record the remote domains in this array, as
|
||||
* we'll only ever consume from them on the greedy
|
||||
* threshold path.
|
||||
*/
|
||||
if (!bpf_cpumask_test_cpu(cpu,
|
||||
(const struct cpumask *)cpumask)) {
|
||||
dom_nodes = MEMBER_VPTR(pcpuc->node_doms, [j]);
|
||||
if (!dom_nodes) {
|
||||
bpf_rcu_read_unlock();
|
||||
scx_bpf_error("Failed to lookup doms ptr");
|
||||
return -EINVAL;
|
||||
}
|
||||
*dom_nodes = i;
|
||||
j++;
|
||||
} else {
|
||||
pcpuc->dom_id = i;
|
||||
}
|
||||
}
|
||||
bpf_rcu_read_unlock();
|
||||
}
|
||||
pcpuc->nr_node_doms = j;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
|
||||
{
|
||||
struct bpf_cpumask *cpumask;
|
||||
@ -1334,15 +1435,17 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
bpf_for(i, 0, nr_doms) {
|
||||
ret = create_dom(i);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
bpf_for(i, 0, nr_cpus)
|
||||
pcpu_ctx[i].dom_rr_cur = i;
|
||||
bpf_for(i, 0, nr_cpus) {
|
||||
ret = initialize_cpu(i);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -110,13 +110,27 @@ struct Opts {
|
||||
#[clap(short = 'C', long, num_args = 1.., conflicts_with = "cache_level")]
|
||||
cpumasks: Vec<String>,
|
||||
|
||||
/// When non-zero, enable greedy task stealing. When a domain is idle, a
|
||||
/// cpu will attempt to steal tasks from a domain with at least
|
||||
/// greedy_threshold tasks enqueued. These tasks aren't permanently
|
||||
/// stolen from the domain.
|
||||
/// When non-zero, enable greedy task stealing. When a domain is idle, a cpu
|
||||
/// will attempt to steal tasks from another domain as follows:
|
||||
///
|
||||
/// 1. Try to consume a task from the current domain
|
||||
/// 2. Try to consume a task from another domain in the current NUMA node
|
||||
/// (or globally, if running on a single-socket system), if the domain
|
||||
/// has at least this specified number of tasks enqueued.
|
||||
///
|
||||
/// See greedy_threshold_x_numa to enable task stealing across NUMA nodes.
|
||||
/// Tasks stolen in this manner are not permanently stolen from their
|
||||
/// domain.
|
||||
#[clap(short = 'g', long, default_value = "1")]
|
||||
greedy_threshold: u32,
|
||||
|
||||
/// When non-zero, enable greedy task stealing across NUMA nodes. The order
|
||||
/// of greedy task stealing follows greedy_threshold as described above, and
|
||||
/// greedy_threshold must be nonzero to enable task stealing across NUMA
|
||||
/// nodes.
|
||||
#[clap(long, default_value = "0")]
|
||||
greedy_threshold_x_numa: u32,
|
||||
|
||||
/// Disable load balancing. Unless disabled, periodically userspace will
|
||||
/// calculate the load factor of each domain and instruct BPF which
|
||||
/// processes to move.
|
||||
@ -286,6 +300,7 @@ impl<'a> Scheduler<'a> {
|
||||
skel.rodata_mut().fifo_sched = opts.fifo_sched;
|
||||
skel.rodata_mut().switch_partial = opts.partial;
|
||||
skel.rodata_mut().greedy_threshold = opts.greedy_threshold;
|
||||
skel.rodata_mut().greedy_threshold_x_numa = opts.greedy_threshold_x_numa;
|
||||
skel.rodata_mut().direct_greedy_numa = opts.direct_greedy_numa;
|
||||
skel.rodata_mut().debug = opts.verbose as u32;
|
||||
|
||||
@ -423,7 +438,8 @@ impl<'a> Scheduler<'a> {
|
||||
+ stat(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY)
|
||||
+ stat(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR)
|
||||
+ stat(bpf_intf::stat_idx_RUSTY_STAT_DSQ_DISPATCH)
|
||||
+ stat(bpf_intf::stat_idx_RUSTY_STAT_GREEDY);
|
||||
+ stat(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_LOCAL)
|
||||
+ stat(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_XNUMA);
|
||||
|
||||
let numa_load_avg = lb_stats[0].load.load_avg();
|
||||
let dom_load_avg = lb_stats[0].domains[0].load.load_avg();
|
||||
@ -456,9 +472,14 @@ impl<'a> Scheduler<'a> {
|
||||
);
|
||||
|
||||
info!(
|
||||
"dsq={:5.2} greedy={:5.2} kick_greedy={:5.2} rep={:5.2}",
|
||||
"dsq={:5.2} greedy_local={:5.2} greedy_xnuma={:5.2}",
|
||||
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DSQ_DISPATCH),
|
||||
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_GREEDY),
|
||||
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_LOCAL),
|
||||
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_XNUMA),
|
||||
);
|
||||
|
||||
info!(
|
||||
"kick_greedy={:5.2} rep={:5.2}",
|
||||
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_KICK_GREEDY),
|
||||
stat_pct(bpf_intf::stat_idx_RUSTY_STAT_REPATRIATE),
|
||||
);
|
||||
|
Loading…
Reference in New Issue
Block a user