Merge pull request #178 from sched-ext/multi_numa_rusty

rusty: Implement NUMA-aware load balancing
2024-11-24 20:00:22 +00:00 · 2024-03-12 15:50:27 -05:00 · 2024-03-12 15:50:27 -05:00 · 91cb5ce8ab
commit 91cb5ce8ab
parent 54fe1c954e c8d841d50b
10 changed files with 1806 additions and 806 deletions
--- a/rust/scx_utils/src/cpumask.rs
+++ b/rust/scx_utils/src/cpumask.rs
@ -207,7 +207,14 @@ impl Cpumask {

 impl fmt::Display for Cpumask {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "{}:<{}>", self.nr_cpus, self.mask)
+        let slice = self.as_raw_slice();
+        let mut remaining_width = self.nr_cpus + 2;
+        write!(f, "{:#0width$b}", slice[0], width = remaining_width.min(66))?;
+        for submask in &slice[1..] {
+            remaining_width -= 64;
+            write!(f, "{:0width$b}", submask, width = remaining_width.min(64))?;
+        }
+        Ok(())
    }
 }

--- a/rust/scx_utils/src/topology.rs
+++ b/rust/scx_utils/src/topology.rs
@ -181,7 +181,7 @@ impl Node {
 #[derive(Debug)]
 pub struct Topology {
    nodes: Vec<Node>,
-    cores: BTreeMap<usize, Core>,
+    cores: Vec<Core>,
    cpus: BTreeMap<usize, Cpu>,
    nr_cpus: usize,
    span: Cpumask,
@ -199,14 +199,12 @@ impl Topology {
        // system. We clone the objects that are located further down in the
        // hierarchy rather than dealing with references, as the entire
        // Topology is read-only anyways.
-        let mut cores = BTreeMap::new();
+        let mut cores = Vec::new();
        let mut cpus = BTreeMap::new();
        for node in nodes.iter() {
-            for (_, llc) in node.llcs.iter() {
-                for (core_id, core) in llc.cores.iter() {
-                    if let Some(_) = cores.insert(*core_id, core.clone()) {
-                        bail!("Found duplicate core ID {}", core_id);
-                    }
+            for llc in node.llcs.values() {
+                for core in llc.cores.values() {
+                    cores.push(core.clone());
                    for (cpu_id, cpu) in core.cpus.iter() {
                        if let Some(_) = cpus.insert(*cpu_id, cpu.clone()) {
                            bail!("Found duplicate CPU ID {}", cpu_id);
@ -219,13 +217,13 @@ impl Topology {
        Ok(Topology { nodes, nr_cpus, cores, cpus, span })
    }

-    /// Get a slice of the NUMA nodes on the host
+    /// Get a slice of the NUMA nodes on the host.
    pub fn nodes(&self) -> &[Node] {
        &self.nodes
    }

-    /// Get a hashmap of <core ID, Core> for all Cores on the host.
-    pub fn cores(&self) -> &BTreeMap<usize, Core> {
+    /// Get a slice of all Cores on the host.
+    pub fn cores(&self) -> &[Core] {
        &self.cores
    }

--- a/scheds/rust/scx_rustland/src/main.rs
+++ b/scheds/rust/scx_rustland/src/main.rs
@ -296,7 +296,7 @@ impl<'a> Scheduler<'a> {
        let mut idle_cpu_count = 0;

        // Count the number of cores where all the CPUs are idle.
-        for (_, core) in self.topo.cores().iter() {
+        for core in self.topo.cores().iter() {
            let mut all_idle = true;
            for (cpu_id, _) in core.cpus().iter() {
                if self.bpf.get_cpu_pid(*cpu_id as i32) != 0 {
@ -669,14 +669,14 @@ impl<'a> Scheduler<'a> {
            Err(_) => -1,
        };
        info!("Running tasks:");
-        for (core_id, core) in self.topo.cores().iter() {
+        for core in self.topo.cores().iter() {
            for (cpu_id, _) in core.cpus().iter() {
                let pid = if *cpu_id as i32 == sched_cpu {
                    "[self]".to_string()
                } else {
                    self.bpf.get_cpu_pid(*cpu_id as i32).to_string()
                };
-                info!("  core {:2} cpu {:2} pid={}", core_id, cpu_id, pid);
+                info!("  core {:2} cpu {:2} pid={}", core.id(), cpu_id, pid);
            }
        }

--- a/scheds/rust/scx_rusty/Cargo.toml
+++ b/scheds/rust/scx_rusty/Cargo.toml
@ -17,6 +17,7 @@ log = "0.4.17"
 ordered-float = "3.4.0"
 scx_utils = { path = "../../../rust/scx_utils", version = "0.6" }
 simplelog = "0.12.0"
+sorted-vec = "0.8.3"
 static_assertions = "1.1.0"

 [build-dependencies]
--- a/scheds/rust/scx_rusty/src/bpf/intf.h
+++ b/scheds/rust/scx_rusty/src/bpf/intf.h
@ -24,6 +24,7 @@ typedef unsigned long long u64;
 enum consts {
 	MAX_CPUS		= 512,
 	MAX_DOMS		= 64,	/* limited to avoid complex bitmask ops */
+	MAX_NUMA_NODES		= MAX_DOMS,	/* Assume at least 1 domain per NUMA node */
 	CACHELINE_SIZE		= 64,

 	LB_DEFAULT_WEIGHT	= 100,
@ -54,7 +55,8 @@ enum stat_idx {
 	RUSTY_STAT_DIRECT_GREEDY,
 	RUSTY_STAT_DIRECT_GREEDY_FAR,
 	RUSTY_STAT_DSQ_DISPATCH,
-	RUSTY_STAT_GREEDY,
+	RUSTY_STAT_GREEDY_LOCAL,
+	RUSTY_STAT_GREEDY_XNUMA,

 	/* Extra stats that don't contribute to total */
 	RUSTY_STAT_REPATRIATE,
@ -72,6 +74,7 @@ struct task_ctx {
 	u64 dom_mask;

 	struct bpf_cpumask __kptr *cpumask;
+	struct bpf_cpumask __kptr *tmp_cpumask;
 	u32 dom_id;
 	u32 weight;
 	bool runnable;
@ -99,9 +102,15 @@ struct dom_ctx {
 	u64 vtime_now;
 	struct bpf_cpumask __kptr *cpumask;
 	struct bpf_cpumask __kptr *direct_greedy_cpumask;
+	struct bpf_cpumask __kptr *node_cpumask;
+	u32 node_id;

 	u64 dbg_dcycle_printed_at;
 	struct bucket_ctx buckets[LB_LOAD_BUCKETS];
 };

+struct node_ctx {
+	struct bpf_cpumask __kptr *cpumask;
+};
+
 #endif /* __INTF_H */
--- a/scheds/rust/scx_rusty/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_rusty/src/bpf/main.bpf.c
@ -59,15 +59,20 @@ struct user_exit_info uei;
 * Domains and cpus
 */
 const volatile u32 nr_doms = 32;	/* !0 for veristat, set during init */
+const volatile u32 nr_nodes = 32;	/* !0 for veristat, set during init */
 const volatile u32 nr_cpus = 64;	/* !0 for veristat, set during init */
 const volatile u32 cpu_dom_id_map[MAX_CPUS];
+const volatile u32 dom_numa_id_map[MAX_DOMS];
 const volatile u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64];
+const volatile u64 numa_cpumasks[MAX_NUMA_NODES][MAX_CPUS / 64];
 const volatile u32 load_half_life = 1000000000	/* 1s */;

 const volatile bool kthreads_local;
 const volatile bool fifo_sched;
 const volatile bool switch_partial;
+const volatile bool direct_greedy_numa;
 const volatile u32 greedy_threshold;
+const volatile u32 greedy_threshold_x_numa;
 const volatile u32 debug;

 /* base slice duration */
@ -78,13 +83,27 @@ const volatile u64 slice_ns = SCX_SLICE_DFL;
 */
 struct pcpu_ctx {
 	u32 dom_rr_cur; /* used when scanning other doms */
+	u32 dom_id;
+	u32 nr_node_doms;
+	u32 node_doms[MAX_DOMS];

 	/* libbpf-rs does not respect the alignment, so pad out the struct explicitly */
-	u8 _padding[CACHELINE_SIZE - sizeof(u32)];
+	u8 _padding[CACHELINE_SIZE - ((3 + MAX_DOMS) * sizeof(u32) % CACHELINE_SIZE)];
 } __attribute__((aligned(CACHELINE_SIZE)));

 struct pcpu_ctx pcpu_ctx[MAX_CPUS];

+/*
+ * Numa node context
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct node_ctx);
+	__uint(max_entries, MAX_NUMA_NODES);
+	__uint(map_flags, 0);
+} node_data SEC(".maps");
+
 /*
 * Domain context
 */
@ -467,7 +486,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 {
 	const struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask();
 	struct task_ctx *taskc;
-	struct bpf_cpumask *p_cpumask;
+	struct bpf_cpumask *p_cpumask, *tmp_cpumask = NULL;
 	bool prev_domestic, has_idle_cores;
 	s32 cpu;

@ -599,19 +618,51 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,

 	/*
 	 * Domestic domain is fully booked. If there are CPUs which are idle and
-	 * under-utilized, ignore domain boundaries and push the task there. Try
-	 * to find an idle core first.
+	 * under-utilized, ignore domain boundaries (while still respecting NUMA
+	 * boundaries) and push the task there. Try to find an idle core first.
 	 */
 	if (taskc->all_cpus && direct_greedy_cpumask &&
 	    !bpf_cpumask_empty((const struct cpumask *)direct_greedy_cpumask)) {
 		u32 dom_id = cpu_to_dom_id(prev_cpu);
 		struct dom_ctx *domc;
+		struct bpf_cpumask *tmp_direct_greedy, *node_mask;

 		if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) {
 			scx_bpf_error("Failed to lookup dom[%u]", dom_id);
 			goto enoent;
 		}

+		tmp_direct_greedy = direct_greedy_cpumask;
+		if (!tmp_direct_greedy) {
+			scx_bpf_error("Failed to lookup direct_greedy mask");
+			goto enoent;
+		}
+		/*
+		 * By default, only look for an idle core in the current NUMA
+		 * node when looking for direct greedy CPUs outside of the
+		 * current domain. Stealing work temporarily is fine when
+		 * you're going across domain boundaries, but it may be less
+		 * desirable when crossing NUMA boundaries as the task's
+		 * working set may end up spanning multiple NUMA nodes.
+		 */
+		if (!direct_greedy_numa) {
+			node_mask = domc->node_cpumask;
+			if (!node_mask) {
+				scx_bpf_error("Failed to lookup node mask");
+				goto enoent;
+			}
+
+			tmp_cpumask = bpf_kptr_xchg(&taskc->tmp_cpumask, NULL);
+			if (!tmp_cpumask) {
+				scx_bpf_error("Failed to lookup tmp cpumask");
+				goto enoent;
+			}
+			bpf_cpumask_and(tmp_cpumask,
+					(const struct cpumask *)node_mask,
+					(const struct cpumask *)tmp_direct_greedy);
+			tmp_direct_greedy = tmp_cpumask;
+		}
+
 		/* Try to find an idle core in the previous and then any domain */
 		if (has_idle_cores) {
 			if (domc->direct_greedy_cpumask) {
@ -626,7 +677,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,

 			if (direct_greedy_cpumask) {
 				cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
-							    direct_greedy_cpumask,
+							    tmp_direct_greedy,
 							    SCX_PICK_IDLE_CORE);
 				if (cpu >= 0) {
 					stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
@ -649,7 +700,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,

 		if (direct_greedy_cpumask) {
 			cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)
-						    direct_greedy_cpumask, 0);
+						    tmp_direct_greedy, 0);
 			if (cpu >= 0) {
 				stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
 				goto direct;
@ -668,10 +719,20 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 	else
 		cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0);

+	if (tmp_cpumask) {
+		tmp_cpumask = bpf_kptr_xchg(&taskc->tmp_cpumask, tmp_cpumask);
+		if (tmp_cpumask)
+			bpf_cpumask_release(tmp_cpumask);
+	}
 	scx_bpf_put_idle_cpumask(idle_smtmask);
 	return cpu;

 direct:
+	if (tmp_cpumask) {
+		tmp_cpumask = bpf_kptr_xchg(&taskc->tmp_cpumask, tmp_cpumask);
+		if (tmp_cpumask)
+			bpf_cpumask_release(tmp_cpumask);
+	}
 	taskc->dispatch_local = true;
 	scx_bpf_put_idle_cpumask(idle_smtmask);
 	return cpu;
@ -797,24 +858,43 @@ static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
 static u32 dom_rr_next(s32 cpu)
 {
 	struct pcpu_ctx *pcpuc;
-	u32 dom_id;
+	u32 idx, *dom_id;

 	pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
-	if (!pcpuc)
+	if (!pcpuc || !pcpuc->nr_node_doms)
 		return 0;

-	dom_id = (pcpuc->dom_rr_cur + 1) % nr_doms;
+	idx = (pcpuc->dom_rr_cur + 1) % pcpuc->nr_node_doms;
+	dom_id = MEMBER_VPTR(pcpuc->node_doms, [idx]);
+	if (!dom_id) {
+		scx_bpf_error("Failed to lookup dom for %d", cpu);
+		return 0;
+	}

-	if (dom_id == cpu_to_dom_id(cpu))
-		dom_id = (dom_id + 1) % nr_doms;
+	if (*dom_id == cpu_to_dom_id(cpu))
+		scx_bpf_error("%d found current dom in node_doms array", cpu);

-	pcpuc->dom_rr_cur = dom_id;
-	return dom_id;
+	pcpuc->dom_rr_cur++;
+	return *dom_id;
+}
+
+u32 dom_node_id(u32 dom_id)
+{
+	u32 *nid_ptr;
+
+	nid_ptr = MEMBER_VPTR(dom_numa_id_map, [dom_id]);
+	if (!nid_ptr) {
+		scx_bpf_error("Couldn't look up node ID for %s", dom_id);
+		return 0;
+	}
+	return *nid_ptr;
 }

 void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
 {
 	u32 dom = cpu_to_dom_id(cpu);
+	struct pcpu_ctx *pcpuc;
+	u32 node_doms, my_node, i;

 	if (scx_bpf_consume(dom)) {
 		stat_add(RUSTY_STAT_DSQ_DISPATCH, 1);
@ -824,13 +904,35 @@ void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
 	if (!greedy_threshold)
 		return;

-	bpf_repeat(nr_doms - 1) {
-		u32 dom_id = dom_rr_next(cpu);
+	pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
+	if (!pcpuc) {
+		scx_bpf_error("Failed to get PCPU context");
+		return;
+	}
+	node_doms = pcpuc->nr_node_doms;

-		if (scx_bpf_dsq_nr_queued(dom_id) >= greedy_threshold &&
-		    scx_bpf_consume(dom_id)) {
-			stat_add(RUSTY_STAT_GREEDY, 1);
-			break;
+	/* try to steal a task from domains on the current NUMA node */
+	bpf_for(i, 0, node_doms) {
+		dom = (pcpuc->dom_rr_cur + 1 + i) % node_doms;
+		if (scx_bpf_consume(dom)) {
+			stat_add(RUSTY_STAT_GREEDY_LOCAL, 1);
+			return;
+		}
+	}
+
+	if (!greedy_threshold_x_numa || nr_nodes == 1)
+		return;
+
+	/* try to steal a task from domains on other NUMA nodes */
+	my_node = dom_node_id(pcpuc->dom_id);
+	bpf_repeat(nr_doms - 1) {
+		dom = (pcpuc->dom_rr_cur + 1) % nr_doms;
+		pcpuc->dom_rr_cur++;
+		if (dom_node_id(dom) != my_node &&
+		    scx_bpf_dsq_nr_queued(dom) >= greedy_threshold_x_numa &&
+		    scx_bpf_consume(dom)) {
+			stat_add(RUSTY_STAT_GREEDY_XNUMA, 1);
+			return;
 		}
 	}
 }
@ -1053,6 +1155,18 @@ s32 BPF_STRUCT_OPS(rusty_init_task, struct task_struct *p,
 		return -EINVAL;
 	}

+	cpumask = bpf_cpumask_create();
+	if (!cpumask) {
+		scx_bpf_error("Failed to create BPF cpumask for task");
+		return -ENOMEM;
+	}
+	cpumask = bpf_kptr_xchg(&map_value->tmp_cpumask, cpumask);
+	if (cpumask) {
+		scx_bpf_error("%s[%d] tmp_cpumask already present", p->comm, p->pid);
+		bpf_cpumask_release(cpumask);
+		return -EEXIST;
+	}
+
 	task_pick_and_set_domain(map_value, p, p->cpus_ptr, true);

 	return 0;
@ -1077,11 +1191,53 @@ void BPF_STRUCT_OPS(rusty_exit_task, struct task_struct *p,
 	}
 }

+static s32 create_node(u32 node_id)
+{
+	u32 cpu;
+	struct bpf_cpumask *cpumask;
+	struct node_ctx *nodec;
+
+	nodec = bpf_map_lookup_elem(&node_data, &node_id);
+	if (!nodec) {
+		/* Should never happen, it's created statically at load time. */
+		scx_bpf_error("No node%u", node_id);
+		return -ENOENT;
+	}
+
+	cpumask = bpf_cpumask_create();
+	if (!cpumask)
+		return -ENOMEM;
+
+	for (cpu = 0; cpu < MAX_CPUS; cpu++) {
+		const volatile u64 *nmask;
+
+		nmask = MEMBER_VPTR(numa_cpumasks, [node_id][cpu / 64]);
+		if (!nmask) {
+			scx_bpf_error("array index error");
+			bpf_cpumask_release(cpumask);
+			return -ENOENT;
+		}
+
+		if (*nmask & (1LLU << (cpu % 64)))
+			bpf_cpumask_set_cpu(cpu, cpumask);
+	}
+
+	cpumask = bpf_kptr_xchg(&nodec->cpumask, cpumask);
+	if (cpumask) {
+		scx_bpf_error("Node %u cpumask already present", node_id);
+		bpf_cpumask_release(cpumask);
+		return -EEXIST;
+	}
+
+	return 0;
+}
+
 static s32 create_dom(u32 dom_id)
 {
 	struct dom_ctx *domc;
-	struct bpf_cpumask *cpumask;
-	u32 cpu;
+	struct node_ctx *nodec;
+	struct bpf_cpumask *cpumask, *node_mask;
+	u32 cpu, node_id;
 	s32 ret;

 	if (dom_id >= MAX_DOMS) {
@ -1141,7 +1297,6 @@ static s32 create_dom(u32 dom_id)
 			      dom_id);
 		return -ENOMEM;
 	}
-
 	cpumask = bpf_kptr_xchg(&domc->direct_greedy_cpumask, cpumask);
 	if (cpumask) {
 		scx_bpf_error("Domain %u direct_greedy_cpumask already present",
@ -1150,6 +1305,99 @@ static s32 create_dom(u32 dom_id)
 		return -EEXIST;
 	}

+	node_id = dom_node_id(dom_id);
+	nodec = bpf_map_lookup_elem(&node_data, &node_id);
+	if (!nodec) {
+		/* Should never happen, it's created statically at load time. */
+		scx_bpf_error("No node%u", node_id);
+		return -ENOENT;
+	}
+	bpf_rcu_read_lock();
+	node_mask = nodec->cpumask;
+	if (!node_mask) {
+		bpf_rcu_read_unlock();
+		scx_bpf_error("NUMA %d mask not found for domain %u",
+			      node_id, dom_id);
+		return -ENOENT;
+	}
+	cpumask = bpf_cpumask_create();
+	if (!cpumask) {
+		bpf_rcu_read_unlock();
+		scx_bpf_error("Failed to create BPF cpumask for domain %u",
+			      dom_id);
+		return -ENOMEM;
+	}
+	bpf_cpumask_copy(cpumask, (const struct cpumask *)node_mask);
+	bpf_rcu_read_unlock();
+	cpumask = bpf_kptr_xchg(&domc->node_cpumask, cpumask);
+	if (cpumask) {
+		scx_bpf_error("Domain %u node_cpumask already present",
+			      dom_id);
+		bpf_cpumask_release(cpumask);
+		return -EEXIST;
+	}
+
+	return 0;
+}
+
+static s32 initialize_cpu(s32 cpu)
+{
+	struct bpf_cpumask *cpumask;
+	struct dom_ctx *domc;
+	int i, j = 0;
+	struct pcpu_ctx *pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
+	u32 *dom_nodes;
+
+	if (!pcpuc) {
+		scx_bpf_error("Failed to lookup pcpu ctx %d", cpu);
+		return -ENOENT;
+	}
+
+	pcpuc->dom_rr_cur = cpu;
+	bpf_for(i, 0, nr_doms) {
+		domc = bpf_map_lookup_elem(&dom_data, &i);
+		if (!domc) {
+			scx_bpf_error("Failed to lookup dom_ctx");
+			return -ENOENT;
+		}
+		bpf_rcu_read_lock();
+		cpumask = domc->node_cpumask;
+		if (!cpumask) {
+			bpf_rcu_read_unlock();
+			scx_bpf_error("Failed to lookup dom node cpumask");
+			return -ENOENT;
+		}
+
+		if (bpf_cpumask_test_cpu(cpu, (const struct cpumask *)cpumask)) {
+			cpumask = domc->cpumask;
+			if (!cpumask) {
+				bpf_rcu_read_unlock();
+				scx_bpf_error("Failed to lookup dom cpumask");
+				return -ENOENT;
+			}
+			/*
+			 * Only record the remote domains in this array, as
+			 * we'll only ever consume from them on the greedy
+			 * threshold path.
+			 */
+			if (!bpf_cpumask_test_cpu(cpu,
+						  (const struct cpumask *)cpumask)) {
+				dom_nodes = MEMBER_VPTR(pcpuc->node_doms, [j]);
+				if (!dom_nodes) {
+					bpf_rcu_read_unlock();
+					scx_bpf_error("Failed to lookup doms ptr");
+					return -EINVAL;
+				}
+				*dom_nodes = i;
+				j++;
+			} else {
+				pcpuc->dom_id = i;
+			}
+		}
+		bpf_rcu_read_unlock();
+	}
+	pcpuc->nr_node_doms = j;
+
 	return 0;
 }

@ -1182,14 +1430,22 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
 	if (!switch_partial)
 		scx_bpf_switch_all();

+	bpf_for(i, 0, nr_nodes) {
+		ret = create_node(i);
+		if (ret)
+			return ret;
+	}
 	bpf_for(i, 0, nr_doms) {
 		ret = create_dom(i);
 		if (ret)
 			return ret;
 	}

-	bpf_for(i, 0, nr_cpus)
-		pcpu_ctx[i].dom_rr_cur = i;
+	bpf_for(i, 0, nr_cpus) {
+		ret = initialize_cpu(i);
+		if (ret)
+			return ret;
+	}

 	return 0;
 }
--- a/scheds/rust/scx_rusty/src/domain.rs
+++ b/scheds/rust/scx_rusty/src/domain.rs
@ -0,0 +1,112 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+use std::collections::BTreeMap;
+use std::sync::Arc;
+
+use anyhow::Result;
+
+use scx_utils::Cpumask;
+use scx_utils::Topology;
+
+#[derive(Clone, Debug)]
+pub struct Domain {
+    id: usize,
+    mask: Cpumask,
+}
+
+impl Domain {
+    /// Get the Domain's ID.
+    pub fn id(&self) -> usize {
+        self.id
+    }
+
+    /// Get a copy of the domain's cpumask.
+    pub fn mask(&self) -> Cpumask {
+        self.mask.clone()
+    }
+
+    /// Get a raw slice of the domain's cpumask as a set of one or more u64
+    /// variables whose bits represent CPUs in the mask.
+    pub fn mask_slice(&self) -> &[u64] {
+        self.mask.as_raw_slice()
+    }
+
+    /// The number of CPUs in the domain.
+    pub fn weight(&self) -> usize {
+        self.mask.len()
+    }
+}
+
+#[derive(Debug)]
+pub struct DomainGroup {
+    doms: BTreeMap<usize, Domain>,
+    cpu_dom_map: BTreeMap<usize, usize>,
+    dom_numa_map: BTreeMap<usize, usize>,
+    num_numa_nodes: usize,
+}
+
+impl DomainGroup {
+    pub fn new(top: Arc<Topology>, cpumasks: &[String]) -> Result<Self> {
+        let mut dom_numa_map = BTreeMap::new();
+        let (doms, num_numa_nodes) = if !cpumasks.is_empty() {
+            let mut doms: BTreeMap<usize, Domain> = BTreeMap::new();
+            let mut id = 0;
+            for mask_str in cpumasks.iter() {
+                let mask = Cpumask::from_str(&mask_str)?;
+                doms.insert(id, Domain { id, mask, });
+                dom_numa_map.insert(id, 0);
+                id += 1;
+            }
+            (doms, 1)
+        } else {
+            let mut doms: BTreeMap<usize, Domain> = BTreeMap::new();
+            for (node_id, node) in top.nodes().iter().enumerate() {
+                for (id, llc) in node.llcs().iter() {
+                    let mask = llc.span();
+                    doms.insert(*id, Domain { id: id.clone(), mask, });
+                    dom_numa_map.insert(*id, node_id.clone());
+                }
+            }
+            (doms, top.nodes().len())
+        };
+
+        let mut cpu_dom_map = BTreeMap::new();
+        for (id, dom) in doms.iter() {
+            for cpu in dom.mask.clone().into_iter() {
+                cpu_dom_map.insert(cpu, *id);
+            }
+        }
+
+        Ok(Self { doms, cpu_dom_map, dom_numa_map, num_numa_nodes, })
+    }
+
+    pub fn numa_doms(&self, numa_id: &usize) -> Vec<Domain> {
+        let mut numa_doms = Vec::new();
+        for (d_id, n_id) in self.dom_numa_map.iter() {
+            if n_id == numa_id {
+                let dom = self.doms.get(&d_id).unwrap();
+                numa_doms.push(dom.clone());
+            }
+        }
+
+        numa_doms
+    }
+
+    pub fn nr_doms(&self) -> usize {
+        self.doms.len()
+    }
+
+    pub fn nr_nodes(&self) -> usize {
+        self.num_numa_nodes
+    }
+
+    pub fn cpu_dom_id(&self, cpu: &usize) -> Option<usize> {
+        self.cpu_dom_map.get(cpu).copied()
+    }
+
+    pub fn dom_numa_id(&self, dom_id: &usize) -> Option<usize> {
+        self.dom_numa_map.get(dom_id).copied()
+    }
+}
--- a/scheds/rust/scx_rusty/src/load_balance.rs
+++ b/scheds/rust/scx_rusty/src/load_balance.rs
--- a/scheds/rust/scx_rusty/src/main.rs
+++ b/scheds/rust/scx_rusty/src/main.rs
--- a/scheds/rust/scx_rusty/src/tuner.rs
+++ b/scheds/rust/scx_rusty/src/tuner.rs
@ -0,0 +1,183 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+use std::collections::BTreeMap;
+use std::sync::Arc;
+
+use crate::sub_or_zero;
+use crate::DomainGroup;
+use crate::BpfSkel;
+
+use ::fb_procfs as procfs;
+use anyhow::anyhow;
+use anyhow::bail;
+use anyhow::Result;
+
+use scx_utils::Topology;
+
+fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
+    match (curr, prev) {
+        (
+            procfs::CpuStat {
+                user_usec: Some(curr_user),
+                nice_usec: Some(curr_nice),
+                system_usec: Some(curr_system),
+                idle_usec: Some(curr_idle),
+                iowait_usec: Some(curr_iowait),
+                irq_usec: Some(curr_irq),
+                softirq_usec: Some(curr_softirq),
+                stolen_usec: Some(curr_stolen),
+                ..
+            },
+            procfs::CpuStat {
+                user_usec: Some(prev_user),
+                nice_usec: Some(prev_nice),
+                system_usec: Some(prev_system),
+                idle_usec: Some(prev_idle),
+                iowait_usec: Some(prev_iowait),
+                irq_usec: Some(prev_irq),
+                softirq_usec: Some(prev_softirq),
+                stolen_usec: Some(prev_stolen),
+                ..
+            },
+        ) => {
+            let idle_usec = sub_or_zero(curr_idle, prev_idle);
+            let iowait_usec = sub_or_zero(curr_iowait, prev_iowait);
+            let user_usec = sub_or_zero(curr_user, prev_user);
+            let system_usec = sub_or_zero(curr_system, prev_system);
+            let nice_usec = sub_or_zero(curr_nice, prev_nice);
+            let irq_usec = sub_or_zero(curr_irq, prev_irq);
+            let softirq_usec = sub_or_zero(curr_softirq, prev_softirq);
+            let stolen_usec = sub_or_zero(curr_stolen, prev_stolen);
+
+            let busy_usec =
+                user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
+            let total_usec = idle_usec + busy_usec + iowait_usec;
+            if total_usec > 0 {
+                Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
+            } else {
+                Ok(1.0)
+            }
+        }
+        _ => {
+            bail!("Missing stats in cpustat");
+        }
+    }
+}
+
+pub struct Tuner {
+    top: Arc<Topology>,
+    dom_group: Arc<DomainGroup>,
+    direct_greedy_under: f64,
+    kick_greedy_under: f64,
+    proc_reader: procfs::ProcReader,
+    prev_cpu_stats: BTreeMap<u32, procfs::CpuStat>,
+    pub fully_utilized: bool,
+    dom_utils: Vec<f64>,
+}
+
+impl Tuner {
+    pub fn new(top: Arc<Topology>,
+               dom_group: Arc<DomainGroup>,
+               direct_greedy_under: f64,
+               kick_greedy_under: f64) -> Result<Self> {
+        let proc_reader = procfs::ProcReader::new();
+        let prev_cpu_stats = proc_reader
+            .read_stat()?
+            .cpus_map
+            .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
+
+       Ok(Self {
+           direct_greedy_under: direct_greedy_under / 100.0,
+           kick_greedy_under: kick_greedy_under / 100.0,
+           proc_reader,
+           prev_cpu_stats,
+           dom_utils: vec![0.0; dom_group.nr_doms()],
+           fully_utilized: false,
+           top,
+           dom_group,
+       })
+    }
+
+    pub fn dom_util(&self, dom: usize) -> f64 {
+        self.dom_utils[dom]
+    }
+
+    /// Apply a step in the Tuner by:
+    ///
+    /// 1. Recording CPU stats from procfs
+    /// 2. Calculating current per-domain and host-wide utilization
+    /// 3. Updating direct_greedy_under and kick_greedy_under cpumasks according
+    ///    to the observed utilization
+    pub fn step(&mut self, skel: &mut BpfSkel) -> Result<()> {
+        let curr_cpu_stats = self
+            .proc_reader
+            .read_stat()?
+            .cpus_map
+            .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
+        let mut dom_nr_cpus = vec![0; self.dom_group.nr_doms()];
+        let mut dom_util_sum = vec![0.0; self.dom_group.nr_doms()];
+
+        let mut avg_util = 0.0f64;
+        for cpu in 0..self.top.nr_cpus() {
+            let cpu32 = cpu as u32;
+            // None domain indicates the CPU was offline during
+            // initialization and None CpuStat indicates the CPU has gone
+            // down since then. Ignore both.
+            if let (Some(dom), Some(curr), Some(prev)) = (
+                self.dom_group.cpu_dom_id(&cpu),
+                curr_cpu_stats.get(&cpu32),
+                self.prev_cpu_stats.get(&cpu32),
+            ) {
+                let util = calc_util(curr, prev)?;
+                dom_nr_cpus[dom] += 1;
+                dom_util_sum[dom] += util;
+                avg_util += util;
+            }
+        }
+        avg_util /= self.top.nr_cpus() as f64;
+        self.fully_utilized = avg_util >= 0.99999;
+
+        let ti = &mut skel.bss_mut().tune_input;
+        for dom in 0..self.dom_group.nr_doms() {
+            // Calculate the domain avg util. If there are no active CPUs,
+            // it doesn't really matter. Go with 0.0 as that's less likely
+            // to confuse users.
+            let util = match dom_nr_cpus[dom] {
+                0 => 0.0,
+                nr => dom_util_sum[dom] / nr as f64,
+            };
+
+            self.dom_utils[dom] = util;
+
+            // This could be implemented better.
+            let update_dom_bits = |target: &mut [u64; 8], val: bool| {
+                for cpu in 0..self.top.nr_cpus() {
+                    if let Some(cdom) = self.dom_group.cpu_dom_id(&cpu) {
+                        if cdom == dom {
+                            if val {
+                                target[cpu / 64] |= 1u64 << (cpu % 64);
+                            } else {
+                                target[cpu / 64] &= !(1u64 << (cpu % 64));
+                            }
+                        }
+                    }
+                }
+            };
+
+            update_dom_bits(
+                &mut ti.direct_greedy_cpumask,
+                self.direct_greedy_under > 0.99999 || util < self.direct_greedy_under,
+            );
+            update_dom_bits(
+                &mut ti.kick_greedy_cpumask,
+                self.kick_greedy_under > 0.99999 || util < self.kick_greedy_under,
+            );
+        }
+
+        ti.gen += 1;
+        self.prev_cpu_stats = curr_cpu_stats;
+        Ok(())
+    }
+}