scx_flatcg: Keep cgroup rb nodes stashed

The flatcg scheduler uses a rb_node type - struct cgv_node - to keep track of vtime. On cgroup init, a cgv_node is created and stashed in a hashmap - cgv_node_stash - for later use. In cgroup_enqueued and try_pick_next_cgroup, the node is inserted into the rbtree, which required removing it from the stash before this patch's changes. This patch makes cgv_node refcounted, which allows keeping it in the stash for the entirety of the cgroup's lifetime. Unnecessary bpf_kptr_xchg's and other boilerplate can be removed as a result. Note that in addition to bpf_refcount patches, which have been upstream for quite some time, this change depends on a more recent series [0]. [0]: https://lore.kernel.org/bpf/20231107085639.3016113-1-davemarchevsky@fb.com/ Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
2024-11-26 03:20:24 +00:00 · 2023-12-14 11:25:31 -08:00 · 2023-12-14 11:25:31 -08:00 · 3b7f33ea1b
commit 3b7f33ea1b
parent b7c06b9ed9
2 changed files with 11 additions and 18 deletions
--- a/scheds/c/scx_flatcg.bpf.c
+++ b/scheds/c/scx_flatcg.bpf.c
@ -100,6 +100,7 @@ struct cgv_node {
 	struct bpf_rb_node	rb_node;
 	__u64			cvtime;
 	__u64			cgid;
+	struct bpf_refcount	refcount;
 };

 private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock;
@ -289,14 +290,17 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
 	}

 	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
-	if (!stash) {
+	if (!stash || !stash->node) {
 		scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid);
 		return;
 	}

-	/* NULL if the node is already on the rbtree */
-	cgv_node = bpf_kptr_xchg(&stash->node, NULL);
+	cgv_node = bpf_refcount_acquire(stash->node);
 	if (!cgv_node) {
+		/*
+		 * Node never leaves cgv_node_stash, this should only happen if
+		 * fcg_cgroup_exit deletes the stashed node
+		 */
 		stat_inc(FCG_STAT_ENQ_RACE);
 		return;
 	}
@ -609,7 +613,6 @@ void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
 static bool try_pick_next_cgroup(u64 *cgidp)
 {
 	struct bpf_rb_node *rb_node;
-	struct cgv_node_stash *stash;
 	struct cgv_node *cgv_node;
 	struct fcg_cgrp_ctx *cgc;
 	struct cgroup *cgrp;
@ -693,12 +696,6 @@ static bool try_pick_next_cgroup(u64 *cgidp)
 	return true;

 out_stash:
-	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
-	if (!stash) {
-		stat_inc(FCG_STAT_PNC_GONE);
-		goto out_free;
-	}
-
 	/*
 	 * Paired with cmpxchg in cgrp_enqueued(). If they see the following
 	 * transition, they'll enqueue the cgroup. If they are earlier, we'll
@ -711,16 +708,9 @@ out_stash:
 		bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
 		bpf_spin_unlock(&cgv_tree_lock);
 		stat_inc(FCG_STAT_PNC_RACE);
-	} else {
-		cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
-		if (cgv_node) {
-			scx_bpf_error("unexpected !NULL cgv_node stash");
-			goto out_free;
-		}
+		return false;
 	}

-	return false;
-
 out_free:
 	bpf_obj_drop(cgv_node);
 	return false;
--- a/scheds/include/scx/common.bpf.h
+++ b/scheds/include/scx/common.bpf.h
@ -200,6 +200,9 @@ int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,

 struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;

+extern void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
+#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL)
+
 /* task */
 struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
 struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;