Merge pull request #82 from sched-ext/htejun

scx_flatcg: Fix fallout from direct dispatch API update
2024-11-28 21:50:23 +00:00 · 2024-01-10 11:25:36 -10:00 · 2024-01-10 11:25:36 -10:00 · b32d73ae4e
commit b32d73ae4e
parent ae50b155ca 18f7fe8477
3 changed files with 71 additions and 40 deletions
--- a/scheds/c/scx_flatcg.bpf.c
+++ b/scheds/c/scx_flatcg.bpf.c
@ -307,6 +307,17 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
 	bpf_spin_unlock(&cgv_tree_lock);
 }

+static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc)
+{
+	/*
+	 * Tell fcg_stopping() that this bypassed the regular scheduling path
+	 * and should be force charged to the cgroup. 0 is used to indicate that
+	 * the task isn't bypassing, so if the current runtime is 0, go back by
+	 * one nanosecond.
+	 */
+	taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
+}
+
 s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
 	struct fcg_task_ctx *taskc;
@ -324,35 +335,12 @@ s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake
 	/*
 	 * If select_cpu_dfl() is recommending local enqueue, the target CPU is
 	 * idle. Follow it and charge the cgroup later in fcg_stopping() after
-	 * the fact. Use the same mechanism to deal with tasks with custom
-	 * affinities so that we don't have to worry about per-cgroup dq's
-	 * containing tasks that can't be executed from some CPUs.
+	 * the fact.
 	 */
-	if (is_idle || p->nr_cpus_allowed != nr_cpus) {
-		/*
-		 * Tell fcg_stopping() that this bypassed the regular scheduling
-		 * path and should be force charged to the cgroup. 0 is used to
-		 * indicate that the task isn't bypassing, so if the current
-		 * runtime is 0, go back by one nanosecond.
-		 */
-		taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
-
-		/*
-		 * The global dq is deprioritized as we don't want to let tasks
-		 * to boost themselves by constraining its cpumask. The
-		 * deprioritization is rather severe, so let's not apply that to
-		 * per-cpu kernel threads. This is ham-fisted. We probably wanna
-		 * implement per-cgroup fallback dq's instead so that we have
-		 * more control over when tasks with custom cpumask get issued.
-		 */
-		if (is_idle ||
-		    (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
-			stat_inc(FCG_STAT_LOCAL);
-			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
-		} else {
-			stat_inc(FCG_STAT_GLOBAL);
-			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
-		}
+	if (is_idle) {
+		set_bypassed_at(p, taskc);
+		stat_inc(FCG_STAT_LOCAL);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 	}

 	return cpu;
@ -370,6 +358,32 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}

+	/*
+	 * Use the direct dispatching and force charging to deal with tasks with
+	 * custom affinities so that we don't have to worry about per-cgroup
+	 * dq's containing tasks that can't be executed from some CPUs.
+	 */
+	if (p->nr_cpus_allowed != nr_cpus) {
+		set_bypassed_at(p, taskc);
+
+		/*
+		 * The global dq is deprioritized as we don't want to let tasks
+		 * to boost themselves by constraining its cpumask. The
+		 * deprioritization is rather severe, so let's not apply that to
+		 * per-cpu kernel threads. This is ham-fisted. We probably wanna
+		 * implement per-cgroup fallback dq's instead so that we have
+		 * more control over when tasks with custom cpumask get issued.
+		 */
+		if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
+			stat_inc(FCG_STAT_LOCAL);
+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		} else {
+			stat_inc(FCG_STAT_GLOBAL);
+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		}
+		return;
+	}
+
 	cgrp = scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (!cgc)
@ -696,6 +710,7 @@ out_stash:
 		bpf_spin_lock(&cgv_tree_lock);
 		bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
 		bpf_spin_unlock(&cgv_tree_lock);
+		stat_inc(FCG_STAT_PNC_RACE);
 	} else {
 		cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
 		if (cgv_node) {
@ -717,6 +732,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 	struct fcg_cgrp_ctx *cgc;
 	struct cgroup *cgrp;
 	u64 now = bpf_ktime_get_ns();
+	bool picked_next = false;

 	cpuc = find_cpu_ctx();
 	if (!cpuc)
@ -772,9 +788,20 @@ pick_next_cgroup:
 	}

 	bpf_repeat(CGROUP_MAX_RETRIES) {
-		if (try_pick_next_cgroup(&cpuc->cur_cgid))
+		if (try_pick_next_cgroup(&cpuc->cur_cgid)) {
+			picked_next = true;
 			break;
+		}
 	}
+
+	/*
+	 * This only happens if try_pick_next_cgroup() races against enqueue
+	 * path for more than CGROUP_MAX_RETRIES times, which is extremely
+	 * unlikely and likely indicates an underlying bug. There shouldn't be
+	 * any stall risk as the race is against enqueue.
+	 */
+	if (!picked_next)
+		stat_inc(FCG_STAT_PNC_FAIL);
 }

 s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
--- a/scheds/c/scx_flatcg.c
+++ b/scheds/c/scx_flatcg.c
@ -186,29 +186,31 @@ int main(int argc, char **argv)

 		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
 		       seq++, cpu_util * 100.0, skel->data->hweight_gen);
-		printf("       act:%6llu  deact:%6llu local:%6llu global:%6llu\n",
+		printf("       act:%6llu  deact:%6llu global:%6llu local:%6llu\n",
 		       stats[FCG_STAT_ACT],
 		       stats[FCG_STAT_DEACT],
-		       stats[FCG_STAT_LOCAL],
-		       stats[FCG_STAT_GLOBAL]);
-		printf("HWT   skip:%6llu   race:%6llu cache:%6llu update:%6llu\n",
-		       stats[FCG_STAT_HWT_SKIP],
-		       stats[FCG_STAT_HWT_RACE],
+		       stats[FCG_STAT_GLOBAL],
+		       stats[FCG_STAT_LOCAL]);
+		printf("HWT  cache:%6llu update:%6llu   skip:%6llu  race:%6llu\n",
 		       stats[FCG_STAT_HWT_CACHE],
-		       stats[FCG_STAT_HWT_UPDATES]);
+		       stats[FCG_STAT_HWT_UPDATES],
+		       stats[FCG_STAT_HWT_SKIP],
+		       stats[FCG_STAT_HWT_RACE]);
 		printf("ENQ   skip:%6llu   race:%6llu\n",
 		       stats[FCG_STAT_ENQ_SKIP],
 		       stats[FCG_STAT_ENQ_RACE]);
-		printf("CNS   keep:%6llu expire:%6llu empty:%6llu   gone:%6llu\n",
+		printf("CNS   keep:%6llu expire:%6llu  empty:%6llu  gone:%6llu\n",
 		       stats[FCG_STAT_CNS_KEEP],
 		       stats[FCG_STAT_CNS_EXPIRE],
 		       stats[FCG_STAT_CNS_EMPTY],
 		       stats[FCG_STAT_CNS_GONE]);
-		printf("PNC nocgrp:%6llu   next:%6llu empty:%6llu   gone:%6llu\n",
-		       stats[FCG_STAT_PNC_NO_CGRP],
+		printf("PNC   next:%6llu  empty:%6llu nocgrp:%6llu  gone:%6llu race:%6llu fail:%6llu\n",
 		       stats[FCG_STAT_PNC_NEXT],
 		       stats[FCG_STAT_PNC_EMPTY],
-		       stats[FCG_STAT_PNC_GONE]);
+		       stats[FCG_STAT_PNC_NO_CGRP],
+		       stats[FCG_STAT_PNC_GONE],
+		       stats[FCG_STAT_PNC_RACE],
+		       stats[FCG_STAT_PNC_FAIL]);
 		printf("BAD remove:%6llu\n",
 		       acc_stats[FCG_STAT_BAD_REMOVAL]);
 		fflush(stdout);
--- a/scheds/c/scx_flatcg.h
+++ b/scheds/c/scx_flatcg.h
@ -28,6 +28,8 @@ enum fcg_stat_idx {
 	FCG_STAT_PNC_NEXT,
 	FCG_STAT_PNC_EMPTY,
 	FCG_STAT_PNC_GONE,
+	FCG_STAT_PNC_RACE,
+	FCG_STAT_PNC_FAIL,

 	FCG_STAT_BAD_REMOVAL,