Merge pull request #82 from sched-ext/htejun

scx_flatcg: Fix fallout from direct dispatch API update
This commit is contained in:
Tejun Heo 2024-01-10 11:25:36 -10:00 committed by GitHub
commit b32d73ae4e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 71 additions and 40 deletions

View File

@ -307,6 +307,17 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
bpf_spin_unlock(&cgv_tree_lock);
}
static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc)
{
/*
* Tell fcg_stopping() that this bypassed the regular scheduling path
* and should be force charged to the cgroup. 0 is used to indicate that
* the task isn't bypassing, so if the current runtime is 0, go back by
* one nanosecond.
*/
taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
}
s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
struct fcg_task_ctx *taskc;
@ -324,35 +335,12 @@ s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake
/*
* If select_cpu_dfl() is recommending local enqueue, the target CPU is
* idle. Follow it and charge the cgroup later in fcg_stopping() after
* the fact. Use the same mechanism to deal with tasks with custom
* affinities so that we don't have to worry about per-cgroup dq's
* containing tasks that can't be executed from some CPUs.
* the fact.
*/
if (is_idle || p->nr_cpus_allowed != nr_cpus) {
/*
* Tell fcg_stopping() that this bypassed the regular scheduling
* path and should be force charged to the cgroup. 0 is used to
* indicate that the task isn't bypassing, so if the current
* runtime is 0, go back by one nanosecond.
*/
taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
/*
* The global dq is deprioritized as we don't want to let tasks
* to boost themselves by constraining its cpumask. The
* deprioritization is rather severe, so let's not apply that to
* per-cpu kernel threads. This is ham-fisted. We probably wanna
* implement per-cgroup fallback dq's instead so that we have
* more control over when tasks with custom cpumask get issued.
*/
if (is_idle ||
(p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
stat_inc(FCG_STAT_LOCAL);
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
} else {
stat_inc(FCG_STAT_GLOBAL);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
}
if (is_idle) {
set_bypassed_at(p, taskc);
stat_inc(FCG_STAT_LOCAL);
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
}
return cpu;
@ -370,6 +358,32 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
/*
* Use the direct dispatching and force charging to deal with tasks with
* custom affinities so that we don't have to worry about per-cgroup
* dq's containing tasks that can't be executed from some CPUs.
*/
if (p->nr_cpus_allowed != nr_cpus) {
set_bypassed_at(p, taskc);
/*
* The global dq is deprioritized as we don't want to let tasks
* to boost themselves by constraining its cpumask. The
* deprioritization is rather severe, so let's not apply that to
* per-cpu kernel threads. This is ham-fisted. We probably wanna
* implement per-cgroup fallback dq's instead so that we have
* more control over when tasks with custom cpumask get issued.
*/
if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
stat_inc(FCG_STAT_LOCAL);
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
} else {
stat_inc(FCG_STAT_GLOBAL);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
}
return;
}
cgrp = scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp);
if (!cgc)
@ -696,6 +710,7 @@ out_stash:
bpf_spin_lock(&cgv_tree_lock);
bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
bpf_spin_unlock(&cgv_tree_lock);
stat_inc(FCG_STAT_PNC_RACE);
} else {
cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
if (cgv_node) {
@ -717,6 +732,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
struct fcg_cgrp_ctx *cgc;
struct cgroup *cgrp;
u64 now = bpf_ktime_get_ns();
bool picked_next = false;
cpuc = find_cpu_ctx();
if (!cpuc)
@ -772,9 +788,20 @@ pick_next_cgroup:
}
bpf_repeat(CGROUP_MAX_RETRIES) {
if (try_pick_next_cgroup(&cpuc->cur_cgid))
if (try_pick_next_cgroup(&cpuc->cur_cgid)) {
picked_next = true;
break;
}
}
/*
* This only happens if try_pick_next_cgroup() races against enqueue
* path for more than CGROUP_MAX_RETRIES times, which is extremely
* unlikely and likely indicates an underlying bug. There shouldn't be
* any stall risk as the race is against enqueue.
*/
if (!picked_next)
stat_inc(FCG_STAT_PNC_FAIL);
}
s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,

View File

@ -186,29 +186,31 @@ int main(int argc, char **argv)
printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
seq++, cpu_util * 100.0, skel->data->hweight_gen);
printf(" act:%6llu deact:%6llu local:%6llu global:%6llu\n",
printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n",
stats[FCG_STAT_ACT],
stats[FCG_STAT_DEACT],
stats[FCG_STAT_LOCAL],
stats[FCG_STAT_GLOBAL]);
printf("HWT skip:%6llu race:%6llu cache:%6llu update:%6llu\n",
stats[FCG_STAT_HWT_SKIP],
stats[FCG_STAT_HWT_RACE],
stats[FCG_STAT_GLOBAL],
stats[FCG_STAT_LOCAL]);
printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n",
stats[FCG_STAT_HWT_CACHE],
stats[FCG_STAT_HWT_UPDATES]);
stats[FCG_STAT_HWT_UPDATES],
stats[FCG_STAT_HWT_SKIP],
stats[FCG_STAT_HWT_RACE]);
printf("ENQ skip:%6llu race:%6llu\n",
stats[FCG_STAT_ENQ_SKIP],
stats[FCG_STAT_ENQ_RACE]);
printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n",
printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n",
stats[FCG_STAT_CNS_KEEP],
stats[FCG_STAT_CNS_EXPIRE],
stats[FCG_STAT_CNS_EMPTY],
stats[FCG_STAT_CNS_GONE]);
printf("PNC nocgrp:%6llu next:%6llu empty:%6llu gone:%6llu\n",
stats[FCG_STAT_PNC_NO_CGRP],
printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n",
stats[FCG_STAT_PNC_NEXT],
stats[FCG_STAT_PNC_EMPTY],
stats[FCG_STAT_PNC_GONE]);
stats[FCG_STAT_PNC_NO_CGRP],
stats[FCG_STAT_PNC_GONE],
stats[FCG_STAT_PNC_RACE],
stats[FCG_STAT_PNC_FAIL]);
printf("BAD remove:%6llu\n",
acc_stats[FCG_STAT_BAD_REMOVAL]);
fflush(stdout);

View File

@ -28,6 +28,8 @@ enum fcg_stat_idx {
FCG_STAT_PNC_NEXT,
FCG_STAT_PNC_EMPTY,
FCG_STAT_PNC_GONE,
FCG_STAT_PNC_RACE,
FCG_STAT_PNC_FAIL,
FCG_STAT_BAD_REMOVAL,