mirror of
https://github.com/sched-ext/scx.git
synced 2024-12-01 07:00:23 +00:00
layered: Use TLS map instead of hash map
In scx_layered, we're using a BPF_MAP_TYPE_HASH map (indexed by pid) rather than a BPF_MAP_TYPE_TASK_STORAGE, to track local storage for a task. As far as I can tell, there's no reason we need to be doing this. We never access the map from user space, and we're even passing a struct task_struct * to a helper subprog to look up the task context rather than only doing it by pid. Using a hashmap is error prone for this because we end up having to manually track lifecycles for entries in the map rather than relying on BPF to do it for us. For example, BPF will automatically free a task's entry from the map when it exits. Let's just use TLS here rather than a hashmap to avoid issues from this (e.g. we've observed the scheduler getting evicted because we're accessing a stale map entry after a task has been destroyed). Reported-by: Valentin Andrei <vandrei@meta.com> Signed-off-by: David Vernet <void@manifault.com>
This commit is contained in:
parent
63bae69d2a
commit
e857dd90ab
@ -37,7 +37,6 @@ enum consts {
|
||||
|
||||
/* Statistics */
|
||||
enum global_stat_idx {
|
||||
GSTAT_TASK_CTX_FREE_FAILED,
|
||||
GSTAT_EXCL_IDLE,
|
||||
GSTAT_EXCL_WAKEUP,
|
||||
NR_GSTATS,
|
||||
|
@ -238,31 +238,25 @@ struct task_ctx {
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, pid_t);
|
||||
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
__type(key, int);
|
||||
__type(value, struct task_ctx);
|
||||
__uint(max_entries, MAX_TASKS);
|
||||
__uint(map_flags, 0);
|
||||
} task_ctxs SEC(".maps");
|
||||
|
||||
static struct task_ctx *lookup_task_ctx_may_fail(struct task_struct *p)
|
||||
{
|
||||
s32 pid = p->pid;
|
||||
|
||||
return bpf_map_lookup_elem(&task_ctxs, &pid);
|
||||
return bpf_task_storage_get(&task_ctxs, p, 0, 0);
|
||||
}
|
||||
|
||||
static struct task_ctx *lookup_task_ctx(struct task_struct *p)
|
||||
{
|
||||
struct task_ctx *tctx;
|
||||
s32 pid = p->pid;
|
||||
struct task_ctx *tctx = lookup_task_ctx_may_fail(p);
|
||||
|
||||
if ((tctx = bpf_map_lookup_elem(&task_ctxs, &pid))) {
|
||||
return tctx;
|
||||
} else {
|
||||
if (!tctx)
|
||||
scx_bpf_error("task_ctx lookup failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return tctx;
|
||||
}
|
||||
|
||||
static struct layer *lookup_layer(int idx)
|
||||
@ -852,29 +846,21 @@ s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
|
||||
* fail spuriously due to BPF recursion protection triggering
|
||||
* unnecessarily.
|
||||
*/
|
||||
if ((ret = bpf_map_update_elem(&task_ctxs, &pid, &tctx_init, 0 /*BPF_NOEXIST*/))) {
|
||||
scx_bpf_error("task_ctx allocation failure, ret=%d", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read the entry from the map immediately so we can add the cpumask
|
||||
* with bpf_kptr_xchg().
|
||||
*/
|
||||
if (!(tctx = lookup_task_ctx(p)))
|
||||
return -ENOENT;
|
||||
|
||||
cpumask = bpf_cpumask_create();
|
||||
if (!cpumask) {
|
||||
bpf_map_delete_elem(&task_ctxs, &pid);
|
||||
tctx = bpf_task_storage_get(&task_ctxs, p, 0,
|
||||
BPF_LOCAL_STORAGE_GET_F_CREATE);
|
||||
if (!tctx) {
|
||||
scx_bpf_error("task_ctx allocation failure");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
cpumask = bpf_cpumask_create();
|
||||
if (!cpumask)
|
||||
return -ENOMEM;
|
||||
|
||||
cpumask = bpf_kptr_xchg(&tctx->layered_cpumask, cpumask);
|
||||
if (cpumask) {
|
||||
/* Should never happen as we just inserted it above. */
|
||||
bpf_cpumask_release(cpumask);
|
||||
bpf_map_delete_elem(&task_ctxs, &pid);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -901,16 +887,6 @@ void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p,
|
||||
|
||||
if (tctx->layer >= 0 && tctx->layer < nr_layers)
|
||||
__sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1);
|
||||
|
||||
/*
|
||||
* XXX - There's no reason delete should fail here but BPF's recursion
|
||||
* protection can unnecessarily fail the operation. The fact that
|
||||
* deletions aren't reliable means that we sometimes leak task_ctx and
|
||||
* can't use BPF_NOEXIST on allocation in .prep_enable().
|
||||
*/
|
||||
ret = bpf_map_delete_elem(&task_ctxs, &pid);
|
||||
if (ret)
|
||||
gstat_inc(GSTAT_TASK_CTX_FREE_FAILED, cctx);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
|
||||
|
@ -199,7 +199,7 @@ lazy_static::lazy_static! {
|
||||
/// scx_layered will print out a set of statistics every monitoring
|
||||
/// interval.
|
||||
///
|
||||
/// tot= 117909 local=86.20 open_idle= 0.21 affn_viol= 1.37 tctx_err=9 proc=6ms
|
||||
/// tot= 117909 local=86.20 open_idle= 0.21 affn_viol= 1.37 proc=6ms
|
||||
/// busy= 34.2 util= 1733.6 load= 21744.1 fallback_cpu= 1
|
||||
/// batch : util/frac= 11.8/ 0.7 load/frac= 29.7: 0.1 tasks= 2597
|
||||
/// tot= 3478 local=67.80 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
|
||||
@ -1084,7 +1084,6 @@ struct OpenMetricsStats {
|
||||
local: Gauge<f64, AtomicU64>,
|
||||
open_idle: Gauge<f64, AtomicU64>,
|
||||
affn_viol: Gauge<f64, AtomicU64>,
|
||||
tctx_err: Gauge<i64, AtomicI64>,
|
||||
excl_idle: Gauge<f64, AtomicU64>,
|
||||
excl_wakeup: Gauge<f64, AtomicU64>,
|
||||
proc_ms: Gauge<i64, AtomicI64>,
|
||||
@ -1137,7 +1136,6 @@ impl OpenMetricsStats {
|
||||
affn_viol,
|
||||
"% which violated configured policies due to CPU affinity restrictions"
|
||||
);
|
||||
register!(tctx_err, "Failures to free task contexts");
|
||||
register!(
|
||||
excl_idle,
|
||||
"Number of times a CPU skipped dispatching due to sibling running an exclusive task"
|
||||
@ -1508,10 +1506,6 @@ impl<'a> Scheduler<'a> {
|
||||
self.om_stats
|
||||
.affn_viol
|
||||
.set(lsum_pct(bpf_intf::layer_stat_idx_LSTAT_AFFN_VIOL));
|
||||
self.om_stats.tctx_err.set(
|
||||
stats.prev_bpf_stats.gstats
|
||||
[bpf_intf::global_stat_idx_GSTAT_TASK_CTX_FREE_FAILED as usize] as i64,
|
||||
);
|
||||
self.om_stats.excl_idle.set(
|
||||
stats.bpf_stats.gstats[bpf_intf::global_stat_idx_GSTAT_EXCL_IDLE as usize] as f64
|
||||
/ total as f64,
|
||||
@ -1527,12 +1521,11 @@ impl<'a> Scheduler<'a> {
|
||||
|
||||
if !self.om_format {
|
||||
info!(
|
||||
"tot={:7} local={} open_idle={} affn_viol={} tctx_err={} proc={:?}ms",
|
||||
"tot={:7} local={} open_idle={} affn_viol={} proc={:?}ms",
|
||||
self.om_stats.total.get(),
|
||||
fmt_pct(self.om_stats.local.get()),
|
||||
fmt_pct(self.om_stats.open_idle.get()),
|
||||
fmt_pct(self.om_stats.affn_viol.get()),
|
||||
self.om_stats.tctx_err.get(),
|
||||
self.om_stats.proc_ms.get(),
|
||||
);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user