scx_lavd: Perform load balancing at consume_task()

Upon ops.dispatch, perform load balancing based on the set-up plan,
stealing a task from a stealee domain to a stealer domain. To avoid
the thundering herd problem of concurrent stealers, a stealer steals
a task probabilistically. Also, to minimize the task migration distance,
decrease the stealing probability exponentially for each hop in the
distance. Finally, for every stat cycle (50 ms), a stealer will migrate
only one task from a stealee for gradual load balancing.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
This commit is contained in:
Changwoo Min 2024-11-28 14:52:42 +09:00
parent 4f1ffc1bc6
commit 047e8c81e9
3 changed files with 123 additions and 15 deletions

View File

@ -51,7 +51,9 @@ enum consts_internal {
performance mode when cpu util > 40% */
LAVD_CPDOM_STARV_NS = (2 * LAVD_SLICE_MAX_NS_DFL),
LAVD_CPDOM_MIGRATION_SHIFT = 2, /* 1/2**2 = +/- 25% */
LAVD_CPDOM_MIGRATION_SHIFT = 3, /* 1/2**3 = +/- 12.5% */
LAVD_CPDOM_X_PROB_FT = (LAVD_SYS_STAT_INTERVAL_NS /
(2 * LAVD_SLICE_MAX_NS_DFL)), /* roughly twice per interval */
};
/*

View File

@ -1129,18 +1129,109 @@ static bool consume_dsq(u64 dsq_id)
return false;
}
static bool try_to_steal_task(struct cpdom_ctx *cpdomc)
{
struct cpdom_ctx *cpdomc_pick;
u64 nr_nbr, dsq_id;
s64 nuance;
/*
* If all CPUs are not used -- i.e., the system is under-utilized,
* there is no point of load balancing. It is better to make an
* effort to increase the system utilization.
*/
if (!use_full_cpus())
return false;
/*
* Probabilistically make a go or no go decision to avoid the
* thundering herd problem. In other words, one out of nr_cpus
* will try to steal a task at a moment.
*/
if (!prob_x_out_of_y(1, cpdomc->nr_cpus * LAVD_CPDOM_X_PROB_FT))
return false;
/*
* Traverse neighbor compute domains in distance order.
*/
nuance = bpf_get_prandom_u32();
for (int i = 0; i < LAVD_CPDOM_MAX_DIST; i++) {
nr_nbr = min(cpdomc->nr_neighbors[i], LAVD_CPDOM_MAX_NR);
if (nr_nbr == 0)
break;
/*
* Traverse neighbor in the same distance in arbitrary order.
*/
for (int j = 0; j < LAVD_CPDOM_MAX_NR; j++, nuance++) {
if (j >= nr_nbr)
break;
dsq_id = pick_any_bit(cpdomc->neighbor_bits[i], nuance);
if (dsq_id == -ENOENT)
continue;
cpdomc_pick = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
if (!cpdomc_pick) {
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
return false;
}
if (!cpdomc_pick->is_stealee || !cpdomc_pick->is_active)
continue;
/*
* If task stealing is successful, mark the stealer
* and the stealee's job done. By marking done,
* those compute domains would not be involved in
* load balancing until the end of this round,
* so this helps gradual migration. Note that multiple
* stealers can steal tasks from the same stealee.
* However, we don't coordinate concurrent stealing
* because the chance is low and there is no harm
* in slight over-stealing.
*/
if (consume_dsq(dsq_id)) {
WRITE_ONCE(cpdomc_pick->is_stealee, false);
WRITE_ONCE(cpdomc->is_stealer, false);
return true;
}
}
/*
* Now, we need to steal a task from a farther neighbor
* for load balancing. Since task migration from a farther
* neighbor is more expensive (e.g., crossing a NUMA boundary),
* we will do this with a lot of hesitation. The chance of
* further migration will decrease exponentially as distance
* increases, so, on the other hand, it increases the chance
* of closer migration.
*/
if (!prob_x_out_of_y(1, LAVD_CPDOM_X_PROB_FT))
break;
}
return false;
}
static bool force_to_steal_task(struct cpdom_ctx *cpdomc)
{
struct cpdom_ctx *cpdomc_pick;
u64 nr_nbr, dsq_id;
s64 nuance;
/*
* Traverse neighbor compute domains in distance order.
*/
nuance = bpf_get_prandom_u32();
for (int i = 0; i < LAVD_CPDOM_MAX_DIST; i++) {
nr_nbr = min(cpdomc->nr_neighbors[i], LAVD_CPDOM_MAX_NR);
if (nr_nbr == 0)
break;
nuance = bpf_get_prandom_u32();
/*
* Traverse neighbor in the same distance in arbitrary order.
*/
for (int j = 0; j < LAVD_CPDOM_MAX_NR; j++, nuance++) {
if (j >= nr_nbr)
break;
@ -1171,10 +1262,23 @@ static bool consume_task(struct cpu_ctx *cpuc)
struct cpdom_ctx *cpdomc;
u64 dsq_id;
/*
* Try to consume from CPU's associated DSQ.
*/
dsq_id = cpuc->cpdom_id;
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
if (!cpdomc) {
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
return false;
}
/*
* If the current compute domain is a stealer, try to steal
* a task from any of stealee domains probabilistically.
*/
if (cpdomc->is_stealer && try_to_steal_task(cpdomc))
goto x_domain_migration_out;
/*
* Try to consume a task from CPU's associated DSQ.
*/
if (consume_dsq(dsq_id))
return true;
@ -1182,12 +1286,6 @@ static bool consume_task(struct cpu_ctx *cpuc)
* If there is no task in the assssociated DSQ, traverse neighbor
* compute domains in distance order -- task stealing.
*/
cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
if (!cpdomc) {
scx_bpf_error("Failed to lookup cpdom_ctx for %llu", dsq_id);
return false;
}
if (force_to_steal_task(cpdomc))
goto x_domain_migration_out;
@ -1337,10 +1435,7 @@ consume_out:
/*
* Consume a task if requested.
*/
if (!try_consume)
return;
if (consume_task(cpuc))
if (try_consume && consume_task(cpuc))
return;
/*

View File

@ -299,3 +299,14 @@ static void set_on_core_type(struct task_ctx *taskc,
WRITE_ONCE(taskc->on_big, on_big);
WRITE_ONCE(taskc->on_little, on_little);
}
static bool prob_x_out_of_y(u32 x, u32 y)
{
/*
* [0, r, y)
* ---- x?
*/
u32 r = bpf_get_prandom_u32() % y;
return r < x;
}