Merge branch 'main' into htejun/scx_lavd-stats

This commit is contained in:
Tejun Heo 2024-08-24 15:53:53 -10:00
commit fdfb7f60f4
8 changed files with 44 additions and 35 deletions

View File

@ -1,3 +1,3 @@
#!/bin/bash
"$1" --version | sed -nr 's/^.*clang version ([\.0-9]*)(git)?(\+.*)?( .*)?$/\1/p'
"$1" --version | sed -nr 's/^.*clang version ([\.0-9]*)(git)?(-rc.*)?(\+.*)?( .*)?$/\1/p'

View File

@ -384,8 +384,7 @@ static bool task_avg_nvcsw(struct task_struct *p)
*/
static inline u64 task_deadline(struct task_struct *p)
{
u64 dl_boost = lowlatency ?
MIN(task_avg_nvcsw(p), nvcsw_max_thresh) * slice_ns : 0;
u64 dl_boost = lowlatency ? task_avg_nvcsw(p) * slice_ns : 0;
/*
* Limit the vruntime to (vtime_now - slice_ns_lag) to avoid
@ -406,8 +405,11 @@ static inline u64 task_deadline(struct task_struct *p)
* Return the task's deadline as its vruntime, with a bonus that is
* proportional to the task's average number of voluntary context
* switches.
*
* Also make sure the bonus is limited to the starvation threshold (to
* prevent starvation).
*/
return p->scx.dsq_vtime - dl_boost;
return p->scx.dsq_vtime - MIN(dl_boost, starvation_thresh_ns);
}
/*
@ -1062,18 +1064,19 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
* Evaluate the average number of voluntary context switches per second
* using an exponentially weighted moving average, see calc_avg().
*/
if (!lowlatency && !is_nvcsw_enabled())
return;
delta_t = (s64)(now - tctx->nvcsw_ts);
if (is_nvcsw_enabled() && delta_t > NSEC_PER_SEC) {
if (delta_t > NSEC_PER_SEC) {
u64 delta_nvcsw = p->nvcsw - tctx->nvcsw;
u64 avg_nvcsw = delta_nvcsw * NSEC_PER_SEC / delta_t;
/*
* Evaluate the average nvcsw for the task, limited to the
* range [0 .. nvcsw_max_thresh * 8] to prevent excessive
* spikes.
* range [0 .. 1000] to prevent excessive spikes.
*/
tctx->avg_nvcsw = calc_avg_clamp(tctx->avg_nvcsw, avg_nvcsw,
0, nvcsw_max_thresh << 3);
0, MAX(nvcsw_max_thresh, 1000));
tctx->nvcsw = p->nvcsw;
tctx->nvcsw_ts = now;
@ -1098,10 +1101,6 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
* Reresh task status: interactive or regular.
*/
update_task_interactive(tctx);
dbg_msg("%d (%s) avg_nvcsw = %llu [%s]",
p->pid, p->comm, tctx->avg_nvcsw,
tctx->avg_nvcsw < nvcsw_avg_thresh ? "regular" : "interactive");
}
}

View File

@ -81,28 +81,27 @@ struct Opts {
#[clap(long = "balanced", action = clap::ArgAction::SetTrue)]
balanced: bool,
/// The following 4 options are set automatically by the power mode (above), but they can be
/// set independently if desired:
/// Disable core compaction and schedule tasks across all online CPUs. Core compaction attempts
/// to keep idle CPUs idle in favor of scheduling tasks on CPUs that are already
/// awake. See main.bpf.c for more info.
/// awake. See main.bpf.c for more info. Normally set by the power mode, but can be set independently if
/// desired.
#[clap(long = "no-core-compaction", action = clap::ArgAction::SetTrue)]
no_core_compaction: bool,
/// Schedule tasks on SMT siblings before using other physcial cores when core compaction is
/// enabled.
/// enabled. Normally set by the power mode, but can be set independently if desired.
#[clap(long = "prefer-smt-core", action = clap::ArgAction::SetTrue)]
prefer_smt_core: bool,
/// Schedule tasks on little (efficiency) cores before big (performance) cores when core compaction is
/// enabled.
/// enabled. Normally set by the power mode, but can be set independently if desired.
#[clap(long = "prefer-little-core", action = clap::ArgAction::SetTrue)]
prefer_little_core: bool,
/// Disable controlling the CPU frequency. In order to improve latency and responsiveness of
/// performance-critical tasks, scx_lavd increases the CPU frequency even if CPU usage is low.
/// See main.bpf.c for more info.
/// See main.bpf.c for more info. Normally set by the power mode, but can be set independently
/// if desired.
#[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
no_freq_scaling: bool,

View File

@ -111,6 +111,7 @@ enum layer_match_kind {
MATCH_GROUP_ID_EQUALS,
MATCH_PID_EQUALS,
MATCH_PPID_EQUALS,
MATCH_TGID_EQUALS,
NR_LAYER_MATCH_KINDS,
};
@ -125,6 +126,7 @@ struct layer_match {
u32 group_id;
u32 pid;
u32 ppid;
u32 tgid;
};
struct layer_match_ands {

View File

@ -1018,6 +1018,8 @@ static __noinline bool match_one(struct layer_match *match,
return p->pid == match->pid;
case MATCH_PPID_EQUALS:
return p->real_parent->pid == match->ppid;
case MATCH_TGID_EQUALS:
return p->tgid == match->tgid;
default:
scx_bpf_error("invalid match kind %d", match->kind);
return result;
@ -1625,6 +1627,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
case MATCH_PPID_EQUALS:
dbg("%s PPID %u", header, match->ppid);
break;
case MATCH_TGID_EQUALS:
dbg("%s TGID %u", header, match->tgid);
break;
default:
scx_bpf_error("%s Invalid kind", header);
return -EINVAL;

View File

@ -436,6 +436,7 @@ enum LayerMatch {
GIDEquals(u32),
PIDEquals(u32),
PPIDEquals(u32),
TGIDEquals(u32),
}
#[derive(Clone, Debug, Serialize, Deserialize)]
@ -1443,6 +1444,10 @@ impl<'a, 'b> Scheduler<'a, 'b> {
mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
mt.ppid = *ppid;
}
LayerMatch::TGIDEquals(tgid) => {
mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
mt.tgid = *tgid;
}
}
}
layer.matches[or_i].nr_match_ands = or.len() as i32;

View File

@ -15,15 +15,15 @@ Available as a [Rust crate](https://crates.io/crates/scx_rusty): `cargo add scx_
## Typical Use Case
Rusty is designed to be flexible, and accommodate different architectures and
workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc),
Rusty is designed to be flexible, accommodating different architectures and
workloads. Various load balancing thresholds (e.g. greediness, frequency, etc),
as well as how Rusty should partition the system into scheduling domains, can
be tuned to achieve the optimal configuration for any given system or workload.
## Production Ready?
Yes. If tuned correctly, rusty should be performant across various CPU
architectures and workloads. Rusty by default creates a separate scheduling
architectures and workloads. By default, rusty creates a separate scheduling
domain per-LLC, so its default configuration may be performant as well. Note
however that scx_rusty does not yet disambiguate between LLCs in different NUMA
nodes, so it may perform better on multi-CCX machines where all the LLCs share

View File

@ -69,7 +69,7 @@ const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize;
///
/// The userspace part performs two roles. First, it makes higher frequency
/// (100ms) tuning decisions. It identifies CPUs which are not too heavily
/// loaded and mark them so that they can pull tasks from other overloaded
/// loaded and marks them so that they can pull tasks from other overloaded
/// domains on the fly.
///
/// Second, it drives lower frequency (2s) load balancing. It determines
@ -79,10 +79,10 @@ const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize;
/// migrated.
///
/// The overhead of userspace operations is low. Load balancing is not
/// performed frequently but work-conservation is still maintained through
/// performed frequently, but work-conservation is still maintained through
/// tuning and greedy execution. Load balancing itself is not that expensive
/// either. It only accesses per-domain load metrics to determine the
/// domains that need load balancing and limited number of per-task metrics
/// either. It only accesses per-domain load metrics to determine the domains
/// that need load balancing, as well as limited number of per-task metrics
/// for each pushing domain.
///
/// An earlier variant of this scheduler was used to balance across six
@ -106,7 +106,7 @@ struct Opts {
#[clap(short = 'i', long, default_value = "2.0")]
interval: f64,
/// Tuner runs at higher frequency than the load balancer to dynamically
/// The tuner runs at a higher frequency than the load balancer to dynamically
/// tune scheduling behavior. Tuning interval in seconds.
#[clap(short = 'I', long, default_value = "0.1")]
tune_interval: f64,
@ -121,8 +121,8 @@ struct Opts {
cache_level: u32,
/// Instead of using cache locality, set the cpumask for each domain
/// manually, provide multiple --cpumasks, one for each domain. E.g.
/// --cpumasks 0xff_00ff --cpumasks 0xff00 will create two domains with
/// manually. Provide multiple --cpumasks, one for each domain. E.g.
/// --cpumasks 0xff_00ff --cpumasks 0xff00 will create two domains, with
/// the corresponding CPUs belonging to each domain. Each CPU must
/// belong to precisely one domain.
#[clap(short = 'C', long, num_args = 1.., conflicts_with = "cache_level")]
@ -149,9 +149,8 @@ struct Opts {
#[clap(long, default_value = "0")]
greedy_threshold_x_numa: u32,
/// Disable load balancing. Unless disabled, periodically userspace will
/// calculate the load factor of each domain and instruct BPF which
/// processes to move.
/// Disable load balancing. Unless disabled, userspace will periodically calculate
/// the load factor of each domain and instruct BPF which processes to move.
#[clap(long, action = clap::ArgAction::SetTrue)]
no_load_balance: bool,
@ -170,7 +169,7 @@ struct Opts {
fifo_sched: bool,
/// Idle CPUs with utilization lower than this will get remote tasks
/// directly pushed on them. 0 disables, 100 enables always.
/// directly pushed onto them. 0 disables, 100 always enables.
#[clap(short = 'D', long, default_value = "90.0")]
direct_greedy_under: f64,
@ -181,7 +180,7 @@ struct Opts {
kick_greedy_under: f64,
/// Whether tasks can be pushed directly to idle CPUs on NUMA nodes
/// different than its domain's node. If direct-greedy-under is disabled,
/// different than their domain's node. If direct-greedy-under is disabled,
/// this option is a no-op. Otherwise, if this option is set to false
/// (default), tasks will only be directly pushed to idle CPUs if they
/// reside on the same NUMA node as the task's domain.
@ -203,7 +202,7 @@ struct Opts {
#[clap(long)]
stats: Option<f64>,
/// Run in stats monitoring mode with the specified interval. Scheduler
/// Run in stats monitoring mode with the specified interval. The scheduler
/// is not launched.
#[clap(long)]
monitor: Option<f64>,