mirror of
https://github.com/sched-ext/scx.git
synced 2024-11-25 04:00:24 +00:00
Merge branch 'main' into htejun/scx_lavd-stats
This commit is contained in:
commit
fdfb7f60f4
@ -1,3 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
"$1" --version | sed -nr 's/^.*clang version ([\.0-9]*)(git)?(\+.*)?( .*)?$/\1/p'
|
||||
"$1" --version | sed -nr 's/^.*clang version ([\.0-9]*)(git)?(-rc.*)?(\+.*)?( .*)?$/\1/p'
|
||||
|
@ -384,8 +384,7 @@ static bool task_avg_nvcsw(struct task_struct *p)
|
||||
*/
|
||||
static inline u64 task_deadline(struct task_struct *p)
|
||||
{
|
||||
u64 dl_boost = lowlatency ?
|
||||
MIN(task_avg_nvcsw(p), nvcsw_max_thresh) * slice_ns : 0;
|
||||
u64 dl_boost = lowlatency ? task_avg_nvcsw(p) * slice_ns : 0;
|
||||
|
||||
/*
|
||||
* Limit the vruntime to (vtime_now - slice_ns_lag) to avoid
|
||||
@ -406,8 +405,11 @@ static inline u64 task_deadline(struct task_struct *p)
|
||||
* Return the task's deadline as its vruntime, with a bonus that is
|
||||
* proportional to the task's average number of voluntary context
|
||||
* switches.
|
||||
*
|
||||
* Also make sure the bonus is limited to the starvation threshold (to
|
||||
* prevent starvation).
|
||||
*/
|
||||
return p->scx.dsq_vtime - dl_boost;
|
||||
return p->scx.dsq_vtime - MIN(dl_boost, starvation_thresh_ns);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1062,18 +1064,19 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
|
||||
* Evaluate the average number of voluntary context switches per second
|
||||
* using an exponentially weighted moving average, see calc_avg().
|
||||
*/
|
||||
if (!lowlatency && !is_nvcsw_enabled())
|
||||
return;
|
||||
delta_t = (s64)(now - tctx->nvcsw_ts);
|
||||
if (is_nvcsw_enabled() && delta_t > NSEC_PER_SEC) {
|
||||
if (delta_t > NSEC_PER_SEC) {
|
||||
u64 delta_nvcsw = p->nvcsw - tctx->nvcsw;
|
||||
u64 avg_nvcsw = delta_nvcsw * NSEC_PER_SEC / delta_t;
|
||||
|
||||
/*
|
||||
* Evaluate the average nvcsw for the task, limited to the
|
||||
* range [0 .. nvcsw_max_thresh * 8] to prevent excessive
|
||||
* spikes.
|
||||
* range [0 .. 1000] to prevent excessive spikes.
|
||||
*/
|
||||
tctx->avg_nvcsw = calc_avg_clamp(tctx->avg_nvcsw, avg_nvcsw,
|
||||
0, nvcsw_max_thresh << 3);
|
||||
0, MAX(nvcsw_max_thresh, 1000));
|
||||
tctx->nvcsw = p->nvcsw;
|
||||
tctx->nvcsw_ts = now;
|
||||
|
||||
@ -1098,10 +1101,6 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
|
||||
* Reresh task status: interactive or regular.
|
||||
*/
|
||||
update_task_interactive(tctx);
|
||||
|
||||
dbg_msg("%d (%s) avg_nvcsw = %llu [%s]",
|
||||
p->pid, p->comm, tctx->avg_nvcsw,
|
||||
tctx->avg_nvcsw < nvcsw_avg_thresh ? "regular" : "interactive");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -81,28 +81,27 @@ struct Opts {
|
||||
#[clap(long = "balanced", action = clap::ArgAction::SetTrue)]
|
||||
balanced: bool,
|
||||
|
||||
/// The following 4 options are set automatically by the power mode (above), but they can be
|
||||
/// set independently if desired:
|
||||
|
||||
/// Disable core compaction and schedule tasks across all online CPUs. Core compaction attempts
|
||||
/// to keep idle CPUs idle in favor of scheduling tasks on CPUs that are already
|
||||
/// awake. See main.bpf.c for more info.
|
||||
/// awake. See main.bpf.c for more info. Normally set by the power mode, but can be set independently if
|
||||
/// desired.
|
||||
#[clap(long = "no-core-compaction", action = clap::ArgAction::SetTrue)]
|
||||
no_core_compaction: bool,
|
||||
|
||||
/// Schedule tasks on SMT siblings before using other physcial cores when core compaction is
|
||||
/// enabled.
|
||||
/// enabled. Normally set by the power mode, but can be set independently if desired.
|
||||
#[clap(long = "prefer-smt-core", action = clap::ArgAction::SetTrue)]
|
||||
prefer_smt_core: bool,
|
||||
|
||||
/// Schedule tasks on little (efficiency) cores before big (performance) cores when core compaction is
|
||||
/// enabled.
|
||||
/// enabled. Normally set by the power mode, but can be set independently if desired.
|
||||
#[clap(long = "prefer-little-core", action = clap::ArgAction::SetTrue)]
|
||||
prefer_little_core: bool,
|
||||
|
||||
/// Disable controlling the CPU frequency. In order to improve latency and responsiveness of
|
||||
/// performance-critical tasks, scx_lavd increases the CPU frequency even if CPU usage is low.
|
||||
/// See main.bpf.c for more info.
|
||||
/// See main.bpf.c for more info. Normally set by the power mode, but can be set independently
|
||||
/// if desired.
|
||||
#[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
|
||||
no_freq_scaling: bool,
|
||||
|
||||
|
@ -111,6 +111,7 @@ enum layer_match_kind {
|
||||
MATCH_GROUP_ID_EQUALS,
|
||||
MATCH_PID_EQUALS,
|
||||
MATCH_PPID_EQUALS,
|
||||
MATCH_TGID_EQUALS,
|
||||
|
||||
NR_LAYER_MATCH_KINDS,
|
||||
};
|
||||
@ -125,6 +126,7 @@ struct layer_match {
|
||||
u32 group_id;
|
||||
u32 pid;
|
||||
u32 ppid;
|
||||
u32 tgid;
|
||||
};
|
||||
|
||||
struct layer_match_ands {
|
||||
|
@ -1018,6 +1018,8 @@ static __noinline bool match_one(struct layer_match *match,
|
||||
return p->pid == match->pid;
|
||||
case MATCH_PPID_EQUALS:
|
||||
return p->real_parent->pid == match->ppid;
|
||||
case MATCH_TGID_EQUALS:
|
||||
return p->tgid == match->tgid;
|
||||
default:
|
||||
scx_bpf_error("invalid match kind %d", match->kind);
|
||||
return result;
|
||||
@ -1625,6 +1627,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
|
||||
case MATCH_PPID_EQUALS:
|
||||
dbg("%s PPID %u", header, match->ppid);
|
||||
break;
|
||||
case MATCH_TGID_EQUALS:
|
||||
dbg("%s TGID %u", header, match->tgid);
|
||||
break;
|
||||
default:
|
||||
scx_bpf_error("%s Invalid kind", header);
|
||||
return -EINVAL;
|
||||
|
@ -436,6 +436,7 @@ enum LayerMatch {
|
||||
GIDEquals(u32),
|
||||
PIDEquals(u32),
|
||||
PPIDEquals(u32),
|
||||
TGIDEquals(u32),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
@ -1443,6 +1444,10 @@ impl<'a, 'b> Scheduler<'a, 'b> {
|
||||
mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
|
||||
mt.ppid = *ppid;
|
||||
}
|
||||
LayerMatch::TGIDEquals(tgid) => {
|
||||
mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
|
||||
mt.tgid = *tgid;
|
||||
}
|
||||
}
|
||||
}
|
||||
layer.matches[or_i].nr_match_ands = or.len() as i32;
|
||||
|
@ -15,15 +15,15 @@ Available as a [Rust crate](https://crates.io/crates/scx_rusty): `cargo add scx_
|
||||
|
||||
## Typical Use Case
|
||||
|
||||
Rusty is designed to be flexible, and accommodate different architectures and
|
||||
workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc),
|
||||
Rusty is designed to be flexible, accommodating different architectures and
|
||||
workloads. Various load balancing thresholds (e.g. greediness, frequency, etc),
|
||||
as well as how Rusty should partition the system into scheduling domains, can
|
||||
be tuned to achieve the optimal configuration for any given system or workload.
|
||||
|
||||
## Production Ready?
|
||||
|
||||
Yes. If tuned correctly, rusty should be performant across various CPU
|
||||
architectures and workloads. Rusty by default creates a separate scheduling
|
||||
architectures and workloads. By default, rusty creates a separate scheduling
|
||||
domain per-LLC, so its default configuration may be performant as well. Note
|
||||
however that scx_rusty does not yet disambiguate between LLCs in different NUMA
|
||||
nodes, so it may perform better on multi-CCX machines where all the LLCs share
|
||||
|
@ -69,7 +69,7 @@ const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize;
|
||||
///
|
||||
/// The userspace part performs two roles. First, it makes higher frequency
|
||||
/// (100ms) tuning decisions. It identifies CPUs which are not too heavily
|
||||
/// loaded and mark them so that they can pull tasks from other overloaded
|
||||
/// loaded and marks them so that they can pull tasks from other overloaded
|
||||
/// domains on the fly.
|
||||
///
|
||||
/// Second, it drives lower frequency (2s) load balancing. It determines
|
||||
@ -79,10 +79,10 @@ const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize;
|
||||
/// migrated.
|
||||
///
|
||||
/// The overhead of userspace operations is low. Load balancing is not
|
||||
/// performed frequently but work-conservation is still maintained through
|
||||
/// performed frequently, but work-conservation is still maintained through
|
||||
/// tuning and greedy execution. Load balancing itself is not that expensive
|
||||
/// either. It only accesses per-domain load metrics to determine the
|
||||
/// domains that need load balancing and limited number of per-task metrics
|
||||
/// either. It only accesses per-domain load metrics to determine the domains
|
||||
/// that need load balancing, as well as limited number of per-task metrics
|
||||
/// for each pushing domain.
|
||||
///
|
||||
/// An earlier variant of this scheduler was used to balance across six
|
||||
@ -106,7 +106,7 @@ struct Opts {
|
||||
#[clap(short = 'i', long, default_value = "2.0")]
|
||||
interval: f64,
|
||||
|
||||
/// Tuner runs at higher frequency than the load balancer to dynamically
|
||||
/// The tuner runs at a higher frequency than the load balancer to dynamically
|
||||
/// tune scheduling behavior. Tuning interval in seconds.
|
||||
#[clap(short = 'I', long, default_value = "0.1")]
|
||||
tune_interval: f64,
|
||||
@ -121,8 +121,8 @@ struct Opts {
|
||||
cache_level: u32,
|
||||
|
||||
/// Instead of using cache locality, set the cpumask for each domain
|
||||
/// manually, provide multiple --cpumasks, one for each domain. E.g.
|
||||
/// --cpumasks 0xff_00ff --cpumasks 0xff00 will create two domains with
|
||||
/// manually. Provide multiple --cpumasks, one for each domain. E.g.
|
||||
/// --cpumasks 0xff_00ff --cpumasks 0xff00 will create two domains, with
|
||||
/// the corresponding CPUs belonging to each domain. Each CPU must
|
||||
/// belong to precisely one domain.
|
||||
#[clap(short = 'C', long, num_args = 1.., conflicts_with = "cache_level")]
|
||||
@ -149,9 +149,8 @@ struct Opts {
|
||||
#[clap(long, default_value = "0")]
|
||||
greedy_threshold_x_numa: u32,
|
||||
|
||||
/// Disable load balancing. Unless disabled, periodically userspace will
|
||||
/// calculate the load factor of each domain and instruct BPF which
|
||||
/// processes to move.
|
||||
/// Disable load balancing. Unless disabled, userspace will periodically calculate
|
||||
/// the load factor of each domain and instruct BPF which processes to move.
|
||||
#[clap(long, action = clap::ArgAction::SetTrue)]
|
||||
no_load_balance: bool,
|
||||
|
||||
@ -170,7 +169,7 @@ struct Opts {
|
||||
fifo_sched: bool,
|
||||
|
||||
/// Idle CPUs with utilization lower than this will get remote tasks
|
||||
/// directly pushed on them. 0 disables, 100 enables always.
|
||||
/// directly pushed onto them. 0 disables, 100 always enables.
|
||||
#[clap(short = 'D', long, default_value = "90.0")]
|
||||
direct_greedy_under: f64,
|
||||
|
||||
@ -181,7 +180,7 @@ struct Opts {
|
||||
kick_greedy_under: f64,
|
||||
|
||||
/// Whether tasks can be pushed directly to idle CPUs on NUMA nodes
|
||||
/// different than its domain's node. If direct-greedy-under is disabled,
|
||||
/// different than their domain's node. If direct-greedy-under is disabled,
|
||||
/// this option is a no-op. Otherwise, if this option is set to false
|
||||
/// (default), tasks will only be directly pushed to idle CPUs if they
|
||||
/// reside on the same NUMA node as the task's domain.
|
||||
@ -203,7 +202,7 @@ struct Opts {
|
||||
#[clap(long)]
|
||||
stats: Option<f64>,
|
||||
|
||||
/// Run in stats monitoring mode with the specified interval. Scheduler
|
||||
/// Run in stats monitoring mode with the specified interval. The scheduler
|
||||
/// is not launched.
|
||||
#[clap(long)]
|
||||
monitor: Option<f64>,
|
||||
|
Loading…
Reference in New Issue
Block a user