linux: Add cgroups patches for 4.9, 4.10, 4.11
This commit is contained in:
parent
d7501b986a
commit
b5169fd277
784
pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.10.patch
Normal file
784
pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.10.patch
Normal file
@ -0,0 +1,784 @@
|
||||
commit d0273888226b264d34795970c073d6e935d5114f
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Mar 11 07:31:23 2016 -0500
|
||||
|
||||
sched: Misc preps for cgroup unified hierarchy interface
|
||||
|
||||
Make the following changes in preparation for the cpu controller
|
||||
interface implementation for the unified hierarchy. This patch
|
||||
doesn't cause any functional differences.
|
||||
|
||||
* s/cpu_stats_show()/cpu_cfs_stats_show()/
|
||||
|
||||
* s/cpu_files/cpu_legacy_files/
|
||||
|
||||
* Separate out cpuacct_stats_read() from cpuacct_stats_show(). While
|
||||
at it, make the @val array u64 for consistency.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Li Zefan <lizefan@huawei.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index c56fb57f2991..112037890e9b 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -8724,7 +8724,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
||||
return ret;
|
||||
}
|
||||
|
||||
-static int cpu_stats_show(struct seq_file *sf, void *v)
|
||||
+static int cpu_cfs_stats_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct task_group *tg = css_tg(seq_css(sf));
|
||||
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
||||
@@ -8764,7 +8764,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
|
||||
}
|
||||
#endif /* CONFIG_RT_GROUP_SCHED */
|
||||
|
||||
-static struct cftype cpu_files[] = {
|
||||
+static struct cftype cpu_legacy_files[] = {
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
{
|
||||
.name = "shares",
|
||||
@@ -8785,7 +8785,7 @@ static struct cftype cpu_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "stat",
|
||||
- .seq_show = cpu_stats_show,
|
||||
+ .seq_show = cpu_cfs_stats_show,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
@@ -8810,7 +8810,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.fork = cpu_cgroup_fork,
|
||||
.can_attach = cpu_cgroup_can_attach,
|
||||
.attach = cpu_cgroup_attach,
|
||||
- .legacy_cftypes = cpu_files,
|
||||
+ .legacy_cftypes = cpu_legacy_files,
|
||||
.early_init = true,
|
||||
};
|
||||
|
||||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
||||
index 9add206b5608..4dd7b8588b69 100644
|
||||
--- a/kernel/sched/cpuacct.c
|
||||
+++ b/kernel/sched/cpuacct.c
|
||||
@@ -276,26 +276,33 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
+static void cpuacct_stats_read(struct cpuacct *ca,
|
||||
+ u64 (*val)[CPUACCT_STAT_NSTATS])
|
||||
{
|
||||
- struct cpuacct *ca = css_ca(seq_css(sf));
|
||||
- s64 val[CPUACCT_STAT_NSTATS];
|
||||
int cpu;
|
||||
- int stat;
|
||||
|
||||
- memset(val, 0, sizeof(val));
|
||||
+ memset(val, 0, sizeof(*val));
|
||||
+
|
||||
for_each_possible_cpu(cpu) {
|
||||
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
|
||||
|
||||
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
|
||||
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
|
||||
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
|
||||
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
|
||||
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
|
||||
+ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
|
||||
+ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
|
||||
+ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
|
||||
+ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
|
||||
+ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
|
||||
}
|
||||
+}
|
||||
+
|
||||
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ u64 val[CPUACCT_STAT_NSTATS];
|
||||
+ int stat;
|
||||
+
|
||||
+ cpuacct_stats_read(css_ca(seq_css(sf)), &val);
|
||||
|
||||
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
|
||||
- seq_printf(sf, "%s %lld\n",
|
||||
+ seq_printf(sf, "%s %llu\n",
|
||||
cpuacct_stat_desc[stat],
|
||||
(long long)cputime64_to_clock_t(val[stat]));
|
||||
}
|
||||
|
||||
commit 41103511fa43b4aa04fea259c5c60fef752ddefb
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Mar 11 07:31:23 2016 -0500
|
||||
|
||||
sched: Implement interface for cgroup unified hierarchy
|
||||
|
||||
While the cpu controller doesn't have any functional problems, there
|
||||
are a couple interface issues which can be addressed in the v2
|
||||
interface.
|
||||
|
||||
* cpuacct being a separate controller. This separation is artificial
|
||||
and rather pointless as demonstrated by most use cases co-mounting
|
||||
the two controllers. It also forces certain information to be
|
||||
accounted twice.
|
||||
|
||||
* Use of different time units. Writable control knobs use
|
||||
microseconds, some stat fields use nanoseconds while other cpuacct
|
||||
stat fields use centiseconds.
|
||||
|
||||
* Control knobs which can't be used in the root cgroup still show up
|
||||
in the root.
|
||||
|
||||
* Control knob names and semantics aren't consistent with other
|
||||
controllers.
|
||||
|
||||
This patchset implements cpu controller's interface on the unified
|
||||
hierarchy which adheres to the controller file conventions described
|
||||
in Documentation/cgroups/unified-hierarchy.txt. Overall, the
|
||||
following changes are made.
|
||||
|
||||
* cpuacct is implictly enabled and disabled by cpu and its information
|
||||
is reported through "cpu.stat" which now uses microseconds for all
|
||||
time durations. All time duration fields now have "_usec" appended
|
||||
to them for clarity. While this doesn't solve the double accounting
|
||||
immediately, once majority of users switch to v2, cpu can directly
|
||||
account and report the relevant stats and cpuacct can be disabled on
|
||||
the unified hierarchy.
|
||||
|
||||
Note that cpuacct.usage_percpu is currently not included in
|
||||
"cpu.stat". If this information is actually called for, it can be
|
||||
added later.
|
||||
|
||||
* "cpu.shares" is replaced with "cpu.weight" and operates on the
|
||||
standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
|
||||
The weight is scaled to scheduler weight so that 100 maps to 1024
|
||||
and the ratio relationship is preserved - if weight is W and its
|
||||
scaled value is S, W / 100 == S / 1024. While the mapped range is a
|
||||
bit smaller than the orignal scheduler weight range, the dead zones
|
||||
on both sides are relatively small and covers wider range than the
|
||||
nice value mappings. This file doesn't make sense in the root
|
||||
cgroup and isn't create on root.
|
||||
|
||||
* "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
|
||||
which contains both quota and period.
|
||||
|
||||
* "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by
|
||||
"cpu.rt.max" which contains both runtime and period.
|
||||
|
||||
v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for
|
||||
CFS bandwidth stats and also using raw division for u64. Use
|
||||
CONFIG_CFS_BANDWITH and do_div() instead.
|
||||
|
||||
The semantics of "cpu.rt.max" is not fully decided yet. Dropped
|
||||
for now.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Li Zefan <lizefan@huawei.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index 112037890e9b..a80d586a4317 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -8803,6 +8803,139 @@ static struct cftype cpu_legacy_files[] = {
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
+static int cpu_stats_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ cpuacct_cpu_stats_show(sf);
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ struct task_group *tg = css_tg(seq_css(sf));
|
||||
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
||||
+ u64 throttled_usec;
|
||||
+
|
||||
+ throttled_usec = cfs_b->throttled_time;
|
||||
+ do_div(throttled_usec, NSEC_PER_USEC);
|
||||
+
|
||||
+ seq_printf(sf, "nr_periods %d\n"
|
||||
+ "nr_throttled %d\n"
|
||||
+ "throttled_usec %llu\n",
|
||||
+ cfs_b->nr_periods, cfs_b->nr_throttled,
|
||||
+ throttled_usec);
|
||||
+ }
|
||||
+#endif
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cft)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(css);
|
||||
+ u64 weight = scale_load_down(tg->shares);
|
||||
+
|
||||
+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
|
||||
+}
|
||||
+
|
||||
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cftype, u64 weight)
|
||||
+{
|
||||
+ /*
|
||||
+ * cgroup weight knobs should use the common MIN, DFL and MAX
|
||||
+ * values which are 1, 100 and 10000 respectively. While it loses
|
||||
+ * a bit of range on both ends, it maps pretty well onto the shares
|
||||
+ * value used by scheduler and the round-trip conversions preserve
|
||||
+ * the original value over the entire range.
|
||||
+ */
|
||||
+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
|
||||
+
|
||||
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
|
||||
+ long period, long quota)
|
||||
+{
|
||||
+ if (quota < 0)
|
||||
+ seq_puts(sf, "max");
|
||||
+ else
|
||||
+ seq_printf(sf, "%ld", quota);
|
||||
+
|
||||
+ seq_printf(sf, " %ld\n", period);
|
||||
+}
|
||||
+
|
||||
+/* caller should put the current value in *@periodp before calling */
|
||||
+static int __maybe_unused cpu_period_quota_parse(char *buf,
|
||||
+ u64 *periodp, u64 *quotap)
|
||||
+{
|
||||
+ char tok[21]; /* U64_MAX */
|
||||
+
|
||||
+ if (!sscanf(buf, "%s %llu", tok, periodp))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ *periodp *= NSEC_PER_USEC;
|
||||
+
|
||||
+ if (sscanf(tok, "%llu", quotap))
|
||||
+ *quotap *= NSEC_PER_USEC;
|
||||
+ else if (!strcmp(tok, "max"))
|
||||
+ *quotap = RUNTIME_INF;
|
||||
+ else
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+static int cpu_max_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(seq_css(sf));
|
||||
+
|
||||
+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
|
||||
+ char *buf, size_t nbytes, loff_t off)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(of_css(of));
|
||||
+ u64 period = tg_get_cfs_period(tg);
|
||||
+ u64 quota;
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = cpu_period_quota_parse(buf, &period, "a);
|
||||
+ if (!ret)
|
||||
+ ret = tg_set_cfs_bandwidth(tg, period, quota);
|
||||
+ return ret ?: nbytes;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static struct cftype cpu_files[] = {
|
||||
+ {
|
||||
+ .name = "stat",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .seq_show = cpu_stats_show,
|
||||
+ },
|
||||
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+ {
|
||||
+ .name = "weight",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .read_u64 = cpu_weight_read_u64,
|
||||
+ .write_u64 = cpu_weight_write_u64,
|
||||
+ },
|
||||
+#endif
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ .name = "max",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .seq_show = cpu_max_show,
|
||||
+ .write = cpu_max_write,
|
||||
+ },
|
||||
+#endif
|
||||
+ { } /* terminate */
|
||||
+};
|
||||
+
|
||||
struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.css_alloc = cpu_cgroup_css_alloc,
|
||||
.css_released = cpu_cgroup_css_released,
|
||||
@@ -8811,7 +8944,15 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.can_attach = cpu_cgroup_can_attach,
|
||||
.attach = cpu_cgroup_attach,
|
||||
.legacy_cftypes = cpu_legacy_files,
|
||||
+ .dfl_cftypes = cpu_files,
|
||||
.early_init = true,
|
||||
+#ifdef CONFIG_CGROUP_CPUACCT
|
||||
+ /*
|
||||
+ * cpuacct is enabled together with cpu on the unified hierarchy
|
||||
+ * and its stats are reported through "cpu.stat".
|
||||
+ */
|
||||
+ .depends_on = 1 << cpuacct_cgrp_id,
|
||||
+#endif
|
||||
};
|
||||
|
||||
#endif /* CONFIG_CGROUP_SCHED */
|
||||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
||||
index 4dd7b8588b69..97c6dd7d8f59 100644
|
||||
--- a/kernel/sched/cpuacct.c
|
||||
+++ b/kernel/sched/cpuacct.c
|
||||
@@ -347,6 +347,31 @@ static struct cftype files[] = {
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
+/* used to print cpuacct stats in cpu.stat on the unified hierarchy */
|
||||
+void cpuacct_cpu_stats_show(struct seq_file *sf)
|
||||
+{
|
||||
+ struct cgroup_subsys_state *css;
|
||||
+ u64 usage, val[CPUACCT_STAT_NSTATS];
|
||||
+
|
||||
+ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys);
|
||||
+
|
||||
+ usage = cpuusage_read(css, seq_cft(sf));
|
||||
+ cpuacct_stats_read(css_ca(css), &val);
|
||||
+
|
||||
+ val[CPUACCT_STAT_USER] *= TICK_NSEC;
|
||||
+ val[CPUACCT_STAT_SYSTEM] *= TICK_NSEC;
|
||||
+ do_div(usage, NSEC_PER_USEC);
|
||||
+ do_div(val[CPUACCT_STAT_USER], NSEC_PER_USEC);
|
||||
+ do_div(val[CPUACCT_STAT_SYSTEM], NSEC_PER_USEC);
|
||||
+
|
||||
+ seq_printf(sf, "usage_usec %llu\n"
|
||||
+ "user_usec %llu\n"
|
||||
+ "system_usec %llu\n",
|
||||
+ usage, val[CPUACCT_STAT_USER], val[CPUACCT_STAT_SYSTEM]);
|
||||
+
|
||||
+ css_put(css);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* charge this task's execution time to its accounting group.
|
||||
*
|
||||
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
|
||||
index ba72807c73d4..ddf7af466d35 100644
|
||||
--- a/kernel/sched/cpuacct.h
|
||||
+++ b/kernel/sched/cpuacct.h
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
|
||||
extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
|
||||
+extern void cpuacct_cpu_stats_show(struct seq_file *sf);
|
||||
|
||||
#else
|
||||
|
||||
@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
|
||||
{
|
||||
}
|
||||
|
||||
+static inline void cpuacct_cpu_stats_show(struct seq_file *sf)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
#endif
|
||||
|
||||
commit 2dae6b0ec091c93131e02eb56987ec6c26818f42
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Aug 5 12:41:01 2016 -0400
|
||||
|
||||
cgroup: add documentation regarding CPU controller cgroup v2 support
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
|
||||
diff --git a/Documentation/cgroup-v2-cpu.txt b/Documentation/cgroup-v2-cpu.txt
|
||||
new file mode 100644
|
||||
index 000000000000..1ed7032d4472
|
||||
--- /dev/null
|
||||
+++ b/Documentation/cgroup-v2-cpu.txt
|
||||
@@ -0,0 +1,368 @@
|
||||
+
|
||||
+
|
||||
+CPU Controller on Control Group v2
|
||||
+
|
||||
+August, 2016 Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+
|
||||
+While most controllers have support for cgroup v2 now, the CPU
|
||||
+controller support is not upstream yet due to objections from the
|
||||
+scheduler maintainers on the basic designs of cgroup v2. This
|
||||
+document explains the current situation as well as an interim
|
||||
+solution, and details the disagreements and arguments. The latest
|
||||
+version of this document can be found at the following URL.
|
||||
+
|
||||
+ https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/tree/Documentation/cgroup-v2-cpu.txt?h=cgroup-v2-cpu
|
||||
+
|
||||
+This document was posted to the linux-kernel and cgroup mailing lists.
|
||||
+Unfortunately, no consensus was reached as of Oct, 2016. The thread
|
||||
+can be found at the following URL.
|
||||
+
|
||||
+ http://lkml.kernel.org/r/20160805170752.GK2542@mtj.duckdns.org
|
||||
+
|
||||
+
|
||||
+CONTENTS
|
||||
+
|
||||
+1. Current Situation and Interim Solution
|
||||
+2. Disagreements and Arguments
|
||||
+ 2-1. Contentious Restrictions
|
||||
+ 2-1-1. Process Granularity
|
||||
+ 2-1-2. No Internal Process Constraint
|
||||
+ 2-2. Impact on CPU Controller
|
||||
+ 2-2-1. Impact of Process Granularity
|
||||
+ 2-2-2. Impact of No Internal Process Constraint
|
||||
+ 2-3. Arguments for cgroup v2
|
||||
+3. Way Forward
|
||||
+4. References
|
||||
+
|
||||
+
|
||||
+1. Current Situation and Interim Solution
|
||||
+
|
||||
+All objections from the scheduler maintainers apply to cgroup v2 core
|
||||
+design, and there are no known objections to the specifics of the CPU
|
||||
+controller cgroup v2 interface. The only blocked part is changes to
|
||||
+expose the CPU controller interface on cgroup v2, which comprises the
|
||||
+following two patches:
|
||||
+
|
||||
+ [1] sched: Misc preps for cgroup unified hierarchy interface
|
||||
+ [2] sched: Implement interface for cgroup unified hierarchy
|
||||
+
|
||||
+The necessary changes are superficial and implement the interface
|
||||
+files on cgroup v2. The combined diffstat is as follows.
|
||||
+
|
||||
+ kernel/sched/core.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++--
|
||||
+ kernel/sched/cpuacct.c | 57 ++++++++++++------
|
||||
+ kernel/sched/cpuacct.h | 5 +
|
||||
+ 3 files changed, 189 insertions(+), 22 deletions(-)
|
||||
+
|
||||
+The patches are easy to apply and forward-port. The following git
|
||||
+branch will always carry the two patches on top of the latest release
|
||||
+of the upstream kernel.
|
||||
+
|
||||
+ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu
|
||||
+
|
||||
+There also are versioned branches going back to v4.4.
|
||||
+
|
||||
+ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu-$KERNEL_VER
|
||||
+
|
||||
+While it's difficult to tell whether the CPU controller support will
|
||||
+be merged, there are crucial resource control features in cgroup v2
|
||||
+that are only possible due to the design choices that are being
|
||||
+objected to, and every effort will be made to ease enabling the CPU
|
||||
+controller cgroup v2 support out-of-tree for parties which choose to.
|
||||
+
|
||||
+
|
||||
+2. Disagreements and Arguments
|
||||
+
|
||||
+There have been several lengthy discussion threads [3][4] on LKML
|
||||
+around the structural constraints of cgroup v2. The two that affect
|
||||
+the CPU controller are process granularity and no internal process
|
||||
+constraint. Both arise primarily from the need for common resource
|
||||
+domain definition across different resources.
|
||||
+
|
||||
+The common resource domain is a powerful concept in cgroup v2 that
|
||||
+allows controllers to make basic assumptions about the structural
|
||||
+organization of processes and controllers inside the cgroup hierarchy,
|
||||
+and thus solve problems spanning multiple types of resources. The
|
||||
+prime example for this is page cache writeback: dirty page cache is
|
||||
+regulated through throttling buffered writers based on memory
|
||||
+availability, and initiating batched write outs to the disk based on
|
||||
+IO capacity. Tracking and controlling writeback inside a cgroup thus
|
||||
+requires the direct cooperation of the memory and the IO controller.
|
||||
+
|
||||
+This easily extends to other areas, such as CPU cycles consumed while
|
||||
+performing memory reclaim or IO encryption.
|
||||
+
|
||||
+
|
||||
+2-1. Contentious Restrictions
|
||||
+
|
||||
+For controllers of different resources to work together, they must
|
||||
+agree on a common organization. This uniform model across controllers
|
||||
+imposes two contentious restrictions on the CPU controller: process
|
||||
+granularity and the no-internal-process constraint.
|
||||
+
|
||||
+
|
||||
+ 2-1-1. Process Granularity
|
||||
+
|
||||
+ For memory, because an address space is shared between all threads
|
||||
+ of a process, the terminal consumer is a process, not a thread.
|
||||
+ Separating the threads of a single process into different memory
|
||||
+ control domains doesn't make semantical sense. cgroup v2 ensures
|
||||
+ that all controller can agree on the same organization by requiring
|
||||
+ that threads of the same process belong to the same cgroup.
|
||||
+
|
||||
+ There are other reasons to enforce process granularity. One
|
||||
+ important one is isolating system-level management operations from
|
||||
+ in-process application operations. The cgroup interface, being a
|
||||
+ virtual filesystem, is very unfit for multiple independent
|
||||
+ operations taking place at the same time as most operations have to
|
||||
+ be multi-step and there is no way to synchronize multiple accessors.
|
||||
+ See also [5] Documentation/cgroup-v2.txt, "R-2. Thread Granularity"
|
||||
+
|
||||
+
|
||||
+ 2-1-2. No Internal Process Constraint
|
||||
+
|
||||
+ cgroup v2 does not allow processes to belong to any cgroup which has
|
||||
+ child cgroups when resource controllers are enabled on it (the
|
||||
+ notable exception being the root cgroup itself). This is because,
|
||||
+ for some resources, a resource domain (cgroup) is not directly
|
||||
+ comparable to the terminal consumer (process/task) of said resource,
|
||||
+ and so putting the two into a sibling relationship isn't meaningful.
|
||||
+
|
||||
+ - Differing Control Parameters and Capabilities
|
||||
+
|
||||
+ A cgroup controller has different resource control parameters and
|
||||
+ capabilities from a terminal consumer, be that a task or process.
|
||||
+ There are a couple cases where a cgroup control knob can be mapped
|
||||
+ to a per-task or per-process API but they are exceptions and the
|
||||
+ mappings aren't obvious even in those cases.
|
||||
+
|
||||
+ For example, task priorities (also known as nice values) set
|
||||
+ through setpriority(2) are mapped to the CPU controller
|
||||
+ "cpu.shares" values. However, how exactly the two ranges map and
|
||||
+ even the fact that they map to each other at all are not obvious.
|
||||
+
|
||||
+ The situation gets further muddled when considering other resource
|
||||
+ types and control knobs. IO priorities set through ioprio_set(2)
|
||||
+ cannot be mapped to IO controller weights and most cgroup resource
|
||||
+ control knobs including the bandwidth control knobs of the CPU
|
||||
+ controller don't have counterparts in the terminal consumers.
|
||||
+
|
||||
+ - Anonymous Resource Consumption
|
||||
+
|
||||
+ For CPU, every time slice consumed from inside a cgroup, which
|
||||
+ comprises most but not all of consumed CPU time for the cgroup,
|
||||
+ can be clearly attributed to a specific task or process. Because
|
||||
+ these two types of entities are directly comparable as consumers
|
||||
+ of CPU time, it's theoretically possible to mix tasks and cgroups
|
||||
+ on the same tree levels and let them directly compete for the time
|
||||
+ quota available to their common ancestor.
|
||||
+
|
||||
+ However, the same can't be said for resource types like memory or
|
||||
+ IO: the memory consumed by the page cache, for example, can be
|
||||
+ tracked on a per-cgroup level, but due to mismatches in lifetimes
|
||||
+ of involved objects (page cache can persist long after processes
|
||||
+ are gone), shared usages and the implementation overhead of
|
||||
+ tracking persistent state, it can no longer be attributed to
|
||||
+ individual processes after instantiation. Consequently, any IO
|
||||
+ incurred by page cache writeback can be attributed to a cgroup,
|
||||
+ but not to the individual consumers inside the cgroup.
|
||||
+
|
||||
+ For memory and IO, this makes a resource domain (cgroup) an object
|
||||
+ of a fundamentally different type than a terminal consumer
|
||||
+ (process). A process can't be a first class object in the resource
|
||||
+ distribution graph as its total resource consumption can't be
|
||||
+ described without the containing resource domain.
|
||||
+
|
||||
+ Disallowing processes in internal cgroups avoids competition between
|
||||
+ cgroups and processes which cannot be meaningfully defined for these
|
||||
+ resources. All resource control takes place among cgroups and a
|
||||
+ terminal consumer interacts with the containing cgroup the same way
|
||||
+ it would with the system without cgroup.
|
||||
+
|
||||
+ Root cgroup is exempt from this constraint, which is in line with
|
||||
+ how root cgroup is handled in general - it's excluded from cgroup
|
||||
+ resource accounting and control.
|
||||
+
|
||||
+
|
||||
+Enforcing process granularity and no internal process constraint
|
||||
+allows all controllers to be on the same footing in terms of resource
|
||||
+distribution hierarchy.
|
||||
+
|
||||
+
|
||||
+2-2. Impact on CPU Controller
|
||||
+
|
||||
+As indicated earlier, the CPU controller's resource distribution graph
|
||||
+is the simplest. Every schedulable resource consumption can be
|
||||
+attributed to a specific task. In addition, for weight based control,
|
||||
+the per-task priority set through setpriority(2) can be translated to
|
||||
+and from a per-cgroup weight. As such, the CPU controller can treat a
|
||||
+task and a cgroup symmetrically, allowing support for any tree layout
|
||||
+of cgroups and tasks. Both process granularity and the no internal
|
||||
+process constraint restrict how the CPU controller can be used.
|
||||
+
|
||||
+
|
||||
+ 2-2-1. Impact of Process Granularity
|
||||
+
|
||||
+ Process granularity prevents tasks belonging to the same process to
|
||||
+ be assigned to different cgroups. It was pointed out [6] that this
|
||||
+ excludes the valid use case of hierarchical CPU distribution within
|
||||
+ processes.
|
||||
+
|
||||
+ To address this issue, the rgroup (resource group) [7][8][9]
|
||||
+ interface, an extension of the existing setpriority(2) API, was
|
||||
+ proposed, which is in line with other programmable priority
|
||||
+ mechanisms and eliminates the risk of in-application configuration
|
||||
+ and system configuration stepping on each other's toes.
|
||||
+ Unfortunately, the proposal quickly turned into discussions around
|
||||
+ cgroup v2 design decisions [4] and no consensus could be reached.
|
||||
+
|
||||
+
|
||||
+ 2-2-2. Impact of No Internal Process Constraint
|
||||
+
|
||||
+ The no internal process constraint disallows tasks from competing
|
||||
+ directly against cgroups. Here is an excerpt from Peter Zijlstra
|
||||
+ pointing out the issue [10] - R, L and A are cgroups; t1, t2, t3 and
|
||||
+ t4 are tasks:
|
||||
+
|
||||
+
|
||||
+ R
|
||||
+ / | \
|
||||
+ t1 t2 A
|
||||
+ / \
|
||||
+ t3 t4
|
||||
+
|
||||
+
|
||||
+ Is fundamentally different from:
|
||||
+
|
||||
+
|
||||
+ R
|
||||
+ / \
|
||||
+ L A
|
||||
+ / \ / \
|
||||
+ t1 t2 t3 t4
|
||||
+
|
||||
+
|
||||
+ Because if in the first hierarchy you add a task (t5) to R, all of
|
||||
+ its A will run at 1/4th of total bandwidth where before it had
|
||||
+ 1/3rd, whereas with the second example, if you add our t5 to L, A
|
||||
+ doesn't get any less bandwidth.
|
||||
+
|
||||
+
|
||||
+ It is true that the trees are semantically different from each other
|
||||
+ and the symmetric handling of tasks and cgroups is aesthetically
|
||||
+ pleasing. However, it isn't clear what the practical usefulness of
|
||||
+ a layout with direct competition between tasks and cgroups would be,
|
||||
+ considering that number and behavior of tasks are controlled by each
|
||||
+ application, and cgroups primarily deal with system level resource
|
||||
+ distribution; changes in the number of active threads would directly
|
||||
+ impact resource distribution. Real world use cases of such layouts
|
||||
+ could not be established during the discussions.
|
||||
+
|
||||
+
|
||||
+2-3. Arguments for cgroup v2
|
||||
+
|
||||
+There are strong demands for comprehensive hierarchical resource
|
||||
+control across all major resources, and establishing a common resource
|
||||
+hierarchy is an essential step. As with most engineering decisions,
|
||||
+common resource hierarchy definition comes with its trade-offs. With
|
||||
+cgroup v2, the trade-offs are in the form of structural constraints
|
||||
+which, among others, restrict the CPU controller's space of possible
|
||||
+configurations.
|
||||
+
|
||||
+However, even with the restrictions, cgroup v2, in combination with
|
||||
+rgroup, covers most of identified real world use cases while enabling
|
||||
+new important use cases of resource control across multiple resource
|
||||
+types that were fundamentally broken previously.
|
||||
+
|
||||
+Furthermore, for resource control, treating resource domains as
|
||||
+objects of a different type from terminal consumers has important
|
||||
+advantages - it can account for resource consumptions which are not
|
||||
+tied to any specific terminal consumer, be that a task or process, and
|
||||
+allows decoupling resource distribution controls from in-application
|
||||
+APIs. Even the CPU controller may benefit from it as the kernel can
|
||||
+consume significant amount of CPU cycles in interrupt context or tasks
|
||||
+shared across multiple resource domains (e.g. softirq).
|
||||
+
|
||||
+Finally, it's important to note that enabling cgroup v2 support for
|
||||
+the CPU controller doesn't block use cases which require the features
|
||||
+which are not available on cgroup v2. Unlikely, but should anybody
|
||||
+actually rely on the CPU controller's symmetric handling of tasks and
|
||||
+cgroups, backward compatibility is and will be maintained by being
|
||||
+able to disconnect the controller from the cgroup v2 hierarchy and use
|
||||
+it standalone. This also holds for cpuset which is often used in
|
||||
+highly customized configurations which might be a poor fit for common
|
||||
+resource domains.
|
||||
+
|
||||
+The required changes are minimal, the benefits for the target use
|
||||
+cases are critical and obvious, and use cases which have to use v1 can
|
||||
+continue to do so.
|
||||
+
|
||||
+
|
||||
+3. Way Forward
|
||||
+
|
||||
+cgroup v2 primarily aims to solve the problem of comprehensive
|
||||
+hierarchical resource control across all major computing resources,
|
||||
+which is one of the core problems of modern server infrastructure
|
||||
+engineering. The trade-offs that cgroup v2 took are results of
|
||||
+pursuing that goal and gaining a better understanding of the nature of
|
||||
+resource control in the process.
|
||||
+
|
||||
+I believe that real world usages will prove cgroup v2's model right,
|
||||
+considering the crucial pieces of comprehensive resource control that
|
||||
+cannot be implemented without common resource domains. This is not to
|
||||
+say that cgroup v2 is fixed in stone and can't be updated; if there is
|
||||
+an approach which better serves both comprehensive resource control
|
||||
+and the CPU controller's flexibility, we will surely move towards
|
||||
+that. It goes without saying that discussions around such approach
|
||||
+should consider practical aspects of resource control as a whole
|
||||
+rather than absolutely focusing on a particular controller.
|
||||
+
|
||||
+Until such consensus can be reached, the CPU controller cgroup v2
|
||||
+support will be maintained out of the mainline kernel in an easily
|
||||
+accessible form. If there is anything cgroup developers can do to
|
||||
+ease the pain, please feel free to contact us on the cgroup mailing
|
||||
+list at cgroups@vger.kernel.org.
|
||||
+
|
||||
+
|
||||
+4. References
|
||||
+
|
||||
+[1] http://lkml.kernel.org/r/20160105164834.GE5995@mtj.duckdns.org
|
||||
+ [PATCH 1/2] sched: Misc preps for cgroup unified hierarchy interface
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[2] http://lkml.kernel.org/r/20160105164852.GF5995@mtj.duckdns.org
|
||||
+ [PATCH 2/2] sched: Implement interface for cgroup unified hierarchy
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[3] http://lkml.kernel.org/r/1438641689-14655-4-git-send-email-tj@kernel.org
|
||||
+ [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[4] http://lkml.kernel.org/r/20160407064549.GH3430@twins.programming.kicks-ass.net
|
||||
+ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP
|
||||
+ Peter Zijlstra <peterz@infradead.org>
|
||||
+
|
||||
+[5] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/cgroup-v2.txt
|
||||
+ Control Group v2
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[6] http://lkml.kernel.org/r/CAPM31RJNy3jgG=DYe6GO=wyL4BPPxwUm1f2S6YXacQmo7viFZA@mail.gmail.com
|
||||
+ Re: [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy
|
||||
+ Paul Turner <pjt@google.com>
|
||||
+
|
||||
+[7] http://lkml.kernel.org/r/20160105154503.GC5995@mtj.duckdns.org
|
||||
+ [RFD] cgroup: thread granularity support for cpu controller
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[8] http://lkml.kernel.org/r/1457710888-31182-1-git-send-email-tj@kernel.org
|
||||
+ [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[9] http://lkml.kernel.org/r/20160311160522.GA24046@htj.duckdns.org
|
||||
+ Example program for PRIO_RGRP
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[10] http://lkml.kernel.org/r/20160407082810.GN3430@twins.programming.kicks-ass.net
|
||||
+ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource
|
||||
+ Peter Zijlstra <peterz@infradead.org>
|
784
pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.11.patch
Normal file
784
pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.11.patch
Normal file
@ -0,0 +1,784 @@
|
||||
commit 827b86ad1dd21feed4c0b99faf6059f245f7dadb
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Mar 11 07:31:23 2016 -0500
|
||||
|
||||
sched: Misc preps for cgroup unified hierarchy interface
|
||||
|
||||
Make the following changes in preparation for the cpu controller
|
||||
interface implementation for the unified hierarchy. This patch
|
||||
doesn't cause any functional differences.
|
||||
|
||||
* s/cpu_stats_show()/cpu_cfs_stats_show()/
|
||||
|
||||
* s/cpu_files/cpu_legacy_files/
|
||||
|
||||
* Separate out cpuacct_stats_read() from cpuacct_stats_show(). While
|
||||
at it, make the @val array u64 for consistency.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Li Zefan <lizefan@huawei.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index 3b31fc05a0f1..a1b95e83fa87 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -7174,7 +7174,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
||||
return ret;
|
||||
}
|
||||
|
||||
-static int cpu_stats_show(struct seq_file *sf, void *v)
|
||||
+static int cpu_cfs_stats_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct task_group *tg = css_tg(seq_css(sf));
|
||||
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
||||
@@ -7214,7 +7214,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
|
||||
}
|
||||
#endif /* CONFIG_RT_GROUP_SCHED */
|
||||
|
||||
-static struct cftype cpu_files[] = {
|
||||
+static struct cftype cpu_legacy_files[] = {
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
{
|
||||
.name = "shares",
|
||||
@@ -7235,7 +7235,7 @@ static struct cftype cpu_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "stat",
|
||||
- .seq_show = cpu_stats_show,
|
||||
+ .seq_show = cpu_cfs_stats_show,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
@@ -7261,7 +7261,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.fork = cpu_cgroup_fork,
|
||||
.can_attach = cpu_cgroup_can_attach,
|
||||
.attach = cpu_cgroup_attach,
|
||||
- .legacy_cftypes = cpu_files,
|
||||
+ .legacy_cftypes = cpu_legacy_files,
|
||||
.early_init = true,
|
||||
};
|
||||
|
||||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
||||
index f95ab29a45d0..6151c23f722f 100644
|
||||
--- a/kernel/sched/cpuacct.c
|
||||
+++ b/kernel/sched/cpuacct.c
|
||||
@@ -276,26 +276,33 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
+static void cpuacct_stats_read(struct cpuacct *ca,
|
||||
+ u64 (*val)[CPUACCT_STAT_NSTATS])
|
||||
{
|
||||
- struct cpuacct *ca = css_ca(seq_css(sf));
|
||||
- s64 val[CPUACCT_STAT_NSTATS];
|
||||
int cpu;
|
||||
- int stat;
|
||||
|
||||
- memset(val, 0, sizeof(val));
|
||||
+ memset(val, 0, sizeof(*val));
|
||||
+
|
||||
for_each_possible_cpu(cpu) {
|
||||
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
|
||||
|
||||
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
|
||||
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
|
||||
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
|
||||
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
|
||||
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
|
||||
+ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
|
||||
+ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
|
||||
+ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
|
||||
+ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
|
||||
+ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
|
||||
}
|
||||
+}
|
||||
+
|
||||
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ u64 val[CPUACCT_STAT_NSTATS];
|
||||
+ int stat;
|
||||
+
|
||||
+ cpuacct_stats_read(css_ca(seq_css(sf)), &val);
|
||||
|
||||
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
|
||||
- seq_printf(sf, "%s %lld\n",
|
||||
+ seq_printf(sf, "%s %llu\n",
|
||||
cpuacct_stat_desc[stat],
|
||||
(long long)nsec_to_clock_t(val[stat]));
|
||||
}
|
||||
|
||||
commit fdb64d002b3a223ce4bb11aa4448a42050470052
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Mar 11 07:31:23 2016 -0500
|
||||
|
||||
sched: Implement interface for cgroup unified hierarchy
|
||||
|
||||
While the cpu controller doesn't have any functional problems, there
|
||||
are a couple interface issues which can be addressed in the v2
|
||||
interface.
|
||||
|
||||
* cpuacct being a separate controller. This separation is artificial
|
||||
and rather pointless as demonstrated by most use cases co-mounting
|
||||
the two controllers. It also forces certain information to be
|
||||
accounted twice.
|
||||
|
||||
* Use of different time units. Writable control knobs use
|
||||
microseconds, some stat fields use nanoseconds while other cpuacct
|
||||
stat fields use centiseconds.
|
||||
|
||||
* Control knobs which can't be used in the root cgroup still show up
|
||||
in the root.
|
||||
|
||||
* Control knob names and semantics aren't consistent with other
|
||||
controllers.
|
||||
|
||||
This patchset implements cpu controller's interface on the unified
|
||||
hierarchy which adheres to the controller file conventions described
|
||||
in Documentation/cgroups/unified-hierarchy.txt. Overall, the
|
||||
following changes are made.
|
||||
|
||||
* cpuacct is implictly enabled and disabled by cpu and its information
|
||||
is reported through "cpu.stat" which now uses microseconds for all
|
||||
time durations. All time duration fields now have "_usec" appended
|
||||
to them for clarity. While this doesn't solve the double accounting
|
||||
immediately, once majority of users switch to v2, cpu can directly
|
||||
account and report the relevant stats and cpuacct can be disabled on
|
||||
the unified hierarchy.
|
||||
|
||||
Note that cpuacct.usage_percpu is currently not included in
|
||||
"cpu.stat". If this information is actually called for, it can be
|
||||
added later.
|
||||
|
||||
* "cpu.shares" is replaced with "cpu.weight" and operates on the
|
||||
standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
|
||||
The weight is scaled to scheduler weight so that 100 maps to 1024
|
||||
and the ratio relationship is preserved - if weight is W and its
|
||||
scaled value is S, W / 100 == S / 1024. While the mapped range is a
|
||||
bit smaller than the orignal scheduler weight range, the dead zones
|
||||
on both sides are relatively small and covers wider range than the
|
||||
nice value mappings. This file doesn't make sense in the root
|
||||
cgroup and isn't create on root.
|
||||
|
||||
* "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
|
||||
which contains both quota and period.
|
||||
|
||||
* "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by
|
||||
"cpu.rt.max" which contains both runtime and period.
|
||||
|
||||
v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for
|
||||
CFS bandwidth stats and also using raw division for u64. Use
|
||||
CONFIG_CFS_BANDWITH and do_div() instead.
|
||||
|
||||
The semantics of "cpu.rt.max" is not fully decided yet. Dropped
|
||||
for now.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Li Zefan <lizefan@huawei.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index a1b95e83fa87..f01d56e58a1b 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -7253,6 +7253,139 @@ static struct cftype cpu_legacy_files[] = {
|
||||
{ } /* Terminate */
|
||||
};
|
||||
|
||||
+static int cpu_stats_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ cpuacct_cpu_stats_show(sf);
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ struct task_group *tg = css_tg(seq_css(sf));
|
||||
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
||||
+ u64 throttled_usec;
|
||||
+
|
||||
+ throttled_usec = cfs_b->throttled_time;
|
||||
+ do_div(throttled_usec, NSEC_PER_USEC);
|
||||
+
|
||||
+ seq_printf(sf, "nr_periods %d\n"
|
||||
+ "nr_throttled %d\n"
|
||||
+ "throttled_usec %llu\n",
|
||||
+ cfs_b->nr_periods, cfs_b->nr_throttled,
|
||||
+ throttled_usec);
|
||||
+ }
|
||||
+#endif
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cft)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(css);
|
||||
+ u64 weight = scale_load_down(tg->shares);
|
||||
+
|
||||
+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
|
||||
+}
|
||||
+
|
||||
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cftype, u64 weight)
|
||||
+{
|
||||
+ /*
|
||||
+ * cgroup weight knobs should use the common MIN, DFL and MAX
|
||||
+ * values which are 1, 100 and 10000 respectively. While it loses
|
||||
+ * a bit of range on both ends, it maps pretty well onto the shares
|
||||
+ * value used by scheduler and the round-trip conversions preserve
|
||||
+ * the original value over the entire range.
|
||||
+ */
|
||||
+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
|
||||
+
|
||||
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
|
||||
+ long period, long quota)
|
||||
+{
|
||||
+ if (quota < 0)
|
||||
+ seq_puts(sf, "max");
|
||||
+ else
|
||||
+ seq_printf(sf, "%ld", quota);
|
||||
+
|
||||
+ seq_printf(sf, " %ld\n", period);
|
||||
+}
|
||||
+
|
||||
+/* caller should put the current value in *@periodp before calling */
|
||||
+static int __maybe_unused cpu_period_quota_parse(char *buf,
|
||||
+ u64 *periodp, u64 *quotap)
|
||||
+{
|
||||
+ char tok[21]; /* U64_MAX */
|
||||
+
|
||||
+ if (!sscanf(buf, "%s %llu", tok, periodp))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ *periodp *= NSEC_PER_USEC;
|
||||
+
|
||||
+ if (sscanf(tok, "%llu", quotap))
|
||||
+ *quotap *= NSEC_PER_USEC;
|
||||
+ else if (!strcmp(tok, "max"))
|
||||
+ *quotap = RUNTIME_INF;
|
||||
+ else
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+static int cpu_max_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(seq_css(sf));
|
||||
+
|
||||
+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
|
||||
+ char *buf, size_t nbytes, loff_t off)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(of_css(of));
|
||||
+ u64 period = tg_get_cfs_period(tg);
|
||||
+ u64 quota;
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = cpu_period_quota_parse(buf, &period, "a);
|
||||
+ if (!ret)
|
||||
+ ret = tg_set_cfs_bandwidth(tg, period, quota);
|
||||
+ return ret ?: nbytes;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static struct cftype cpu_files[] = {
|
||||
+ {
|
||||
+ .name = "stat",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .seq_show = cpu_stats_show,
|
||||
+ },
|
||||
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+ {
|
||||
+ .name = "weight",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .read_u64 = cpu_weight_read_u64,
|
||||
+ .write_u64 = cpu_weight_write_u64,
|
||||
+ },
|
||||
+#endif
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ .name = "max",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .seq_show = cpu_max_show,
|
||||
+ .write = cpu_max_write,
|
||||
+ },
|
||||
+#endif
|
||||
+ { } /* terminate */
|
||||
+};
|
||||
+
|
||||
struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.css_alloc = cpu_cgroup_css_alloc,
|
||||
.css_online = cpu_cgroup_css_online,
|
||||
@@ -7262,7 +7395,15 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.can_attach = cpu_cgroup_can_attach,
|
||||
.attach = cpu_cgroup_attach,
|
||||
.legacy_cftypes = cpu_legacy_files,
|
||||
+ .dfl_cftypes = cpu_files,
|
||||
.early_init = true,
|
||||
+#ifdef CONFIG_CGROUP_CPUACCT
|
||||
+ /*
|
||||
+ * cpuacct is enabled together with cpu on the unified hierarchy
|
||||
+ * and its stats are reported through "cpu.stat".
|
||||
+ */
|
||||
+ .depends_on = 1 << cpuacct_cgrp_id,
|
||||
+#endif
|
||||
};
|
||||
|
||||
#endif /* CONFIG_CGROUP_SCHED */
|
||||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
||||
index 6151c23f722f..fc1cf13c3af1 100644
|
||||
--- a/kernel/sched/cpuacct.c
|
||||
+++ b/kernel/sched/cpuacct.c
|
||||
@@ -347,6 +347,31 @@ static struct cftype files[] = {
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
+/* used to print cpuacct stats in cpu.stat on the unified hierarchy */
|
||||
+void cpuacct_cpu_stats_show(struct seq_file *sf)
|
||||
+{
|
||||
+ struct cgroup_subsys_state *css;
|
||||
+ u64 usage, val[CPUACCT_STAT_NSTATS];
|
||||
+
|
||||
+ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys);
|
||||
+
|
||||
+ usage = cpuusage_read(css, seq_cft(sf));
|
||||
+ cpuacct_stats_read(css_ca(css), &val);
|
||||
+
|
||||
+ val[CPUACCT_STAT_USER] *= TICK_NSEC;
|
||||
+ val[CPUACCT_STAT_SYSTEM] *= TICK_NSEC;
|
||||
+ do_div(usage, NSEC_PER_USEC);
|
||||
+ do_div(val[CPUACCT_STAT_USER], NSEC_PER_USEC);
|
||||
+ do_div(val[CPUACCT_STAT_SYSTEM], NSEC_PER_USEC);
|
||||
+
|
||||
+ seq_printf(sf, "usage_usec %llu\n"
|
||||
+ "user_usec %llu\n"
|
||||
+ "system_usec %llu\n",
|
||||
+ usage, val[CPUACCT_STAT_USER], val[CPUACCT_STAT_SYSTEM]);
|
||||
+
|
||||
+ css_put(css);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* charge this task's execution time to its accounting group.
|
||||
*
|
||||
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
|
||||
index ba72807c73d4..ddf7af466d35 100644
|
||||
--- a/kernel/sched/cpuacct.h
|
||||
+++ b/kernel/sched/cpuacct.h
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
|
||||
extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
|
||||
+extern void cpuacct_cpu_stats_show(struct seq_file *sf);
|
||||
|
||||
#else
|
||||
|
||||
@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
|
||||
{
|
||||
}
|
||||
|
||||
+static inline void cpuacct_cpu_stats_show(struct seq_file *sf)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
#endif
|
||||
|
||||
commit 8dde150866b8c433216105c50b7e889d5242d583
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Aug 5 12:41:01 2016 -0400
|
||||
|
||||
cgroup: add documentation regarding CPU controller cgroup v2 support
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
|
||||
diff --git a/Documentation/cgroup-v2-cpu.txt b/Documentation/cgroup-v2-cpu.txt
|
||||
new file mode 100644
|
||||
index 000000000000..1ed7032d4472
|
||||
--- /dev/null
|
||||
+++ b/Documentation/cgroup-v2-cpu.txt
|
||||
@@ -0,0 +1,368 @@
|
||||
+
|
||||
+
|
||||
+CPU Controller on Control Group v2
|
||||
+
|
||||
+August, 2016 Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+
|
||||
+While most controllers have support for cgroup v2 now, the CPU
|
||||
+controller support is not upstream yet due to objections from the
|
||||
+scheduler maintainers on the basic designs of cgroup v2. This
|
||||
+document explains the current situation as well as an interim
|
||||
+solution, and details the disagreements and arguments. The latest
|
||||
+version of this document can be found at the following URL.
|
||||
+
|
||||
+ https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/tree/Documentation/cgroup-v2-cpu.txt?h=cgroup-v2-cpu
|
||||
+
|
||||
+This document was posted to the linux-kernel and cgroup mailing lists.
|
||||
+Unfortunately, no consensus was reached as of Oct, 2016. The thread
|
||||
+can be found at the following URL.
|
||||
+
|
||||
+ http://lkml.kernel.org/r/20160805170752.GK2542@mtj.duckdns.org
|
||||
+
|
||||
+
|
||||
+CONTENTS
|
||||
+
|
||||
+1. Current Situation and Interim Solution
|
||||
+2. Disagreements and Arguments
|
||||
+ 2-1. Contentious Restrictions
|
||||
+ 2-1-1. Process Granularity
|
||||
+ 2-1-2. No Internal Process Constraint
|
||||
+ 2-2. Impact on CPU Controller
|
||||
+ 2-2-1. Impact of Process Granularity
|
||||
+ 2-2-2. Impact of No Internal Process Constraint
|
||||
+ 2-3. Arguments for cgroup v2
|
||||
+3. Way Forward
|
||||
+4. References
|
||||
+
|
||||
+
|
||||
+1. Current Situation and Interim Solution
|
||||
+
|
||||
+All objections from the scheduler maintainers apply to cgroup v2 core
|
||||
+design, and there are no known objections to the specifics of the CPU
|
||||
+controller cgroup v2 interface. The only blocked part is changes to
|
||||
+expose the CPU controller interface on cgroup v2, which comprises the
|
||||
+following two patches:
|
||||
+
|
||||
+ [1] sched: Misc preps for cgroup unified hierarchy interface
|
||||
+ [2] sched: Implement interface for cgroup unified hierarchy
|
||||
+
|
||||
+The necessary changes are superficial and implement the interface
|
||||
+files on cgroup v2. The combined diffstat is as follows.
|
||||
+
|
||||
+ kernel/sched/core.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++--
|
||||
+ kernel/sched/cpuacct.c | 57 ++++++++++++------
|
||||
+ kernel/sched/cpuacct.h | 5 +
|
||||
+ 3 files changed, 189 insertions(+), 22 deletions(-)
|
||||
+
|
||||
+The patches are easy to apply and forward-port. The following git
|
||||
+branch will always carry the two patches on top of the latest release
|
||||
+of the upstream kernel.
|
||||
+
|
||||
+ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu
|
||||
+
|
||||
+There also are versioned branches going back to v4.4.
|
||||
+
|
||||
+ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu-$KERNEL_VER
|
||||
+
|
||||
+While it's difficult to tell whether the CPU controller support will
|
||||
+be merged, there are crucial resource control features in cgroup v2
|
||||
+that are only possible due to the design choices that are being
|
||||
+objected to, and every effort will be made to ease enabling the CPU
|
||||
+controller cgroup v2 support out-of-tree for parties which choose to.
|
||||
+
|
||||
+
|
||||
+2. Disagreements and Arguments
|
||||
+
|
||||
+There have been several lengthy discussion threads [3][4] on LKML
|
||||
+around the structural constraints of cgroup v2. The two that affect
|
||||
+the CPU controller are process granularity and no internal process
|
||||
+constraint. Both arise primarily from the need for common resource
|
||||
+domain definition across different resources.
|
||||
+
|
||||
+The common resource domain is a powerful concept in cgroup v2 that
|
||||
+allows controllers to make basic assumptions about the structural
|
||||
+organization of processes and controllers inside the cgroup hierarchy,
|
||||
+and thus solve problems spanning multiple types of resources. The
|
||||
+prime example for this is page cache writeback: dirty page cache is
|
||||
+regulated through throttling buffered writers based on memory
|
||||
+availability, and initiating batched write outs to the disk based on
|
||||
+IO capacity. Tracking and controlling writeback inside a cgroup thus
|
||||
+requires the direct cooperation of the memory and the IO controller.
|
||||
+
|
||||
+This easily extends to other areas, such as CPU cycles consumed while
|
||||
+performing memory reclaim or IO encryption.
|
||||
+
|
||||
+
|
||||
+2-1. Contentious Restrictions
|
||||
+
|
||||
+For controllers of different resources to work together, they must
|
||||
+agree on a common organization. This uniform model across controllers
|
||||
+imposes two contentious restrictions on the CPU controller: process
|
||||
+granularity and the no-internal-process constraint.
|
||||
+
|
||||
+
|
||||
+ 2-1-1. Process Granularity
|
||||
+
|
||||
+ For memory, because an address space is shared between all threads
|
||||
+ of a process, the terminal consumer is a process, not a thread.
|
||||
+ Separating the threads of a single process into different memory
|
||||
+ control domains doesn't make semantical sense. cgroup v2 ensures
|
||||
+ that all controller can agree on the same organization by requiring
|
||||
+ that threads of the same process belong to the same cgroup.
|
||||
+
|
||||
+ There are other reasons to enforce process granularity. One
|
||||
+ important one is isolating system-level management operations from
|
||||
+ in-process application operations. The cgroup interface, being a
|
||||
+ virtual filesystem, is very unfit for multiple independent
|
||||
+ operations taking place at the same time as most operations have to
|
||||
+ be multi-step and there is no way to synchronize multiple accessors.
|
||||
+ See also [5] Documentation/cgroup-v2.txt, "R-2. Thread Granularity"
|
||||
+
|
||||
+
|
||||
+ 2-1-2. No Internal Process Constraint
|
||||
+
|
||||
+ cgroup v2 does not allow processes to belong to any cgroup which has
|
||||
+ child cgroups when resource controllers are enabled on it (the
|
||||
+ notable exception being the root cgroup itself). This is because,
|
||||
+ for some resources, a resource domain (cgroup) is not directly
|
||||
+ comparable to the terminal consumer (process/task) of said resource,
|
||||
+ and so putting the two into a sibling relationship isn't meaningful.
|
||||
+
|
||||
+ - Differing Control Parameters and Capabilities
|
||||
+
|
||||
+ A cgroup controller has different resource control parameters and
|
||||
+ capabilities from a terminal consumer, be that a task or process.
|
||||
+ There are a couple cases where a cgroup control knob can be mapped
|
||||
+ to a per-task or per-process API but they are exceptions and the
|
||||
+ mappings aren't obvious even in those cases.
|
||||
+
|
||||
+ For example, task priorities (also known as nice values) set
|
||||
+ through setpriority(2) are mapped to the CPU controller
|
||||
+ "cpu.shares" values. However, how exactly the two ranges map and
|
||||
+ even the fact that they map to each other at all are not obvious.
|
||||
+
|
||||
+ The situation gets further muddled when considering other resource
|
||||
+ types and control knobs. IO priorities set through ioprio_set(2)
|
||||
+ cannot be mapped to IO controller weights and most cgroup resource
|
||||
+ control knobs including the bandwidth control knobs of the CPU
|
||||
+ controller don't have counterparts in the terminal consumers.
|
||||
+
|
||||
+ - Anonymous Resource Consumption
|
||||
+
|
||||
+ For CPU, every time slice consumed from inside a cgroup, which
|
||||
+ comprises most but not all of consumed CPU time for the cgroup,
|
||||
+ can be clearly attributed to a specific task or process. Because
|
||||
+ these two types of entities are directly comparable as consumers
|
||||
+ of CPU time, it's theoretically possible to mix tasks and cgroups
|
||||
+ on the same tree levels and let them directly compete for the time
|
||||
+ quota available to their common ancestor.
|
||||
+
|
||||
+ However, the same can't be said for resource types like memory or
|
||||
+ IO: the memory consumed by the page cache, for example, can be
|
||||
+ tracked on a per-cgroup level, but due to mismatches in lifetimes
|
||||
+ of involved objects (page cache can persist long after processes
|
||||
+ are gone), shared usages and the implementation overhead of
|
||||
+ tracking persistent state, it can no longer be attributed to
|
||||
+ individual processes after instantiation. Consequently, any IO
|
||||
+ incurred by page cache writeback can be attributed to a cgroup,
|
||||
+ but not to the individual consumers inside the cgroup.
|
||||
+
|
||||
+ For memory and IO, this makes a resource domain (cgroup) an object
|
||||
+ of a fundamentally different type than a terminal consumer
|
||||
+ (process). A process can't be a first class object in the resource
|
||||
+ distribution graph as its total resource consumption can't be
|
||||
+ described without the containing resource domain.
|
||||
+
|
||||
+ Disallowing processes in internal cgroups avoids competition between
|
||||
+ cgroups and processes which cannot be meaningfully defined for these
|
||||
+ resources. All resource control takes place among cgroups and a
|
||||
+ terminal consumer interacts with the containing cgroup the same way
|
||||
+ it would with the system without cgroup.
|
||||
+
|
||||
+ Root cgroup is exempt from this constraint, which is in line with
|
||||
+ how root cgroup is handled in general - it's excluded from cgroup
|
||||
+ resource accounting and control.
|
||||
+
|
||||
+
|
||||
+Enforcing process granularity and no internal process constraint
|
||||
+allows all controllers to be on the same footing in terms of resource
|
||||
+distribution hierarchy.
|
||||
+
|
||||
+
|
||||
+2-2. Impact on CPU Controller
|
||||
+
|
||||
+As indicated earlier, the CPU controller's resource distribution graph
|
||||
+is the simplest. Every schedulable resource consumption can be
|
||||
+attributed to a specific task. In addition, for weight based control,
|
||||
+the per-task priority set through setpriority(2) can be translated to
|
||||
+and from a per-cgroup weight. As such, the CPU controller can treat a
|
||||
+task and a cgroup symmetrically, allowing support for any tree layout
|
||||
+of cgroups and tasks. Both process granularity and the no internal
|
||||
+process constraint restrict how the CPU controller can be used.
|
||||
+
|
||||
+
|
||||
+ 2-2-1. Impact of Process Granularity
|
||||
+
|
||||
+ Process granularity prevents tasks belonging to the same process to
|
||||
+ be assigned to different cgroups. It was pointed out [6] that this
|
||||
+ excludes the valid use case of hierarchical CPU distribution within
|
||||
+ processes.
|
||||
+
|
||||
+ To address this issue, the rgroup (resource group) [7][8][9]
|
||||
+ interface, an extension of the existing setpriority(2) API, was
|
||||
+ proposed, which is in line with other programmable priority
|
||||
+ mechanisms and eliminates the risk of in-application configuration
|
||||
+ and system configuration stepping on each other's toes.
|
||||
+ Unfortunately, the proposal quickly turned into discussions around
|
||||
+ cgroup v2 design decisions [4] and no consensus could be reached.
|
||||
+
|
||||
+
|
||||
+ 2-2-2. Impact of No Internal Process Constraint
|
||||
+
|
||||
+ The no internal process constraint disallows tasks from competing
|
||||
+ directly against cgroups. Here is an excerpt from Peter Zijlstra
|
||||
+ pointing out the issue [10] - R, L and A are cgroups; t1, t2, t3 and
|
||||
+ t4 are tasks:
|
||||
+
|
||||
+
|
||||
+ R
|
||||
+ / | \
|
||||
+ t1 t2 A
|
||||
+ / \
|
||||
+ t3 t4
|
||||
+
|
||||
+
|
||||
+ Is fundamentally different from:
|
||||
+
|
||||
+
|
||||
+ R
|
||||
+ / \
|
||||
+ L A
|
||||
+ / \ / \
|
||||
+ t1 t2 t3 t4
|
||||
+
|
||||
+
|
||||
+ Because if in the first hierarchy you add a task (t5) to R, all of
|
||||
+ its A will run at 1/4th of total bandwidth where before it had
|
||||
+ 1/3rd, whereas with the second example, if you add our t5 to L, A
|
||||
+ doesn't get any less bandwidth.
|
||||
+
|
||||
+
|
||||
+ It is true that the trees are semantically different from each other
|
||||
+ and the symmetric handling of tasks and cgroups is aesthetically
|
||||
+ pleasing. However, it isn't clear what the practical usefulness of
|
||||
+ a layout with direct competition between tasks and cgroups would be,
|
||||
+ considering that number and behavior of tasks are controlled by each
|
||||
+ application, and cgroups primarily deal with system level resource
|
||||
+ distribution; changes in the number of active threads would directly
|
||||
+ impact resource distribution. Real world use cases of such layouts
|
||||
+ could not be established during the discussions.
|
||||
+
|
||||
+
|
||||
+2-3. Arguments for cgroup v2
|
||||
+
|
||||
+There are strong demands for comprehensive hierarchical resource
|
||||
+control across all major resources, and establishing a common resource
|
||||
+hierarchy is an essential step. As with most engineering decisions,
|
||||
+common resource hierarchy definition comes with its trade-offs. With
|
||||
+cgroup v2, the trade-offs are in the form of structural constraints
|
||||
+which, among others, restrict the CPU controller's space of possible
|
||||
+configurations.
|
||||
+
|
||||
+However, even with the restrictions, cgroup v2, in combination with
|
||||
+rgroup, covers most of identified real world use cases while enabling
|
||||
+new important use cases of resource control across multiple resource
|
||||
+types that were fundamentally broken previously.
|
||||
+
|
||||
+Furthermore, for resource control, treating resource domains as
|
||||
+objects of a different type from terminal consumers has important
|
||||
+advantages - it can account for resource consumptions which are not
|
||||
+tied to any specific terminal consumer, be that a task or process, and
|
||||
+allows decoupling resource distribution controls from in-application
|
||||
+APIs. Even the CPU controller may benefit from it as the kernel can
|
||||
+consume significant amount of CPU cycles in interrupt context or tasks
|
||||
+shared across multiple resource domains (e.g. softirq).
|
||||
+
|
||||
+Finally, it's important to note that enabling cgroup v2 support for
|
||||
+the CPU controller doesn't block use cases which require the features
|
||||
+which are not available on cgroup v2. Unlikely, but should anybody
|
||||
+actually rely on the CPU controller's symmetric handling of tasks and
|
||||
+cgroups, backward compatibility is and will be maintained by being
|
||||
+able to disconnect the controller from the cgroup v2 hierarchy and use
|
||||
+it standalone. This also holds for cpuset which is often used in
|
||||
+highly customized configurations which might be a poor fit for common
|
||||
+resource domains.
|
||||
+
|
||||
+The required changes are minimal, the benefits for the target use
|
||||
+cases are critical and obvious, and use cases which have to use v1 can
|
||||
+continue to do so.
|
||||
+
|
||||
+
|
||||
+3. Way Forward
|
||||
+
|
||||
+cgroup v2 primarily aims to solve the problem of comprehensive
|
||||
+hierarchical resource control across all major computing resources,
|
||||
+which is one of the core problems of modern server infrastructure
|
||||
+engineering. The trade-offs that cgroup v2 took are results of
|
||||
+pursuing that goal and gaining a better understanding of the nature of
|
||||
+resource control in the process.
|
||||
+
|
||||
+I believe that real world usages will prove cgroup v2's model right,
|
||||
+considering the crucial pieces of comprehensive resource control that
|
||||
+cannot be implemented without common resource domains. This is not to
|
||||
+say that cgroup v2 is fixed in stone and can't be updated; if there is
|
||||
+an approach which better serves both comprehensive resource control
|
||||
+and the CPU controller's flexibility, we will surely move towards
|
||||
+that. It goes without saying that discussions around such approach
|
||||
+should consider practical aspects of resource control as a whole
|
||||
+rather than absolutely focusing on a particular controller.
|
||||
+
|
||||
+Until such consensus can be reached, the CPU controller cgroup v2
|
||||
+support will be maintained out of the mainline kernel in an easily
|
||||
+accessible form. If there is anything cgroup developers can do to
|
||||
+ease the pain, please feel free to contact us on the cgroup mailing
|
||||
+list at cgroups@vger.kernel.org.
|
||||
+
|
||||
+
|
||||
+4. References
|
||||
+
|
||||
+[1] http://lkml.kernel.org/r/20160105164834.GE5995@mtj.duckdns.org
|
||||
+ [PATCH 1/2] sched: Misc preps for cgroup unified hierarchy interface
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[2] http://lkml.kernel.org/r/20160105164852.GF5995@mtj.duckdns.org
|
||||
+ [PATCH 2/2] sched: Implement interface for cgroup unified hierarchy
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[3] http://lkml.kernel.org/r/1438641689-14655-4-git-send-email-tj@kernel.org
|
||||
+ [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[4] http://lkml.kernel.org/r/20160407064549.GH3430@twins.programming.kicks-ass.net
|
||||
+ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP
|
||||
+ Peter Zijlstra <peterz@infradead.org>
|
||||
+
|
||||
+[5] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/cgroup-v2.txt
|
||||
+ Control Group v2
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[6] http://lkml.kernel.org/r/CAPM31RJNy3jgG=DYe6GO=wyL4BPPxwUm1f2S6YXacQmo7viFZA@mail.gmail.com
|
||||
+ Re: [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy
|
||||
+ Paul Turner <pjt@google.com>
|
||||
+
|
||||
+[7] http://lkml.kernel.org/r/20160105154503.GC5995@mtj.duckdns.org
|
||||
+ [RFD] cgroup: thread granularity support for cpu controller
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[8] http://lkml.kernel.org/r/1457710888-31182-1-git-send-email-tj@kernel.org
|
||||
+ [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[9] http://lkml.kernel.org/r/20160311160522.GA24046@htj.duckdns.org
|
||||
+ Example program for PRIO_RGRP
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[10] http://lkml.kernel.org/r/20160407082810.GN3430@twins.programming.kicks-ass.net
|
||||
+ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource
|
||||
+ Peter Zijlstra <peterz@infradead.org>
|
@ -1,407 +0,0 @@
|
||||
commit 6426c5b02d4aab620219b08a5d97ad8851b56b0d
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Mar 11 07:31:23 2016 -0500
|
||||
|
||||
sched: Misc preps for cgroup unified hierarchy interface
|
||||
|
||||
Make the following changes in preparation for the cpu controller
|
||||
interface implementation for the unified hierarchy. This patch
|
||||
doesn't cause any functional differences.
|
||||
|
||||
* s/cpu_stats_show()/cpu_cfs_stats_show()/
|
||||
|
||||
* s/cpu_files/cpu_legacy_files/
|
||||
|
||||
* Separate out cpuacct_stats_read() from cpuacct_stats_show(). While
|
||||
at it, remove pointless cpuacct_stat_desc[] array.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Li Zefan <lizefan@huawei.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index d1f7149..0d34f35 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -8371,7 +8371,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
||||
return ret;
|
||||
}
|
||||
|
||||
-static int cpu_stats_show(struct seq_file *sf, void *v)
|
||||
+static int cpu_cfs_stats_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct task_group *tg = css_tg(seq_css(sf));
|
||||
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
||||
@@ -8411,7 +8411,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
|
||||
}
|
||||
#endif /* CONFIG_RT_GROUP_SCHED */
|
||||
|
||||
-static struct cftype cpu_files[] = {
|
||||
+static struct cftype cpu_legacy_files[] = {
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
{
|
||||
.name = "shares",
|
||||
@@ -8432,7 +8432,7 @@ static struct cftype cpu_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "stat",
|
||||
- .seq_show = cpu_stats_show,
|
||||
+ .seq_show = cpu_cfs_stats_show,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
@@ -8457,7 +8457,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.fork = cpu_cgroup_fork,
|
||||
.can_attach = cpu_cgroup_can_attach,
|
||||
.attach = cpu_cgroup_attach,
|
||||
- .legacy_cftypes = cpu_files,
|
||||
+ .legacy_cftypes = cpu_legacy_files,
|
||||
.early_init = true,
|
||||
};
|
||||
|
||||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
||||
index 4a81120..b99030a 100644
|
||||
--- a/kernel/sched/cpuacct.c
|
||||
+++ b/kernel/sched/cpuacct.c
|
||||
@@ -180,36 +180,33 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static const char * const cpuacct_stat_desc[] = {
|
||||
- [CPUACCT_STAT_USER] = "user",
|
||||
- [CPUACCT_STAT_SYSTEM] = "system",
|
||||
-};
|
||||
-
|
||||
-static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
+static void cpuacct_stats_read(struct cpuacct *ca, u64 *userp, u64 *sysp)
|
||||
{
|
||||
- struct cpuacct *ca = css_ca(seq_css(sf));
|
||||
int cpu;
|
||||
- s64 val = 0;
|
||||
|
||||
+ *userp = 0;
|
||||
for_each_online_cpu(cpu) {
|
||||
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
||||
- val += kcpustat->cpustat[CPUTIME_USER];
|
||||
- val += kcpustat->cpustat[CPUTIME_NICE];
|
||||
+ *userp += kcpustat->cpustat[CPUTIME_USER];
|
||||
+ *userp += kcpustat->cpustat[CPUTIME_NICE];
|
||||
}
|
||||
- val = cputime64_to_clock_t(val);
|
||||
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
|
||||
|
||||
- val = 0;
|
||||
+ *sysp = 0;
|
||||
for_each_online_cpu(cpu) {
|
||||
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
||||
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
|
||||
- val += kcpustat->cpustat[CPUTIME_IRQ];
|
||||
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
|
||||
+ *sysp += kcpustat->cpustat[CPUTIME_SYSTEM];
|
||||
+ *sysp += kcpustat->cpustat[CPUTIME_IRQ];
|
||||
+ *sysp += kcpustat->cpustat[CPUTIME_SOFTIRQ];
|
||||
}
|
||||
+}
|
||||
|
||||
- val = cputime64_to_clock_t(val);
|
||||
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
|
||||
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ cputime64_t user, sys;
|
||||
|
||||
+ cpuacct_stats_read(css_ca(seq_css(sf)), &user, &sys);
|
||||
+ seq_printf(sf, "user %lld\n", cputime64_to_clock_t(user));
|
||||
+ seq_printf(sf, "system %lld\n", cputime64_to_clock_t(sys));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
commit d2a799f795a5d5a69c9dc365c34f926e0649f840
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Mar 11 07:31:23 2016 -0500
|
||||
|
||||
sched: Implement interface for cgroup unified hierarchy
|
||||
|
||||
While the cpu controller doesn't have any functional problems, there
|
||||
are a couple interface issues which can be addressed in the v2
|
||||
interface.
|
||||
|
||||
* cpuacct being a separate controller. This separation is artificial
|
||||
and rather pointless as demonstrated by most use cases co-mounting
|
||||
the two controllers. It also forces certain information to be
|
||||
accounted twice.
|
||||
|
||||
* Use of different time units. Writable control knobs use
|
||||
microseconds, some stat fields use nanoseconds while other cpuacct
|
||||
stat fields use centiseconds.
|
||||
|
||||
* Control knobs which can't be used in the root cgroup still show up
|
||||
in the root.
|
||||
|
||||
* Control knob names and semantics aren't consistent with other
|
||||
controllers.
|
||||
|
||||
This patchset implements cpu controller's interface on the unified
|
||||
hierarchy which adheres to the controller file conventions described
|
||||
in Documentation/cgroups/unified-hierarchy.txt. Overall, the
|
||||
following changes are made.
|
||||
|
||||
* cpuacct is implictly enabled and disabled by cpu and its information
|
||||
is reported through "cpu.stat" which now uses microseconds for all
|
||||
time durations. All time duration fields now have "_usec" appended
|
||||
to them for clarity. While this doesn't solve the double accounting
|
||||
immediately, once majority of users switch to v2, cpu can directly
|
||||
account and report the relevant stats and cpuacct can be disabled on
|
||||
the unified hierarchy.
|
||||
|
||||
Note that cpuacct.usage_percpu is currently not included in
|
||||
"cpu.stat". If this information is actually called for, it can be
|
||||
added later.
|
||||
|
||||
* "cpu.shares" is replaced with "cpu.weight" and operates on the
|
||||
standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
|
||||
The weight is scaled to scheduler weight so that 100 maps to 1024
|
||||
and the ratio relationship is preserved - if weight is W and its
|
||||
scaled value is S, W / 100 == S / 1024. While the mapped range is a
|
||||
bit smaller than the orignal scheduler weight range, the dead zones
|
||||
on both sides are relatively small and covers wider range than the
|
||||
nice value mappings. This file doesn't make sense in the root
|
||||
cgroup and isn't create on root.
|
||||
|
||||
* "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
|
||||
which contains both quota and period.
|
||||
|
||||
* "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by
|
||||
"cpu.rt.max" which contains both runtime and period.
|
||||
|
||||
v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for
|
||||
CFS bandwidth stats and also using raw division for u64. Use
|
||||
CONFIG_CFS_BANDWITH and do_div() instead.
|
||||
|
||||
The semantics of "cpu.rt.max" is not fully decided yet. Dropped
|
||||
for now.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Li Zefan <lizefan@huawei.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index 0d34f35..5990efc 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -8450,6 +8450,139 @@ static struct cftype cpu_legacy_files[] = {
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
+static int cpu_stats_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ cpuacct_cpu_stats_show(sf);
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ struct task_group *tg = css_tg(seq_css(sf));
|
||||
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
||||
+ u64 throttled_usec;
|
||||
+
|
||||
+ throttled_usec = cfs_b->throttled_time;
|
||||
+ do_div(throttled_usec, NSEC_PER_USEC);
|
||||
+
|
||||
+ seq_printf(sf, "nr_periods %d\n"
|
||||
+ "nr_throttled %d\n"
|
||||
+ "throttled_usec %llu\n",
|
||||
+ cfs_b->nr_periods, cfs_b->nr_throttled,
|
||||
+ throttled_usec);
|
||||
+ }
|
||||
+#endif
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cft)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(css);
|
||||
+ u64 weight = scale_load_down(tg->shares);
|
||||
+
|
||||
+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
|
||||
+}
|
||||
+
|
||||
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cftype, u64 weight)
|
||||
+{
|
||||
+ /*
|
||||
+ * cgroup weight knobs should use the common MIN, DFL and MAX
|
||||
+ * values which are 1, 100 and 10000 respectively. While it loses
|
||||
+ * a bit of range on both ends, it maps pretty well onto the shares
|
||||
+ * value used by scheduler and the round-trip conversions preserve
|
||||
+ * the original value over the entire range.
|
||||
+ */
|
||||
+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
|
||||
+
|
||||
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
|
||||
+ long period, long quota)
|
||||
+{
|
||||
+ if (quota < 0)
|
||||
+ seq_puts(sf, "max");
|
||||
+ else
|
||||
+ seq_printf(sf, "%ld", quota);
|
||||
+
|
||||
+ seq_printf(sf, " %ld\n", period);
|
||||
+}
|
||||
+
|
||||
+/* caller should put the current value in *@periodp before calling */
|
||||
+static int __maybe_unused cpu_period_quota_parse(char *buf,
|
||||
+ u64 *periodp, u64 *quotap)
|
||||
+{
|
||||
+ char tok[21]; /* U64_MAX */
|
||||
+
|
||||
+ if (!sscanf(buf, "%s %llu", tok, periodp))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ *periodp *= NSEC_PER_USEC;
|
||||
+
|
||||
+ if (sscanf(tok, "%llu", quotap))
|
||||
+ *quotap *= NSEC_PER_USEC;
|
||||
+ else if (!strcmp(tok, "max"))
|
||||
+ *quotap = RUNTIME_INF;
|
||||
+ else
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+static int cpu_max_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(seq_css(sf));
|
||||
+
|
||||
+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
|
||||
+ char *buf, size_t nbytes, loff_t off)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(of_css(of));
|
||||
+ u64 period = tg_get_cfs_period(tg);
|
||||
+ u64 quota;
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = cpu_period_quota_parse(buf, &period, "a);
|
||||
+ if (!ret)
|
||||
+ ret = tg_set_cfs_bandwidth(tg, period, quota);
|
||||
+ return ret ?: nbytes;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static struct cftype cpu_files[] = {
|
||||
+ {
|
||||
+ .name = "stat",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .seq_show = cpu_stats_show,
|
||||
+ },
|
||||
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+ {
|
||||
+ .name = "weight",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .read_u64 = cpu_weight_read_u64,
|
||||
+ .write_u64 = cpu_weight_write_u64,
|
||||
+ },
|
||||
+#endif
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ .name = "max",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .seq_show = cpu_max_show,
|
||||
+ .write = cpu_max_write,
|
||||
+ },
|
||||
+#endif
|
||||
+ { } /* terminate */
|
||||
+};
|
||||
+
|
||||
struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.css_alloc = cpu_cgroup_css_alloc,
|
||||
.css_released = cpu_cgroup_css_released,
|
||||
@@ -8458,7 +8591,15 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.can_attach = cpu_cgroup_can_attach,
|
||||
.attach = cpu_cgroup_attach,
|
||||
.legacy_cftypes = cpu_legacy_files,
|
||||
+ .dfl_cftypes = cpu_files,
|
||||
.early_init = true,
|
||||
+#ifdef CONFIG_CGROUP_CPUACCT
|
||||
+ /*
|
||||
+ * cpuacct is enabled together with cpu on the unified hierarchy
|
||||
+ * and its stats are reported through "cpu.stat".
|
||||
+ */
|
||||
+ .depends_on = 1 << cpuacct_cgrp_id,
|
||||
+#endif
|
||||
};
|
||||
|
||||
#endif /* CONFIG_CGROUP_SCHED */
|
||||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
||||
index b99030a..a1a5a4b 100644
|
||||
--- a/kernel/sched/cpuacct.c
|
||||
+++ b/kernel/sched/cpuacct.c
|
||||
@@ -227,6 +227,30 @@ static struct cftype files[] = {
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
+/* used to print cpuacct stats in cpu.stat on the unified hierarchy */
|
||||
+void cpuacct_cpu_stats_show(struct seq_file *sf)
|
||||
+{
|
||||
+ struct cgroup_subsys_state *css;
|
||||
+ u64 usage, user, sys;
|
||||
+
|
||||
+ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys);
|
||||
+
|
||||
+ usage = cpuusage_read(css, seq_cft(sf));
|
||||
+ cpuacct_stats_read(css_ca(css), &user, &sys);
|
||||
+
|
||||
+ user *= TICK_NSEC;
|
||||
+ sys *= TICK_NSEC;
|
||||
+ do_div(usage, NSEC_PER_USEC);
|
||||
+ do_div(user, NSEC_PER_USEC);
|
||||
+ do_div(sys, NSEC_PER_USEC);
|
||||
+
|
||||
+ seq_printf(sf, "usage_usec %llu\n"
|
||||
+ "user_usec %llu\n"
|
||||
+ "system_usec %llu\n", usage, user, sys);
|
||||
+
|
||||
+ css_put(css);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* charge this task's execution time to its accounting group.
|
||||
*
|
||||
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
|
||||
index ba72807..ddf7af4 100644
|
||||
--- a/kernel/sched/cpuacct.h
|
||||
+++ b/kernel/sched/cpuacct.h
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
|
||||
extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
|
||||
+extern void cpuacct_cpu_stats_show(struct seq_file *sf);
|
||||
|
||||
#else
|
||||
|
||||
@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
|
||||
{
|
||||
}
|
||||
|
||||
+static inline void cpuacct_cpu_stats_show(struct seq_file *sf)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
#endif
|
@ -1,407 +0,0 @@
|
||||
commit 0d966df508ef4d6c0b1baae9e369f4fb0d3e10af
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Mar 11 07:31:23 2016 -0500
|
||||
|
||||
sched: Misc preps for cgroup unified hierarchy interface
|
||||
|
||||
Make the following changes in preparation for the cpu controller
|
||||
interface implementation for the unified hierarchy. This patch
|
||||
doesn't cause any functional differences.
|
||||
|
||||
* s/cpu_stats_show()/cpu_cfs_stats_show()/
|
||||
|
||||
* s/cpu_files/cpu_legacy_files/
|
||||
|
||||
* Separate out cpuacct_stats_read() from cpuacct_stats_show(). While
|
||||
at it, remove pointless cpuacct_stat_desc[] array.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Li Zefan <lizefan@huawei.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index 97ee9ac..c148dfe 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -8482,7 +8482,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
||||
return ret;
|
||||
}
|
||||
|
||||
-static int cpu_stats_show(struct seq_file *sf, void *v)
|
||||
+static int cpu_cfs_stats_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct task_group *tg = css_tg(seq_css(sf));
|
||||
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
||||
@@ -8522,7 +8522,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
|
||||
}
|
||||
#endif /* CONFIG_RT_GROUP_SCHED */
|
||||
|
||||
-static struct cftype cpu_files[] = {
|
||||
+static struct cftype cpu_legacy_files[] = {
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
{
|
||||
.name = "shares",
|
||||
@@ -8543,7 +8543,7 @@ static struct cftype cpu_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "stat",
|
||||
- .seq_show = cpu_stats_show,
|
||||
+ .seq_show = cpu_cfs_stats_show,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
@@ -8568,7 +8568,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.fork = cpu_cgroup_fork,
|
||||
.can_attach = cpu_cgroup_can_attach,
|
||||
.attach = cpu_cgroup_attach,
|
||||
- .legacy_cftypes = cpu_files,
|
||||
+ .legacy_cftypes = cpu_legacy_files,
|
||||
.early_init = true,
|
||||
};
|
||||
|
||||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
||||
index 41f85c4..3eb9eda 100644
|
||||
--- a/kernel/sched/cpuacct.c
|
||||
+++ b/kernel/sched/cpuacct.c
|
||||
@@ -242,36 +242,33 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
|
||||
return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE);
|
||||
}
|
||||
|
||||
-static const char * const cpuacct_stat_desc[] = {
|
||||
- [CPUACCT_STAT_USER] = "user",
|
||||
- [CPUACCT_STAT_SYSTEM] = "system",
|
||||
-};
|
||||
-
|
||||
-static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
+static void cpuacct_stats_read(struct cpuacct *ca, u64 *userp, u64 *sysp)
|
||||
{
|
||||
- struct cpuacct *ca = css_ca(seq_css(sf));
|
||||
int cpu;
|
||||
- s64 val = 0;
|
||||
|
||||
+ *userp = 0;
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
||||
- val += kcpustat->cpustat[CPUTIME_USER];
|
||||
- val += kcpustat->cpustat[CPUTIME_NICE];
|
||||
+ *userp += kcpustat->cpustat[CPUTIME_USER];
|
||||
+ *userp += kcpustat->cpustat[CPUTIME_NICE];
|
||||
}
|
||||
- val = cputime64_to_clock_t(val);
|
||||
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
|
||||
|
||||
- val = 0;
|
||||
+ *sysp = 0;
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
||||
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
|
||||
- val += kcpustat->cpustat[CPUTIME_IRQ];
|
||||
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
|
||||
+ *sysp += kcpustat->cpustat[CPUTIME_SYSTEM];
|
||||
+ *sysp += kcpustat->cpustat[CPUTIME_IRQ];
|
||||
+ *sysp += kcpustat->cpustat[CPUTIME_SOFTIRQ];
|
||||
}
|
||||
+}
|
||||
|
||||
- val = cputime64_to_clock_t(val);
|
||||
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
|
||||
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ cputime64_t user, sys;
|
||||
|
||||
+ cpuacct_stats_read(css_ca(seq_css(sf)), &user, &sys);
|
||||
+ seq_printf(sf, "user %lld\n", cputime64_to_clock_t(user));
|
||||
+ seq_printf(sf, "system %lld\n", cputime64_to_clock_t(sys));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
commit ed6d93036ec930cb774da10b7c87f67905ce71f1
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Mar 11 07:31:23 2016 -0500
|
||||
|
||||
sched: Implement interface for cgroup unified hierarchy
|
||||
|
||||
While the cpu controller doesn't have any functional problems, there
|
||||
are a couple interface issues which can be addressed in the v2
|
||||
interface.
|
||||
|
||||
* cpuacct being a separate controller. This separation is artificial
|
||||
and rather pointless as demonstrated by most use cases co-mounting
|
||||
the two controllers. It also forces certain information to be
|
||||
accounted twice.
|
||||
|
||||
* Use of different time units. Writable control knobs use
|
||||
microseconds, some stat fields use nanoseconds while other cpuacct
|
||||
stat fields use centiseconds.
|
||||
|
||||
* Control knobs which can't be used in the root cgroup still show up
|
||||
in the root.
|
||||
|
||||
* Control knob names and semantics aren't consistent with other
|
||||
controllers.
|
||||
|
||||
This patchset implements cpu controller's interface on the unified
|
||||
hierarchy which adheres to the controller file conventions described
|
||||
in Documentation/cgroups/unified-hierarchy.txt. Overall, the
|
||||
following changes are made.
|
||||
|
||||
* cpuacct is implictly enabled and disabled by cpu and its information
|
||||
is reported through "cpu.stat" which now uses microseconds for all
|
||||
time durations. All time duration fields now have "_usec" appended
|
||||
to them for clarity. While this doesn't solve the double accounting
|
||||
immediately, once majority of users switch to v2, cpu can directly
|
||||
account and report the relevant stats and cpuacct can be disabled on
|
||||
the unified hierarchy.
|
||||
|
||||
Note that cpuacct.usage_percpu is currently not included in
|
||||
"cpu.stat". If this information is actually called for, it can be
|
||||
added later.
|
||||
|
||||
* "cpu.shares" is replaced with "cpu.weight" and operates on the
|
||||
standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
|
||||
The weight is scaled to scheduler weight so that 100 maps to 1024
|
||||
and the ratio relationship is preserved - if weight is W and its
|
||||
scaled value is S, W / 100 == S / 1024. While the mapped range is a
|
||||
bit smaller than the orignal scheduler weight range, the dead zones
|
||||
on both sides are relatively small and covers wider range than the
|
||||
nice value mappings. This file doesn't make sense in the root
|
||||
cgroup and isn't create on root.
|
||||
|
||||
* "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
|
||||
which contains both quota and period.
|
||||
|
||||
* "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by
|
||||
"cpu.rt.max" which contains both runtime and period.
|
||||
|
||||
v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for
|
||||
CFS bandwidth stats and also using raw division for u64. Use
|
||||
CONFIG_CFS_BANDWITH and do_div() instead.
|
||||
|
||||
The semantics of "cpu.rt.max" is not fully decided yet. Dropped
|
||||
for now.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Li Zefan <lizefan@huawei.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index c148dfe..7bba2c5 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -8561,6 +8561,139 @@ static struct cftype cpu_legacy_files[] = {
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
+static int cpu_stats_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ cpuacct_cpu_stats_show(sf);
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ struct task_group *tg = css_tg(seq_css(sf));
|
||||
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
||||
+ u64 throttled_usec;
|
||||
+
|
||||
+ throttled_usec = cfs_b->throttled_time;
|
||||
+ do_div(throttled_usec, NSEC_PER_USEC);
|
||||
+
|
||||
+ seq_printf(sf, "nr_periods %d\n"
|
||||
+ "nr_throttled %d\n"
|
||||
+ "throttled_usec %llu\n",
|
||||
+ cfs_b->nr_periods, cfs_b->nr_throttled,
|
||||
+ throttled_usec);
|
||||
+ }
|
||||
+#endif
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cft)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(css);
|
||||
+ u64 weight = scale_load_down(tg->shares);
|
||||
+
|
||||
+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
|
||||
+}
|
||||
+
|
||||
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cftype, u64 weight)
|
||||
+{
|
||||
+ /*
|
||||
+ * cgroup weight knobs should use the common MIN, DFL and MAX
|
||||
+ * values which are 1, 100 and 10000 respectively. While it loses
|
||||
+ * a bit of range on both ends, it maps pretty well onto the shares
|
||||
+ * value used by scheduler and the round-trip conversions preserve
|
||||
+ * the original value over the entire range.
|
||||
+ */
|
||||
+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
|
||||
+
|
||||
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
|
||||
+ long period, long quota)
|
||||
+{
|
||||
+ if (quota < 0)
|
||||
+ seq_puts(sf, "max");
|
||||
+ else
|
||||
+ seq_printf(sf, "%ld", quota);
|
||||
+
|
||||
+ seq_printf(sf, " %ld\n", period);
|
||||
+}
|
||||
+
|
||||
+/* caller should put the current value in *@periodp before calling */
|
||||
+static int __maybe_unused cpu_period_quota_parse(char *buf,
|
||||
+ u64 *periodp, u64 *quotap)
|
||||
+{
|
||||
+ char tok[21]; /* U64_MAX */
|
||||
+
|
||||
+ if (!sscanf(buf, "%s %llu", tok, periodp))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ *periodp *= NSEC_PER_USEC;
|
||||
+
|
||||
+ if (sscanf(tok, "%llu", quotap))
|
||||
+ *quotap *= NSEC_PER_USEC;
|
||||
+ else if (!strcmp(tok, "max"))
|
||||
+ *quotap = RUNTIME_INF;
|
||||
+ else
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+static int cpu_max_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(seq_css(sf));
|
||||
+
|
||||
+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
|
||||
+ char *buf, size_t nbytes, loff_t off)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(of_css(of));
|
||||
+ u64 period = tg_get_cfs_period(tg);
|
||||
+ u64 quota;
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = cpu_period_quota_parse(buf, &period, "a);
|
||||
+ if (!ret)
|
||||
+ ret = tg_set_cfs_bandwidth(tg, period, quota);
|
||||
+ return ret ?: nbytes;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static struct cftype cpu_files[] = {
|
||||
+ {
|
||||
+ .name = "stat",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .seq_show = cpu_stats_show,
|
||||
+ },
|
||||
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+ {
|
||||
+ .name = "weight",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .read_u64 = cpu_weight_read_u64,
|
||||
+ .write_u64 = cpu_weight_write_u64,
|
||||
+ },
|
||||
+#endif
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ .name = "max",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .seq_show = cpu_max_show,
|
||||
+ .write = cpu_max_write,
|
||||
+ },
|
||||
+#endif
|
||||
+ { } /* terminate */
|
||||
+};
|
||||
+
|
||||
struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.css_alloc = cpu_cgroup_css_alloc,
|
||||
.css_released = cpu_cgroup_css_released,
|
||||
@@ -8569,7 +8702,15 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.can_attach = cpu_cgroup_can_attach,
|
||||
.attach = cpu_cgroup_attach,
|
||||
.legacy_cftypes = cpu_legacy_files,
|
||||
+ .dfl_cftypes = cpu_files,
|
||||
.early_init = true,
|
||||
+#ifdef CONFIG_CGROUP_CPUACCT
|
||||
+ /*
|
||||
+ * cpuacct is enabled together with cpu on the unified hierarchy
|
||||
+ * and its stats are reported through "cpu.stat".
|
||||
+ */
|
||||
+ .depends_on = 1 << cpuacct_cgrp_id,
|
||||
+#endif
|
||||
};
|
||||
|
||||
#endif /* CONFIG_CGROUP_SCHED */
|
||||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
||||
index 3eb9eda..7a02d26 100644
|
||||
--- a/kernel/sched/cpuacct.c
|
||||
+++ b/kernel/sched/cpuacct.c
|
||||
@@ -305,6 +305,30 @@ static struct cftype files[] = {
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
+/* used to print cpuacct stats in cpu.stat on the unified hierarchy */
|
||||
+void cpuacct_cpu_stats_show(struct seq_file *sf)
|
||||
+{
|
||||
+ struct cgroup_subsys_state *css;
|
||||
+ u64 usage, user, sys;
|
||||
+
|
||||
+ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys);
|
||||
+
|
||||
+ usage = cpuusage_read(css, seq_cft(sf));
|
||||
+ cpuacct_stats_read(css_ca(css), &user, &sys);
|
||||
+
|
||||
+ user *= TICK_NSEC;
|
||||
+ sys *= TICK_NSEC;
|
||||
+ do_div(usage, NSEC_PER_USEC);
|
||||
+ do_div(user, NSEC_PER_USEC);
|
||||
+ do_div(sys, NSEC_PER_USEC);
|
||||
+
|
||||
+ seq_printf(sf, "usage_usec %llu\n"
|
||||
+ "user_usec %llu\n"
|
||||
+ "system_usec %llu\n", usage, user, sys);
|
||||
+
|
||||
+ css_put(css);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* charge this task's execution time to its accounting group.
|
||||
*
|
||||
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
|
||||
index ba72807..ddf7af4 100644
|
||||
--- a/kernel/sched/cpuacct.h
|
||||
+++ b/kernel/sched/cpuacct.h
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
|
||||
extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
|
||||
+extern void cpuacct_cpu_stats_show(struct seq_file *sf);
|
||||
|
||||
#else
|
||||
|
||||
@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
|
||||
{
|
||||
}
|
||||
|
||||
+static inline void cpuacct_cpu_stats_show(struct seq_file *sf)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
#endif
|
784
pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.9.patch
Normal file
784
pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.9.patch
Normal file
@ -0,0 +1,784 @@
|
||||
commit 280858b0bb3384b9ec06b455e196b453888bd6b8
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Mar 11 07:31:23 2016 -0500
|
||||
|
||||
sched: Misc preps for cgroup unified hierarchy interface
|
||||
|
||||
Make the following changes in preparation for the cpu controller
|
||||
interface implementation for the unified hierarchy. This patch
|
||||
doesn't cause any functional differences.
|
||||
|
||||
* s/cpu_stats_show()/cpu_cfs_stats_show()/
|
||||
|
||||
* s/cpu_files/cpu_legacy_files/
|
||||
|
||||
* Separate out cpuacct_stats_read() from cpuacct_stats_show(). While
|
||||
at it, make the @val array u64 for consistency.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Li Zefan <lizefan@huawei.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index 154fd689fe02..57472485b79c 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -8705,7 +8705,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
||||
return ret;
|
||||
}
|
||||
|
||||
-static int cpu_stats_show(struct seq_file *sf, void *v)
|
||||
+static int cpu_cfs_stats_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct task_group *tg = css_tg(seq_css(sf));
|
||||
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
||||
@@ -8745,7 +8745,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
|
||||
}
|
||||
#endif /* CONFIG_RT_GROUP_SCHED */
|
||||
|
||||
-static struct cftype cpu_files[] = {
|
||||
+static struct cftype cpu_legacy_files[] = {
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
{
|
||||
.name = "shares",
|
||||
@@ -8766,7 +8766,7 @@ static struct cftype cpu_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "stat",
|
||||
- .seq_show = cpu_stats_show,
|
||||
+ .seq_show = cpu_cfs_stats_show,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
@@ -8791,7 +8791,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.fork = cpu_cgroup_fork,
|
||||
.can_attach = cpu_cgroup_can_attach,
|
||||
.attach = cpu_cgroup_attach,
|
||||
- .legacy_cftypes = cpu_files,
|
||||
+ .legacy_cftypes = cpu_legacy_files,
|
||||
.early_init = true,
|
||||
};
|
||||
|
||||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
||||
index bc0b309c3f19..d1e5dd0b3a64 100644
|
||||
--- a/kernel/sched/cpuacct.c
|
||||
+++ b/kernel/sched/cpuacct.c
|
||||
@@ -276,26 +276,33 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
+static void cpuacct_stats_read(struct cpuacct *ca,
|
||||
+ u64 (*val)[CPUACCT_STAT_NSTATS])
|
||||
{
|
||||
- struct cpuacct *ca = css_ca(seq_css(sf));
|
||||
- s64 val[CPUACCT_STAT_NSTATS];
|
||||
int cpu;
|
||||
- int stat;
|
||||
|
||||
- memset(val, 0, sizeof(val));
|
||||
+ memset(val, 0, sizeof(*val));
|
||||
+
|
||||
for_each_possible_cpu(cpu) {
|
||||
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
|
||||
|
||||
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
|
||||
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
|
||||
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
|
||||
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
|
||||
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
|
||||
+ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
|
||||
+ (*val)[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
|
||||
+ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
|
||||
+ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
|
||||
+ (*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
|
||||
}
|
||||
+}
|
||||
+
|
||||
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ u64 val[CPUACCT_STAT_NSTATS];
|
||||
+ int stat;
|
||||
+
|
||||
+ cpuacct_stats_read(css_ca(seq_css(sf)), &val);
|
||||
|
||||
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
|
||||
- seq_printf(sf, "%s %lld\n",
|
||||
+ seq_printf(sf, "%s %llu\n",
|
||||
cpuacct_stat_desc[stat],
|
||||
cputime64_to_clock_t(val[stat]));
|
||||
}
|
||||
|
||||
commit 015cbdcb90034fd566d00de9d3d405613da3cd26
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Mar 11 07:31:23 2016 -0500
|
||||
|
||||
sched: Implement interface for cgroup unified hierarchy
|
||||
|
||||
While the cpu controller doesn't have any functional problems, there
|
||||
are a couple interface issues which can be addressed in the v2
|
||||
interface.
|
||||
|
||||
* cpuacct being a separate controller. This separation is artificial
|
||||
and rather pointless as demonstrated by most use cases co-mounting
|
||||
the two controllers. It also forces certain information to be
|
||||
accounted twice.
|
||||
|
||||
* Use of different time units. Writable control knobs use
|
||||
microseconds, some stat fields use nanoseconds while other cpuacct
|
||||
stat fields use centiseconds.
|
||||
|
||||
* Control knobs which can't be used in the root cgroup still show up
|
||||
in the root.
|
||||
|
||||
* Control knob names and semantics aren't consistent with other
|
||||
controllers.
|
||||
|
||||
This patchset implements cpu controller's interface on the unified
|
||||
hierarchy which adheres to the controller file conventions described
|
||||
in Documentation/cgroups/unified-hierarchy.txt. Overall, the
|
||||
following changes are made.
|
||||
|
||||
* cpuacct is implictly enabled and disabled by cpu and its information
|
||||
is reported through "cpu.stat" which now uses microseconds for all
|
||||
time durations. All time duration fields now have "_usec" appended
|
||||
to them for clarity. While this doesn't solve the double accounting
|
||||
immediately, once majority of users switch to v2, cpu can directly
|
||||
account and report the relevant stats and cpuacct can be disabled on
|
||||
the unified hierarchy.
|
||||
|
||||
Note that cpuacct.usage_percpu is currently not included in
|
||||
"cpu.stat". If this information is actually called for, it can be
|
||||
added later.
|
||||
|
||||
* "cpu.shares" is replaced with "cpu.weight" and operates on the
|
||||
standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
|
||||
The weight is scaled to scheduler weight so that 100 maps to 1024
|
||||
and the ratio relationship is preserved - if weight is W and its
|
||||
scaled value is S, W / 100 == S / 1024. While the mapped range is a
|
||||
bit smaller than the orignal scheduler weight range, the dead zones
|
||||
on both sides are relatively small and covers wider range than the
|
||||
nice value mappings. This file doesn't make sense in the root
|
||||
cgroup and isn't create on root.
|
||||
|
||||
* "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
|
||||
which contains both quota and period.
|
||||
|
||||
* "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by
|
||||
"cpu.rt.max" which contains both runtime and period.
|
||||
|
||||
v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for
|
||||
CFS bandwidth stats and also using raw division for u64. Use
|
||||
CONFIG_CFS_BANDWITH and do_div() instead.
|
||||
|
||||
The semantics of "cpu.rt.max" is not fully decided yet. Dropped
|
||||
for now.
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Li Zefan <lizefan@huawei.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
||||
index 57472485b79c..c0ae869f51c4 100644
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -8784,6 +8784,139 @@ static struct cftype cpu_legacy_files[] = {
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
+static int cpu_stats_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ cpuacct_cpu_stats_show(sf);
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ struct task_group *tg = css_tg(seq_css(sf));
|
||||
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
||||
+ u64 throttled_usec;
|
||||
+
|
||||
+ throttled_usec = cfs_b->throttled_time;
|
||||
+ do_div(throttled_usec, NSEC_PER_USEC);
|
||||
+
|
||||
+ seq_printf(sf, "nr_periods %d\n"
|
||||
+ "nr_throttled %d\n"
|
||||
+ "throttled_usec %llu\n",
|
||||
+ cfs_b->nr_periods, cfs_b->nr_throttled,
|
||||
+ throttled_usec);
|
||||
+ }
|
||||
+#endif
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cft)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(css);
|
||||
+ u64 weight = scale_load_down(tg->shares);
|
||||
+
|
||||
+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
|
||||
+}
|
||||
+
|
||||
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
|
||||
+ struct cftype *cftype, u64 weight)
|
||||
+{
|
||||
+ /*
|
||||
+ * cgroup weight knobs should use the common MIN, DFL and MAX
|
||||
+ * values which are 1, 100 and 10000 respectively. While it loses
|
||||
+ * a bit of range on both ends, it maps pretty well onto the shares
|
||||
+ * value used by scheduler and the round-trip conversions preserve
|
||||
+ * the original value over the entire range.
|
||||
+ */
|
||||
+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
|
||||
+
|
||||
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
|
||||
+ long period, long quota)
|
||||
+{
|
||||
+ if (quota < 0)
|
||||
+ seq_puts(sf, "max");
|
||||
+ else
|
||||
+ seq_printf(sf, "%ld", quota);
|
||||
+
|
||||
+ seq_printf(sf, " %ld\n", period);
|
||||
+}
|
||||
+
|
||||
+/* caller should put the current value in *@periodp before calling */
|
||||
+static int __maybe_unused cpu_period_quota_parse(char *buf,
|
||||
+ u64 *periodp, u64 *quotap)
|
||||
+{
|
||||
+ char tok[21]; /* U64_MAX */
|
||||
+
|
||||
+ if (!sscanf(buf, "%s %llu", tok, periodp))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ *periodp *= NSEC_PER_USEC;
|
||||
+
|
||||
+ if (sscanf(tok, "%llu", quotap))
|
||||
+ *quotap *= NSEC_PER_USEC;
|
||||
+ else if (!strcmp(tok, "max"))
|
||||
+ *quotap = RUNTIME_INF;
|
||||
+ else
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+static int cpu_max_show(struct seq_file *sf, void *v)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(seq_css(sf));
|
||||
+
|
||||
+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
|
||||
+ char *buf, size_t nbytes, loff_t off)
|
||||
+{
|
||||
+ struct task_group *tg = css_tg(of_css(of));
|
||||
+ u64 period = tg_get_cfs_period(tg);
|
||||
+ u64 quota;
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = cpu_period_quota_parse(buf, &period, "a);
|
||||
+ if (!ret)
|
||||
+ ret = tg_set_cfs_bandwidth(tg, period, quota);
|
||||
+ return ret ?: nbytes;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static struct cftype cpu_files[] = {
|
||||
+ {
|
||||
+ .name = "stat",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .seq_show = cpu_stats_show,
|
||||
+ },
|
||||
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+ {
|
||||
+ .name = "weight",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .read_u64 = cpu_weight_read_u64,
|
||||
+ .write_u64 = cpu_weight_write_u64,
|
||||
+ },
|
||||
+#endif
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ .name = "max",
|
||||
+ .flags = CFTYPE_NOT_ON_ROOT,
|
||||
+ .seq_show = cpu_max_show,
|
||||
+ .write = cpu_max_write,
|
||||
+ },
|
||||
+#endif
|
||||
+ { } /* terminate */
|
||||
+};
|
||||
+
|
||||
struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.css_alloc = cpu_cgroup_css_alloc,
|
||||
.css_released = cpu_cgroup_css_released,
|
||||
@@ -8792,7 +8925,15 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.can_attach = cpu_cgroup_can_attach,
|
||||
.attach = cpu_cgroup_attach,
|
||||
.legacy_cftypes = cpu_legacy_files,
|
||||
+ .dfl_cftypes = cpu_files,
|
||||
.early_init = true,
|
||||
+#ifdef CONFIG_CGROUP_CPUACCT
|
||||
+ /*
|
||||
+ * cpuacct is enabled together with cpu on the unified hierarchy
|
||||
+ * and its stats are reported through "cpu.stat".
|
||||
+ */
|
||||
+ .depends_on = 1 << cpuacct_cgrp_id,
|
||||
+#endif
|
||||
};
|
||||
|
||||
#endif /* CONFIG_CGROUP_SCHED */
|
||||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
||||
index d1e5dd0b3a64..57f390514c39 100644
|
||||
--- a/kernel/sched/cpuacct.c
|
||||
+++ b/kernel/sched/cpuacct.c
|
||||
@@ -347,6 +347,31 @@ static struct cftype files[] = {
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
+/* used to print cpuacct stats in cpu.stat on the unified hierarchy */
|
||||
+void cpuacct_cpu_stats_show(struct seq_file *sf)
|
||||
+{
|
||||
+ struct cgroup_subsys_state *css;
|
||||
+ u64 usage, val[CPUACCT_STAT_NSTATS];
|
||||
+
|
||||
+ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys);
|
||||
+
|
||||
+ usage = cpuusage_read(css, seq_cft(sf));
|
||||
+ cpuacct_stats_read(css_ca(css), &val);
|
||||
+
|
||||
+ val[CPUACCT_STAT_USER] *= TICK_NSEC;
|
||||
+ val[CPUACCT_STAT_SYSTEM] *= TICK_NSEC;
|
||||
+ do_div(usage, NSEC_PER_USEC);
|
||||
+ do_div(val[CPUACCT_STAT_USER], NSEC_PER_USEC);
|
||||
+ do_div(val[CPUACCT_STAT_SYSTEM], NSEC_PER_USEC);
|
||||
+
|
||||
+ seq_printf(sf, "usage_usec %llu\n"
|
||||
+ "user_usec %llu\n"
|
||||
+ "system_usec %llu\n",
|
||||
+ usage, val[CPUACCT_STAT_USER], val[CPUACCT_STAT_SYSTEM]);
|
||||
+
|
||||
+ css_put(css);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* charge this task's execution time to its accounting group.
|
||||
*
|
||||
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
|
||||
index ba72807c73d4..ddf7af466d35 100644
|
||||
--- a/kernel/sched/cpuacct.h
|
||||
+++ b/kernel/sched/cpuacct.h
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
|
||||
extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
|
||||
+extern void cpuacct_cpu_stats_show(struct seq_file *sf);
|
||||
|
||||
#else
|
||||
|
||||
@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
|
||||
{
|
||||
}
|
||||
|
||||
+static inline void cpuacct_cpu_stats_show(struct seq_file *sf)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
#endif
|
||||
|
||||
commit 5019fe3d7ec456b58d451ef06fe1f81d7d9f28a9
|
||||
Author: Tejun Heo <tj@kernel.org>
|
||||
Date: Fri Aug 5 12:41:01 2016 -0400
|
||||
|
||||
cgroup: add documentation regarding CPU controller cgroup v2 support
|
||||
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
|
||||
diff --git a/Documentation/cgroup-v2-cpu.txt b/Documentation/cgroup-v2-cpu.txt
|
||||
new file mode 100644
|
||||
index 000000000000..1ed7032d4472
|
||||
--- /dev/null
|
||||
+++ b/Documentation/cgroup-v2-cpu.txt
|
||||
@@ -0,0 +1,368 @@
|
||||
+
|
||||
+
|
||||
+CPU Controller on Control Group v2
|
||||
+
|
||||
+August, 2016 Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+
|
||||
+While most controllers have support for cgroup v2 now, the CPU
|
||||
+controller support is not upstream yet due to objections from the
|
||||
+scheduler maintainers on the basic designs of cgroup v2. This
|
||||
+document explains the current situation as well as an interim
|
||||
+solution, and details the disagreements and arguments. The latest
|
||||
+version of this document can be found at the following URL.
|
||||
+
|
||||
+ https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/tree/Documentation/cgroup-v2-cpu.txt?h=cgroup-v2-cpu
|
||||
+
|
||||
+This document was posted to the linux-kernel and cgroup mailing lists.
|
||||
+Unfortunately, no consensus was reached as of Oct, 2016. The thread
|
||||
+can be found at the following URL.
|
||||
+
|
||||
+ http://lkml.kernel.org/r/20160805170752.GK2542@mtj.duckdns.org
|
||||
+
|
||||
+
|
||||
+CONTENTS
|
||||
+
|
||||
+1. Current Situation and Interim Solution
|
||||
+2. Disagreements and Arguments
|
||||
+ 2-1. Contentious Restrictions
|
||||
+ 2-1-1. Process Granularity
|
||||
+ 2-1-2. No Internal Process Constraint
|
||||
+ 2-2. Impact on CPU Controller
|
||||
+ 2-2-1. Impact of Process Granularity
|
||||
+ 2-2-2. Impact of No Internal Process Constraint
|
||||
+ 2-3. Arguments for cgroup v2
|
||||
+3. Way Forward
|
||||
+4. References
|
||||
+
|
||||
+
|
||||
+1. Current Situation and Interim Solution
|
||||
+
|
||||
+All objections from the scheduler maintainers apply to cgroup v2 core
|
||||
+design, and there are no known objections to the specifics of the CPU
|
||||
+controller cgroup v2 interface. The only blocked part is changes to
|
||||
+expose the CPU controller interface on cgroup v2, which comprises the
|
||||
+following two patches:
|
||||
+
|
||||
+ [1] sched: Misc preps for cgroup unified hierarchy interface
|
||||
+ [2] sched: Implement interface for cgroup unified hierarchy
|
||||
+
|
||||
+The necessary changes are superficial and implement the interface
|
||||
+files on cgroup v2. The combined diffstat is as follows.
|
||||
+
|
||||
+ kernel/sched/core.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++--
|
||||
+ kernel/sched/cpuacct.c | 57 ++++++++++++------
|
||||
+ kernel/sched/cpuacct.h | 5 +
|
||||
+ 3 files changed, 189 insertions(+), 22 deletions(-)
|
||||
+
|
||||
+The patches are easy to apply and forward-port. The following git
|
||||
+branch will always carry the two patches on top of the latest release
|
||||
+of the upstream kernel.
|
||||
+
|
||||
+ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu
|
||||
+
|
||||
+There also are versioned branches going back to v4.4.
|
||||
+
|
||||
+ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu-$KERNEL_VER
|
||||
+
|
||||
+While it's difficult to tell whether the CPU controller support will
|
||||
+be merged, there are crucial resource control features in cgroup v2
|
||||
+that are only possible due to the design choices that are being
|
||||
+objected to, and every effort will be made to ease enabling the CPU
|
||||
+controller cgroup v2 support out-of-tree for parties which choose to.
|
||||
+
|
||||
+
|
||||
+2. Disagreements and Arguments
|
||||
+
|
||||
+There have been several lengthy discussion threads [3][4] on LKML
|
||||
+around the structural constraints of cgroup v2. The two that affect
|
||||
+the CPU controller are process granularity and no internal process
|
||||
+constraint. Both arise primarily from the need for common resource
|
||||
+domain definition across different resources.
|
||||
+
|
||||
+The common resource domain is a powerful concept in cgroup v2 that
|
||||
+allows controllers to make basic assumptions about the structural
|
||||
+organization of processes and controllers inside the cgroup hierarchy,
|
||||
+and thus solve problems spanning multiple types of resources. The
|
||||
+prime example for this is page cache writeback: dirty page cache is
|
||||
+regulated through throttling buffered writers based on memory
|
||||
+availability, and initiating batched write outs to the disk based on
|
||||
+IO capacity. Tracking and controlling writeback inside a cgroup thus
|
||||
+requires the direct cooperation of the memory and the IO controller.
|
||||
+
|
||||
+This easily extends to other areas, such as CPU cycles consumed while
|
||||
+performing memory reclaim or IO encryption.
|
||||
+
|
||||
+
|
||||
+2-1. Contentious Restrictions
|
||||
+
|
||||
+For controllers of different resources to work together, they must
|
||||
+agree on a common organization. This uniform model across controllers
|
||||
+imposes two contentious restrictions on the CPU controller: process
|
||||
+granularity and the no-internal-process constraint.
|
||||
+
|
||||
+
|
||||
+ 2-1-1. Process Granularity
|
||||
+
|
||||
+ For memory, because an address space is shared between all threads
|
||||
+ of a process, the terminal consumer is a process, not a thread.
|
||||
+ Separating the threads of a single process into different memory
|
||||
+ control domains doesn't make semantical sense. cgroup v2 ensures
|
||||
+ that all controller can agree on the same organization by requiring
|
||||
+ that threads of the same process belong to the same cgroup.
|
||||
+
|
||||
+ There are other reasons to enforce process granularity. One
|
||||
+ important one is isolating system-level management operations from
|
||||
+ in-process application operations. The cgroup interface, being a
|
||||
+ virtual filesystem, is very unfit for multiple independent
|
||||
+ operations taking place at the same time as most operations have to
|
||||
+ be multi-step and there is no way to synchronize multiple accessors.
|
||||
+ See also [5] Documentation/cgroup-v2.txt, "R-2. Thread Granularity"
|
||||
+
|
||||
+
|
||||
+ 2-1-2. No Internal Process Constraint
|
||||
+
|
||||
+ cgroup v2 does not allow processes to belong to any cgroup which has
|
||||
+ child cgroups when resource controllers are enabled on it (the
|
||||
+ notable exception being the root cgroup itself). This is because,
|
||||
+ for some resources, a resource domain (cgroup) is not directly
|
||||
+ comparable to the terminal consumer (process/task) of said resource,
|
||||
+ and so putting the two into a sibling relationship isn't meaningful.
|
||||
+
|
||||
+ - Differing Control Parameters and Capabilities
|
||||
+
|
||||
+ A cgroup controller has different resource control parameters and
|
||||
+ capabilities from a terminal consumer, be that a task or process.
|
||||
+ There are a couple cases where a cgroup control knob can be mapped
|
||||
+ to a per-task or per-process API but they are exceptions and the
|
||||
+ mappings aren't obvious even in those cases.
|
||||
+
|
||||
+ For example, task priorities (also known as nice values) set
|
||||
+ through setpriority(2) are mapped to the CPU controller
|
||||
+ "cpu.shares" values. However, how exactly the two ranges map and
|
||||
+ even the fact that they map to each other at all are not obvious.
|
||||
+
|
||||
+ The situation gets further muddled when considering other resource
|
||||
+ types and control knobs. IO priorities set through ioprio_set(2)
|
||||
+ cannot be mapped to IO controller weights and most cgroup resource
|
||||
+ control knobs including the bandwidth control knobs of the CPU
|
||||
+ controller don't have counterparts in the terminal consumers.
|
||||
+
|
||||
+ - Anonymous Resource Consumption
|
||||
+
|
||||
+ For CPU, every time slice consumed from inside a cgroup, which
|
||||
+ comprises most but not all of consumed CPU time for the cgroup,
|
||||
+ can be clearly attributed to a specific task or process. Because
|
||||
+ these two types of entities are directly comparable as consumers
|
||||
+ of CPU time, it's theoretically possible to mix tasks and cgroups
|
||||
+ on the same tree levels and let them directly compete for the time
|
||||
+ quota available to their common ancestor.
|
||||
+
|
||||
+ However, the same can't be said for resource types like memory or
|
||||
+ IO: the memory consumed by the page cache, for example, can be
|
||||
+ tracked on a per-cgroup level, but due to mismatches in lifetimes
|
||||
+ of involved objects (page cache can persist long after processes
|
||||
+ are gone), shared usages and the implementation overhead of
|
||||
+ tracking persistent state, it can no longer be attributed to
|
||||
+ individual processes after instantiation. Consequently, any IO
|
||||
+ incurred by page cache writeback can be attributed to a cgroup,
|
||||
+ but not to the individual consumers inside the cgroup.
|
||||
+
|
||||
+ For memory and IO, this makes a resource domain (cgroup) an object
|
||||
+ of a fundamentally different type than a terminal consumer
|
||||
+ (process). A process can't be a first class object in the resource
|
||||
+ distribution graph as its total resource consumption can't be
|
||||
+ described without the containing resource domain.
|
||||
+
|
||||
+ Disallowing processes in internal cgroups avoids competition between
|
||||
+ cgroups and processes which cannot be meaningfully defined for these
|
||||
+ resources. All resource control takes place among cgroups and a
|
||||
+ terminal consumer interacts with the containing cgroup the same way
|
||||
+ it would with the system without cgroup.
|
||||
+
|
||||
+ Root cgroup is exempt from this constraint, which is in line with
|
||||
+ how root cgroup is handled in general - it's excluded from cgroup
|
||||
+ resource accounting and control.
|
||||
+
|
||||
+
|
||||
+Enforcing process granularity and no internal process constraint
|
||||
+allows all controllers to be on the same footing in terms of resource
|
||||
+distribution hierarchy.
|
||||
+
|
||||
+
|
||||
+2-2. Impact on CPU Controller
|
||||
+
|
||||
+As indicated earlier, the CPU controller's resource distribution graph
|
||||
+is the simplest. Every schedulable resource consumption can be
|
||||
+attributed to a specific task. In addition, for weight based control,
|
||||
+the per-task priority set through setpriority(2) can be translated to
|
||||
+and from a per-cgroup weight. As such, the CPU controller can treat a
|
||||
+task and a cgroup symmetrically, allowing support for any tree layout
|
||||
+of cgroups and tasks. Both process granularity and the no internal
|
||||
+process constraint restrict how the CPU controller can be used.
|
||||
+
|
||||
+
|
||||
+ 2-2-1. Impact of Process Granularity
|
||||
+
|
||||
+ Process granularity prevents tasks belonging to the same process to
|
||||
+ be assigned to different cgroups. It was pointed out [6] that this
|
||||
+ excludes the valid use case of hierarchical CPU distribution within
|
||||
+ processes.
|
||||
+
|
||||
+ To address this issue, the rgroup (resource group) [7][8][9]
|
||||
+ interface, an extension of the existing setpriority(2) API, was
|
||||
+ proposed, which is in line with other programmable priority
|
||||
+ mechanisms and eliminates the risk of in-application configuration
|
||||
+ and system configuration stepping on each other's toes.
|
||||
+ Unfortunately, the proposal quickly turned into discussions around
|
||||
+ cgroup v2 design decisions [4] and no consensus could be reached.
|
||||
+
|
||||
+
|
||||
+ 2-2-2. Impact of No Internal Process Constraint
|
||||
+
|
||||
+ The no internal process constraint disallows tasks from competing
|
||||
+ directly against cgroups. Here is an excerpt from Peter Zijlstra
|
||||
+ pointing out the issue [10] - R, L and A are cgroups; t1, t2, t3 and
|
||||
+ t4 are tasks:
|
||||
+
|
||||
+
|
||||
+ R
|
||||
+ / | \
|
||||
+ t1 t2 A
|
||||
+ / \
|
||||
+ t3 t4
|
||||
+
|
||||
+
|
||||
+ Is fundamentally different from:
|
||||
+
|
||||
+
|
||||
+ R
|
||||
+ / \
|
||||
+ L A
|
||||
+ / \ / \
|
||||
+ t1 t2 t3 t4
|
||||
+
|
||||
+
|
||||
+ Because if in the first hierarchy you add a task (t5) to R, all of
|
||||
+ its A will run at 1/4th of total bandwidth where before it had
|
||||
+ 1/3rd, whereas with the second example, if you add our t5 to L, A
|
||||
+ doesn't get any less bandwidth.
|
||||
+
|
||||
+
|
||||
+ It is true that the trees are semantically different from each other
|
||||
+ and the symmetric handling of tasks and cgroups is aesthetically
|
||||
+ pleasing. However, it isn't clear what the practical usefulness of
|
||||
+ a layout with direct competition between tasks and cgroups would be,
|
||||
+ considering that number and behavior of tasks are controlled by each
|
||||
+ application, and cgroups primarily deal with system level resource
|
||||
+ distribution; changes in the number of active threads would directly
|
||||
+ impact resource distribution. Real world use cases of such layouts
|
||||
+ could not be established during the discussions.
|
||||
+
|
||||
+
|
||||
+2-3. Arguments for cgroup v2
|
||||
+
|
||||
+There are strong demands for comprehensive hierarchical resource
|
||||
+control across all major resources, and establishing a common resource
|
||||
+hierarchy is an essential step. As with most engineering decisions,
|
||||
+common resource hierarchy definition comes with its trade-offs. With
|
||||
+cgroup v2, the trade-offs are in the form of structural constraints
|
||||
+which, among others, restrict the CPU controller's space of possible
|
||||
+configurations.
|
||||
+
|
||||
+However, even with the restrictions, cgroup v2, in combination with
|
||||
+rgroup, covers most of identified real world use cases while enabling
|
||||
+new important use cases of resource control across multiple resource
|
||||
+types that were fundamentally broken previously.
|
||||
+
|
||||
+Furthermore, for resource control, treating resource domains as
|
||||
+objects of a different type from terminal consumers has important
|
||||
+advantages - it can account for resource consumptions which are not
|
||||
+tied to any specific terminal consumer, be that a task or process, and
|
||||
+allows decoupling resource distribution controls from in-application
|
||||
+APIs. Even the CPU controller may benefit from it as the kernel can
|
||||
+consume significant amount of CPU cycles in interrupt context or tasks
|
||||
+shared across multiple resource domains (e.g. softirq).
|
||||
+
|
||||
+Finally, it's important to note that enabling cgroup v2 support for
|
||||
+the CPU controller doesn't block use cases which require the features
|
||||
+which are not available on cgroup v2. Unlikely, but should anybody
|
||||
+actually rely on the CPU controller's symmetric handling of tasks and
|
||||
+cgroups, backward compatibility is and will be maintained by being
|
||||
+able to disconnect the controller from the cgroup v2 hierarchy and use
|
||||
+it standalone. This also holds for cpuset which is often used in
|
||||
+highly customized configurations which might be a poor fit for common
|
||||
+resource domains.
|
||||
+
|
||||
+The required changes are minimal, the benefits for the target use
|
||||
+cases are critical and obvious, and use cases which have to use v1 can
|
||||
+continue to do so.
|
||||
+
|
||||
+
|
||||
+3. Way Forward
|
||||
+
|
||||
+cgroup v2 primarily aims to solve the problem of comprehensive
|
||||
+hierarchical resource control across all major computing resources,
|
||||
+which is one of the core problems of modern server infrastructure
|
||||
+engineering. The trade-offs that cgroup v2 took are results of
|
||||
+pursuing that goal and gaining a better understanding of the nature of
|
||||
+resource control in the process.
|
||||
+
|
||||
+I believe that real world usages will prove cgroup v2's model right,
|
||||
+considering the crucial pieces of comprehensive resource control that
|
||||
+cannot be implemented without common resource domains. This is not to
|
||||
+say that cgroup v2 is fixed in stone and can't be updated; if there is
|
||||
+an approach which better serves both comprehensive resource control
|
||||
+and the CPU controller's flexibility, we will surely move towards
|
||||
+that. It goes without saying that discussions around such approach
|
||||
+should consider practical aspects of resource control as a whole
|
||||
+rather than absolutely focusing on a particular controller.
|
||||
+
|
||||
+Until such consensus can be reached, the CPU controller cgroup v2
|
||||
+support will be maintained out of the mainline kernel in an easily
|
||||
+accessible form. If there is anything cgroup developers can do to
|
||||
+ease the pain, please feel free to contact us on the cgroup mailing
|
||||
+list at cgroups@vger.kernel.org.
|
||||
+
|
||||
+
|
||||
+4. References
|
||||
+
|
||||
+[1] http://lkml.kernel.org/r/20160105164834.GE5995@mtj.duckdns.org
|
||||
+ [PATCH 1/2] sched: Misc preps for cgroup unified hierarchy interface
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[2] http://lkml.kernel.org/r/20160105164852.GF5995@mtj.duckdns.org
|
||||
+ [PATCH 2/2] sched: Implement interface for cgroup unified hierarchy
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[3] http://lkml.kernel.org/r/1438641689-14655-4-git-send-email-tj@kernel.org
|
||||
+ [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[4] http://lkml.kernel.org/r/20160407064549.GH3430@twins.programming.kicks-ass.net
|
||||
+ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP
|
||||
+ Peter Zijlstra <peterz@infradead.org>
|
||||
+
|
||||
+[5] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/cgroup-v2.txt
|
||||
+ Control Group v2
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[6] http://lkml.kernel.org/r/CAPM31RJNy3jgG=DYe6GO=wyL4BPPxwUm1f2S6YXacQmo7viFZA@mail.gmail.com
|
||||
+ Re: [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy
|
||||
+ Paul Turner <pjt@google.com>
|
||||
+
|
||||
+[7] http://lkml.kernel.org/r/20160105154503.GC5995@mtj.duckdns.org
|
||||
+ [RFD] cgroup: thread granularity support for cpu controller
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[8] http://lkml.kernel.org/r/1457710888-31182-1-git-send-email-tj@kernel.org
|
||||
+ [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[9] http://lkml.kernel.org/r/20160311160522.GA24046@htj.duckdns.org
|
||||
+ Example program for PRIO_RGRP
|
||||
+ Tejun Heo <tj@kernel.org>
|
||||
+
|
||||
+[10] http://lkml.kernel.org/r/20160407082810.GN3430@twins.programming.kicks-ass.net
|
||||
+ Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource
|
||||
+ Peter Zijlstra <peterz@infradead.org>
|
@ -11716,10 +11716,7 @@ with pkgs;
|
||||
kernelPatches =
|
||||
[ kernelPatches.bridge_stp_helper
|
||||
kernelPatches.p9_fixes
|
||||
# See pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/README.md
|
||||
# when adding a new linux version
|
||||
# !!! 4.7 patch doesn't apply, 4.9 patch not up yet, will keep checking
|
||||
# kernelPatches.cpu-cgroup-v2."4.7"
|
||||
kernelPatches.cpu-cgroup-v2."4.9"
|
||||
kernelPatches.modinst_arg_list_too_long
|
||||
]
|
||||
++ lib.optionals ((platform.kernelArch or null) == "mips")
|
||||
@ -11733,10 +11730,7 @@ with pkgs;
|
||||
kernelPatches =
|
||||
[ kernelPatches.bridge_stp_helper
|
||||
kernelPatches.p9_fixes
|
||||
# See pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/README.md
|
||||
# when adding a new linux version
|
||||
# !!! 4.7 patch doesn't apply, 4.9 patch not up yet, will keep checking
|
||||
# kernelPatches.cpu-cgroup-v2."4.7"
|
||||
kernelPatches.cpu-cgroup-v2."4.10"
|
||||
kernelPatches.modinst_arg_list_too_long
|
||||
]
|
||||
++ lib.optionals ((platform.kernelArch or null) == "mips")
|
||||
@ -11752,8 +11746,7 @@ with pkgs;
|
||||
kernelPatches.p9_fixes
|
||||
# See pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/README.md
|
||||
# when adding a new linux version
|
||||
# !!! 4.7 patch doesn't apply, 4.9 patch not up yet, will keep checking
|
||||
# kernelPatches.cpu-cgroup-v2."4.7"
|
||||
kernelPatches.cpu-cgroup-v2."4.11"
|
||||
kernelPatches.modinst_arg_list_too_long
|
||||
]
|
||||
++ lib.optionals ((platform.kernelArch or null) == "mips")
|
||||
|
Loading…
Reference in New Issue
Block a user