Re: [PATCH 7/7] psi: cgroup support

2018-05-10 Thread Johannes Weiner
On Wed, May 09, 2018 at 01:07:36PM +0200, Peter Zijlstra wrote:
> On Mon, May 07, 2018 at 05:01:35PM -0400, Johannes Weiner wrote:
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -260,6 +260,18 @@ void psi_task_change(struct task_struct *task, u64 
> > now, int clear, int set)
> > task->psi_flags |= set;
> >  
> > psi_group_update(&psi_system, cpu, now, clear, set);
> > +
> > +#ifdef CONFIG_CGROUPS
> > +   cgroup = task->cgroups->dfl_cgrp;
> > +   while (cgroup && (parent = cgroup_parent(cgroup))) {
> > +   struct psi_group *group;
> > +
> > +   group = cgroup_psi(cgroup);
> > +   psi_group_update(group, cpu, now, clear, set);
> > +
> > +   cgroup = parent;
> > +   }
> > +#endif
> >  }
> 
> TJ fixed needing that for stats at some point, why can't you do the
> same?

The stats deltas are all additive, so it's okay to delay flushing them
up the tree right before somebody is trying to look at them.

With this, though, we are tracking time of an aggregate state composed
of child tasks, and that state might not be identical for you and all
your ancestor, so everytime a task state changes we have to evaluate
and start/stop clocks on every level, because we cannot derive our
state from the state history of our child groups.

For example, say you have the following tree:

  root
 /
A
  /   \
 A1   A2
  running=1   running=1

I.e. There is a a running task in A1 and one in A2.

root, A, A1, and A2 are all PSI_NONE as nothing is stalled.

Now the task in A2 enters a memstall.

  root
 /
A
  /   \
 A1   A2
  running=1   memstall=1

>From the perspective of A2, the group is now fully blocked and starts
recording time in PSI_FULL.

>From the perspective of A, it has a working group below it and a
stalled one, which would make it PSI_SOME, so it starts recording time
in PSI_SOME.

The root/sytem level likewise has to start the timer on PSI_SOME.

Now the task in A1 enters a memstall, and we have to propagate the
PSI_FULL state up A1 -> A -> root.

I'm not quite sure how we could make this lazy. Say we hadn't
propagated the state from A1 and A2 right away, and somebody is asking
about the averages for A. We could tell that A1 and A2 had been in
PSI_FULL recently, but we wouldn't know exactly if them being in these
states fully overlapped (all PSI_FULL), overlapped partially (some
PSI_FULL and some PSI_SOME), or didn't overlap at all (PSI_SOME).


Re: [PATCH 7/7] psi: cgroup support

2018-05-09 Thread Peter Zijlstra
On Mon, May 07, 2018 at 05:01:35PM -0400, Johannes Weiner wrote:
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -260,6 +260,18 @@ void psi_task_change(struct task_struct *task, u64 now, 
> int clear, int set)
>   task->psi_flags |= set;
>  
>   psi_group_update(&psi_system, cpu, now, clear, set);
> +
> +#ifdef CONFIG_CGROUPS
> +   cgroup = task->cgroups->dfl_cgrp;
> +   while (cgroup && (parent = cgroup_parent(cgroup))) {
> +   struct psi_group *group;
> +
> +   group = cgroup_psi(cgroup);
> +   psi_group_update(group, cpu, now, clear, set);
> +
> +   cgroup = parent;
> +   }
> +#endif
>  }

TJ fixed needing that for stats at some point, why can't you do the
same?


[PATCH 7/7] psi: cgroup support

2018-05-07 Thread Johannes Weiner
On a system that executes multiple cgrouped jobs and independent
workloads, we don't just care about the health of the overall system,
but also that of individual jobs, so that we can ensure individual job
health, fairness between jobs, or prioritize some jobs over others.

This patch implements pressure stall tracking for cgroups. In kernels
with CONFIG_PSI=y, cgroups will have cpu.pressure, memory.pressure,
and io.pressure files that track aggregate pressure stall times for
only the tasks inside the cgroup.

Signed-off-by: Johannes Weiner 
---
 Documentation/cgroup-v2.txt | 18 +
 include/linux/cgroup-defs.h |  4 ++
 include/linux/cgroup.h  | 15 +++
 include/linux/psi.h | 25 
 init/Kconfig|  4 ++
 kernel/cgroup/cgroup.c  | 45 -
 kernel/sched/psi.c  | 79 -
 7 files changed, 186 insertions(+), 4 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 74cdeaed9f7a..a22879dba019 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -963,6 +963,12 @@ All time durations are in microseconds.
$PERIOD duration.  "max" for $MAX indicates no limit.  If only
one number is written, $MAX is updated.
 
+  cpu.pressure
+   A read-only nested-key file which exists on non-root cgroups.
+
+   Shows pressure stall information for CPU. See
+   Documentation/accounting/psi.txt for details.
+
 
 Memory
 --
@@ -1199,6 +1205,12 @@ PAGE_SIZE multiple when read back.
Swap usage hard limit.  If a cgroup's swap usage reaches this
limit, anonymous memory of the cgroup will not be swapped out.
 
+  memory.pressure
+   A read-only nested-key file which exists on non-root cgroups.
+
+   Shows pressure stall information for memory. See
+   Documentation/accounting/psi.txt for details.
+
 
 Usage Guidelines
 
@@ -1334,6 +1346,12 @@ IO Interface Files
 
  8:16 rbps=2097152 wbps=max riops=max wiops=max
 
+  io.pressure
+   A read-only nested-key file which exists on non-root cgroups.
+
+   Shows pressure stall information for IO. See
+   Documentation/accounting/psi.txt for details.
+
 
 Writeback
 ~
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index dc5b70449dc6..280f18da956a 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_CGROUPS
 
@@ -424,6 +425,9 @@ struct cgroup {
/* used to schedule release agent */
struct work_struct release_agent_work;
 
+   /* used to track pressure stalls */
+   struct psi_group psi;
+
/* used to store eBPF programs */
struct cgroup_bpf bpf;
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 473e0c0abb86..fd94c294c207 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -627,6 +627,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
pr_cont_kernfs_path(cgrp->kn);
 }
 
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+   return &cgrp->psi;
+}
+
 static inline void cgroup_init_kthreadd(void)
 {
/*
@@ -680,6 +685,16 @@ static inline union kernfs_node_id 
*cgroup_get_kernfs_id(struct cgroup *cgrp)
return NULL;
 }
 
+static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+   return NULL;
+}
+
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+   return NULL;
+}
+
 static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
   struct cgroup *ancestor)
 {
diff --git a/include/linux/psi.h b/include/linux/psi.h
index 371af1479699..05c3dae3e9c5 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -4,6 +4,9 @@
 #include 
 #include 
 
+struct seq_file;
+struct css_set;
+
 #ifdef CONFIG_PSI
 
 extern bool psi_disabled;
@@ -15,6 +18,14 @@ void psi_task_change(struct task_struct *task, u64 now, int 
clear, int set);
 void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
 
+int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
+
+#ifdef CONFIG_CGROUPS
+int psi_cgroup_alloc(struct cgroup *cgrp);
+void psi_cgroup_free(struct cgroup *cgrp);
+void cgroup_move_task(struct task_struct *p, struct css_set *to);
+#endif
+
 #else /* CONFIG_PSI */
 
 static inline void psi_init(void) {}
@@ -22,6 +33,20 @@ static inline void psi_init(void) {}
 static inline void psi_memstall_enter(unsigned long *flags) {}
 static inline void psi_memstall_leave(unsigned long *flags) {}
 
+#ifdef CONFIG_CGROUPS
+static inline int psi_cgroup_alloc(struct cgroup *cgrp)
+{
+   return 0;
+}
+static inline void psi_cgroup_free(struct cgroup *cgrp)
+{
+}
+static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
+{
+   rcu_assign_point