Re: [PATCH 3/8] net: consolidate memcg socket buffer tracking and accounting

2015-10-23 Thread Michal Hocko
On Thu 22-10-15 00:21:31, Johannes Weiner wrote:
> The tcp memory controller has extensive provisions for future memory
> accounting interfaces that won't materialize after all. Cut the code
> base down to what's actually used, now and in the likely future.
> 
> - There won't be any different protocol counters in the future, so a
>   direct sock->sk_memcg linkage is enough. This eliminates a lot of
>   callback maze and boilerplate code, and restores most of the socket
>   allocation code to pre-tcp_memcontrol state.
> 
> - There won't be a tcp control soft limit, so integrating the memcg
>   code into the global skmem limiting scheme complicates things
>   unnecessarily. Replace all that with simple and clear charge and
>   uncharge calls--hidden behind a jump label--to account skb memory.
> 
> - The previous jump label code was an elaborate state machine that
>   tracked the number of cgroups with an active socket limit in order
>   to enable the skmem tracking and accounting code only when actively
>   necessary. But this is overengineered: it was meant to protect the
>   people who never use this feature in the first place. Simply enable
>   the branches once when the first limit is set until the next reboot.
> 
> Signed-off-by: Johannes Weiner 

The changelog is certainly attractive. I have looked through the patch
but my knowledge of the networking subsystem and its memory management
is close to zero so I cannot really do a competent review.

Anyway I support any simplification of the tcp kmem accounting. If
networking people are OK with the changes, including reduction of the
functionality as described by Vladimir then no objections from me for
this to be merged.

Thanks!
> ---
>  include/linux/memcontrol.h   |  64 ---
>  include/net/sock.h   | 135 +++
>  include/net/tcp.h|   3 -
>  include/net/tcp_memcontrol.h |   7 ---
>  mm/memcontrol.c  | 101 +++--
>  net/core/sock.c  |  78 ++-
>  net/ipv4/sysctl_net_ipv4.c   |   1 -
>  net/ipv4/tcp.c   |   3 +-
>  net/ipv4/tcp_ipv4.c  |   9 +--
>  net/ipv4/tcp_memcontrol.c| 147 
> +++
>  net/ipv4/tcp_output.c|   6 +-
>  net/ipv6/tcp_ipv6.c  |   3 -
>  12 files changed, 136 insertions(+), 421 deletions(-)
>  delete mode 100644 include/net/tcp_memcontrol.h
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 19ff87b..5b72f83 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -85,34 +85,6 @@ enum mem_cgroup_events_target {
>   MEM_CGROUP_NTARGETS,
>  };
>  
> -/*
> - * Bits in struct cg_proto.flags
> - */
> -enum cg_proto_flags {
> - /* Currently active and new sockets should be assigned to cgroups */
> - MEMCG_SOCK_ACTIVE,
> - /* It was ever activated; we must disarm static keys on destruction */
> - MEMCG_SOCK_ACTIVATED,
> -};
> -
> -struct cg_proto {
> - struct page_counter memory_allocated;   /* Current allocated 
> memory. */
> - struct percpu_counter   sockets_allocated;  /* Current number of 
> sockets. */
> - int memory_pressure;
> - longsysctl_mem[3];
> - unsigned long   flags;
> - /*
> -  * memcg field is used to find which memcg we belong directly
> -  * Each memcg struct can hold more than one cg_proto, so container_of
> -  * won't really cut.
> -  *
> -  * The elegant solution would be having an inverse function to
> -  * proto_cgroup in struct proto, but that means polluting the structure
> -  * for everybody, instead of just for memcg users.
> -  */
> - struct mem_cgroup   *memcg;
> -};
> -
>  #ifdef CONFIG_MEMCG
>  struct mem_cgroup_stat_cpu {
>   long count[MEM_CGROUP_STAT_NSTATS];
> @@ -185,8 +157,15 @@ struct mem_cgroup {
>  
>   /* Accounted resources */
>   struct page_counter memory;
> +
> + /*
> +  * Legacy non-resource counters. In unified hierarchy, all
> +  * memory is accounted and limited through memcg->memory.
> +  * Consumer breakdown happens in the statistics.
> +  */
>   struct page_counter memsw;
>   struct page_counter kmem;
> + struct page_counter skmem;
>  
>   /* Normal memory consumption range */
>   unsigned long low;
> @@ -246,9 +225,6 @@ struct mem_cgroup {
>*/
>   struct mem_cgroup_stat_cpu __percpu *stat;
>  
> -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
> - struct cg_proto tcp_mem;
> -#endif
>  #if defined(CONFIG_MEMCG_KMEM)
>  /* Index in the kmem_cache->memcg_params.memcg_caches array */
>   int kmemcg_id;
> @@ -676,12 +652,6 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, 
> enum vm_event_item idx)
>  }
>  #endif /* CONFIG_MEMCG */
>  
> -enum {
> - UNDER_LIMIT,
> - 

Re: [PATCH 3/8] net: consolidate memcg socket buffer tracking and accounting

2015-10-23 Thread Vladimir Davydov
On Thu, Oct 22, 2015 at 03:09:43PM -0400, Johannes Weiner wrote:
> On Thu, Oct 22, 2015 at 09:46:12PM +0300, Vladimir Davydov wrote:
> > On Thu, Oct 22, 2015 at 12:21:31AM -0400, Johannes Weiner wrote:
> > > The tcp memory controller has extensive provisions for future memory
> > > accounting interfaces that won't materialize after all. Cut the code
> > > base down to what's actually used, now and in the likely future.
> > > 
> > > - There won't be any different protocol counters in the future, so a
> > >   direct sock->sk_memcg linkage is enough. This eliminates a lot of
> > >   callback maze and boilerplate code, and restores most of the socket
> > >   allocation code to pre-tcp_memcontrol state.
> > > 
> > > - There won't be a tcp control soft limit, so integrating the memcg
> > 
> > In fact, the code is ready for the "soft" limit (I mean min, pressure,
> > max tuple), it just lacks a knob.
> 
> Yeah, but that's not going to materialize if the entire interface for
> dedicated tcp throttling is considered obsolete.

May be, it shouldn't be. My current understanding is that per memcg tcp
window control is necessary, because:

 - We need to be able to protect a containerized workload from its
   growing network buffers. Using vmpressure notifications for that does
   not look reassuring to me.

 - We need a way to limit network buffers of a particular container,
   otherwise it can fill the system-wide window throttling other
   containers, which is unfair.

> 
> > > @@ -1136,9 +1090,6 @@ static inline bool sk_under_memory_pressure(const 
> > > struct sock *sk)
> > >   if (!sk->sk_prot->memory_pressure)
> > >   return false;
> > >  
> > > - if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
> > > - return !!sk->sk_cgrp->memory_pressure;
> > > -
> > 
> > AFAIU, now we won't shrink the window on hitting the limit, i.e. this
> > patch subtly changes the behavior of the existing knobs, potentially
> > breaking them.
> 
> Hm, but there is no grace period in which something meaningful could
> happen with the window shrinking, is there? Any buffer allocation is
> still going to fail hard.

AFAIU when we hit the limit, we not only throttle the socket which
allocates, but also try to release space reserved by other sockets.
After your patch we won't. This looks unfair to me.

Thanks,
Vladimir
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/8] net: consolidate memcg socket buffer tracking and accounting

2015-10-22 Thread Johannes Weiner
On Thu, Oct 22, 2015 at 09:46:12PM +0300, Vladimir Davydov wrote:
> On Thu, Oct 22, 2015 at 12:21:31AM -0400, Johannes Weiner wrote:
> > The tcp memory controller has extensive provisions for future memory
> > accounting interfaces that won't materialize after all. Cut the code
> > base down to what's actually used, now and in the likely future.
> > 
> > - There won't be any different protocol counters in the future, so a
> >   direct sock->sk_memcg linkage is enough. This eliminates a lot of
> >   callback maze and boilerplate code, and restores most of the socket
> >   allocation code to pre-tcp_memcontrol state.
> > 
> > - There won't be a tcp control soft limit, so integrating the memcg
> 
> In fact, the code is ready for the "soft" limit (I mean min, pressure,
> max tuple), it just lacks a knob.

Yeah, but that's not going to materialize if the entire interface for
dedicated tcp throttling is considered obsolete.

> > @@ -1136,9 +1090,6 @@ static inline bool sk_under_memory_pressure(const 
> > struct sock *sk)
> > if (!sk->sk_prot->memory_pressure)
> > return false;
> >  
> > -   if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
> > -   return !!sk->sk_cgrp->memory_pressure;
> > -
> 
> AFAIU, now we won't shrink the window on hitting the limit, i.e. this
> patch subtly changes the behavior of the existing knobs, potentially
> breaking them.

Hm, but there is no grace period in which something meaningful could
happen with the window shrinking, is there? Any buffer allocation is
still going to fail hard.

I don't see how this would change anything in practice.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/8] net: consolidate memcg socket buffer tracking and accounting

2015-10-22 Thread Vladimir Davydov
On Thu, Oct 22, 2015 at 12:21:31AM -0400, Johannes Weiner wrote:
> The tcp memory controller has extensive provisions for future memory
> accounting interfaces that won't materialize after all. Cut the code
> base down to what's actually used, now and in the likely future.
> 
> - There won't be any different protocol counters in the future, so a
>   direct sock->sk_memcg linkage is enough. This eliminates a lot of
>   callback maze and boilerplate code, and restores most of the socket
>   allocation code to pre-tcp_memcontrol state.
> 
> - There won't be a tcp control soft limit, so integrating the memcg

In fact, the code is ready for the "soft" limit (I mean min, pressure,
max tuple), it just lacks a knob.

>   code into the global skmem limiting scheme complicates things
>   unnecessarily. Replace all that with simple and clear charge and
>   uncharge calls--hidden behind a jump label--to account skb memory.
> 
> - The previous jump label code was an elaborate state machine that
>   tracked the number of cgroups with an active socket limit in order
>   to enable the skmem tracking and accounting code only when actively
>   necessary. But this is overengineered: it was meant to protect the
>   people who never use this feature in the first place. Simply enable
>   the branches once when the first limit is set until the next reboot.
> 
...
> @@ -1136,9 +1090,6 @@ static inline bool sk_under_memory_pressure(const 
> struct sock *sk)
>   if (!sk->sk_prot->memory_pressure)
>   return false;
>  
> - if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
> - return !!sk->sk_cgrp->memory_pressure;
> -

AFAIU, now we won't shrink the window on hitting the limit, i.e. this
patch subtly changes the behavior of the existing knobs, potentially
breaking them.

Thanks,
Vladimir
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/8] net: consolidate memcg socket buffer tracking and accounting

2015-10-21 Thread Johannes Weiner
The tcp memory controller has extensive provisions for future memory
accounting interfaces that won't materialize after all. Cut the code
base down to what's actually used, now and in the likely future.

- There won't be any different protocol counters in the future, so a
  direct sock->sk_memcg linkage is enough. This eliminates a lot of
  callback maze and boilerplate code, and restores most of the socket
  allocation code to pre-tcp_memcontrol state.

- There won't be a tcp control soft limit, so integrating the memcg
  code into the global skmem limiting scheme complicates things
  unnecessarily. Replace all that with simple and clear charge and
  uncharge calls--hidden behind a jump label--to account skb memory.

- The previous jump label code was an elaborate state machine that
  tracked the number of cgroups with an active socket limit in order
  to enable the skmem tracking and accounting code only when actively
  necessary. But this is overengineered: it was meant to protect the
  people who never use this feature in the first place. Simply enable
  the branches once when the first limit is set until the next reboot.

Signed-off-by: Johannes Weiner 
---
 include/linux/memcontrol.h   |  64 ---
 include/net/sock.h   | 135 +++
 include/net/tcp.h|   3 -
 include/net/tcp_memcontrol.h |   7 ---
 mm/memcontrol.c  | 101 +++--
 net/core/sock.c  |  78 ++-
 net/ipv4/sysctl_net_ipv4.c   |   1 -
 net/ipv4/tcp.c   |   3 +-
 net/ipv4/tcp_ipv4.c  |   9 +--
 net/ipv4/tcp_memcontrol.c| 147 +++
 net/ipv4/tcp_output.c|   6 +-
 net/ipv6/tcp_ipv6.c  |   3 -
 12 files changed, 136 insertions(+), 421 deletions(-)
 delete mode 100644 include/net/tcp_memcontrol.h

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 19ff87b..5b72f83 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -85,34 +85,6 @@ enum mem_cgroup_events_target {
MEM_CGROUP_NTARGETS,
 };
 
-/*
- * Bits in struct cg_proto.flags
- */
-enum cg_proto_flags {
-   /* Currently active and new sockets should be assigned to cgroups */
-   MEMCG_SOCK_ACTIVE,
-   /* It was ever activated; we must disarm static keys on destruction */
-   MEMCG_SOCK_ACTIVATED,
-};
-
-struct cg_proto {
-   struct page_counter memory_allocated;   /* Current allocated 
memory. */
-   struct percpu_counter   sockets_allocated;  /* Current number of 
sockets. */
-   int memory_pressure;
-   longsysctl_mem[3];
-   unsigned long   flags;
-   /*
-* memcg field is used to find which memcg we belong directly
-* Each memcg struct can hold more than one cg_proto, so container_of
-* won't really cut.
-*
-* The elegant solution would be having an inverse function to
-* proto_cgroup in struct proto, but that means polluting the structure
-* for everybody, instead of just for memcg users.
-*/
-   struct mem_cgroup   *memcg;
-};
-
 #ifdef CONFIG_MEMCG
 struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
@@ -185,8 +157,15 @@ struct mem_cgroup {
 
/* Accounted resources */
struct page_counter memory;
+
+   /*
+* Legacy non-resource counters. In unified hierarchy, all
+* memory is accounted and limited through memcg->memory.
+* Consumer breakdown happens in the statistics.
+*/
struct page_counter memsw;
struct page_counter kmem;
+   struct page_counter skmem;
 
/* Normal memory consumption range */
unsigned long low;
@@ -246,9 +225,6 @@ struct mem_cgroup {
 */
struct mem_cgroup_stat_cpu __percpu *stat;
 
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
-   struct cg_proto tcp_mem;
-#endif
 #if defined(CONFIG_MEMCG_KMEM)
 /* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
@@ -676,12 +652,6 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum 
vm_event_item idx)
 }
 #endif /* CONFIG_MEMCG */
 
-enum {
-   UNDER_LIMIT,
-   SOFT_LIMIT,
-   OVER_LIMIT,
-};
-
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
@@ -707,15 +677,35 @@ static inline void mem_cgroup_wb_stats(struct 
bdi_writeback *wb,
 
 struct sock;
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
+extern struct static_key_false mem_cgroup_sockets;
+static inline bool mem_cgroup_do_sockets(void)
+{
+   return static_branch_unlikely(_cgroup_sockets);
+}
 void sock_update_memcg(struct sock *sk);
 void sock_release_memcg(struct sock *sk);
+bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
+void