25.08.2017 18:38, Andrey Ryabinin пишет:
> If kmem limit on memcg reached, we go into memory reclaim,
> and reclaim everything we can, including page cache and anon.
> Reclaiming page cache or anon won't help since we need to lower
> only kmem usage. This patch fixes the problem by avoiding
> non-kmem reclaim on hitting the kmem limit.
> 

Can't there be a situation, when some object in anon mem or page cache holds 
some object in kmem (indirectly)?

> https://jira.sw.ru/browse/PSBM-69226
> Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com>
> ---
>  include/linux/memcontrol.h | 10 ++++++++++
>  include/linux/swap.h       |  2 +-
>  mm/memcontrol.c            | 30 ++++++++++++++++--------------
>  mm/vmscan.c                | 31 ++++++++++++++++++++++++-------
>  4 files changed, 51 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 1a52e58ab7de..1d6bc80c4c90 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -45,6 +45,16 @@ struct mem_cgroup_reclaim_cookie {
>       unsigned int generation;
>  };
>  
> +/*
> + * Reclaim flags for mem_cgroup_hierarchical_reclaim
> + */
> +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT        0x0
> +#define MEM_CGROUP_RECLAIM_NOSWAP    (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> +#define MEM_CGROUP_RECLAIM_SHRINK_BIT        0x1
> +#define MEM_CGROUP_RECLAIM_SHRINK    (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> +#define MEM_CGROUP_RECLAIM_KMEM_BIT  0x2
> +#define MEM_CGROUP_RECLAIM_KMEM              (1 << 
> MEM_CGROUP_RECLAIM_KMEM_BIT)
> +
>  #ifdef CONFIG_MEMCG
>  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
>                         gfp_t gfp_mask, struct mem_cgroup **memcgp);
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index bd162f9bef0d..bd47451ec95a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -324,7 +324,7 @@ extern unsigned long try_to_free_pages(struct zonelist 
> *zonelist, int order,
>  extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
>  extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
>                                                 unsigned long nr_pages,
> -                                               gfp_t gfp_mask, bool noswap);
> +                                               gfp_t gfp_mask, int flags);
>  extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
>                                               gfp_t gfp_mask, bool noswap,
>                                               struct zone *zone,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 97824e281d7a..f9a5f3819a31 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -511,16 +511,6 @@ enum res_type {
>  #define OOM_CONTROL          (0)
>  
>  /*
> - * Reclaim flags for mem_cgroup_hierarchical_reclaim
> - */
> -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT        0x0
> -#define MEM_CGROUP_RECLAIM_NOSWAP    (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> -#define MEM_CGROUP_RECLAIM_SHRINK_BIT        0x1
> -#define MEM_CGROUP_RECLAIM_SHRINK    (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> -#define MEM_CGROUP_RECLAIM_KMEM_BIT  0x2
> -#define MEM_CGROUP_RECLAIM_KMEM              (1 << 
> MEM_CGROUP_RECLAIM_KMEM_BIT)
> -
> -/*
>   * The memcg_create_mutex will be held whenever a new cgroup is created.
>   * As a consequence, any change that needs to protect against new child 
> cgroups
>   * appearing has to hold it as well.
> @@ -2137,7 +2127,7 @@ static unsigned long mem_cgroup_reclaim(struct 
> mem_cgroup *memcg,
>               if (loop)
>                       drain_all_stock_async(memcg);
>               total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
> -                                                   gfp_mask, noswap);
> +                                                   gfp_mask, flags);
>               if (test_thread_flag(TIF_MEMDIE) ||
>                   fatal_signal_pending(current))
>                       return 1;
> @@ -2150,6 +2140,16 @@ static unsigned long mem_cgroup_reclaim(struct 
> mem_cgroup *memcg,
>                       break;
>               if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM))
>                       break;
> +
> +             /*
> +              * Try harder to reclaim dcache. dcache reclaim may
> +              * temporarly fail due to dcache->dlock being held
> +              * by someone else. We must try harder to avoid premature
> +              * slab allocation failures.
> +              */
> +             if (flags & MEM_CGROUP_RECLAIM_KMEM &&
> +                 page_counter_read(&memcg->dcache))
> +                     continue;
>               /*
>                * If nothing was reclaimed after two attempts, there
>                * may be no reclaimable pages in this hierarchy.
> @@ -2778,11 +2778,13 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t 
> gfp_mask, bool kmem_charge
>       struct mem_cgroup *mem_over_limit;
>       struct page_counter *counter;
>       unsigned long nr_reclaimed;
> -     unsigned long flags = 0;
> +     unsigned long flags;
>  
>       if (mem_cgroup_is_root(memcg))
>               goto done;
>  retry:
> +     flags = 0;
> +
>       if (consume_stock(memcg, nr_pages)) {
>               if (!kmem_charge)
>                       goto done;
> @@ -4138,7 +4140,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup 
> *memcg)
>                       return -EINTR;
>  
>               progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
> -                                                     GFP_KERNEL, false);
> +                                                     GFP_KERNEL, 0);
>               if (!progress) {
>                       nr_retries--;
>                       /* maybe some writeback is necessary */
> @@ -4573,7 +4575,7 @@ static int mem_cgroup_high_write(struct cgroup *cont, 
> struct cftype *cft,
>       usage = page_counter_read(&memcg->memory);
>       if (usage > nr_pages)
>               try_to_free_mem_cgroup_pages(memcg, usage - nr_pages,
> -                                          GFP_KERNEL, false);
> +                                          GFP_KERNEL, 0);
>       return 0;
>  }
>  
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 277bd37bd430..a5db5940bb1e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -88,6 +88,9 @@ struct scan_control {
>       /* Scan (total_size >> priority) pages at once */
>       int priority;
>  
> +     /* Reclaim only slab */
> +     bool slab_only;
> +
>       /*
>        * The memory cgroup that hit its limit and as a result is the
>        * primary target of this reclaim invocation.
> @@ -2346,6 +2349,7 @@ static void shrink_zone(struct zone *zone, struct 
> scan_control *sc,
>       struct reclaim_state *reclaim_state = current->reclaim_state;
>       unsigned long nr_reclaimed, nr_scanned;
>       gfp_t slab_gfp = sc->gfp_mask;
> +     bool slab_only = sc->slab_only;
>  
>       /* Disable fs-related IO for direct reclaim */
>       if (!sc->target_mem_cgroup &&
> @@ -2372,14 +2376,24 @@ static void shrink_zone(struct zone *zone, struct 
> scan_control *sc,
>                       if (!sc->may_thrash && mem_cgroup_low(root, memcg))
>                               continue;
>  
> -                     lruvec = mem_cgroup_zone_lruvec(zone, memcg);
>                       scanned = sc->nr_scanned;
> -                     shrink_lruvec(lruvec, sc, &lru_pages);
> -                     zone_lru_pages += lru_pages;
>  
> -                     if (memcg && is_classzone)
> +                     if (!slab_only) {
> +                             lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> +                             shrink_lruvec(lruvec, sc, &lru_pages);
> +                             zone_lru_pages += lru_pages;
> +                     }
> +
> +                     if (memcg && is_classzone) {
>                               shrink_slab(slab_gfp, zone_to_nid(zone),
>                                           memcg, sc->priority, false);
> +                             if (reclaim_state) {
> +                                     sc->nr_reclaimed += 
> reclaim_state->reclaimed_slab;
> +                                     sc->nr_scanned += 
> reclaim_state->reclaimed_slab;
> +                                     reclaim_state->reclaimed_slab = 0;
> +                             }
> +
> +                     }
>  
>                       /*
>                        * Direct reclaim and kswapd have to scan all memory
> @@ -2902,15 +2916,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct 
> mem_cgroup *memcg,
>  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>                                          unsigned long nr_pages,
>                                          gfp_t gfp_mask,
> -                                        bool noswap)
> +                                        int flags)
>  {
>       struct zonelist *zonelist;
>       unsigned long nr_reclaimed;
> +     struct reclaim_state reclaim_state = { 0 };
>       int nid;
>       struct scan_control sc = {
>               .may_writepage = !laptop_mode,
>               .may_unmap = 1,
> -             .may_swap = !noswap,
> +             .may_swap = !(flags & MEM_CGROUP_RECLAIM_NOSWAP),
> +             .slab_only = flags & MEM_CGROUP_RECLAIM_KMEM,
>               .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
>               .order = 0,
>               .priority = DEF_PRIORITY,
> @@ -2933,10 +2949,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct 
> mem_cgroup *memcg,
>                                           sc.may_writepage,
>                                           sc.gfp_mask);
>  
> +     current->reclaim_state = &reclaim_state;
>       current->flags |= PF_MEMALLOC | PF_MEMCG_RECLAIM;
>       nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>       current->flags &= ~(PF_MEMALLOC | PF_MEMCG_RECLAIM);
> -
> +     current->reclaim_state = NULL;
>       trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
>  
>       return nr_reclaimed;
> 
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to