25.08.2017 18:38, Andrey Ryabinin пишет: > If kmem limit on memcg reached, we go into memory reclaim, > and reclaim everything we can, including page cache and anon. > Reclaiming page cache or anon won't help since we need to lower > only kmem usage. This patch fixes the problem by avoiding > non-kmem reclaim on hitting the kmem limit. >
Can't there be a situation, when some object in anon mem or page cache holds some object in kmem (indirectly)? > https://jira.sw.ru/browse/PSBM-69226 > Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> > --- > include/linux/memcontrol.h | 10 ++++++++++ > include/linux/swap.h | 2 +- > mm/memcontrol.c | 30 ++++++++++++++++-------------- > mm/vmscan.c | 31 ++++++++++++++++++++++++------- > 4 files changed, 51 insertions(+), 22 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 1a52e58ab7de..1d6bc80c4c90 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -45,6 +45,16 @@ struct mem_cgroup_reclaim_cookie { > unsigned int generation; > }; > > +/* > + * Reclaim flags for mem_cgroup_hierarchical_reclaim > + */ > +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 > +#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) > +#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 > +#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) > +#define MEM_CGROUP_RECLAIM_KMEM_BIT 0x2 > +#define MEM_CGROUP_RECLAIM_KMEM (1 << > MEM_CGROUP_RECLAIM_KMEM_BIT) > + > #ifdef CONFIG_MEMCG > int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, > gfp_t gfp_mask, struct mem_cgroup **memcgp); > diff --git a/include/linux/swap.h b/include/linux/swap.h > index bd162f9bef0d..bd47451ec95a 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -324,7 +324,7 @@ extern unsigned long try_to_free_pages(struct zonelist > *zonelist, int order, > extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); > extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, > unsigned long nr_pages, > - gfp_t gfp_mask, bool noswap); > + gfp_t gfp_mask, int flags); > extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, > gfp_t gfp_mask, bool noswap, > struct zone *zone, > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 97824e281d7a..f9a5f3819a31 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -511,16 +511,6 @@ enum res_type { > #define OOM_CONTROL (0) > > /* > - * Reclaim flags for mem_cgroup_hierarchical_reclaim > - */ > -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 > -#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) > -#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 > -#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) > -#define MEM_CGROUP_RECLAIM_KMEM_BIT 0x2 > -#define MEM_CGROUP_RECLAIM_KMEM (1 << > MEM_CGROUP_RECLAIM_KMEM_BIT) > - > -/* > * The memcg_create_mutex will be held whenever a new cgroup is created. > * As a consequence, any change that needs to protect against new child > cgroups > * appearing has to hold it as well. > @@ -2137,7 +2127,7 @@ static unsigned long mem_cgroup_reclaim(struct > mem_cgroup *memcg, > if (loop) > drain_all_stock_async(memcg); > total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX, > - gfp_mask, noswap); > + gfp_mask, flags); > if (test_thread_flag(TIF_MEMDIE) || > fatal_signal_pending(current)) > return 1; > @@ -2150,6 +2140,16 @@ static unsigned long mem_cgroup_reclaim(struct > mem_cgroup *memcg, > break; > if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM)) > break; > + > + /* > + * Try harder to reclaim dcache. dcache reclaim may > + * temporarly fail due to dcache->dlock being held > + * by someone else. We must try harder to avoid premature > + * slab allocation failures. > + */ > + if (flags & MEM_CGROUP_RECLAIM_KMEM && > + page_counter_read(&memcg->dcache)) > + continue; > /* > * If nothing was reclaimed after two attempts, there > * may be no reclaimable pages in this hierarchy. > @@ -2778,11 +2778,13 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t > gfp_mask, bool kmem_charge > struct mem_cgroup *mem_over_limit; > struct page_counter *counter; > unsigned long nr_reclaimed; > - unsigned long flags = 0; > + unsigned long flags; > > if (mem_cgroup_is_root(memcg)) > goto done; > retry: > + flags = 0; > + > if (consume_stock(memcg, nr_pages)) { > if (!kmem_charge) > goto done; > @@ -4138,7 +4140,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup > *memcg) > return -EINTR; > > progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX, > - GFP_KERNEL, false); > + GFP_KERNEL, 0); > if (!progress) { > nr_retries--; > /* maybe some writeback is necessary */ > @@ -4573,7 +4575,7 @@ static int mem_cgroup_high_write(struct cgroup *cont, > struct cftype *cft, > usage = page_counter_read(&memcg->memory); > if (usage > nr_pages) > try_to_free_mem_cgroup_pages(memcg, usage - nr_pages, > - GFP_KERNEL, false); > + GFP_KERNEL, 0); > return 0; > } > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 277bd37bd430..a5db5940bb1e 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -88,6 +88,9 @@ struct scan_control { > /* Scan (total_size >> priority) pages at once */ > int priority; > > + /* Reclaim only slab */ > + bool slab_only; > + > /* > * The memory cgroup that hit its limit and as a result is the > * primary target of this reclaim invocation. > @@ -2346,6 +2349,7 @@ static void shrink_zone(struct zone *zone, struct > scan_control *sc, > struct reclaim_state *reclaim_state = current->reclaim_state; > unsigned long nr_reclaimed, nr_scanned; > gfp_t slab_gfp = sc->gfp_mask; > + bool slab_only = sc->slab_only; > > /* Disable fs-related IO for direct reclaim */ > if (!sc->target_mem_cgroup && > @@ -2372,14 +2376,24 @@ static void shrink_zone(struct zone *zone, struct > scan_control *sc, > if (!sc->may_thrash && mem_cgroup_low(root, memcg)) > continue; > > - lruvec = mem_cgroup_zone_lruvec(zone, memcg); > scanned = sc->nr_scanned; > - shrink_lruvec(lruvec, sc, &lru_pages); > - zone_lru_pages += lru_pages; > > - if (memcg && is_classzone) > + if (!slab_only) { > + lruvec = mem_cgroup_zone_lruvec(zone, memcg); > + shrink_lruvec(lruvec, sc, &lru_pages); > + zone_lru_pages += lru_pages; > + } > + > + if (memcg && is_classzone) { > shrink_slab(slab_gfp, zone_to_nid(zone), > memcg, sc->priority, false); > + if (reclaim_state) { > + sc->nr_reclaimed += > reclaim_state->reclaimed_slab; > + sc->nr_scanned += > reclaim_state->reclaimed_slab; > + reclaim_state->reclaimed_slab = 0; > + } > + > + } > > /* > * Direct reclaim and kswapd have to scan all memory > @@ -2902,15 +2916,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct > mem_cgroup *memcg, > unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, > unsigned long nr_pages, > gfp_t gfp_mask, > - bool noswap) > + int flags) > { > struct zonelist *zonelist; > unsigned long nr_reclaimed; > + struct reclaim_state reclaim_state = { 0 }; > int nid; > struct scan_control sc = { > .may_writepage = !laptop_mode, > .may_unmap = 1, > - .may_swap = !noswap, > + .may_swap = !(flags & MEM_CGROUP_RECLAIM_NOSWAP), > + .slab_only = flags & MEM_CGROUP_RECLAIM_KMEM, > .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), > .order = 0, > .priority = DEF_PRIORITY, > @@ -2933,10 +2949,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct > mem_cgroup *memcg, > sc.may_writepage, > sc.gfp_mask); > > + current->reclaim_state = &reclaim_state; > current->flags |= PF_MEMALLOC | PF_MEMCG_RECLAIM; > nr_reclaimed = do_try_to_free_pages(zonelist, &sc); > current->flags &= ~(PF_MEMALLOC | PF_MEMCG_RECLAIM); > - > + current->reclaim_state = NULL; > trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); > > return nr_reclaimed; > _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel