This patch makes direct reclaim path shrink slab not only on global
memory pressure, but also when we reach the user memory limit of a
memcg. To achieve that, it makes shrink_slab() walk over the memcg
hierarchy and run shrinkers marked as memcg-aware on the target memcg
and all its descendants. The memcg to scan is passed in a shrink_control
structure; memcg-unaware shrinkers are still called only on global
memory pressure with memcg=NULL. It is up to the shrinker how to
organize the objects it is responsible for to achieve per-memcg reclaim.

Signed-off-by: Vladimir Davydov <[email protected]>
---
 include/linux/memcontrol.h |   22 +++++++++++
 include/linux/shrinker.h   |   10 ++++-
 mm/memcontrol.c            |   46 ++++++++++++++++++++++-
 mm/vmscan.c                |   87 ++++++++++++++++++++++++++++++++++----------
 4 files changed, 143 insertions(+), 22 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 19df5d857411..c4e64d0e318d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -68,6 +68,9 @@ void mem_cgroup_migrate(struct page *oldpage, struct page 
*newpage,
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
+unsigned long mem_cgroup_zone_reclaimable_pages(struct zone *zone,
+                                               struct mem_cgroup *memcg);
+
 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
                                  struct mem_cgroup *memcg);
 bool task_in_mem_cgroup(struct task_struct *task,
@@ -251,6 +254,12 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct 
page *page,
        return &zone->lruvec;
 }
 
+static inline unsigned long mem_cgroup_zone_reclaimable_pages(struct zone 
*zone,
+                                                       struct mem_cgroup *)
+{
+       return 0;
+}
+
 static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page 
*page)
 {
        return NULL;
@@ -421,6 +430,9 @@ static inline bool memcg_kmem_enabled(void)
        return static_key_false(&memcg_kmem_enabled_key);
 }
 
+bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+bool memcg_kmem_is_active_subtree(struct mem_cgroup *memcg);
+
 /*
  * In general, we'll do everything in our power to not incur in any overhead
  * for non-memcg users for the kmem functions. Not even a function call, if we
@@ -549,6 +561,16 @@ static inline bool memcg_kmem_enabled(void)
        return false;
 }
 
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+       return false;
+}
+
+static inline bool memcg_kmem_is_active_subtree(struct mem_cgroup *memcg)
+{
+       return false;
+}
+
 static inline bool
 memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 {
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 68c097077ef0..ab79b174bfbe 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -20,8 +20,15 @@ struct shrink_control {
 
        /* shrink from these nodes */
        nodemask_t nodes_to_scan;
+
+       /* shrink from this memory cgroup hierarchy (if not NULL) */
+       struct mem_cgroup *target_mem_cgroup;
+
        /* current node being shrunk (for NUMA aware shrinkers) */
        int nid;
+
+       /* current memcg being shrunk (for memcg aware shrinkers) */
+       struct mem_cgroup *memcg;
 };
 
 #define SHRINK_STOP (~0UL)
@@ -63,7 +70,8 @@ struct shrinker {
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 
 /* Flags */
-#define SHRINKER_NUMA_AWARE (1 << 0)
+#define SHRINKER_NUMA_AWARE    (1 << 0)
+#define SHRINKER_MEMCG_AWARE   (1 << 1)
 
 extern int register_shrinker(struct shrinker *);
 extern void unregister_shrinker(struct shrinker *);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9431024e490c..7361bd8b720a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -391,7 +391,7 @@ static inline void memcg_kmem_set_active(struct mem_cgroup 
*memcg)
        set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
 
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
        return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
@@ -1411,6 +1411,31 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, 
enum lru_list lru,
        VM_BUG_ON((long)(*lru_size) < 0);
 }
 
+unsigned long mem_cgroup_zone_reclaimable_pages(struct zone *zone,
+                                               struct mem_cgroup *memcg)
+{
+       unsigned long nr = 0;
+       unsigned int lru_mask;
+       struct mem_cgroup *iter;
+
+       lru_mask = LRU_ALL_FILE;
+       if (get_nr_swap_pages() > 0)
+               lru_mask |= LRU_ALL_ANON;
+
+       iter = memcg;
+       do {
+               struct mem_cgroup_per_zone *mz;
+               enum lru_list lru;
+
+               mz = mem_cgroup_zone_zoneinfo(memcg, zone);
+               for_each_lru(lru)
+                       if (BIT(lru) & lru_mask)
+                               nr += mz->lru_size[lru];
+       } while ((iter = mem_cgroup_iter(memcg, iter, NULL)) != NULL);
+
+       return nr;
+}
+
 /*
  * Checks whether given mem is same or in the root_mem_cgroup's
  * hierarchy subtree
@@ -2786,6 +2811,25 @@ static DEFINE_MUTEX(memcg_slab_mutex);
 
 static DEFINE_MUTEX(activate_kmem_mutex);
 
+/*
+ * Returns true if the given cgroup or any of its descendants has kmem
+ * accounting enabled.
+ */
+bool memcg_kmem_is_active_subtree(struct mem_cgroup *memcg)
+{
+       struct mem_cgroup *iter;
+
+       iter = memcg;
+       do {
+               if (memcg_kmem_is_active(iter)) {
+                       mem_cgroup_iter_break(memcg, iter);
+                       return true;
+               }
+       } while ((iter = mem_cgroup_iter(memcg, iter, NULL)) != NULL);
+
+       return false;
+}
+
 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
 {
        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b672e2c6becc..041d0e41a5a4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -340,6 +340,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct 
shrinker *shrinker,
        return freed;
 }
 
+static unsigned long
+run_shrinker(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+            unsigned long nr_pages_scanned, unsigned long lru_pages)
+{
+       unsigned long freed = 0;
+
+       if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
+               shrinkctl->nid = 0;
+               return shrink_slab_node(shrinkctl, shrinker,
+                                       nr_pages_scanned, lru_pages);
+       }
+
+       for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+               if (node_online(shrinkctl->nid))
+                       freed += shrink_slab_node(shrinkctl, shrinker,
+                                                 nr_pages_scanned, lru_pages);
+       }
+       return freed;
+}
+
 /*
  * Call the shrink functions to age shrinkable caches
  *
@@ -381,20 +401,34 @@ unsigned long shrink_slab(struct shrink_control 
*shrinkctl,
        }
 
        list_for_each_entry(shrinker, &shrinker_list, list) {
-               if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
-                       shrinkctl->nid = 0;
-                       freed += shrink_slab_node(shrinkctl, shrinker,
-                                       nr_pages_scanned, lru_pages);
+               /*
+                * Call memcg-unaware shrinkers only on global pressure.
+                */
+               if (!(shrinker->flags & SHRINKER_MEMCG_AWARE)) {
+                       if (!shrinkctl->target_mem_cgroup) {
+                               shrinkctl->memcg = NULL;
+                               freed += run_shrinker(shrinkctl, shrinker,
+                                               nr_pages_scanned, lru_pages);
+                       }
                        continue;
                }
 
-               for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
-                       if (node_online(shrinkctl->nid))
-                               freed += shrink_slab_node(shrinkctl, shrinker,
+               /*
+                * For memcg-aware shrinkers iterate over the target memcg
+                * hierarchy and run the shrinker on each kmem-active memcg
+                * found in the hierarchy.
+                */
+               shrinkctl->memcg = shrinkctl->target_mem_cgroup;
+               do {
+                       if (!shrinkctl->memcg ||
+                           memcg_kmem_is_active(shrinkctl->memcg))
+                               freed += run_shrinker(shrinkctl, shrinker,
                                                nr_pages_scanned, lru_pages);
-
-               }
+               } while ((shrinkctl->memcg =
+                         mem_cgroup_iter(shrinkctl->target_mem_cgroup,
+                                         shrinkctl->memcg, NULL)) != NULL);
        }
+
        up_read(&shrinker_rwsem);
 out:
        cond_resched();
@@ -2381,6 +2415,7 @@ static bool shrink_zones(struct zonelist *zonelist, 
struct scan_control *sc)
        gfp_t orig_mask;
        struct shrink_control shrink = {
                .gfp_mask = sc->gfp_mask,
+               .target_mem_cgroup = sc->target_mem_cgroup,
        };
        enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
        bool reclaimable = false;
@@ -2400,17 +2435,22 @@ static bool shrink_zones(struct zonelist *zonelist, 
struct scan_control *sc)
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
                if (!populated_zone(zone))
                        continue;
+
+               if (global_reclaim(sc) &&
+                   !cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                       continue;
+
+               lru_pages += global_reclaim(sc) ?
+                               zone_reclaimable_pages(zone) :
+                               mem_cgroup_zone_reclaimable_pages(zone,
+                                               sc->target_mem_cgroup);
+               node_set(zone_to_nid(zone), shrink.nodes_to_scan);
+
                /*
                 * Take care memory controller reclaiming has small influence
                 * to global LRU.
                 */
                if (global_reclaim(sc)) {
-                       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                               continue;
-
-                       lru_pages += zone_reclaimable_pages(zone);
-                       node_set(zone_to_nid(zone), shrink.nodes_to_scan);
-
                        if (sc->priority != DEF_PRIORITY &&
                            !zone_reclaimable(zone))
                                continue;       /* Let kswapd poll it */
@@ -2458,12 +2498,11 @@ static bool shrink_zones(struct zonelist *zonelist, 
struct scan_control *sc)
        }
 
        /*
-        * Don't shrink slabs when reclaiming memory from over limit cgroups
-        * but do shrink slab at least once when aborting reclaim for
-        * compaction to avoid unevenly scanning file/anon LRU pages over slab
-        * pages.
+        * Shrink slabs at least once when aborting reclaim for compaction
+        * to avoid unevenly scanning file/anon LRU pages over slab pages.
         */
-       if (global_reclaim(sc)) {
+       if (global_reclaim(sc) ||
+           memcg_kmem_is_active_subtree(sc->target_mem_cgroup)) {
                shrink_slab(&shrink, sc->nr_scanned, lru_pages);
                if (reclaim_state) {
                        sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2765,6 +2804,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct 
mem_cgroup *memcg,
        struct zonelist *zonelist;
        unsigned long nr_reclaimed;
        int nid;
+       struct reclaim_state reclaim_state;
        struct scan_control sc = {
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2785,6 +2825,10 @@ unsigned long try_to_free_mem_cgroup_pages(struct 
mem_cgroup *memcg,
 
        zonelist = NODE_DATA(nid)->node_zonelists;
 
+       lockdep_set_current_reclaim_state(sc.gfp_mask);
+       reclaim_state.reclaimed_slab = 0;
+       current->reclaim_state = &reclaim_state;
+
        trace_mm_vmscan_memcg_reclaim_begin(0,
                                            sc.may_writepage,
                                            sc.gfp_mask);
@@ -2793,6 +2837,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct 
mem_cgroup *memcg,
 
        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 
+       current->reclaim_state = NULL;
+       lockdep_clear_current_reclaim_state();
+
        return nr_reclaimed;
 }
 #endif
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to