Update the utility functions __mem_cgroup_insert_exceeded() and
__mem_cgroup_remove_exceeded(), to allow addition and removal of cgroups
from the new red black tree that tracks the cgroups that exceed their
toptier memory limits.

Update also the function +mem_cgroup_largest_soft_limit_node(),
to allow returning the cgroup that has the largest exceess usage
of toptier memory.

Signed-off-by: Tim Chen <tim.c.c...@linux.intel.com>
---
 include/linux/memcontrol.h |   9 +++
 mm/memcontrol.c            | 152 +++++++++++++++++++++++++++----------
 2 files changed, 122 insertions(+), 39 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 609d8590950c..0ed8ddfd5436 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -124,6 +124,15 @@ struct mem_cgroup_per_node {
        unsigned long           usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                    on_tree;
+
+       struct rb_node          toptier_tree_node;       /* RB tree node */
+       unsigned long           toptier_usage_in_excess; /* Set to the value by 
which */
+                                                        /* the soft limit is 
exceeded*/
+       bool                    on_toptier_tree;
+
+       bool                    congested;      /* memcg has many dirty pages */
+                                               /* backed by a congested BDI */
+
        struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
                                                /* use container_of        */
 };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 90a78ff3fca8..8a7648b79635 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -616,24 +616,44 @@ soft_limit_tree_from_page(struct page *page, enum 
node_states type)
 
 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
                                         struct mem_cgroup_tree_per_node *mctz,
-                                        unsigned long new_usage_in_excess)
+                                        unsigned long new_usage_in_excess,
+                                        enum node_states type)
 {
        struct rb_node **p = &mctz->rb_root.rb_node;
-       struct rb_node *parent = NULL;
+       struct rb_node *parent = NULL, *mz_tree_node;
        struct mem_cgroup_per_node *mz_node;
-       bool rightmost = true;
+       bool rightmost = true, *mz_on_tree;
+       unsigned long usage_in_excess, *mz_usage_in_excess;
 
-       if (mz->on_tree)
+       if (type == N_TOPTIER) {
+               mz_usage_in_excess = &mz->toptier_usage_in_excess;
+               mz_tree_node = &mz->toptier_tree_node;
+               mz_on_tree = &mz->on_toptier_tree;
+       } else {
+               mz_usage_in_excess = &mz->usage_in_excess;
+               mz_tree_node = &mz->tree_node;
+               mz_on_tree = &mz->on_tree;
+       }
+
+       if (*mz_on_tree)
                return;
 
-       mz->usage_in_excess = new_usage_in_excess;
-       if (!mz->usage_in_excess)
+       if (!new_usage_in_excess)
                return;
+
        while (*p) {
                parent = *p;
-               mz_node = rb_entry(parent, struct mem_cgroup_per_node,
+               if (type == N_TOPTIER) {
+                       mz_node = rb_entry(parent, struct mem_cgroup_per_node,
+                                       toptier_tree_node);
+                       usage_in_excess = mz_node->toptier_usage_in_excess;
+               } else {
+                       mz_node = rb_entry(parent, struct mem_cgroup_per_node,
                                        tree_node);
-               if (mz->usage_in_excess < mz_node->usage_in_excess) {
+                       usage_in_excess = mz_node->usage_in_excess;
+               }
+
+               if (new_usage_in_excess < usage_in_excess) {
                        p = &(*p)->rb_left;
                        rightmost = false;
                } else {
@@ -642,33 +662,47 @@ static void __mem_cgroup_insert_exceeded(struct 
mem_cgroup_per_node *mz,
        }
 
        if (rightmost)
-               mctz->rb_rightmost = &mz->tree_node;
+               mctz->rb_rightmost = mz_tree_node;
 
-       rb_link_node(&mz->tree_node, parent, p);
-       rb_insert_color(&mz->tree_node, &mctz->rb_root);
-       mz->on_tree = true;
+       rb_link_node(mz_tree_node, parent, p);
+       rb_insert_color(mz_tree_node, &mctz->rb_root);
+       *mz_usage_in_excess = new_usage_in_excess;
+       *mz_on_tree = true;
 }
 
 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
-                                        struct mem_cgroup_tree_per_node *mctz)
+                                        struct mem_cgroup_tree_per_node *mctz,
+                                        enum node_states type)
 {
-       if (!mz->on_tree)
+       bool *mz_on_tree;
+       struct rb_node *mz_tree_node;
+
+       if (type == N_TOPTIER) {
+               mz_tree_node = &mz->toptier_tree_node;
+               mz_on_tree = &mz->on_toptier_tree;
+       } else {
+               mz_tree_node = &mz->tree_node;
+               mz_on_tree = &mz->on_tree;
+       }
+
+       if (!(*mz_on_tree))
                return;
 
-       if (&mz->tree_node == mctz->rb_rightmost)
-               mctz->rb_rightmost = rb_prev(&mz->tree_node);
+       if (mz_tree_node == mctz->rb_rightmost)
+               mctz->rb_rightmost = rb_prev(mz_tree_node);
 
-       rb_erase(&mz->tree_node, &mctz->rb_root);
-       mz->on_tree = false;
+       rb_erase(mz_tree_node, &mctz->rb_root);
+       *mz_on_tree = false;
 }
 
 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
-                                      struct mem_cgroup_tree_per_node *mctz)
+                                      struct mem_cgroup_tree_per_node *mctz,
+                                      enum node_states type)
 {
        unsigned long flags;
 
        spin_lock_irqsave(&mctz->lock, flags);
-       __mem_cgroup_remove_exceeded(mz, mctz);
+       __mem_cgroup_remove_exceeded(mz, mctz, type);
        spin_unlock_irqrestore(&mctz->lock, flags);
 }
 
@@ -696,13 +730,18 @@ static unsigned long soft_limit_excess(struct mem_cgroup 
*memcg, enum node_state
        return excess;
 }
 
-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+static void mem_cgroup_update_tree(struct mem_cgroup *bottom_memcg, struct 
page *page)
 {
        unsigned long excess;
        struct mem_cgroup_per_node *mz;
        struct mem_cgroup_tree_per_node *mctz;
+       enum node_states type = N_MEMORY;
+       struct mem_cgroup *memcg;
+
+repeat_toptier:
+       memcg = bottom_memcg;
+       mctz = soft_limit_tree_from_page(page, type);
 
-       mctz = soft_limit_tree_from_page(page, N_MEMORY);
        if (!mctz)
                return;
        /*
@@ -710,27 +749,37 @@ static void mem_cgroup_update_tree(struct mem_cgroup 
*memcg, struct page *page)
         * because their event counter is not touched.
         */
        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+               bool on_tree;
+
                mz = mem_cgroup_page_nodeinfo(memcg, page);
-               excess = soft_limit_excess(memcg, N_MEMORY);
+               excess = soft_limit_excess(memcg, type);
+
+               on_tree = (type == N_MEMORY) ? mz->on_tree: mz->on_toptier_tree;
                /*
                 * We have to update the tree if mz is on RB-tree or
                 * mem is over its softlimit.
                 */
-               if (excess || mz->on_tree) {
+               if (excess || on_tree) {
                        unsigned long flags;
 
                        spin_lock_irqsave(&mctz->lock, flags);
                        /* if on-tree, remove it */
-                       if (mz->on_tree)
-                               __mem_cgroup_remove_exceeded(mz, mctz);
+                       if (on_tree)
+                               __mem_cgroup_remove_exceeded(mz, mctz, type);
+
                        /*
                         * Insert again. mz->usage_in_excess will be updated.
                         * If excess is 0, no tree ops.
                         */
-                       __mem_cgroup_insert_exceeded(mz, mctz, excess);
+                       __mem_cgroup_insert_exceeded(mz, mctz, excess, type);
+
                        spin_unlock_irqrestore(&mctz->lock, flags);
                }
        }
+       if (type == N_MEMORY) {
+               type = N_TOPTIER;
+               goto repeat_toptier;
+       }
 }
 
 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
@@ -743,12 +792,16 @@ static void mem_cgroup_remove_from_trees(struct 
mem_cgroup *memcg)
                mz = mem_cgroup_nodeinfo(memcg, nid);
                mctz = soft_limit_tree_node(nid, N_MEMORY);
                if (mctz)
-                       mem_cgroup_remove_exceeded(mz, mctz);
+                       mem_cgroup_remove_exceeded(mz, mctz, N_MEMORY);
+               mctz = soft_limit_tree_node(nid, N_TOPTIER);
+               if (mctz)
+                       mem_cgroup_remove_exceeded(mz, mctz, N_TOPTIER);
        }
 }
 
 static struct mem_cgroup_per_node *
-__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz,
+                                    enum node_states type)
 {
        struct mem_cgroup_per_node *mz;
 
@@ -757,15 +810,19 @@ __mem_cgroup_largest_soft_limit_node(struct 
mem_cgroup_tree_per_node *mctz)
        if (!mctz->rb_rightmost)
                goto done;              /* Nothing to reclaim from */
 
-       mz = rb_entry(mctz->rb_rightmost,
+       if (type == N_TOPTIER)
+               mz = rb_entry(mctz->rb_rightmost,
+                     struct mem_cgroup_per_node, toptier_tree_node);
+       else
+               mz = rb_entry(mctz->rb_rightmost,
                      struct mem_cgroup_per_node, tree_node);
        /*
         * Remove the node now but someone else can add it back,
         * we will to add it back at the end of reclaim to its correct
         * position in the tree.
         */
-       __mem_cgroup_remove_exceeded(mz, mctz);
-       if (!soft_limit_excess(mz->memcg, N_MEMORY) ||
+       __mem_cgroup_remove_exceeded(mz, mctz, type);
+       if (!soft_limit_excess(mz->memcg, type) ||
            !css_tryget(&mz->memcg->css))
                goto retry;
 done:
@@ -773,12 +830,13 @@ __mem_cgroup_largest_soft_limit_node(struct 
mem_cgroup_tree_per_node *mctz)
 }
 
 static struct mem_cgroup_per_node *
-mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz,
+                                  enum node_states type)
 {
        struct mem_cgroup_per_node *mz;
 
        spin_lock_irq(&mctz->lock);
-       mz = __mem_cgroup_largest_soft_limit_node(mctz);
+       mz = __mem_cgroup_largest_soft_limit_node(mctz, type);
        spin_unlock_irq(&mctz->lock);
        return mz;
 }
@@ -3472,7 +3530,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
        struct mem_cgroup_per_node *mz, *next_mz = NULL;
        unsigned long reclaimed;
        int loop = 0;
-       struct mem_cgroup_tree_per_node *mctz;
+       struct mem_cgroup_tree_per_node *mctz, *mctz_sibling;
        unsigned long excess;
        unsigned long nr_scanned;
        int migration_nid;
@@ -3481,6 +3539,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
                return 0;
 
        mctz = soft_limit_tree_node(pgdat->node_id, N_MEMORY);
+       mctz_sibling = soft_limit_tree_node(pgdat->node_id, N_TOPTIER);
 
        /*
         * Do not even bother to check the largest node if the root
@@ -3516,7 +3575,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
                if (next_mz)
                        mz = next_mz;
                else
-                       mz = mem_cgroup_largest_soft_limit_node(mctz);
+                       mz = mem_cgroup_largest_soft_limit_node(mctz, N_MEMORY);
                if (!mz)
                        break;
 
@@ -3526,7 +3585,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
                nr_reclaimed += reclaimed;
                *total_scanned += nr_scanned;
                spin_lock_irq(&mctz->lock);
-               __mem_cgroup_remove_exceeded(mz, mctz);
+               __mem_cgroup_remove_exceeded(mz, mctz, N_MEMORY);
 
                /*
                 * If we failed to reclaim anything from this memory cgroup
@@ -3534,7 +3593,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
                 */
                next_mz = NULL;
                if (!reclaimed)
-                       next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
+                       next_mz =
+                          __mem_cgroup_largest_soft_limit_node(mctz, N_MEMORY);
 
                excess = soft_limit_excess(mz->memcg, N_MEMORY);
                /*
@@ -3546,8 +3606,20 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
                 * term TODO.
                 */
                /* If excess == 0, no tree ops */
-               __mem_cgroup_insert_exceeded(mz, mctz, excess);
+               __mem_cgroup_insert_exceeded(mz, mctz, excess, N_MEMORY);
                spin_unlock_irq(&mctz->lock);
+
+               /* update both affected N_MEMORY and N_TOPTIER trees */
+               if (mctz_sibling) {
+                       spin_lock_irq(&mctz_sibling->lock);
+                       __mem_cgroup_remove_exceeded(mz, mctz_sibling,
+                                                    N_TOPTIER);
+                       excess = soft_limit_excess(mz->memcg, N_TOPTIER);
+                       __mem_cgroup_insert_exceeded(mz, mctz, excess,
+                                                    N_TOPTIER);
+                       spin_unlock_irq(&mctz_sibling->lock);
+               }
+
                css_put(&mz->memcg->css);
                loop++;
                /*
@@ -5312,6 +5384,8 @@ static int alloc_mem_cgroup_per_node_info(struct 
mem_cgroup *memcg, int node)
        lruvec_init(&pn->lruvec);
        pn->usage_in_excess = 0;
        pn->on_tree = false;
+       pn->toptier_usage_in_excess = 0;
+       pn->on_toptier_tree = false;
        pn->memcg = memcg;
 
        memcg->nodeinfo[node] = pn;
-- 
2.20.1

Reply via email to