While using memory control cgroup, page-migration under it works as following.
==
 1. uncharge all refs at try to unmap.
 2. charge regs again remove_migration_ptes()
==
This is simple but has following problems.
==
 The page is uncharged and chaged back again if *mapped*.
    - This means that cgroup before migraion can be different from one after
      migraion
    - If page is not mapped but charged as page cache, charge is just ignored
      (because not mapped, it will not be uncharged before migration)
      This is memory leak.
==
This is bad.
And migration can migrate *not mapped* pages in future by migration-by-kernel
driven by memory-unplug and defragment-by-migration at el.

This patch tries to keep memory cgroup at page migration by increasing
one refcnt during it. 3 functions are added.
 mem_cgroup_prepare_migration() --- increase refcnt of page->page_cgroup
 mem_cgroup_end_migration()     --- decrease refcnt of page->page_cgroup
 mem_cgroup_page_migration() --- copy page->page_cgroup from old page to
                                 new page.

Obviously, mem_cgroup_isolate_pages() and this page migration, which
copies page_cgroup from old page to new page, has race.

There seem to be  3 ways for avoiding this race.
 A. take mem_group->lock while mem_cgroup_page_migration().
 B. isolate pc from mem_cgroup's LRU when we isolate page from zone's LRU.
 C. ignore non-LRU page at mem_cgroup_isolate_pages(). 

This patch uses method (C.) and modifes mem_cgroup_isolate_pages() igonres
!PageLRU pages.

Tested and worked well in ia64/NUMA box.

Signed-off-by: KAMEZAWA Hiroyuki <[EMAIL PROTECTED]>

---
 include/linux/memcontrol.h |   22 +++++++++++++++
 mm/memcontrol.c            |   62 ++++++++++++++++++++++++++++++++++++++++++---
 mm/migrate.c               |   13 +++++++--
 3 files changed, 90 insertions(+), 7 deletions(-)

Index: linux-2.6.23-rc8-mm2/include/linux/memcontrol.h
===================================================================
--- linux-2.6.23-rc8-mm2.orig/include/linux/memcontrol.h
+++ linux-2.6.23-rc8-mm2/include/linux/memcontrol.h
@@ -43,8 +43,14 @@ extern unsigned long mem_cgroup_isolate_
                                        struct mem_cgroup *mem_cont,
                                        int active);
 extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
+
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                        gfp_t gfp_mask);
+/* For handling page migration in proper way */
+extern void mem_cgroup_prepare_migration(struct page *page);
+extern void mem_cgroup_end_migration(struct page *page);
+extern void mem_cgroup_page_migration(struct page *newpage, struct page *page);
+
 
 static inline struct mem_cgroup *mm_cgroup(const struct mm_struct *mm)
 {
@@ -107,6 +113,22 @@ static inline struct mem_cgroup *mm_cgro
        return NULL;
 }
 
+/* For page migration */
+static inline void mem_cgroup_prepare_migration(struct page *page)
+{
+       return;
+}
+
+static inline void mem_cgroup_end_migration(struct mem_cgroup *cgroup)
+{
+       return;
+}
+
+static inline void
+mem_cgroup_page_migration(struct page *newpage, struct page *page)
+{
+       return;
+}
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
Index: linux-2.6.23-rc8-mm2/mm/memcontrol.c
===================================================================
--- linux-2.6.23-rc8-mm2.orig/mm/memcontrol.c
+++ linux-2.6.23-rc8-mm2/mm/memcontrol.c
@@ -198,7 +198,7 @@ unsigned long mem_cgroup_isolate_pages(u
        unsigned long scan;
        LIST_HEAD(pc_list);
        struct list_head *src;
-       struct page_cgroup *pc;
+       struct page_cgroup *pc, *tmp;
 
        if (active)
                src = &mem_cont->active_list;
@@ -206,8 +206,10 @@ unsigned long mem_cgroup_isolate_pages(u
                src = &mem_cont->inactive_list;
 
        spin_lock(&mem_cont->lru_lock);
-       for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
-               pc = list_entry(src->prev, struct page_cgroup, lru);
+       scan = 0;
+       list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
+               if (scan++ >= nr_to_scan)
+                       break;
                page = pc->page;
                VM_BUG_ON(!pc);
 
@@ -225,9 +227,14 @@ unsigned long mem_cgroup_isolate_pages(u
                /*
                 * Reclaim, per zone
                 * TODO: make the active/inactive lists per zone
+                * !PageLRU page can be found under us while migration.
+                * just ignore it.
                 */
-               if (page_zone(page) != z)
+               if (page_zone(page) != z || !PageLRU(page)) {
+                       /* Skip this */
+                       /* Don't decrease scan here for avoiding dead lock */
                        continue;
+               }
 
                /*
                 * Check if the meta page went away from under us
@@ -417,8 +424,14 @@ void mem_cgroup_uncharge(struct page_cgr
                return;
 
        if (atomic_dec_and_test(&pc->ref_cnt)) {
+retry:
                page = pc->page;
                lock_page_cgroup(page);
+               /* migration occur ? */
+               if (page_get_page_cgroup(page) != pc) {
+                       unlock_page_cgroup(page);
+                       goto retry;
+               }
                mem = pc->mem_cgroup;
                css_put(&mem->css);
                page_assign_page_cgroup(page, NULL);
@@ -432,6 +445,47 @@ void mem_cgroup_uncharge(struct page_cgr
        }
 }
 
+void mem_cgroup_prepare_migration(struct page *page)
+{
+       struct page_cgroup *pc;
+       lock_page_cgroup(page);
+       pc = page_get_page_cgroup(page);
+       if (pc)
+               atomic_inc(&pc->ref_cnt);
+       unlock_page_cgroup(page);
+       return;
+}
+
+void mem_cgroup_end_migration(struct page *page)
+{
+       struct page_cgroup *pc = page_get_page_cgroup(page);
+       mem_cgroup_uncharge(pc);
+}
+
+void mem_cgroup_page_migration(struct page *newpage, struct page *page)
+{
+       struct page_cgroup *pc;
+       /*
+       * We already keep one reference to paage->cgroup.
+       * and both pages are guaranteed to be locked under page migration.
+       */
+       lock_page_cgroup(page);
+       lock_page_cgroup(newpage);
+       pc = page_get_page_cgroup(page);
+       if (!pc)
+               goto unlock_out;
+       page_assign_page_cgroup(page, NULL);
+       pc->page = newpage;
+       page_assign_page_cgroup(newpage, pc);
+
+unlock_out:
+       unlock_page_cgroup(newpage);
+       unlock_page_cgroup(page);
+       return;
+}
+
+
+
 int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
 {
        *tmp = memparse(buf, &buf);
Index: linux-2.6.23-rc8-mm2/mm/migrate.c
===================================================================
--- linux-2.6.23-rc8-mm2.orig/mm/migrate.c
+++ linux-2.6.23-rc8-mm2/mm/migrate.c
@@ -598,9 +598,10 @@ static int move_to_new_page(struct page 
        else
                rc = fallback_migrate_page(mapping, newpage, page);
 
-       if (!rc)
+       if (!rc) {
+               mem_cgroup_page_migration(newpage, page);
                remove_migration_ptes(page, newpage);
-       else
+       } else
                newpage->mapping = NULL;
 
        unlock_page(newpage);
@@ -651,6 +652,8 @@ static int unmap_and_move(new_page_t get
                rcu_read_lock();
                rcu_locked = 1;
        }
+       mem_cgroup_prepare_migration(page);
+
        /*
         * This is a corner case handling.
         * When a new swap-cache is read into, it is linked to LRU
@@ -666,8 +669,12 @@ static int unmap_and_move(new_page_t get
        if (!page_mapped(page))
                rc = move_to_new_page(newpage, page);
 
-       if (rc)
+       if (rc) {
                remove_migration_ptes(page, page);
+               mem_cgroup_end_migration(page);
+       } else
+               mem_cgroup_end_migration(newpage);
+
 rcu_unlock:
        if (rcu_locked)
                rcu_read_unlock();

_______________________________________________
Containers mailing list
[EMAIL PROTECTED]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel

Reply via email to