From: Zi Yan <z...@nvidia.com>

alloc_contig_range() worked at MAX_ORDER-1 granularity to avoid merging
pageblocks with different migratetypes. It might unnecessarily convert
extra pageblocks at the beginning and at the end of the range. Change
alloc_contig_range() to work at pageblock granularity.

It is done by restoring pageblock types and split >pageblock_order free
pages after isolating at MAX_ORDER-1 granularity and migrating pages
away at pageblock granularity. The reason for this process is that
during isolation, some pages, either free or in-use, might have >pageblock
sizes and isolating part of them can cause free accounting issues.
Restoring the migratetypes of the pageblocks not in the interesting
range later is much easier.

Signed-off-by: Zi Yan <z...@nvidia.com>
---
 mm/page_alloc.c | 175 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 155 insertions(+), 20 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 812cf557b20f..6ed506234efa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8862,8 +8862,8 @@ void *__init alloc_large_system_hash(const char 
*tablename,
 #ifdef CONFIG_CONTIG_ALLOC
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
-       return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
-                            pageblock_nr_pages) - 1);
+       return ALIGN_DOWN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
+                                    pageblock_nr_pages));
 }
 
 static unsigned long pfn_max_align_up(unsigned long pfn)
@@ -8952,6 +8952,52 @@ static int __alloc_contig_migrate_range(struct 
compact_control *cc,
        return 0;
 }
 
+static inline int save_migratetypes(unsigned char *migratetypes,
+                               unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long pfn = start_pfn;
+       int num = 0;
+
+       while (pfn < end_pfn) {
+               migratetypes[num] = get_pageblock_migratetype(pfn_to_page(pfn));
+               num++;
+               pfn += pageblock_nr_pages;
+       }
+       return num;
+}
+
+static inline int restore_migratetypes(unsigned char *migratetypes,
+                               unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long pfn = start_pfn;
+       int num = 0;
+
+       while (pfn < end_pfn) {
+               set_pageblock_migratetype(pfn_to_page(pfn), migratetypes[num]);
+               num++;
+               pfn += pageblock_nr_pages;
+       }
+       return num;
+}
+
+static inline void split_free_page_into_pageblocks(struct page *free_page,
+                               int order, struct zone *zone)
+{
+       unsigned long pfn;
+
+       spin_lock(&zone->lock);
+       del_page_from_free_list(free_page, zone, order);
+       for (pfn = page_to_pfn(free_page);
+            pfn < page_to_pfn(free_page) + (1UL << order);
+            pfn += pageblock_nr_pages) {
+               int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+               __free_one_page(pfn_to_page(pfn), pfn, zone, pageblock_order,
+                               mt, FPI_NONE);
+       }
+       spin_unlock(&zone->lock);
+}
+
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:     start PFN to allocate
@@ -8977,8 +9023,15 @@ int alloc_contig_range(unsigned long start, unsigned 
long end,
                       unsigned migratetype, gfp_t gfp_mask)
 {
        unsigned long outer_start, outer_end;
+       unsigned long isolate_start = pfn_max_align_down(start);
+       unsigned long isolate_end = pfn_max_align_up(end);
+       unsigned long alloc_start = ALIGN_DOWN(start, pageblock_nr_pages);
+       unsigned long alloc_end = ALIGN(end, pageblock_nr_pages);
+       unsigned long num_pageblock_to_save;
        unsigned int order;
        int ret = 0;
+       unsigned char *saved_mt;
+       int num;
 
        struct compact_control cc = {
                .nr_migratepages = 0,
@@ -8992,11 +9045,30 @@ int alloc_contig_range(unsigned long start, unsigned 
long end,
        };
        INIT_LIST_HEAD(&cc.migratepages);
 
+       /*
+        * TODO: make MIGRATE_ISOLATE a standalone bit to avoid overwriting
+        * the exiting migratetype. Then, we will not need the save and restore
+        * process here.
+        */
+
+       /* Save the migratepages of the pageblocks before start and after end */
+       num_pageblock_to_save = (alloc_start - isolate_start) / 
pageblock_nr_pages
+                               + (isolate_end - alloc_end) / 
pageblock_nr_pages;
+       saved_mt =
+               kmalloc_array(num_pageblock_to_save,
+                             sizeof(unsigned char), GFP_KERNEL);
+       if (!saved_mt)
+               return -ENOMEM;
+
+       num = save_migratetypes(saved_mt, isolate_start, alloc_start);
+
+       num = save_migratetypes(&saved_mt[num], alloc_end, isolate_end);
+
        /*
         * What we do here is we mark all pageblocks in range as
         * MIGRATE_ISOLATE.  Because pageblock and max order pages may
         * have different sizes, and due to the way page allocator
-        * work, we align the range to biggest of the two pages so
+        * work, we align the isolation range to biggest of the two so
         * that page allocator won't try to merge buddies from
         * different pageblocks and change MIGRATE_ISOLATE to some
         * other migration type.
@@ -9006,6 +9078,20 @@ int alloc_contig_range(unsigned long start, unsigned 
long end,
         * we are interested in).  This will put all the pages in
         * range back to page allocator as MIGRATE_ISOLATE.
         *
+        * Afterwards, we restore the migratetypes of the pageblocks not
+        * in range, split free pages spanning outside the range,
+        * and put split free pages (at pageblock_order) to the right
+        * migratetype list.
+        *
+        * NOTE: the above approach is used because it can cause free
+        * page accounting issues during isolation, if a page, either
+        * free or in-use, contains multiple pageblocks and we only
+        * isolate a subset of them. For example, if only the second
+        * pageblock is isolated from a page with 2 pageblocks, after
+        * the page is free, it will be put in the first pageblock
+        * migratetype list instead of having 2 pageblocks in two
+        * separate migratetype lists.
+        *
         * When this is done, we take the pages in range from page
         * allocator removing them from the buddy system.  This way
         * page allocator will never consider using them.
@@ -9016,10 +9102,10 @@ int alloc_contig_range(unsigned long start, unsigned 
long end,
         * put back to page allocator so that buddy can use them.
         */
 
-       ret = start_isolate_page_range(start, end, pfn_max_align_down(start),
-                                      pfn_max_align_up(end), migratetype, 0);
+       ret = start_isolate_page_range(start, end, isolate_start, isolate_end,
+                               migratetype, 0);
        if (ret)
-               return ret;
+               goto done;
 
        drain_all_pages(cc.zone);
 
@@ -9055,6 +9141,19 @@ int alloc_contig_range(unsigned long start, unsigned 
long end,
         * isolated thus they won't get removed from buddy.
         */
 
+       /*
+        * Restore migratetypes of pageblocks outside [start, end)
+        * TODO: remove it when MIGRATE_ISOLATE becomes a standalone bit
+        */
+
+       num = restore_migratetypes(saved_mt, isolate_start, alloc_start);
+
+       num = restore_migratetypes(&saved_mt[num], alloc_end, isolate_end);
+
+       /*
+        * Split free page spanning [isolate_start, alloc_start) and put the
+        * pageblocks in the right migratetype lists.
+        */
        order = 0;
        outer_start = start;
        while (!PageBuddy(pfn_to_page(outer_start))) {
@@ -9069,37 +9168,73 @@ int alloc_contig_range(unsigned long start, unsigned 
long end,
                order = buddy_order(pfn_to_page(outer_start));
 
                /*
-                * outer_start page could be small order buddy page and
-                * it doesn't include start page. Adjust outer_start
-                * in this case to report failed page properly
-                * on tracepoint in test_pages_isolated()
+                * split the free page has start page and put the pageblocks
+                * in the right migratetype list
                 */
-               if (outer_start + (1UL << order) <= start)
-                       outer_start = start;
+               if (outer_start + (1UL << order) > start) {
+                       struct page *free_page = pfn_to_page(outer_start);
+
+                       split_free_page_into_pageblocks(free_page, order, 
cc.zone);
+               }
+       }
+
+       /*
+        * Split free page spanning [alloc_end, isolate_end) and put the
+        * pageblocks in the right migratetype list
+        */
+       for (outer_end = alloc_end; outer_end < isolate_end;) {
+               unsigned long begin_pfn = outer_end;
+
+               order = 0;
+               while (!PageBuddy(pfn_to_page(outer_end))) {
+                       if (++order >= MAX_ORDER) {
+                               outer_end = begin_pfn;
+                               break;
+                       }
+                       outer_end &= ~0UL << order;
+               }
+
+               if (outer_end != begin_pfn) {
+                       order = buddy_order(pfn_to_page(outer_end));
+
+                       /*
+                        * split the free page has start page and put the 
pageblocks
+                        * in the right migratetype list
+                        */
+                       VM_BUG_ON(outer_end + (1UL << order) <= begin_pfn);
+                       {
+                               struct page *free_page = pfn_to_page(outer_end);
+
+                               split_free_page_into_pageblocks(free_page, 
order, cc.zone);
+                       }
+                       outer_end += 1UL << order;
+               } else
+                       outer_end = begin_pfn + 1;
        }
 
        /* Make sure the range is really isolated. */
-       if (test_pages_isolated(outer_start, end, 0)) {
+       if (test_pages_isolated(alloc_start, alloc_end, 0)) {
                ret = -EBUSY;
                goto done;
        }
 
        /* Grab isolated pages from freelists. */
-       outer_end = isolate_freepages_range(&cc, outer_start, end);
+       outer_end = isolate_freepages_range(&cc, alloc_start, alloc_end);
        if (!outer_end) {
                ret = -EBUSY;
                goto done;
        }
 
        /* Free head and tail (if any) */
-       if (start != outer_start)
-               free_contig_range(outer_start, start - outer_start);
-       if (end != outer_end)
-               free_contig_range(end, outer_end - end);
+       if (start != alloc_start)
+               free_contig_range(alloc_start, start - alloc_start);
+       if (end != alloc_end)
+               free_contig_range(end, alloc_end - end);
 
 done:
-       undo_isolate_page_range(pfn_max_align_down(start),
-                               pfn_max_align_up(end), migratetype);
+       kfree(saved_mt);
+       undo_isolate_page_range(alloc_start,
+                               alloc_end, migratetype);
        return ret;
 }
 EXPORT_SYMBOL(alloc_contig_range);
-- 
2.34.1

Reply via email to