From: Zi Yan <z...@nvidia.com>

alloc_contig_range() worked at MAX_ORDER-1 granularity to avoid merging
pageblocks with different migratetypes. It might unnecessarily convert
extra pageblocks at the beginning and at the end of the range. Change
alloc_contig_range() to work at pageblock granularity.

Special handling is needed for free pages and in-use pages across the
boundaries of the range specified alloc_contig_range(). Because these
partially isolated pages causes free page accounting issues. The free
pages will be split and freed into separate migratetype lists; the
in-use pages will be migrated then the freed pages will be handled.

Signed-off-by: Zi Yan <z...@nvidia.com>
---
 include/linux/page-isolation.h |   2 +-
 mm/internal.h                  |   3 +
 mm/memory_hotplug.c            |   3 +-
 mm/page_alloc.c                | 235 +++++++++++++++++++++++++--------
 mm/page_isolation.c            |  33 ++++-
 5 files changed, 211 insertions(+), 65 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 4ef7be6def83..78ff940cc169 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -54,7 +54,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
  */
 int
 start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-                        unsigned migratetype, int flags);
+                        unsigned migratetype, int flags, gfp_t gfp_flags);
 
 /*
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
diff --git a/mm/internal.h b/mm/internal.h
index 0d240e876831..509cbdc25992 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -319,6 +319,9 @@ isolate_freepages_range(struct compact_control *cc,
 int
 isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);
+
+int
+isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, int 
isolate_before_boundary);
 #endif
 int find_suitable_fallback(struct free_area *area, unsigned int order,
                        int migratetype, bool only_stealable, bool *can_steal);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ce68098832aa..82406d2f3e46 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1863,7 +1863,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned 
long nr_pages,
        /* set above range as isolated */
        ret = start_isolate_page_range(start_pfn, end_pfn,
                                       MIGRATE_MOVABLE,
-                                      MEMORY_OFFLINE | REPORT_FAILURE);
+                                      MEMORY_OFFLINE | REPORT_FAILURE,
+                                      GFP_USER | __GFP_MOVABLE | 
__GFP_RETRY_MAYFAIL);
        if (ret) {
                reason = "failure to isolate range";
                goto failed_removal_pcplists_disabled;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 62ef78f3d771..7a4fa21aea5c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8985,7 +8985,7 @@ static inline void alloc_contig_dump_pages(struct 
list_head *page_list)
 #endif
 
 /* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(struct compact_control *cc,
+int __alloc_contig_migrate_range(struct compact_control *cc,
                                        unsigned long start, unsigned long end)
 {
        /* This function is based on compact_zone() from compaction.c. */
@@ -9043,6 +9043,167 @@ static int __alloc_contig_migrate_range(struct 
compact_control *cc,
        return 0;
 }
 
+/**
+ * split_free_page() -- split a free page at split_pfn_offset
+ * @free_page:         the original free page
+ * @order:             the order of the page
+ * @split_pfn_offset:  split offset within the page
+ *
+ * It is used when the free page crosses two pageblocks with different 
migratetypes
+ * at split_pfn_offset within the page. The split free page will be put into
+ * separate migratetype lists afterwards. Otherwise, the function achieves
+ * nothing.
+ */
+static inline void split_free_page(struct page *free_page,
+                               int order, unsigned long split_pfn_offset)
+{
+       struct zone *zone = page_zone(free_page);
+       unsigned long free_page_pfn = page_to_pfn(free_page);
+       unsigned long pfn;
+       unsigned long flags;
+       int free_page_order;
+
+       spin_lock_irqsave(&zone->lock, flags);
+       del_page_from_free_list(free_page, zone, order);
+       for (pfn = free_page_pfn;
+            pfn < free_page_pfn + (1UL << order);) {
+               int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+               free_page_order = order_base_2(split_pfn_offset);
+               __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+                               mt, FPI_NONE);
+               pfn += 1UL << free_page_order;
+               split_pfn_offset -= (1UL << free_page_order);
+               /* we have done the first part, now switch to second part */
+               if (split_pfn_offset == 0)
+                       split_pfn_offset = (1UL << order) - (pfn - 
free_page_pfn);
+       }
+       spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/**
+ * isolate_single_pageblock() -- tries to isolate a pageblock that might be
+ * within a free or in-use page.
+ * @boundary_pfn:              pageblock-aligned pfn that a page might cross
+ * @gfp_flags:                 GFP flags used for migrating pages
+ * @isolate_before_boundary:   isolate the pageblock before (1) or after (0)
+ *                             the boundary_pfn
+ *
+ * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
+ * pageblock. When not all pageblocks within a page are isolated at the same
+ * time, free page accounting can go wrong. For example, in the case of
+ * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
+ * [        MAX_ORDER-1          ]
+ * [  pageblock0  |  pageblock1  ]
+ * When either pageblock is isolated, if it is a free page, the page is not
+ * split into separate migratetype lists, which is supposed to; if it is an
+ * in-use page and freed later, __free_one_page() does not split the free page
+ * either. The function handles this by splitting the free page or migrating
+ * the in-use page then splitting the free page.
+ */
+int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags,
+                       int isolate_before_boundary)
+{
+       unsigned char saved_mt;
+       /*
+        * scan at max(MAX_ORDER_NR_PAGES, pageblock_nr_pages) aligned range to
+        * avoid isolate pageblocks belonging to a bigger free or in-use page
+        */
+       unsigned long start_pfn = pfn_max_align_down(boundary_pfn);
+       unsigned long isolated_pageblock_pfn;
+       unsigned long pfn;
+
+       VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
+
+       if (isolate_before_boundary)
+               isolated_pageblock_pfn = boundary_pfn - pageblock_nr_pages;
+       else
+               isolated_pageblock_pfn = boundary_pfn;
+
+       saved_mt = 
get_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn));
+       set_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn), 
MIGRATE_ISOLATE);
+
+       for (pfn = start_pfn; pfn < boundary_pfn;) {
+               struct page *page = pfn_to_page(pfn);
+
+               /*
+                * start_pfn is max(MAX_ORDER_NR_PAGES, pageblock_nr_pages)
+                * aligned, if there is any free pages in [start_pfn, 
boundary_pfn),
+                * its head page will always be in the range.
+                */
+               if (PageBuddy(page)) {
+                       int order = buddy_order(page);
+
+                       if (pfn + (1UL << order) > boundary_pfn)
+                               split_free_page(page, order, boundary_pfn - 
pfn);
+                       pfn += (1UL << order);
+                       continue;
+               }
+               /*
+                * migrate compound pages then let the free page handling code
+                * above do the rest
+                */
+               if (PageHuge(page) || PageTransCompound(page)) {
+                       unsigned long nr_pages = compound_nr(page);
+                       int order = compound_order(page);
+                       struct page *head = compound_head(page);
+                       unsigned long head_pfn = page_to_pfn(head);
+
+                       if (head_pfn + nr_pages >= boundary_pfn) {
+                               int ret;
+                               struct compact_control cc = {
+                                       .nr_migratepages = 0,
+                                       .order = -1,
+                                       .zone = 
page_zone(pfn_to_page(head_pfn)),
+                                       .mode = MIGRATE_SYNC,
+                                       .ignore_skip_hint = true,
+                                       .no_set_skip_hint = true,
+                                       .gfp_mask = 
current_gfp_context(gfp_flags),
+                                       .alloc_contig = true,
+                               };
+
+                               INIT_LIST_HEAD(&cc.migratepages);
+
+                               ret = __alloc_contig_migrate_range(&cc, 
head_pfn,
+                                                       head_pfn + nr_pages);
+
+                               if (ret) {
+                                       /* restore the original migratetype */
+                                       set_pageblock_migratetype(
+                                               
pfn_to_page(isolated_pageblock_pfn),
+                                               saved_mt);
+                                       return -EBUSY;
+                               }
+                               /*
+                                * reset pfn, let the free page handling code
+                                * above split the free page to the right
+                                * migratetype list.
+                                *
+                                * head_pfn is not used here as a hugetlb page
+                                * order can be bigger than MAX_ORDER-1, but
+                                * after it is freed, the free page order is 
not.
+                                * Use pfn within the range to find the head of
+                                * the free page and reset order to 0 if a 
hugetlb
+                                * page with >MAX_ORDER-1 order is encountered.
+                                */
+                               if (order > MAX_ORDER-1)
+                                       order = 0;
+                               while (!PageBuddy(pfn_to_page(pfn))) {
+                                       order++;
+                                       pfn &= ~0UL << order;
+                               }
+                               continue;
+                       }
+                       pfn += nr_pages;
+                       continue;
+               }
+
+               pfn++;
+       }
+       return 0;
+}
+
+
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:     start PFN to allocate
@@ -9067,8 +9228,9 @@ static int __alloc_contig_migrate_range(struct 
compact_control *cc,
 int alloc_contig_range(unsigned long start, unsigned long end,
                       unsigned migratetype, gfp_t gfp_mask)
 {
-       unsigned long outer_start, outer_end;
-       unsigned int order;
+       unsigned long outer_end;
+       unsigned long alloc_start = ALIGN_DOWN(start, pageblock_nr_pages);
+       unsigned long alloc_end = ALIGN(end, pageblock_nr_pages);
        int ret = 0;
 
        struct compact_control cc = {
@@ -9087,14 +9249,11 @@ int alloc_contig_range(unsigned long start, unsigned 
long end,
         * What we do here is we mark all pageblocks in range as
         * MIGRATE_ISOLATE.  Because pageblock and max order pages may
         * have different sizes, and due to the way page allocator
-        * work, we align the range to biggest of the two pages so
-        * that page allocator won't try to merge buddies from
-        * different pageblocks and change MIGRATE_ISOLATE to some
-        * other migration type.
+        * work, start_isolate_page_range() has special handlings for this.
         *
         * Once the pageblocks are marked as MIGRATE_ISOLATE, we
         * migrate the pages from an unaligned range (ie. pages that
-        * we are interested in).  This will put all the pages in
+        * we are interested in). This will put all the pages in
         * range back to page allocator as MIGRATE_ISOLATE.
         *
         * When this is done, we take the pages in range from page
@@ -9107,9 +9266,9 @@ int alloc_contig_range(unsigned long start, unsigned long 
end,
         * put back to page allocator so that buddy can use them.
         */
 
-       ret = start_isolate_page_range(start, end, migratetype, 0);
+       ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
        if (ret)
-               return ret;
+               goto done;
 
        drain_all_pages(cc.zone);
 
@@ -9128,68 +9287,28 @@ int alloc_contig_range(unsigned long start, unsigned 
long end,
                goto done;
        ret = 0;
 
-       /*
-        * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
-        * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
-        * more, all pages in [start, end) are free in page allocator.
-        * What we are going to do is to allocate all pages from
-        * [start, end) (that is remove them from page allocator).
-        *
-        * The only problem is that pages at the beginning and at the
-        * end of interesting range may be not aligned with pages that
-        * page allocator holds, ie. they can be part of higher order
-        * pages.  Because of this, we reserve the bigger range and
-        * once this is done free the pages we are not interested in.
-        *
-        * We don't have to hold zone->lock here because the pages are
-        * isolated thus they won't get removed from buddy.
-        */
-
-       order = 0;
-       outer_start = start;
-       while (!PageBuddy(pfn_to_page(outer_start))) {
-               if (++order >= MAX_ORDER) {
-                       outer_start = start;
-                       break;
-               }
-               outer_start &= ~0UL << order;
-       }
-
-       if (outer_start != start) {
-               order = buddy_order(pfn_to_page(outer_start));
-
-               /*
-                * outer_start page could be small order buddy page and
-                * it doesn't include start page. Adjust outer_start
-                * in this case to report failed page properly
-                * on tracepoint in test_pages_isolated()
-                */
-               if (outer_start + (1UL << order) <= start)
-                       outer_start = start;
-       }
-
        /* Make sure the range is really isolated. */
-       if (test_pages_isolated(outer_start, end, 0)) {
+       if (test_pages_isolated(alloc_start, alloc_end, 0)) {
                ret = -EBUSY;
                goto done;
        }
 
        /* Grab isolated pages from freelists. */
-       outer_end = isolate_freepages_range(&cc, outer_start, end);
+       outer_end = isolate_freepages_range(&cc, alloc_start, alloc_end);
        if (!outer_end) {
                ret = -EBUSY;
                goto done;
        }
 
        /* Free head and tail (if any) */
-       if (start != outer_start)
-               free_contig_range(outer_start, start - outer_start);
-       if (end != outer_end)
-               free_contig_range(end, outer_end - end);
+       if (start != alloc_start)
+               free_contig_range(alloc_start, start - alloc_start);
+       if (end != alloc_end)
+               free_contig_range(end, alloc_end - end);
 
 done:
-       undo_isolate_page_range(pfn_max_align_down(start),
-                               pfn_max_align_up(end), migratetype);
+       undo_isolate_page_range(alloc_start,
+                               alloc_end, migratetype);
        return ret;
 }
 EXPORT_SYMBOL(alloc_contig_range);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 64d093ab83ec..0256d5e1032c 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -285,6 +285,8 @@ __first_valid_page(unsigned long pfn, unsigned long 
nr_pages)
  *                                      and PageOffline() pages.
  *                     REPORT_FAILURE - report details about the failure to
  *                     isolate the range
+ * @gfp_flags:         GFP flags used for migrating pages that sit across the
+ *                     range boundaries.
  *
  * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
  * the range will never be allocated. Any free pages and pages freed in the
@@ -293,6 +295,10 @@ __first_valid_page(unsigned long pfn, unsigned long 
nr_pages)
  * pages in the range finally, the caller have to free all pages in the range.
  * test_page_isolated() can be used for test it.
  *
+ * The function first tries to isolate the pageblocks at the beginning and end
+ * of the range, since there might be pages across the range boundaries.
+ * Afterwards, it isolates the rest of the range.
+ *
  * There is no high level synchronization mechanism that prevents two threads
  * from trying to isolate overlapping ranges. If this happens, one thread
  * will notice pageblocks in the overlapping range already set to isolate.
@@ -313,21 +319,38 @@ __first_valid_page(unsigned long pfn, unsigned long 
nr_pages)
  * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
  */
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-                            unsigned migratetype, int flags)
+                            unsigned migratetype, int flags, gfp_t gfp_flags)
 {
        unsigned long pfn;
        struct page *page;
+       /* isolation is done at page block granularity */
+       unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
+       unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
+       int ret;
 
-       unsigned long isolate_start = pfn_max_align_down(start_pfn);
-       unsigned long isolate_end = pfn_max_align_up(end_pfn);
+       /* isolate [isolate_start, isolate_start + pageblock_nr_pages] 
pageblock */
+       ret = isolate_single_pageblock(isolate_start, gfp_flags, 0);
+       if (ret)
+               return ret;
+
+       /* isolate [isolate_end - pageblock_nr_pages, isolate_end] pageblock */
+       ret = isolate_single_pageblock(isolate_end, gfp_flags, 1);
+       if (ret) {
+               unset_migratetype_isolate(pfn_to_page(isolate_start), 
migratetype);
+               return ret;
+       }
 
-       for (pfn = isolate_start;
-            pfn < isolate_end;
+       /* skip isolated pageblocks at the beginning and end */
+       for (pfn = isolate_start + pageblock_nr_pages;
+            pfn < isolate_end - pageblock_nr_pages;
             pfn += pageblock_nr_pages) {
                page = __first_valid_page(pfn, pageblock_nr_pages);
                if (page && set_migratetype_isolate(page, migratetype, flags,
                                        start_pfn, end_pfn)) {
                        undo_isolate_page_range(isolate_start, pfn, 
migratetype);
+                       unset_migratetype_isolate(
+                               pfn_to_page(isolate_end - pageblock_nr_pages),
+                               migratetype);
                        return -EBUSY;
                }
        }
-- 
2.34.1

Reply via email to