From: Zi Yan <z...@nvidia.com> During the process of generating physically contiguous memory, it is possible that we want to move a THP to a place with 512 base pages. Exchange pages has not implemented the exchange of a THP and 512 base pages. Instead, we can split the THP and exchange 512 base pages. This increases the chance of creating a large contiguous region. A split THP could be promoted back after all 512 pages are moved to the destination or if none of its subpages is moved. In-place THP promotion will be introduced later in this patch serie.
Signed-off-by: Zi Yan <z...@nvidia.com> --- mm/internal.h | 4 ++ mm/mem_defrag.c | 155 +++++++++++++++++++++++++++++++++++++----------- mm/page_alloc.c | 45 ++++++++++++++ 3 files changed, 168 insertions(+), 36 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 4fe8d1a4d7bb..70a6ef603e5b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -574,6 +574,10 @@ void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area, int migratetype); +int expand_free_page(struct zone *zone, struct page *buddy_head, + struct page *page, int buddy_order, int page_order, + struct free_area *area, int migratetype); + void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags); diff --git a/mm/mem_defrag.c b/mm/mem_defrag.c index 414909e1c19c..4d458b125c95 100644 --- a/mm/mem_defrag.c +++ b/mm/mem_defrag.c @@ -643,6 +643,15 @@ static void exchange_free(struct page *freepage, unsigned long data) head->num_freepages++; } +static bool page_can_migrate(struct page *page) +{ + if (PageAnon(page)) + return true; + if (page_mapping(page)) + return true; + return false; +} + int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct page *anchor_page, unsigned long page_vaddr, @@ -655,6 +664,7 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma, int not_present = 0; bool src_thp = false; +restart: for (scan_address = start_addr; scan_address < end_addr; scan_address += page_size) { struct page *scan_page; @@ -683,6 +693,8 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma, if ((scan_page == compound_head(scan_page)) && PageTransHuge(scan_page) && !PageHuge(scan_page)) src_thp = true; + else + src_thp = false; /* Allow THPs */ if (PageCompound(scan_page) && !src_thp) { @@ -720,13 +732,17 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma, } retry_defrag: - /* migrate */ - if (PageBuddy(dest_page)) { + /* free pages */ + if (page_count(dest_page) == 0 && dest_page->mapping == NULL) { + int buddy_page_order = 0; + unsigned long pfn = page_to_pfn(dest_page); + unsigned long buddy_pfn; + struct page *buddy = dest_page; struct zone *zone = page_zone(dest_page); spinlock_t *zone_lock = &zone->lock; unsigned long zone_lock_flags; unsigned long free_page_order = 0; - int err = 0; + int err = 0, expand_err = 0; struct exchange_alloc_head exchange_alloc_head = {0}; int migratetype = get_pageblock_migratetype(dest_page); @@ -734,32 +750,77 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma, INIT_LIST_HEAD(&exchange_alloc_head.freelist); INIT_LIST_HEAD(&exchange_alloc_head.migratepage_list); - count_vm_events(MEM_DEFRAG_DST_FREE_PAGES, 1<<scan_page_order); + /* not managed pages */ + if (!dest_page->flags) { + failed += 1; + defrag_stats->dst_out_of_bound_failed += 1; + defrag_stats->not_defrag_vpn = scan_address + page_size; + goto quit_defrag; + } + /* spill order-0 pages to buddy allocator from pcplist */ + if (!PageBuddy(dest_page) && !page_drained) { + drain_all_pages(zone); + page_drained = 1; + goto retry_defrag; + } /* lock page_zone(dest_page)->lock */ spin_lock_irqsave(zone_lock, zone_lock_flags); - if (!PageBuddy(dest_page)) { + while (!PageBuddy(buddy) && buddy_page_order < MAX_ORDER) { + buddy_pfn = pfn & ~((1<<buddy_page_order) - 1); + buddy = dest_page - (pfn - buddy_pfn); + buddy_page_order++; + } + if (!PageBuddy(buddy)) { err = -EINVAL; goto freepage_isolate_fail; } - free_page_order = page_order(dest_page); + count_vm_events(MEM_DEFRAG_DST_FREE_PAGES, 1<<scan_page_order); - /* fail early if not enough free pages */ - if (free_page_order < scan_page_order) { + free_page_order = page_order(buddy); + + /* caught some transient-state page */ + if (free_page_order < buddy_page_order) { err = -ENOMEM; goto freepage_isolate_fail; } + /* fail early if not enough free pages */ + if (free_page_order < scan_page_order) { + int ret; + + spin_unlock_irqrestore(zone_lock, zone_lock_flags); + + if (is_huge_zero_page(scan_page)) { + err = -ENOMEM; + goto freepage_isolate_fail_unlocked; + } + get_page(scan_page); + lock_page(scan_page); + ret = split_huge_page(scan_page); + unlock_page(scan_page); + put_page(scan_page); + if (ret) { + err = -ENOMEM; + goto freepage_isolate_fail_unlocked; + } else { + goto restart; + } + } + /* __isolate_free_page() */ - err = isolate_free_page_no_wmark(dest_page, free_page_order); + err = isolate_free_page_no_wmark(buddy, free_page_order); if (!err) goto freepage_isolate_fail; - expand(zone, dest_page, scan_page_order, free_page_order, + expand_err = expand_free_page(zone, buddy, dest_page, + free_page_order, scan_page_order, &(zone->free_area[free_page_order]), migratetype); + if (expand_err) + goto freepage_isolate_fail; if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, -(1UL << scan_page_order), @@ -778,7 +839,7 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma, freepage_isolate_fail: spin_unlock_irqrestore(zone_lock, zone_lock_flags); - +freepage_isolate_fail_unlocked: if (err < 0) { failed += (page_size/PAGE_SIZE); defrag_stats->dst_isolate_free_failed += (page_size/PAGE_SIZE); @@ -844,6 +905,8 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma, if ((dest_page == compound_head(dest_page)) && PageTransHuge(dest_page) && !PageHuge(dest_page)) dst_thp = true; + else + dst_thp = false; if (PageCompound(dest_page) && !dst_thp) { failed += get_contig_page_size(dest_page); @@ -854,37 +917,56 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma, } if (src_thp != dst_thp) { - failed += get_contig_page_size(scan_page); - if (src_thp && !dst_thp) - defrag_stats->src_thp_dst_not_failed += - page_size/PAGE_SIZE; - else /* !src_thp && dst_thp */ - defrag_stats->dst_thp_src_not_failed += - page_size/PAGE_SIZE; + if (src_thp && !dst_thp) { + int ret; + + if (!page_can_migrate(dest_page)) { + failed += get_contig_page_size(scan_page); + defrag_stats->not_defrag_vpn = scan_address + page_size; + goto quit_defrag; + } + get_page(scan_page); + lock_page(scan_page); + if (!PageCompound(scan_page) || is_huge_zero_page(scan_page)) { + ret = 0; + src_thp = false; + goto split_src_done; + } + ret = split_huge_page(scan_page); +split_src_done: + unlock_page(scan_page); + put_page(scan_page); + if (ret) + defrag_stats->src_thp_dst_not_failed += page_size/PAGE_SIZE; + else + goto restart; + } else {/* !src_thp && dst_thp */ + int ret; + + get_page(dest_page); + lock_page(dest_page); + if (!PageCompound(dest_page) || is_huge_zero_page(dest_page)) { + ret = 0; + dst_thp = false; + goto split_dst_done; + } + ret = split_huge_page(dest_page); +split_dst_done: + unlock_page(dest_page); + put_page(dest_page); + if (ret) + defrag_stats->dst_thp_src_not_failed += page_size/PAGE_SIZE; + else + goto retry_defrag; + } + + failed += get_contig_page_size(scan_page); defrag_stats->not_defrag_vpn = scan_address + page_size; goto quit_defrag; /*continue;*/ } - /* free page on pcplist */ - if (page_count(dest_page) == 0) { - /* not managed pages */ - if (!dest_page->flags) { - failed += 1; - defrag_stats->dst_out_of_bound_failed += 1; - - defrag_stats->not_defrag_vpn = scan_address + page_size; - goto quit_defrag; - } - /* spill order-0 pages to buddy allocator from pcplist */ - if (!page_drained) { - drain_all_pages(NULL); - page_drained = 1; - goto retry_defrag; - } - } - if (PageAnon(dest_page)) { count_vm_events(MEM_DEFRAG_DST_ANON_PAGES, 1<<scan_page_order); @@ -895,6 +977,7 @@ int defrag_address_range(struct mm_struct *mm, struct vm_area_struct *vma, 1<<scan_page_order); failed += 1<<scan_page_order; defrag_stats->dst_anon_failed += 1<<scan_page_order; + /*print_page_stats(dest_page, "anonymous page");*/ } } else if (page_mapping(dest_page)) { count_vm_events(MEM_DEFRAG_DST_FILE_PAGES, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a35605e0924a..9ba2cdc320f2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1855,6 +1855,51 @@ inline void expand(struct zone *zone, struct page *page, } } +inline int expand_free_page(struct zone *zone, struct page *buddy_head, + struct page *page, int buddy_order, int page_order, struct free_area *area, + int migratetype) +{ + unsigned long size = 1 << buddy_order; + + if (!(page >= buddy_head && page < (buddy_head + (1<<buddy_order)))) { + int mapcount = PageSlab(buddy_head) ? 0 : page_mapcount(buddy_head); + + mapcount = PageSlab(page) ? 0 : page_mapcount(page); + __free_one_page(buddy_head, page_to_pfn(buddy_head), zone, buddy_order, + migratetype); + return -EINVAL; + } + + while (buddy_order > page_order) { + struct page *page_to_free; + + area--; + buddy_order--; + size >>= 1; + + if (page < (buddy_head + size)) + page_to_free = buddy_head + size; + else { + page_to_free = buddy_head; + buddy_head = buddy_head + size; + } + + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + if (set_page_guard(zone, page_to_free, buddy_order, migratetype)) + continue; + + list_add(&page_to_free->lru, &area->free_list[migratetype]); + area->nr_free++; + set_page_order(page_to_free, buddy_order); + } + return 0; +} + static void check_new_page_bad(struct page *page) { const char *bad_reason = NULL; -- 2.20.1