Re: [PATCH v2 11/13] mm: handling Non-LRU pages returned by vm_normal_pages

Alistair Popple Mon, 23 May 2022 05:38:16 -0700


Technically I think this patch should be earlier in the series. As I
understand it patch 1 allows DEVICE_COHERENT pages to be inserted in the
page tables and therefore makes it possible for page table walkers to
see non-LRU pages.


Some more comments below:

Alex Sierra <alex.sie...@amd.com> writes:

> With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
> device-managed anonymous pages that are not LRU pages. Although they
> behave like normal pages for purposes of mapping in CPU page, and for
> COW. They do not support LRU lists, NUMA migration or THP.
>
> We also introduced a FOLL_LRU flag that adds the same behaviour to
> follow_page and related APIs, to allow callers to specify that they
> expect to put pages on an LRU list.

This means by default GUP can return non-LRU pages. I didn't see
anywhere that would be a problem but I didn't check everything. Did you
check this or is there some other reason I've missed that makes this not
a problem?

[...]

> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index a4e5eaf3eb01..eb3cfd679800 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct 
> vm_area_struct *vma,
>                       goto out;
>               }
>               page = vm_normal_page(vma, address, pteval);
> -             if (unlikely(!page)) {
> +             if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>                       result = SCAN_PAGE_NULL;
>                       goto out;
>               }
> @@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>                       writable = true;
>
>               page = vm_normal_page(vma, _address, pteval);
> -             if (unlikely(!page)) {
> +             if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>                       result = SCAN_PAGE_NULL;
>                       goto out_unmap;
>               }
> @@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, 
> unsigned long addr)
>                       goto abort;
>
>               page = vm_normal_page(vma, addr, *pte);
> -
> +             if (page && is_zone_device_page(page))
> +                     page = NULL;
>               /*
>                * Note that uprobe, debugger, or MAP_PRIVATE may change the
>                * page table, but the new page will not be a subpage of hpage.
> @@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, 
> unsigned long addr)
>               if (pte_none(*pte))
>                       continue;
>               page = vm_normal_page(vma, addr, *pte);
> +             if (page && is_zone_device_page(page))
> +                     goto abort;

Are either of these two cases actually possible? DEVICE_COHERENT doesn't
currently support THP, so if I'm understanding correctly we couldn't
have a pte mapped DEVICE_COHERENT THP right? Assuming that's the case I
think WARN_ON_ONCE() would be better.

Otherwise I think everything else looks reasonable.

>               page_remove_rmap(page, vma, false);
>       }
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 063a48eeb5ee..f16056efca21 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned 
> long addr)
>       do {
>               cond_resched();
>               page = follow_page(vma, addr,
> -                             FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
> +                             FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | 
> FOLL_LRU);
>               if (IS_ERR_OR_NULL(page))
>                       break;
>               if (PageKsm(page))
> @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item 
> *rmap_item)
>       if (!vma)
>               goto out;
>
> -     page = follow_page(vma, addr, FOLL_GET);
> +     page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
>       if (IS_ERR_OR_NULL(page))
>               goto out;
>       if (PageAnon(page)) {
> @@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct 
> page **page)
>               while (ksm_scan.address < vma->vm_end) {
>                       if (ksm_test_exit(mm))
>                               break;
> -                     *page = follow_page(vma, ksm_scan.address, FOLL_GET);
> +                     *page = follow_page(vma, ksm_scan.address, FOLL_GET | 
> FOLL_LRU);
>                       if (IS_ERR_OR_NULL(*page)) {
>                               ksm_scan.address += PAGE_SIZE;
>                               cond_resched();
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 1873616a37d2..e9c24c834e98 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>                       continue;
>
>               page = vm_normal_page(vma, addr, ptent);
> -             if (!page)
> +             if (!page || is_zone_device_page(page))
>                       continue;
>
>               /*
> @@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned 
> long addr,
>               }
>
>               page = vm_normal_page(vma, addr, ptent);
> -             if (!page)
> +             if (!page || is_zone_device_page(page))
>                       continue;
>
>               /*
> diff --git a/mm/memory.c b/mm/memory.c
> index 76e3af9639d9..571a26805ee1 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, 
> unsigned long addr,
>               if (is_zero_pfn(pfn))
>                       return NULL;
>               if (pte_devmap(pte))
> +/*
> + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
> + * refcounts incremented on their struct pages when they are inserted into
> + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
> + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
> + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
> + */
>                       return NULL;
>
>               print_bad_pte(vma, addr, pte, NULL);
> @@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>       pte = pte_modify(old_pte, vma->vm_page_prot);
>
>       page = vm_normal_page(vma, vmf->address, pte);
> -     if (!page)
> +     if (!page || is_zone_device_page(page))
>               goto out_map;
>
>       /* TODO: handle PTE-mapped THP */
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 8c74107a2b15..e32edbecb0cd 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned 
> long addr,
>               if (!pte_present(*pte))
>                       continue;
>               page = vm_normal_page(vma, addr, *pte);
> -             if (!page)
> +             if (!page || is_zone_device_page(page))
>                       continue;
>               /*
>                * vm_normal_page() filters out zero pages, but there might
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 6c31ee1e1c9b..c5d50e96ecd7 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, 
> unsigned long addr,
>               goto out;
>
>       /* FOLL_DUMP to ignore special (like zero) pages */
> -     page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
> +     page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>
>       err = PTR_ERR(page);
>       if (IS_ERR(page))
> @@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, 
> unsigned long nr_pages,
>                       goto set_status;
>
>               /* FOLL_DUMP to ignore special (like zero) pages */
> -             page = follow_page(vma, addr, FOLL_DUMP);
> +             page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU);
>
>               err = PTR_ERR(page);
>               if (IS_ERR(page))
> diff --git a/mm/mlock.c b/mm/mlock.c
> index 716caf851043..b14e929084cc 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
>               if (!pte_present(*pte))
>                       continue;
>               page = vm_normal_page(vma, addr, *pte);
> -             if (!page)
> +             if (!page || is_zone_device_page(page))
>                       continue;
>               if (PageTransCompound(page))
>                       continue;
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index b69ce7a7b2b7..a6f3587ea29a 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct 
> *vma, pmd_t *pmd,
>                                       continue;
>
>                               page = vm_normal_page(vma, addr, oldpte);
> -                             if (!page || PageKsm(page))
> +                             if (!page || is_zone_device_page(page) || 
> PageKsm(page))
>                                       continue;
>
>                               /* Also skip shared copy-on-write pages */

Re: [PATCH v2 11/13] mm: handling Non-LRU pages returned by vm_normal_pages

Reply via email to