Technically I think this patch should be earlier in the series. As I understand it patch 1 allows DEVICE_COHERENT pages to be inserted in the page tables and therefore makes it possible for page table walkers to see non-LRU pages.
Some more comments below: Alex Sierra <alex.sie...@amd.com> writes: > With DEVICE_COHERENT, we'll soon have vm_normal_pages() return > device-managed anonymous pages that are not LRU pages. Although they > behave like normal pages for purposes of mapping in CPU page, and for > COW. They do not support LRU lists, NUMA migration or THP. > > We also introduced a FOLL_LRU flag that adds the same behaviour to > follow_page and related APIs, to allow callers to specify that they > expect to put pages on an LRU list. This means by default GUP can return non-LRU pages. I didn't see anywhere that would be a problem but I didn't check everything. Did you check this or is there some other reason I've missed that makes this not a problem? [...] > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index a4e5eaf3eb01..eb3cfd679800 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct > vm_area_struct *vma, > goto out; > } > page = vm_normal_page(vma, address, pteval); > - if (unlikely(!page)) { > + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { > result = SCAN_PAGE_NULL; > goto out; > } > @@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, > writable = true; > > page = vm_normal_page(vma, _address, pteval); > - if (unlikely(!page)) { > + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { > result = SCAN_PAGE_NULL; > goto out_unmap; > } > @@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, > unsigned long addr) > goto abort; > > page = vm_normal_page(vma, addr, *pte); > - > + if (page && is_zone_device_page(page)) > + page = NULL; > /* > * Note that uprobe, debugger, or MAP_PRIVATE may change the > * page table, but the new page will not be a subpage of hpage. > @@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, > unsigned long addr) > if (pte_none(*pte)) > continue; > page = vm_normal_page(vma, addr, *pte); > + if (page && is_zone_device_page(page)) > + goto abort; Are either of these two cases actually possible? DEVICE_COHERENT doesn't currently support THP, so if I'm understanding correctly we couldn't have a pte mapped DEVICE_COHERENT THP right? Assuming that's the case I think WARN_ON_ONCE() would be better. Otherwise I think everything else looks reasonable. > page_remove_rmap(page, vma, false); > } > > diff --git a/mm/ksm.c b/mm/ksm.c > index 063a48eeb5ee..f16056efca21 100644 > --- a/mm/ksm.c > +++ b/mm/ksm.c > @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned > long addr) > do { > cond_resched(); > page = follow_page(vma, addr, > - FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); > + FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | > FOLL_LRU); > if (IS_ERR_OR_NULL(page)) > break; > if (PageKsm(page)) > @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item > *rmap_item) > if (!vma) > goto out; > > - page = follow_page(vma, addr, FOLL_GET); > + page = follow_page(vma, addr, FOLL_GET | FOLL_LRU); > if (IS_ERR_OR_NULL(page)) > goto out; > if (PageAnon(page)) { > @@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct > page **page) > while (ksm_scan.address < vma->vm_end) { > if (ksm_test_exit(mm)) > break; > - *page = follow_page(vma, ksm_scan.address, FOLL_GET); > + *page = follow_page(vma, ksm_scan.address, FOLL_GET | > FOLL_LRU); > if (IS_ERR_OR_NULL(*page)) { > ksm_scan.address += PAGE_SIZE; > cond_resched(); > diff --git a/mm/madvise.c b/mm/madvise.c > index 1873616a37d2..e9c24c834e98 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > continue; > > page = vm_normal_page(vma, addr, ptent); > - if (!page) > + if (!page || is_zone_device_page(page)) > continue; > > /* > @@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned > long addr, > } > > page = vm_normal_page(vma, addr, ptent); > - if (!page) > + if (!page || is_zone_device_page(page)) > continue; > > /* > diff --git a/mm/memory.c b/mm/memory.c > index 76e3af9639d9..571a26805ee1 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, > unsigned long addr, > if (is_zero_pfn(pfn)) > return NULL; > if (pte_devmap(pte)) > +/* > + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have > + * refcounts incremented on their struct pages when they are inserted into > + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set > + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is > + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. > + */ > return NULL; > > print_bad_pte(vma, addr, pte, NULL); > @@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) > pte = pte_modify(old_pte, vma->vm_page_prot); > > page = vm_normal_page(vma, vmf->address, pte); > - if (!page) > + if (!page || is_zone_device_page(page)) > goto out_map; > > /* TODO: handle PTE-mapped THP */ > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 8c74107a2b15..e32edbecb0cd 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned > long addr, > if (!pte_present(*pte)) > continue; > page = vm_normal_page(vma, addr, *pte); > - if (!page) > + if (!page || is_zone_device_page(page)) > continue; > /* > * vm_normal_page() filters out zero pages, but there might > diff --git a/mm/migrate.c b/mm/migrate.c > index 6c31ee1e1c9b..c5d50e96ecd7 100644 > --- a/mm/migrate.c > +++ b/mm/migrate.c > @@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, > unsigned long addr, > goto out; > > /* FOLL_DUMP to ignore special (like zero) pages */ > - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); > + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU); > > err = PTR_ERR(page); > if (IS_ERR(page)) > @@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, > unsigned long nr_pages, > goto set_status; > > /* FOLL_DUMP to ignore special (like zero) pages */ > - page = follow_page(vma, addr, FOLL_DUMP); > + page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU); > > err = PTR_ERR(page); > if (IS_ERR(page)) > diff --git a/mm/mlock.c b/mm/mlock.c > index 716caf851043..b14e929084cc 100644 > --- a/mm/mlock.c > +++ b/mm/mlock.c > @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, > if (!pte_present(*pte)) > continue; > page = vm_normal_page(vma, addr, *pte); > - if (!page) > + if (!page || is_zone_device_page(page)) > continue; > if (PageTransCompound(page)) > continue; > diff --git a/mm/mprotect.c b/mm/mprotect.c > index b69ce7a7b2b7..a6f3587ea29a 100644 > --- a/mm/mprotect.c > +++ b/mm/mprotect.c > @@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct > *vma, pmd_t *pmd, > continue; > > page = vm_normal_page(vma, addr, oldpte); > - if (!page || PageKsm(page)) > + if (!page || is_zone_device_page(page) || > PageKsm(page)) > continue; > > /* Also skip shared copy-on-write pages */