* Mike Rapoport <[email protected]> [251125 13:39]: > From: "Mike Rapoport (Microsoft)" <[email protected]> > > When userspace resolves a page fault in a shmem VMA with UFFDIO_CONTINUE > it needs to get a folio that already exists in the pagecache backing > that VMA. > > Instead of using shmem_get_folio() for that, add a get_folio() method to > 'struct vm_operations_struct' that will return a folio if it exists in > the VMA's pagecache at given pgoff. > > Implement get_folio() method for shmem and slightly refactor > userfaultfd's mfill_atomic() and mfill_atomic_pte_continue() to support > this new API. > > Signed-off-by: Mike Rapoport (Microsoft) <[email protected]>
Reviewed-by: Liam R. Howlett <[email protected]> > --- > include/linux/mm.h | 9 ++++++++ > mm/shmem.c | 18 ++++++++++++++++ > mm/userfaultfd.c | 52 +++++++++++++++++++++++++++++----------------- > 3 files changed, 60 insertions(+), 19 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 7c79b3369b82..c8647707d75b 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -690,6 +690,15 @@ struct vm_operations_struct { > struct page *(*find_normal_page)(struct vm_area_struct *vma, > unsigned long addr); > #endif /* CONFIG_FIND_NORMAL_PAGE */ > +#ifdef CONFIG_USERFAULTFD > + /* > + * Called by userfault to resolve UFFDIO_CONTINUE request. > + * Should return the folio found at pgoff in the VMA's pagecache if it > + * exists or ERR_PTR otherwise. > + * The returned folio is locked and with reference held. > + */ > + struct folio *(*get_folio)(struct inode *inode, pgoff_t pgoff); > +#endif > }; > > #ifdef CONFIG_NUMA_BALANCING > diff --git a/mm/shmem.c b/mm/shmem.c > index 58701d14dd96..e16c7c8c3e1e 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -3263,6 +3263,18 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, > shmem_inode_unacct_blocks(inode, 1); > return ret; > } > + > +static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t > pgoff) > +{ > + struct folio *folio; > + int err; > + > + err = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); > + if (err) > + return ERR_PTR(err); > + > + return folio; > +} > #endif /* CONFIG_USERFAULTFD */ > > #ifdef CONFIG_TMPFS > @@ -5295,6 +5307,9 @@ static const struct vm_operations_struct shmem_vm_ops = > { > .set_policy = shmem_set_policy, > .get_policy = shmem_get_policy, > #endif > +#ifdef CONFIG_USERFAULTFD > + .get_folio = shmem_get_folio_noalloc, > +#endif > }; > > static const struct vm_operations_struct shmem_anon_vm_ops = { > @@ -5304,6 +5319,9 @@ static const struct vm_operations_struct > shmem_anon_vm_ops = { > .set_policy = shmem_set_policy, > .get_policy = shmem_get_policy, > #endif > +#ifdef CONFIG_USERFAULTFD > + .get_folio = shmem_get_folio_noalloc, > +#endif > }; > > int shmem_init_fs_context(struct fs_context *fc) > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c > index 8dc964389b0d..9f0f879b603a 100644 > --- a/mm/userfaultfd.c > +++ b/mm/userfaultfd.c > @@ -388,15 +388,12 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, > struct page *page; > int ret; > > - ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); > + folio = dst_vma->vm_ops->get_folio(inode, pgoff); > /* Our caller expects us to return -EFAULT if we failed to find folio */ > - if (ret == -ENOENT) > - ret = -EFAULT; > - if (ret) > - goto out; > - if (!folio) { > - ret = -EFAULT; > - goto out; > + if (IS_ERR_OR_NULL(folio)) { > + if (PTR_ERR(folio) == -ENOENT || !folio) > + return -EFAULT; > + return PTR_ERR(folio); > } > > page = folio_file_page(folio, pgoff); > @@ -411,13 +408,12 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, > goto out_release; > > folio_unlock(folio); > - ret = 0; > -out: > - return ret; > + return 0; > + > out_release: > folio_unlock(folio); > folio_put(folio); > - goto out; > + return ret; I really like this part. > } > > /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ > @@ -694,6 +690,15 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t > *dst_pmd, > return err; > } > > +static __always_inline bool vma_can_mfill_atomic(struct vm_area_struct *vma, > + uffd_flags_t flags) > +{ > + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) > + return vma->vm_ops && vma->vm_ops->get_folio; > + > + return vma_is_anonymous(vma) || vma_is_shmem(vma); > +} > + > static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, > unsigned long dst_start, > unsigned long src_start, > @@ -766,10 +771,7 @@ static __always_inline ssize_t mfill_atomic(struct > userfaultfd_ctx *ctx, > return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, > src_start, len, flags); > > - if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) > - goto out_unlock; > - if (!vma_is_shmem(dst_vma) && > - uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) > + if (!vma_can_mfill_atomic(dst_vma, flags)) > goto out_unlock; > > while (src_addr < src_start + len) { > @@ -1985,9 +1987,21 @@ bool vma_can_userfault(struct vm_area_struct *vma, > vm_flags_t vm_flags, > if (vma->vm_flags & VM_DROPPABLE) > return false; > > - if ((vm_flags & VM_UFFD_MINOR) && > - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) > - return false; > + if (vm_flags & VM_UFFD_MINOR) { > + /* > + * If only MINOR mode is requested and we can request an > + * existing folio from VMA's page cache, allow it > + */ > + if (vm_flags == VM_UFFD_MINOR && vma->vm_ops && > + vma->vm_ops->get_folio) > + return true; > + /* > + * Only hugetlb and shmem can support MINOR mode in combination > + * with other modes > + */ > + if (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)) > + return false; > + } > > /* > * If wp async enabled, and WP is the only mode enabled, allow any > -- > 2.50.1 >

