From: Nadav Amit <[email protected]>

In order to use userfaultfd with io-uring, there are two options for
extensions: support userfaultfd ioctls or provide similar functionality
through the "write" interface. The latter approach seems more compelling
as it does not require io-uring changes, and keeps all the logic of
userfaultfd where it should be. In addition it allows to provide
asynchronous completions by performing the copying/zeroing in the
faulting thread (which will be done in a later patch).

This patch enhances the userfaultfd API to provide write interface to
perform similar operations for copy/zero. The lower bits of the position
(smaller than PAGE_SHIFT) are being used to encode the required
operation: zero/copy/wake/write-protect. In the case of zeroing, the
source data is ignored and only the length is being used to determine
the size of the data that needs to be zeroed.

Cc: Jens Axboe <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Peter Xu <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Nadav Amit <[email protected]>
---
 fs/userfaultfd.c                 | 96 +++++++++++++++++++++++++++++++-
 include/uapi/linux/userfaultfd.h | 14 ++++-
 2 files changed, 107 insertions(+), 3 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7bbee2a00d37..eae6ac303951 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1140,6 +1140,34 @@ static __poll_t userfaultfd_poll(struct file *file, 
poll_table *wait)
 
 static const struct file_operations userfaultfd_fops;
 
+/* Open-coded version of anon_inode_getfd() to setup FMODE_PWRITE */
+static int userfaultfd_getfd(const char *name, const struct file_operations 
*fops,
+                    void *priv, int flags)
+{
+       int error, fd;
+       struct file *file;
+
+       error = get_unused_fd_flags(flags);
+       if (error < 0)
+               return error;
+       fd = error;
+
+       file = anon_inode_getfile(name, fops, priv, flags);
+
+       if (IS_ERR(file)) {
+               error = PTR_ERR(file);
+               goto err_put_unused_fd;
+       }
+       file->f_mode |= FMODE_PWRITE;
+       fd_install(fd, file);
+
+       return fd;
+
+err_put_unused_fd:
+       put_unused_fd(fd);
+       return error;
+}
+
 static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
                                  struct userfaultfd_ctx *new,
                                  struct uffd_msg *msg)
@@ -1161,7 +1189,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx 
*ctx,
                task_unlock(current);
        }
 
-       fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
+       fd = userfaultfd_getfd("[userfaultfd]", &userfaultfd_fops, new,
                              O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
 
        if (files != NULL) {
@@ -1496,6 +1524,69 @@ static __always_inline int validate_range(struct 
mm_struct *mm,
        return 0;
 }
 
+ssize_t userfaultfd_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct file *file = iocb->ki_filp;
+       struct userfaultfd_wake_range range;
+       struct userfaultfd_ctx *ctx = file->private_data;
+       size_t len = iov_iter_count(from);
+       __u64 dst = iocb->ki_pos & PAGE_MASK;
+       unsigned long mode = iocb->ki_pos & ~PAGE_MASK;
+       bool zeropage;
+       __s64 ret;
+
+       BUG_ON(len == 0);
+
+       zeropage = mode & UFFDIO_WRITE_MODE_ZEROPAGE;
+
+       ret = -EINVAL;
+       if (mode & ~(UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP |
+                    UFFDIO_WRITE_MODE_ZEROPAGE))
+               goto out;
+
+       mode = mode & (UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP);
+
+       /*
+        * Keep compatibility with zeropage ioctl, which does not allow
+        * write-protect and dontwake.
+        */
+       if (zeropage &&
+           (mode & (UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP)) ==
+            (UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP))
+               goto out;
+
+       ret = -EAGAIN;
+       if (READ_ONCE(ctx->mmap_changing))
+               goto out;
+
+       ret = validate_range(ctx->mm, &dst, len);
+       if (ret)
+               goto out;
+
+       if (mmget_not_zero(ctx->mm)) {
+               if (zeropage)
+                       ret = mfill_zeropage(ctx->mm, dst, from,
+                                            &ctx->mmap_changing);
+               else
+                       ret = mcopy_atomic(ctx->mm, dst, from,
+                                          &ctx->mmap_changing, mode);
+               mmput(ctx->mm);
+       } else {
+               return -ESRCH;
+       }
+       if (ret < 0)
+               goto out;
+
+       /* len == 0 would wake all */
+       range.len = ret;
+       if (!(mode & UFFDIO_COPY_MODE_DONTWAKE)) {
+               range.start = dst;
+               wake_userfault(ctx, &range);
+       }
+out:
+       return ret;
+}
+
 static inline bool vma_can_userfault(struct vm_area_struct *vma,
                                     unsigned long vm_flags)
 {
@@ -2197,6 +2288,7 @@ static const struct file_operations userfaultfd_fops = {
        .release        = userfaultfd_release,
        .poll           = userfaultfd_poll,
        .read_iter      = userfaultfd_read_iter,
+       .write_iter     = userfaultfd_write_iter,
        .unlocked_ioctl = userfaultfd_ioctl,
        .compat_ioctl   = compat_ptr_ioctl,
        .llseek         = noop_llseek,
@@ -2248,7 +2340,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
 
        ctx->files = get_files_struct(current);
 
-       fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
+       fd = userfaultfd_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
                              O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
        if (fd < 0) {
                mmdrop(ctx->mm);
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 4eeba4235afe..943e50b41742 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -28,7 +28,8 @@
                           UFFD_FEATURE_MISSING_SHMEM |         \
                           UFFD_FEATURE_SIGBUS |                \
                           UFFD_FEATURE_THREAD_ID |             \
-                          UFFD_FEATURE_POLL)
+                          UFFD_FEATURE_POLL |                  \
+                          UFFD_FEATURE_WRITE)
 
 #define UFFD_API_IOCTLS                                \
        ((__u64)1 << _UFFDIO_REGISTER |         \
@@ -177,6 +178,9 @@ struct uffdio_api {
         * UFFD_FEATURE_POLL polls upon page-fault if the feature is requested
         * instead of descheduling. This feature should only be enabled for
         * low-latency handlers and when CPUs are not overcomitted.
+        *
+        * UFFD_FEATURE_WRITE allows to use the write interface for copy and
+        * zeroing of pages in addition to the ioctl interface.
         */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP         (1<<0)
 #define UFFD_FEATURE_EVENT_FORK                        (1<<1)
@@ -188,6 +192,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_SIGBUS                    (1<<7)
 #define UFFD_FEATURE_THREAD_ID                 (1<<8)
 #define UFFD_FEATURE_POLL                      (1<<9)
+#define UFFD_FEATURE_WRITE                     (1<<10)
        __u64 features;
 
        __u64 ioctls;
@@ -264,4 +269,11 @@ struct uffdio_writeprotect {
        __u64 mode;
 };
 
+/*
+ * Write modes to be use with UFFDIO_SET_WRITE_MODE ioctl.
+ */
+#define UFFDIO_WRITE_MODE_DONTWAKE             UFFDIO_COPY_MODE_DONTWAKE
+#define UFFDIO_WRITE_MODE_WP                   UFFDIO_COPY_MODE_WP
+#define UFFDIO_WRITE_MODE_ZEROPAGE             ((__u64)1<<2)
+
 #endif /* _LINUX_USERFAULTFD_H */
-- 
2.25.1

Reply via email to