The commit is pushed to "branch-rh7-3.10.0-327.3.1-vz7.10.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.3.1.el7
------>
commit 553598cabaa1339a7096fc03cddbb91bd0193ff4
Author: Dmitry Monakhov <dmonak...@openvz.org>
Date:   Fri Dec 25 12:45:30 2015 +0400

    fs: kernel direct aio
    
    This is a port of 2f3ecb6 ("fs: kernel direct aio")
    onto rebased kernel (based on 3.10.0-327.3.1.el7).
    
        fs: kernel direct aio
    
        Port of 95-diff-kernel-direct-aio-combined from
        https://jira.sw.ru/browse/PSBM-18169
    
        Signed-off-by: Maxim Patlasov <mpatla...@parallels.com>
    
    https://jira.sw.ru/browse/PSBM-42312
    
    Signed-off-by: Dmitry Monakhov <dmonak...@openvz.org>
---
 fs/aio.c                     | 140 +++++++++++
 fs/ceph/file.c               |  10 +-
 fs/cifs/file.c               |   7 +-
 fs/fuse/file.c               |  12 +-
 include/linux/aio.h          |  15 ++
 include/linux/blk_types.h    |   8 +
 include/linux/fs.h           | 142 ++++++++++-
 include/uapi/linux/aio_abi.h |   2 +
 mm/Makefile                  |   3 +-
 mm/filemap.c                 | 563 ++++++++++++++-----------------------------
 mm/iov-iter.c                | 474 ++++++++++++++++++++++++++++++++++++
 11 files changed, 972 insertions(+), 404 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 8427423..8ec32e2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -936,6 +936,10 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
                atomic_set(&iocb->ki_users, 0);
                wake_up_process(iocb->ki_obj.tsk);
                return;
+       } else if (is_kernel_kiocb(iocb)) {
+               iocb->ki_obj.complete(iocb->ki_user_data, res);
+               aio_kernel_free(iocb);
+               return;
        }
 
        /*
@@ -1377,6 +1381,51 @@ static ssize_t aio_setup_single_vector(int rw, struct 
kiocb *kiocb)
        return 0;
 }
 
+static ssize_t aio_read_iter(struct kiocb *iocb)
+{
+       struct file *file = iocb->ki_filp;
+       ssize_t ret;
+
+       if (unlikely(!is_kernel_kiocb(iocb)))
+               return -EINVAL;
+
+       if (unlikely(!(file->f_mode & FMODE_READ)))
+               return -EBADF;
+
+       ret = security_file_permission(file, MAY_READ);
+       if (unlikely(ret))
+               return ret;
+
+       if (!file->f_op->read_iter)
+               return -EINVAL;
+
+       return file->f_op->read_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+}
+
+static ssize_t aio_write_iter(struct kiocb *iocb)
+{
+       struct file *file = iocb->ki_filp;
+       ssize_t ret;
+
+       if (unlikely(!is_kernel_kiocb(iocb)))
+               return -EINVAL;
+
+       if (unlikely(!(file->f_mode & FMODE_WRITE)))
+               return -EBADF;
+
+       ret = security_file_permission(file, MAY_WRITE);
+       if (unlikely(ret))
+               return ret;
+
+       if (!file->f_op->write_iter)
+               return -EINVAL;
+
+       file_start_write(file);
+       ret = file->f_op->write_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+       file_end_write(file);
+       return ret;
+}
+
 /*
  * aio_setup_iocb:
  *     Performs the initial checks and aio retry method
@@ -1428,6 +1477,14 @@ rw_common:
                ret = aio_rw_vect_retry(req, rw, rw_op);
                break;
 
+       case IOCB_CMD_READ_ITER:
+               ret = aio_read_iter(req);
+               break;
+
+       case IOCB_CMD_WRITE_ITER:
+               ret = aio_write_iter(req);
+               break;
+
        case IOCB_CMD_FDSYNC:
                if (!file->f_op->aio_fsync)
                        return -EINVAL;
@@ -1462,6 +1519,89 @@ rw_common:
        return 0;
 }
 
+/*
+ * This allocates an iocb that will be used to submit and track completion of
+ * an IO that is issued from kernel space.
+ *
+ * The caller is expected to call the appropriate aio_kernel_init_() functions
+ * and then call aio_kernel_submit().  From that point forward progress is
+ * guaranteed by the file system aio method.  Eventually the caller's
+ * completion callback will be called.
+ *
+ * These iocbs are special.  They don't have a context, we don't limit the
+ * number pending, they can't be canceled, and can't be retried.  In the short
+ * term callers need to be careful not to call operations which might retry by
+ * only calling new ops which never add retry support.  In the long term
+ * retry-based AIO should be removed.
+ */
+struct kiocb *aio_kernel_alloc(gfp_t gfp)
+{
+       struct kiocb *iocb = kzalloc(sizeof(struct kiocb), gfp);
+       if (iocb)
+               iocb->ki_ctx = (void *)-1;
+       return iocb;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_alloc);
+
+void aio_kernel_free(struct kiocb *iocb)
+{
+       kfree(iocb);
+}
+EXPORT_SYMBOL_GPL(aio_kernel_free);
+
+/*
+ * The iter count must be set before calling here.  Some filesystems uses
+ * iocb->ki_left as an indicator of the size of an IO.
+ */
+void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp,
+                         unsigned short op, struct iov_iter *iter, loff_t off)
+{
+       iocb->ki_filp = filp;
+       iocb->ki_iter = iter;
+       iocb->ki_opcode = op;
+       iocb->ki_pos = off;
+       iocb->ki_nbytes = iov_iter_count(iter);
+       iocb->ki_left = iocb->ki_nbytes;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_iter);
+
+void aio_kernel_init_callback(struct kiocb *iocb,
+                             void (*complete)(u64 user_data, long res),
+                             u64 user_data)
+{
+       iocb->ki_obj.complete = complete;
+       iocb->ki_user_data = user_data;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_callback);
+
+/*
+ * The iocb is our responsibility once this is called.  The caller must not
+ * reference it.  This comes from aio_setup_iocb() modifying the iocb.
+ *
+ * Callers must be prepared for their iocb completion callback to be called the
+ * moment they enter this function.  The completion callback may be called from
+ * any context.
+ *
+ * Returns: 0: the iocb completion callback will be called with the op result
+ * negative errno: the operation was not submitted and the iocb was freed
+ */
+int aio_kernel_submit(struct kiocb *iocb)
+{
+       int ret;
+
+       BUG_ON(!is_kernel_kiocb(iocb));
+       BUG_ON(!iocb->ki_obj.complete);
+       BUG_ON(!iocb->ki_filp);
+
+       ret = aio_run_iocb(iocb, 0);
+
+       if (ret)
+               aio_kernel_free(iocb);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_submit);
+
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                         struct iocb *iocb, bool compat)
 {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 1655236..ccc51a4 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -806,8 +806,9 @@ static ssize_t inline_to_iov(struct kiocb *iocb, struct 
iov_iter *i,
                        zero_user_segment(inline_page, inline_len, end);
 
                while (left) {
-                       void __user *udata = i->iov->iov_base + i->iov_offset;
-                       size_t n = min(i->iov->iov_len - i->iov_offset, left);
+                       struct iovec *iov = iov_iter_iovec(i);
+                       void __user *udata = iov->iov_base + i->iov_offset;
+                       size_t n = min(iov->iov_len - i->iov_offset, left);
 
                        if (__copy_to_user(udata, kdata, n)) {
                                ret = -EFAULT;
@@ -824,8 +825,9 @@ static ssize_t inline_to_iov(struct kiocb *iocb, struct 
iov_iter *i,
                size_t left = min_t(loff_t, iocb->ki_pos + len, i_size) - pos;
 
                while (left) {
-                       void __user *udata = i->iov->iov_base + i->iov_offset;
-                       size_t n = min(i->iov->iov_len - i->iov_offset, left);
+                       struct iovec *iov = iov_iter_iovec(i);
+                       void __user *udata = iov->iov_base + i->iov_offset;
+                       size_t n = min(iov->iov_len - i->iov_offset, left);
 
                        if (__clear_user(udata, n)) {
                                ret = -EFAULT;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0bc0fad..401fa67 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2435,8 +2435,9 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, 
struct iov_iter *from,
        save_len = cur_len;
        for (i = 0; i < nr_pages; i++) {
                bytes = min_t(const size_t, cur_len, PAGE_SIZE);
-               copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
+               copied = iov_iter_copy_from_user(wdata->pages[i], from, 0, 
bytes);
                cur_len -= copied;
+               iov_iter_advance(from, copied);
                /*
                 * If we didn't copy as much as we expected, then that
                 * may mean we trod into an unmapped area. Stop copying
@@ -2865,8 +2866,10 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct 
iov_iter *iter)
        for (i = 0; i < rdata->nr_pages; i++) {
                struct page *page = rdata->pages[i];
                size_t copy = min_t(size_t, remaining, PAGE_SIZE);
-               size_t written = copy_page_to_iter(page, 0, copy, iter);
+               size_t written = iov_iter_copy_to_user(page, iter, 0, copy);
+
                remaining -= written;
+               iov_iter_advance(iter, written);
                if (written < copy && iov_iter_count(iter) > 0)
                        break;
        }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8f16755..f432b70 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1518,7 +1518,12 @@ static inline void fuse_page_descs_length_init(struct 
fuse_req *req,
 
 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
 {
-       return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+       struct iovec *iov;
+
+       BUG_ON(!iov_iter_has_iovec(ii));
+       iov = (struct iovec *)ii->data;
+
+       return (unsigned long)iov->iov_base + ii->iov_offset;
 }
 
 static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
@@ -2637,8 +2642,9 @@ static int fuse_ioctl_copy_user(struct page **pages, 
struct iovec *iov,
                kaddr = kmap(page);
 
                while (todo) {
-                       char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
-                       size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+                       struct iovec *iiov = (struct iovec *)ii.data;
+                       char __user *uaddr = iiov->iov_base + ii.iov_offset;
+                       size_t iov_len = iiov->iov_len - ii.iov_offset;
                        size_t copy = min(todo, iov_len);
                        size_t left;
 
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 161aa0c..0aa7dd3 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -42,6 +42,7 @@ struct kiocb {
        union {
                void __user             *user;
                struct task_struct      *tsk;
+               void                    (*complete)(u64 user_data, long res);
        } ki_obj;
 
        __u64                   ki_user_data;   /* user's data for completion */
@@ -66,6 +67,7 @@ struct kiocb {
         * this is the underlying eventfd context to deliver events to.
         */
        struct eventfd_ctx      *ki_eventfd;
+       struct iov_iter         *ki_iter;
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -73,6 +75,11 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb)
        return kiocb->ki_ctx == NULL;
 }
 
+static inline bool is_kernel_kiocb(struct kiocb *kiocb)
+{
+       return kiocb->ki_ctx == (void *)-1;
+}
+
 static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 {
        *kiocb = (struct kiocb) {
@@ -93,6 +100,14 @@ extern void exit_aio(struct mm_struct *mm);
 extern long do_io_submit(aio_context_t ctx_id, long nr,
                         struct iocb __user *__user *iocbpp, bool compat);
 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
+struct kiocb *aio_kernel_alloc(gfp_t gfp);
+void aio_kernel_free(struct kiocb *iocb);
+void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp,
+                         unsigned short op, struct iov_iter *iter, loff_t off);
+void aio_kernel_init_callback(struct kiocb *iocb,
+                             void (*complete)(u64 user_data, long res),
+                             u64 user_data);
+int aio_kernel_submit(struct kiocb *iocb);
 #else
 static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
 static inline void aio_put_req(struct kiocb *iocb) { }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e3c8bfb..1251977 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -28,6 +28,14 @@ struct bio_vec {
        unsigned int    bv_offset;
 };
 
+static inline ssize_t bvec_length(const struct bio_vec *bvec, unsigned long nr)
+{
+       ssize_t bytes = 0;
+       while (nr--)
+               bytes += (bvec++)->bv_len;
+       return bytes;
+}
+
 /*
  * RHEL7 auxillary shadow structure used to extend 'struct bio' without
  * breaking RHEL kABI -- bio_init_aux() must be used to set bio->bio_aux
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2f26ee8..9e6f777 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -318,35 +318,138 @@ struct address_space;
 struct writeback_control;
 
 struct iov_iter {
-       const struct iovec *iov;
+       struct iov_iter_ops *ops;
+       unsigned long data;
        unsigned long nr_segs;
        size_t iov_offset;
        size_t count;
 };
 
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-               struct iov_iter *i, unsigned long offset, size_t bytes);
-size_t iov_iter_copy_from_user(struct page *page,
-               struct iov_iter *i, unsigned long offset, size_t bytes);
-void iov_iter_advance(struct iov_iter *i, size_t bytes);
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
-size_t iov_iter_single_seg_count(const struct iov_iter *i);
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
-                        struct iov_iter *i);
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
-                       struct iov_iter *i);
+struct iov_iter_ops {
+       size_t (*ii_copy_to_user_atomic)(struct page *, struct iov_iter *,
+                                        unsigned long, size_t);
+       size_t (*ii_copy_to_user)(struct page *, struct iov_iter *,
+                                 unsigned long, size_t);
+       size_t (*ii_copy_from_user_atomic)(struct page *, struct iov_iter *,
+                                          unsigned long, size_t);
+       size_t (*ii_copy_from_user)(struct page *, struct iov_iter *,
+                                         unsigned long, size_t);
+       void (*ii_advance)(struct iov_iter *, size_t);
+       int (*ii_fault_in_readable)(struct iov_iter *, size_t);
+       size_t (*ii_single_seg_count)(const struct iov_iter *);
+       int (*ii_shorten)(struct iov_iter *, size_t);
+};
+
+static inline size_t iov_iter_copy_to_user_atomic(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+       return i->ops->ii_copy_to_user_atomic(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_to_user(struct page *page,
+               struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+       return i->ops->ii_copy_to_user(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_from_user_atomic(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+       return i->ops->ii_copy_from_user_atomic(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_from_user(struct page *page,
+               struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+       return i->ops->ii_copy_from_user(page, i, offset, bytes);
+}
+static inline void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+       return i->ops->ii_advance(i, bytes);
+}
+static inline int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+       return i->ops->ii_fault_in_readable(i, bytes);
+}
+static inline size_t iov_iter_single_seg_count(const struct iov_iter *i)
+{
+       return i->ops->ii_single_seg_count(i);
+}
+static inline int iov_iter_shorten(struct iov_iter *i, size_t count)
+{
+       return i->ops->ii_shorten(i, count);
+}
+
+extern struct iov_iter_ops ii_bvec_ops;
+
+struct bio_vec;
+static inline void iov_iter_init_bvec(struct iov_iter *i,
+                                     struct bio_vec *bvec,
+                                     unsigned long nr_segs,
+                                     size_t count, size_t written)
+{
+       i->ops = &ii_bvec_ops;
+       i->data = (unsigned long)bvec;
+       i->nr_segs = nr_segs;
+       i->iov_offset = 0;
+       i->count = count + written;
+
+       iov_iter_advance(i, written);
+}
+static inline int iov_iter_has_bvec(struct iov_iter *i)
+{
+       return i->ops == &ii_bvec_ops;
+}
+static inline struct bio_vec *iov_iter_bvec(struct iov_iter *i)
+{
+       BUG_ON(!iov_iter_has_bvec(i));
+       return (struct bio_vec *)i->data;
+}
+
+extern struct iov_iter_ops ii_page_ops;
+
+static inline void iov_iter_init_page(struct iov_iter *i,
+                                     struct page *page,
+                                     size_t count, size_t written)
+{
+       i->ops = &ii_page_ops;
+       i->data = (unsigned long)page;
+       i->nr_segs = 1;
+       i->iov_offset = 0;
+       i->count = count + written;
+
+       iov_iter_advance(i, written);
+}
+static inline int iov_iter_has_page(struct iov_iter *i)
+{
+       return i->ops == &ii_page_ops;
+}
+static inline struct page *iov_iter_page(struct iov_iter *i)
+{
+       BUG_ON(!iov_iter_has_page(i));
+       return (struct page *)i->data;
+}
+
+extern struct iov_iter_ops ii_iovec_ops;
 
 static inline void iov_iter_init(struct iov_iter *i,
                        const struct iovec *iov, unsigned long nr_segs,
                        size_t count, size_t written)
 {
-       i->iov = iov;
+       i->ops = &ii_iovec_ops;
+       i->data = (unsigned long)iov;
        i->nr_segs = nr_segs;
        i->iov_offset = 0;
        i->count = count + written;
 
        iov_iter_advance(i, written);
 }
+static inline int iov_iter_has_iovec(const struct iov_iter *i)
+{
+       return i->ops == &ii_iovec_ops;
+}
+static inline struct iovec *iov_iter_iovec(struct iov_iter *i)
+{
+       BUG_ON(!iov_iter_has_iovec(i));
+       return (struct iovec *)i->data;
+}
 
 static inline size_t iov_iter_count(struct iov_iter *i)
 {
@@ -408,6 +511,10 @@ struct address_space_operations {
        void (*freepage)(struct page *);
        ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
                        loff_t offset, unsigned long nr_segs);
+       ssize_t (*direct_IO_bvec)(int, struct kiocb *, struct bio_vec *bvec,
+                       loff_t offset, unsigned long bvec_len);
+       ssize_t (*direct_IO_page)(int, struct kiocb *, struct page *page,
+                       loff_t offset);
        int (*get_xip_mem)(struct address_space *, pgoff_t, int,
                                                void **, unsigned long *);
        /*
@@ -1665,7 +1772,9 @@ struct file_operations {
        ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned 
long, loff_t);
+       ssize_t (*read_iter) (struct kiocb *, struct iov_iter *, loff_t);
        ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned 
long, loff_t);
+       ssize_t (*write_iter) (struct kiocb *, struct iov_iter *, loff_t);
        int (*readdir) (struct file *, void *, filldir_t);
        unsigned int (*poll) (struct file *, struct poll_table_struct *);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
@@ -2702,13 +2811,20 @@ extern int generic_file_remap_pages(struct 
vm_area_struct *, unsigned long addr,
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, 
unsigned long offset, unsigned long size);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int 
isblk);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, 
unsigned long, loff_t);
+extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *, 
loff_t);
 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, 
unsigned long,
                loff_t *);
+extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *, 
loff_t *);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, 
unsigned long, loff_t);
+extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *, 
loff_t);
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
                unsigned long *, loff_t, loff_t *, size_t, size_t);
+extern ssize_t generic_file_direct_write_iter(struct kiocb *, struct iov_iter 
*,
+               loff_t, loff_t *, size_t);
 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec 
*,
                unsigned long, loff_t, loff_t *, size_t, ssize_t);
+extern ssize_t generic_file_buffered_write_iter(struct kiocb *, struct 
iov_iter *,
+               loff_t, loff_t *, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, 
loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t 
len, loff_t *ppos);
 extern int generic_segment_checks(const struct iovec *iov,
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index bb2554f..22ce4bd 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -44,6 +44,8 @@ enum {
        IOCB_CMD_NOOP = 6,
        IOCB_CMD_PREADV = 7,
        IOCB_CMD_PWRITEV = 8,
+       IOCB_CMD_READ_ITER = 9,
+       IOCB_CMD_WRITE_ITER = 10,
 };
 
 /*
diff --git a/mm/Makefile b/mm/Makefile
index 204a614..4c3899b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -20,7 +20,8 @@ obj-y                 := filemap.o mempool.o oom_kill.o 
fadvise.o \
                           util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
                           compaction.o balloon_compaction.o \
-                          interval_tree.o list_lru.o workingset.o oom_group.o 
$(mmu-y)
+                          interval_tree.o list_lru.o workingset.o oom_group.o \
+                          iov-iter.o $(mmu-y)
 
 obj-y += init-mm.o
 
diff --git a/mm/filemap.c b/mm/filemap.c
index ad2939d..605b5d3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1439,162 +1439,6 @@ static void shrink_readahead_size_eio(struct file *filp,
        ra->ra_pages /= 4;
 }
 
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
-                        struct iov_iter *i)
-{
-       size_t skip, copy, left, wanted;
-       const struct iovec *iov;
-       char __user *buf;
-       void *kaddr, *from;
-
-       if (unlikely(bytes > i->count))
-               bytes = i->count;
-
-       if (unlikely(!bytes))
-               return 0;
-
-       wanted = bytes;
-       iov = i->iov;
-       skip = i->iov_offset;
-       buf = iov->iov_base + skip;
-       copy = min(bytes, iov->iov_len - skip);
-
-       if (!fault_in_pages_writeable(buf, copy)) {
-               kaddr = kmap_atomic(page);
-               from = kaddr + offset;
-
-               /* first chunk, usually the only one */
-               left = __copy_to_user_inatomic(buf, from, copy);
-               copy -= left;
-               skip += copy;
-               from += copy;
-               bytes -= copy;
-
-               while (unlikely(!left && bytes)) {
-                       iov++;
-                       buf = iov->iov_base;
-                       copy = min(bytes, iov->iov_len);
-                       left = __copy_to_user_inatomic(buf, from, copy);
-                       copy -= left;
-                       skip = copy;
-                       from += copy;
-                       bytes -= copy;
-               }
-               if (likely(!bytes)) {
-                       kunmap_atomic(kaddr);
-                       goto done;
-               }
-               offset = from - kaddr;
-               buf += copy;
-               kunmap_atomic(kaddr);
-               copy = min(bytes, iov->iov_len - skip);
-       }
-       /* Too bad - revert to non-atomic kmap */
-       kaddr = kmap(page);
-       from = kaddr + offset;
-       left = __copy_to_user(buf, from, copy);
-       copy -= left;
-       skip += copy;
-       from += copy;
-       bytes -= copy;
-       while (unlikely(!left && bytes)) {
-               iov++;
-               buf = iov->iov_base;
-               copy = min(bytes, iov->iov_len);
-               left = __copy_to_user(buf, from, copy);
-               copy -= left;
-               skip = copy;
-               from += copy;
-               bytes -= copy;
-       }
-       kunmap(page);
-done:
-       i->count -= wanted - bytes;
-       i->nr_segs -= iov - i->iov;
-       i->iov = iov;
-       i->iov_offset = skip;
-       return wanted - bytes;
-}
-EXPORT_SYMBOL(copy_page_to_iter);
-
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
-                          struct iov_iter *i)
-{
-       size_t skip, copy, left, wanted;
-       const struct iovec *iov;
-       char __user *buf;
-       void *kaddr, *to;
-
-       if (unlikely(bytes > i->count))
-               bytes = i->count;
-
-       if (unlikely(!bytes))
-               return 0;
-
-       wanted = bytes;
-       iov = i->iov;
-       skip = i->iov_offset;
-       buf = iov->iov_base + skip;
-       copy = min(bytes, iov->iov_len - skip);
-
-       if (!fault_in_pages_readable(buf, copy)) {
-               kaddr = kmap_atomic(page);
-               to = kaddr + offset;
-
-               /* first chunk, usually the only one */
-               left = __copy_from_user_inatomic(to, buf, copy);
-               copy -= left;
-               skip += copy;
-               to += copy;
-               bytes -= copy;
-
-               while (unlikely(!left && bytes)) {
-                       iov++;
-                       buf = iov->iov_base;
-                       copy = min(bytes, iov->iov_len);
-                       left = __copy_from_user_inatomic(to, buf, copy);
-                       copy -= left;
-                       skip = copy;
-                       to += copy;
-                       bytes -= copy;
-               }
-               if (likely(!bytes)) {
-                       kunmap_atomic(kaddr);
-                       goto done;
-               }
-               offset = to - kaddr;
-               buf += copy;
-               kunmap_atomic(kaddr);
-               copy = min(bytes, iov->iov_len - skip);
-       }
-       /* Too bad - revert to non-atomic kmap */
-       kaddr = kmap(page);
-       to = kaddr + offset;
-       left = __copy_from_user(to, buf, copy);
-       copy -= left;
-       skip += copy;
-       to += copy;
-       bytes -= copy;
-       while (unlikely(!left && bytes)) {
-               iov++;
-               buf = iov->iov_base;
-               copy = min(bytes, iov->iov_len);
-               left = __copy_from_user(to, buf, copy);
-               copy -= left;
-               skip = copy;
-               to += copy;
-               bytes -= copy;
-       }
-       kunmap(page);
-done:
-       i->count -= wanted - bytes;
-       i->nr_segs -= iov - i->iov;
-       i->iov = iov;
-       i->iov_offset = skip;
-       return wanted - bytes;
-}
-EXPORT_SYMBOL(copy_page_from_iter);
-
 /**
  * do_generic_file_read - generic file read routine
  * @filp:      the file to read
@@ -1912,31 +1756,60 @@ int generic_segment_checks(const struct iovec *iov,
 }
 EXPORT_SYMBOL(generic_segment_checks);
 
+static ssize_t mapping_direct_IO(struct address_space *mapping, int rw,
+                                struct kiocb *iocb, struct iov_iter *iter,
+                                loff_t pos)
+{
+       if (iov_iter_has_iovec(iter))
+               return mapping->a_ops->direct_IO(rw, iocb, iov_iter_iovec(iter),
+                                                pos, iter->nr_segs);
+       else if (iov_iter_has_bvec(iter))
+               return mapping->a_ops->direct_IO_bvec(rw, iocb,
+                                                     iov_iter_bvec(iter), pos,
+                                                     iter->nr_segs);
+       else if (iov_iter_has_page(iter))
+               return mapping->a_ops->direct_IO_page(rw, iocb,
+                                                     iov_iter_page(iter), pos);
+       else
+               BUG();
+}
+
+static int file_read_iter_actor(read_descriptor_t *desc, struct page *page,
+                               unsigned long offset, unsigned long size)
+{
+       struct iov_iter *iter = desc->arg.data;
+       unsigned long copied = 0;
+
+       if (size > desc->count)
+               size = desc->count;
+
+       copied = iov_iter_copy_to_user(page, iter, offset, size);
+       if (copied < size)
+               desc->error = -EFAULT;
+
+       iov_iter_advance(iter, copied);
+       desc->count -= copied;
+       desc->written += copied;
+
+       return copied;
+}
+
+
 /**
- * generic_file_aio_read - generic filesystem read routine
+ * generic_file_read_iter - generic filesystem read routine
  * @iocb:      kernel I/O control block
- * @iov:       io vector request
- * @nr_segs:   number of segments in the iovec
+ * @iov_iter:  memory vector
  * @pos:       current file position
- *
- * This is the "read()" routine for all filesystems
- * that can use the page cache directly.
  */
 ssize_t
-generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long nr_segs, loff_t pos)
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
 {
        struct file *filp = iocb->ki_filp;
-       ssize_t retval;
-       unsigned long seg = 0;
-       size_t count;
+       read_descriptor_t desc;
+       ssize_t retval = 0;
+       size_t count = iov_iter_count(iter);
        loff_t *ppos = &iocb->ki_pos;
 
-       count = 0;
-       retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-       if (retval)
-               return retval;
-
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (filp->f_flags & O_DIRECT) {
                loff_t size;
@@ -1950,10 +1823,10 @@ generic_file_aio_read(struct kiocb *iocb, const struct 
iovec *iov,
                size = i_size_read(inode);
                if (pos < size) {
                        retval = filemap_write_and_wait_range(mapping, pos,
-                                       pos + iov_length(iov, nr_segs) - 1);
+                                       pos + count - 1);
                        if (!retval) {
-                               retval = mapping->a_ops->direct_IO(READ, iocb,
-                                                       iov, pos, nr_segs);
+                               retval = mapping_direct_IO(mapping, READ,
+                                                          iocb, iter, pos);
                        }
                        if (retval > 0) {
                                *ppos = pos + retval;
@@ -1975,42 +1848,49 @@ generic_file_aio_read(struct kiocb *iocb, const struct 
iovec *iov,
                }
        }
 
-       count = retval;
-       for (seg = 0; seg < nr_segs; seg++) {
-               read_descriptor_t desc;
-               loff_t offset = 0;
+       iov_iter_advance(iter, retval);
 
-               /*
-                * If we did a short DIO read we need to skip the section of the
-                * iov that we've already read data into.
-                */
-               if (count) {
-                       if (count > iov[seg].iov_len) {
-                               count -= iov[seg].iov_len;
-                               continue;
-                       }
-                       offset = count;
-                       count = 0;
-               }
+       desc.written = 0;
+       desc.arg.data = iter;
+       desc.count = count;
+       desc.error = 0;
+       do_generic_file_read(filp, ppos, &desc, file_read_iter_actor);
 
-               desc.written = 0;
-               desc.arg.buf = iov[seg].iov_base + offset;
-               desc.count = iov[seg].iov_len - offset;
-               if (desc.count == 0)
-                       continue;
-               desc.error = 0;
-               do_generic_file_read(filp, ppos, &desc, file_read_actor);
-               retval += desc.written;
-               if (desc.error) {
-                       retval = retval ?: desc.error;
-                       break;
-               }
-               if (desc.count > 0)
-                       break;
-       }
+       retval += desc.written;
+       if (desc.error && !retval)
+               retval = desc.error;
 out:
        return retval;
 }
+EXPORT_SYMBOL(generic_file_read_iter);
+
+/**
+ * generic_file_aio_read - generic filesystem read routine
+ * @iocb:      kernel I/O control block
+ * @iov:       io vector request
+ * @nr_segs:   number of segments in the iovec
+ * @pos:       current file position
+ *
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+               unsigned long nr_segs, loff_t pos)
+{
+       struct iov_iter iter;
+       int ret;
+       size_t count;
+
+       count = 0;
+       ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+       if (ret)
+               return ret;
+
+       iov_iter_init(&iter, iov, nr_segs, count, 0);
+
+       return generic_file_read_iter(iocb, &iter, pos);
+}
 EXPORT_SYMBOL(generic_file_aio_read);
 
 #ifdef CONFIG_MMU
@@ -2477,150 +2357,6 @@ struct page *read_cache_page(struct address_space 
*mapping,
 }
 EXPORT_SYMBOL(read_cache_page);
 
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
-                       const struct iovec *iov, size_t base, size_t bytes)
-{
-       size_t copied = 0, left = 0;
-
-       while (bytes) {
-               char __user *buf = iov->iov_base + base;
-               int copy = min(bytes, iov->iov_len - base);
-
-               base = 0;
-               left = __copy_from_user_inatomic(vaddr, buf, copy);
-               copied += copy;
-               bytes -= copy;
-               vaddr += copy;
-               iov++;
-
-               if (unlikely(left))
-                       break;
-       }
-       return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were successfully copied.  If a fault is encountered then return the number 
of
- * bytes which were copied.
- */
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-               struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-       char *kaddr;
-       size_t copied;
-
-       BUG_ON(!in_atomic());
-       kaddr = kmap_atomic(page);
-       if (likely(i->nr_segs == 1)) {
-               int left;
-               char __user *buf = i->iov->iov_base + i->iov_offset;
-               left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
-               copied = bytes - left;
-       } else {
-               copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-                                               i->iov, i->iov_offset, bytes);
-       }
-       kunmap_atomic(kaddr);
-
-       return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
-
-/*
- * This has the same sideeffects and return value as
- * iov_iter_copy_from_user_atomic().
- * The difference is that it attempts to resolve faults.
- * Page must not be locked.
- */
-size_t iov_iter_copy_from_user(struct page *page,
-               struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-       char *kaddr;
-       size_t copied;
-
-       kaddr = kmap(page);
-       if (likely(i->nr_segs == 1)) {
-               int left;
-               char __user *buf = i->iov->iov_base + i->iov_offset;
-               left = __copy_from_user(kaddr + offset, buf, bytes);
-               copied = bytes - left;
-       } else {
-               copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-                                               i->iov, i->iov_offset, bytes);
-       }
-       kunmap(page);
-       return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user);
-
-void iov_iter_advance(struct iov_iter *i, size_t bytes)
-{
-       BUG_ON(i->count < bytes);
-
-       if (likely(i->nr_segs == 1)) {
-               i->iov_offset += bytes;
-               i->count -= bytes;
-       } else {
-               const struct iovec *iov = i->iov;
-               size_t base = i->iov_offset;
-               unsigned long nr_segs = i->nr_segs;
-
-               /*
-                * The !iov->iov_len check ensures we skip over unlikely
-                * zero-length segments (without overruning the iovec).
-                */
-               while (bytes || unlikely(i->count && !iov->iov_len)) {
-                       int copy;
-
-                       copy = min(bytes, iov->iov_len - base);
-                       BUG_ON(!i->count || i->count < copy);
-                       i->count -= copy;
-                       bytes -= copy;
-                       base += copy;
-                       if (iov->iov_len == base) {
-                               iov++;
-                               nr_segs--;
-                               base = 0;
-                       }
-               }
-               i->iov = iov;
-               i->iov_offset = base;
-               i->nr_segs = nr_segs;
-       }
-}
-EXPORT_SYMBOL(iov_iter_advance);
-
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
-       char __user *buf = i->iov->iov_base + i->iov_offset;
-       bytes = min(bytes, i->iov->iov_len - i->iov_offset);
-       return fault_in_pages_readable(buf, bytes);
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
-       const struct iovec *iov = i->iov;
-       if (i->nr_segs == 1)
-               return i->count;
-       else
-               return min(i->count, iov->iov_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-
 /*
  * Performs necessary checks before doing a write
  *
@@ -2726,9 +2462,8 @@ int pagecache_write_end(struct file *file, struct 
address_space *mapping,
 EXPORT_SYMBOL(pagecache_write_end);
 
 ssize_t
-generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long *nr_segs, loff_t pos, loff_t *ppos,
-               size_t count, size_t ocount)
+generic_file_direct_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+               loff_t pos, loff_t *ppos, size_t count)
 {
        struct file     *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
@@ -2737,10 +2472,13 @@ generic_file_direct_write(struct kiocb *iocb, const 
struct iovec *iov,
        size_t          write_len;
        pgoff_t         end;
 
-       if (count != ocount)
-               *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+       if (count != iov_iter_count(iter)) {
+               written = iov_iter_shorten(iter, count);
+               if (written)
+                       goto out;
+       }
 
-       write_len = iov_length(iov, *nr_segs);
+       write_len = count;
        end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
 
        written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 
1);
@@ -2767,7 +2505,7 @@ generic_file_direct_write(struct kiocb *iocb, const 
struct iovec *iov,
                }
        }
 
-       written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+       written = mapping_direct_IO(mapping, WRITE, iocb, iter, pos);
 
        /*
         * Finally, try again to invalidate clean pages which might have been
@@ -2793,6 +2531,23 @@ generic_file_direct_write(struct kiocb *iocb, const 
struct iovec *iov,
 out:
        return written;
 }
+EXPORT_SYMBOL(generic_file_direct_write_iter);
+
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+               unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+               size_t count, size_t ocount)
+{
+       struct iov_iter iter;
+       ssize_t ret;
+
+       iov_iter_init(&iter, iov, *nr_segs, ocount, 0);
+       ret = generic_file_direct_write_iter(iocb, &iter, pos, ppos, count);
+       /* generic_file_direct_write_iter() might have shortened the vec */
+       if (*nr_segs != iter.nr_segs)
+               *nr_segs = iter.nr_segs;
+       return ret;
+}
 EXPORT_SYMBOL(generic_file_direct_write);
 
 /*
@@ -2926,18 +2681,15 @@ again:
 }
 
 ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long nr_segs, loff_t pos, loff_t *ppos,
-               size_t count, ssize_t written)
+generic_file_buffered_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+               loff_t pos, loff_t *ppos, ssize_t written)
 {
        struct file *file = iocb->ki_filp;
        ssize_t status;
-       struct iov_iter i;
 
        virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
 
-       iov_iter_init(&i, iov, nr_segs, count, written);
-       status = generic_perform_write(file, &i, pos);
+       status = generic_perform_write(file, iter, pos);
 
        if (likely(status >= 0)) {
                written += status;
@@ -2946,13 +2698,24 @@ generic_file_buffered_write(struct kiocb *iocb, const 
struct iovec *iov,
        
        return written ? written : status;
 }
+EXPORT_SYMBOL(generic_file_buffered_write_iter);
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+               unsigned long nr_segs, loff_t pos, loff_t *ppos,
+               size_t count, ssize_t written)
+{
+       struct iov_iter iter;
+       iov_iter_init(&iter, iov, nr_segs, count, written);
+       return generic_file_buffered_write_iter(iocb, &iter, pos, ppos,
+                                               written);
+}
 EXPORT_SYMBOL(generic_file_buffered_write);
 
 /**
  * __generic_file_aio_write - write data to a file
  * @iocb:      IO state structure (file, offset, etc.)
- * @iov:       vector with data to write
- * @nr_segs:   number of segments in the vector
+ * @iter:      iov_iter specifying memory to write
  * @ppos:      position where to write
  *
  * This function does all the work needed for actually writing data to a
@@ -2967,24 +2730,18 @@ EXPORT_SYMBOL(generic_file_buffered_write);
  * A caller has to handle it. This is mainly due to the fact that we want to
  * avoid syncing under i_mutex.
  */
-ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t *ppos)
+ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+                                 loff_t *ppos)
 {
        struct file *file = iocb->ki_filp;
        struct address_space * mapping = file->f_mapping;
-       size_t ocount;          /* original count */
        size_t count;           /* after file limit checks */
        struct inode    *inode = mapping->host;
        loff_t          pos;
        ssize_t         written;
        ssize_t         err;
 
-       ocount = 0;
-       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-       if (err)
-               return err;
-
-       count = ocount;
+       count = iov_iter_count(iter);
        pos = *ppos;
 
        /* We can write back this queue in page reclaim */
@@ -3011,8 +2768,8 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, 
const struct iovec *iov,
                loff_t endbyte;
                ssize_t written_buffered;
 
-               written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
-                                                       ppos, count, ocount);
+               written = generic_file_direct_write_iter(iocb, iter, pos,
+                                                        ppos, count);
                if (written < 0 || written == count)
                        goto out;
                /*
@@ -3021,9 +2778,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, 
const struct iovec *iov,
                 */
                pos += written;
                count -= written;
-               written_buffered = generic_file_buffered_write(iocb, iov,
-                                               nr_segs, pos, ppos, count,
-                                               written);
+               iov_iter_advance(iter, written);
+               written_buffered = generic_file_buffered_write_iter(iocb, iter,
+                                               pos, ppos, written);
                /*
                 * If generic_file_buffered_write() retuned a synchronous error
                 * then we want to return the number of bytes which were
@@ -3055,13 +2812,57 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, 
const struct iovec *iov,
                         */
                }
        } else {
-               written = generic_file_buffered_write(iocb, iov, nr_segs,
-                               pos, ppos, count, written);
+               iter->count = count;
+               written = generic_file_buffered_write_iter(iocb, iter,
+                               pos, ppos, written);
        }
 out:
        current->backing_dev_info = NULL;
        return written ? written : err;
 }
+EXPORT_SYMBOL(__generic_file_write_iter);
+
+ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+                               loff_t pos)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+       ssize_t ret;
+
+       mutex_lock(&inode->i_mutex);
+       ret = __generic_file_write_iter(iocb, iter, &iocb->ki_pos);
+       mutex_unlock(&inode->i_mutex);
+
+       if (ret > 0 || ret == -EIOCBQUEUED) {
+               ssize_t err;
+
+               err = generic_write_sync(file, pos, ret);
+               if (err < 0 && ret > 0)
+                       ret = err;
+       }
+       return ret;
+}
+EXPORT_SYMBOL(generic_file_write_iter);
+
+ssize_t
+__generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                        unsigned long nr_segs, loff_t *ppos)
+{
+       struct iov_iter iter;
+       size_t count;
+       int ret;
+
+       count = 0;
+       ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+       if (ret)
+               goto out;
+
+       iov_iter_init(&iter, iov, nr_segs, count, 0);
+
+       ret = __generic_file_write_iter(iocb, &iter, ppos);
+out:
+       return ret;
+}
 EXPORT_SYMBOL(__generic_file_aio_write);
 
 /**
diff --git a/mm/iov-iter.c b/mm/iov-iter.c
new file mode 100644
index 0000000..e6fc15a
--- /dev/null
+++ b/mm/iov-iter.c
@@ -0,0 +1,474 @@
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <linux/hardirq.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+
+static size_t __iovec_copy_to_user_inatomic(char *vaddr,
+                       const struct iovec *iov, size_t base, size_t bytes)
+{
+       size_t copied = 0, left = 0;
+
+       while (bytes) {
+               char __user *buf = iov->iov_base + base;
+               int copy = min(bytes, iov->iov_len - base);
+
+               base = 0;
+               left = __copy_to_user_inatomic(buf, vaddr, copy);
+               copied += copy;
+               bytes -= copy;
+               vaddr += copy;
+               iov++;
+
+               if (unlikely(left))
+                       break;
+       }
+       return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number 
of
+ * bytes which were copied.
+ */
+static size_t ii_iovec_copy_to_user_atomic(struct page *page,
+               struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+       struct iovec *iov = (struct iovec *)i->data;
+       char *kaddr;
+       size_t copied;
+
+       BUG_ON(!in_atomic());
+       kaddr = kmap_atomic(page);
+       if (likely(i->nr_segs == 1)) {
+               int left;
+               char __user *buf = iov->iov_base + i->iov_offset;
+               left = __copy_to_user_inatomic(buf, kaddr + offset, bytes);
+               copied = bytes - left;
+       } else {
+               copied = __iovec_copy_to_user_inatomic(kaddr + offset,
+                                               iov, i->iov_offset, bytes);
+       }
+       kunmap_atomic(kaddr);
+
+       return copied;
+}
+
+/*
+ * This has the same sideeffects and return value as
+ * ii_iovec_copy_to_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+static size_t ii_iovec_copy_to_user(struct page *page,
+               struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+       struct iovec *iov = (struct iovec *)i->data;
+       char *kaddr;
+       size_t copied;
+
+       kaddr = kmap(page);
+       if (likely(i->nr_segs == 1)) {
+               int left;
+               char __user *buf = iov->iov_base + i->iov_offset;
+               left = copy_to_user(buf, kaddr + offset, bytes);
+               copied = bytes - left;
+       } else {
+               copied = __iovec_copy_to_user_inatomic(kaddr + offset,
+                                               iov, i->iov_offset, bytes);
+       }
+       kunmap(page);
+       return copied;
+}
+
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+                       const struct iovec *iov, size_t base, size_t bytes)
+{
+       size_t copied = 0, left = 0;
+
+       while (bytes) {
+               char __user *buf = iov->iov_base + base;
+               int copy = min(bytes, iov->iov_len - base);
+
+               base = 0;
+               left = __copy_from_user_inatomic(vaddr, buf, copy);
+               copied += copy;
+               bytes -= copy;
+               vaddr += copy;
+               iov++;
+
+               if (unlikely(left))
+                       break;
+       }
+       return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number 
of
+ * bytes which were copied.
+ */
+static size_t ii_iovec_copy_from_user_atomic(struct page *page,
+               struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+       struct iovec *iov = (struct iovec *)i->data;
+       char *kaddr;
+       size_t copied;
+
+       BUG_ON(!in_atomic());
+       kaddr = kmap_atomic(page);
+       if (likely(i->nr_segs == 1)) {
+               int left;
+               char __user *buf = iov->iov_base + i->iov_offset;
+               left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+               copied = bytes - left;
+       } else {
+               copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+                                               iov, i->iov_offset, bytes);
+       }
+       kunmap_atomic(kaddr);
+
+       return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
+ * This has the same sideeffects and return value as
+ * ii_iovec_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+static size_t ii_iovec_copy_from_user(struct page *page,
+               struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+       struct iovec *iov = (struct iovec *)i->data;
+       char *kaddr;
+       size_t copied;
+
+       kaddr = kmap(page);
+       if (likely(i->nr_segs == 1)) {
+               int left;
+               char __user *buf = iov->iov_base + i->iov_offset;
+               left = __copy_from_user(kaddr + offset, buf, bytes);
+               copied = bytes - left;
+       } else {
+               copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+                                               iov, i->iov_offset, bytes);
+       }
+       kunmap(page);
+       return copied;
+}
+
+static void ii_iovec_advance(struct iov_iter *i, size_t bytes)
+{
+       BUG_ON(i->count < bytes);
+
+       if (likely(i->nr_segs == 1)) {
+               i->iov_offset += bytes;
+               i->count -= bytes;
+       } else {
+               struct iovec *iov = (struct iovec *)i->data;
+               size_t base = i->iov_offset;
+               unsigned long nr_segs = i->nr_segs;
+
+               /*
+                * The !iov->iov_len check ensures we skip over unlikely
+                * zero-length segments (without overruning the iovec).
+                */
+               while (bytes || unlikely(i->count && !iov->iov_len)) {
+                       int copy;
+
+                       copy = min(bytes, iov->iov_len - base);
+                       BUG_ON(!i->count || i->count < copy);
+                       i->count -= copy;
+                       bytes -= copy;
+                       base += copy;
+                       if (iov->iov_len == base) {
+                               iov++;
+                               nr_segs--;
+                               base = 0;
+                       }
+               }
+               i->data = (unsigned long)iov;
+               i->iov_offset = base;
+               i->nr_segs = nr_segs;
+       }
+}
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+static int ii_iovec_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+       struct iovec *iov = (struct iovec *)i->data;
+       char __user *buf = iov->iov_base + i->iov_offset;
+       bytes = min(bytes, iov->iov_len - i->iov_offset);
+       return fault_in_pages_readable(buf, bytes);
+}
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+static size_t ii_iovec_single_seg_count(const struct iov_iter *i)
+{
+       struct iovec *iov = (struct iovec *)i->data;
+       if (i->nr_segs == 1)
+               return i->count;
+       else
+               return min(i->count, iov->iov_len - i->iov_offset);
+}
+
+static int ii_iovec_shorten(struct iov_iter *i, size_t count)
+{
+       struct iovec *iov = (struct iovec *)i->data;
+       i->nr_segs = iov_shorten(iov, i->nr_segs, count);
+       return 0;
+}
+
+struct iov_iter_ops ii_iovec_ops = {
+       .ii_copy_to_user_atomic = ii_iovec_copy_to_user_atomic,
+       .ii_copy_to_user = ii_iovec_copy_to_user,
+       .ii_copy_from_user_atomic = ii_iovec_copy_from_user_atomic,
+       .ii_copy_from_user = ii_iovec_copy_from_user,
+       .ii_advance = ii_iovec_advance,
+       .ii_fault_in_readable = ii_iovec_fault_in_readable,
+       .ii_single_seg_count = ii_iovec_single_seg_count,
+       .ii_shorten = ii_iovec_shorten,
+};
+EXPORT_SYMBOL(ii_iovec_ops);
+
+/*
+ * As an easily verifiable first pass, we implement all the methods that
+ * copy data to and from bvec pages with one function.  We implement it
+ * all with kmap_atomic().
+ */
+static size_t bvec_copy_tofrom_page(struct iov_iter *iter, struct page *page,
+                                   unsigned long page_offset, size_t bytes,
+                                   int topage)
+{
+       struct bio_vec *bvec = (struct bio_vec *)iter->data;
+       size_t bvec_offset = iter->iov_offset;
+       size_t remaining = bytes;
+       void *bvec_map;
+       void *page_map;
+       size_t copy;
+
+       page_map = kmap_atomic(page);
+
+       BUG_ON(bytes > iter->count);
+       while (remaining) {
+               BUG_ON(bvec->bv_len == 0);
+               BUG_ON(bvec_offset >= bvec->bv_len);
+               copy = min(remaining, bvec->bv_len - bvec_offset);
+               bvec_map = kmap_atomic(bvec->bv_page);
+               if (topage)
+                       memcpy(page_map + page_offset,
+                              bvec_map + bvec->bv_offset + bvec_offset,
+                              copy);
+               else
+                       memcpy(bvec_map + bvec->bv_offset + bvec_offset,
+                              page_map + page_offset,
+                              copy);
+               kunmap_atomic(bvec_map);
+               remaining -= copy;
+               bvec_offset += copy;
+               page_offset += copy;
+               if (bvec_offset == bvec->bv_len) {
+                       bvec_offset = 0;
+                       bvec++;
+               }
+       }
+
+       kunmap_atomic(page_map);
+
+       return bytes;
+}
+
+size_t ii_bvec_copy_to_user_atomic(struct page *page, struct iov_iter *i,
+                                  unsigned long offset, size_t bytes)
+{
+       return bvec_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_bvec_copy_to_user(struct page *page, struct iov_iter *i,
+                                  unsigned long offset, size_t bytes)
+{
+       return bvec_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_bvec_copy_from_user_atomic(struct page *page, struct iov_iter *i,
+                                    unsigned long offset, size_t bytes)
+{
+       return bvec_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+size_t ii_bvec_copy_from_user(struct page *page, struct iov_iter *i,
+                             unsigned long offset, size_t bytes)
+{
+       return bvec_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+
+/*
+ * bio_vecs have a stricter structure than iovecs that might have
+ * come from userspace.  There are no zero length bio_vec elements.
+ */
+void ii_bvec_advance(struct iov_iter *i, size_t bytes)
+{
+       struct bio_vec *bvec = (struct bio_vec *)i->data;
+       size_t offset = i->iov_offset;
+       size_t delta;
+
+       BUG_ON(i->count < bytes);
+       while (bytes) {
+               BUG_ON(bvec->bv_len == 0);
+               BUG_ON(bvec->bv_len <= offset);
+               delta = min(bytes, bvec->bv_len - offset);
+               offset += delta;
+               i->count -= delta;
+               bytes -= delta;
+               if (offset == bvec->bv_len) {
+                       bvec++;
+                       offset = 0;
+               }
+       }
+
+       i->data = (unsigned long)bvec;
+       i->iov_offset = offset;
+}
+
+/*
+ * pages pointed to by bio_vecs are always pinned.
+ */
+int ii_bvec_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+       return 0;
+}
+
+size_t ii_bvec_single_seg_count(const struct iov_iter *i)
+{
+       const struct bio_vec *bvec = (struct bio_vec *)i->data;
+       if (i->nr_segs == 1)
+               return i->count;
+       else
+               return min(i->count, bvec->bv_len - i->iov_offset);
+}
+
+static int ii_bvec_shorten(struct iov_iter *i, size_t count)
+{
+       return -EINVAL;
+}
+
+struct iov_iter_ops ii_bvec_ops = {
+       .ii_copy_to_user_atomic = ii_bvec_copy_to_user_atomic,
+       .ii_copy_to_user = ii_bvec_copy_to_user,
+       .ii_copy_from_user_atomic = ii_bvec_copy_from_user_atomic,
+       .ii_copy_from_user = ii_bvec_copy_from_user,
+       .ii_advance = ii_bvec_advance,
+       .ii_fault_in_readable = ii_bvec_fault_in_readable,
+       .ii_single_seg_count = ii_bvec_single_seg_count,
+       .ii_shorten = ii_bvec_shorten,
+};
+EXPORT_SYMBOL(ii_bvec_ops);
+
+/* Functions to get on with single page */
+
+static size_t page_copy_tofrom_page(struct iov_iter *iter, struct page *page,
+                                   unsigned long page_offset, size_t bytes,
+                                   int topage)
+{
+       struct page *ipage = (struct page *)iter->data;
+       size_t ipage_offset = iter->iov_offset;
+       void *ipage_map;
+       void *page_map;
+
+       BUG_ON(bytes > iter->count);
+       BUG_ON(bytes > PAGE_SIZE - ipage_offset);
+       BUG_ON(ipage_offset >= PAGE_SIZE);
+
+       page_map = kmap_atomic(page);
+       ipage_map = kmap_atomic(ipage);
+
+       if (topage)
+               memcpy(page_map + page_offset,
+                      ipage_map + ipage_offset,
+                      bytes);
+       else
+               memcpy(ipage_map + ipage_offset,
+                      page_map + page_offset,
+                      bytes);
+
+       kunmap_atomic(ipage_map);
+       kunmap_atomic(page_map);
+
+       return bytes;
+}
+
+size_t ii_page_copy_to_user_atomic(struct page *page, struct iov_iter *i,
+                                  unsigned long offset, size_t bytes)
+{
+       return page_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_page_copy_to_user(struct page *page, struct iov_iter *i,
+                                  unsigned long offset, size_t bytes)
+{
+       return page_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_page_copy_from_user_atomic(struct page *page, struct iov_iter *i,
+                                    unsigned long offset, size_t bytes)
+{
+       return page_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+size_t ii_page_copy_from_user(struct page *page, struct iov_iter *i,
+                             unsigned long offset, size_t bytes)
+{
+       return page_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+
+void ii_page_advance(struct iov_iter *i, size_t bytes)
+{
+       BUG_ON(i->count < bytes);
+       BUG_ON(i->iov_offset >= PAGE_SIZE);
+       BUG_ON(bytes > PAGE_SIZE - i->iov_offset);
+
+       i->iov_offset += bytes;
+       i->count      -= bytes;
+}
+
+/*
+ * pages pointed to by bio_vecs are always pinned.
+ */
+int ii_page_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+       return 0;
+}
+
+size_t ii_page_single_seg_count(const struct iov_iter *i)
+{
+       BUG_ON(i->nr_segs != 1);
+
+       return i->count;
+}
+
+static int ii_page_shorten(struct iov_iter *i, size_t count)
+{
+       return -EINVAL;
+}
+
+struct iov_iter_ops ii_page_ops = {
+       .ii_copy_to_user_atomic = ii_page_copy_to_user_atomic,
+       .ii_copy_to_user = ii_page_copy_to_user,
+       .ii_copy_from_user_atomic = ii_page_copy_from_user_atomic,
+       .ii_copy_from_user = ii_page_copy_from_user,
+       .ii_advance = ii_page_advance,
+       .ii_fault_in_readable = ii_page_fault_in_readable,
+       .ii_single_seg_count = ii_page_single_seg_count,
+       .ii_shorten = ii_page_shorten,
+};
+EXPORT_SYMBOL(ii_page_ops);
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to