The commit is pushed to "branch-rh7-3.10.0-327.3.1-vz7.10.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-327.3.1.el7 ------> commit 553598cabaa1339a7096fc03cddbb91bd0193ff4 Author: Dmitry Monakhov <dmonak...@openvz.org> Date: Fri Dec 25 12:45:30 2015 +0400
fs: kernel direct aio This is a port of 2f3ecb6 ("fs: kernel direct aio") onto rebased kernel (based on 3.10.0-327.3.1.el7). fs: kernel direct aio Port of 95-diff-kernel-direct-aio-combined from https://jira.sw.ru/browse/PSBM-18169 Signed-off-by: Maxim Patlasov <mpatla...@parallels.com> https://jira.sw.ru/browse/PSBM-42312 Signed-off-by: Dmitry Monakhov <dmonak...@openvz.org> --- fs/aio.c | 140 +++++++++++ fs/ceph/file.c | 10 +- fs/cifs/file.c | 7 +- fs/fuse/file.c | 12 +- include/linux/aio.h | 15 ++ include/linux/blk_types.h | 8 + include/linux/fs.h | 142 ++++++++++- include/uapi/linux/aio_abi.h | 2 + mm/Makefile | 3 +- mm/filemap.c | 563 ++++++++++++++----------------------------- mm/iov-iter.c | 474 ++++++++++++++++++++++++++++++++++++ 11 files changed, 972 insertions(+), 404 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 8427423..8ec32e2 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -936,6 +936,10 @@ void aio_complete(struct kiocb *iocb, long res, long res2) atomic_set(&iocb->ki_users, 0); wake_up_process(iocb->ki_obj.tsk); return; + } else if (is_kernel_kiocb(iocb)) { + iocb->ki_obj.complete(iocb->ki_user_data, res); + aio_kernel_free(iocb); + return; } /* @@ -1377,6 +1381,51 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) return 0; } +static ssize_t aio_read_iter(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + if (unlikely(!is_kernel_kiocb(iocb))) + return -EINVAL; + + if (unlikely(!(file->f_mode & FMODE_READ))) + return -EBADF; + + ret = security_file_permission(file, MAY_READ); + if (unlikely(ret)) + return ret; + + if (!file->f_op->read_iter) + return -EINVAL; + + return file->f_op->read_iter(iocb, iocb->ki_iter, iocb->ki_pos); +} + +static ssize_t aio_write_iter(struct kiocb *iocb) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + if (unlikely(!is_kernel_kiocb(iocb))) + return -EINVAL; + + if (unlikely(!(file->f_mode & FMODE_WRITE))) + return -EBADF; + + ret = security_file_permission(file, MAY_WRITE); + if (unlikely(ret)) + return ret; + + if (!file->f_op->write_iter) + return -EINVAL; + + file_start_write(file); + ret = file->f_op->write_iter(iocb, iocb->ki_iter, iocb->ki_pos); + file_end_write(file); + return ret; +} + /* * aio_setup_iocb: * Performs the initial checks and aio retry method @@ -1428,6 +1477,14 @@ rw_common: ret = aio_rw_vect_retry(req, rw, rw_op); break; + case IOCB_CMD_READ_ITER: + ret = aio_read_iter(req); + break; + + case IOCB_CMD_WRITE_ITER: + ret = aio_write_iter(req); + break; + case IOCB_CMD_FDSYNC: if (!file->f_op->aio_fsync) return -EINVAL; @@ -1462,6 +1519,89 @@ rw_common: return 0; } +/* + * This allocates an iocb that will be used to submit and track completion of + * an IO that is issued from kernel space. + * + * The caller is expected to call the appropriate aio_kernel_init_() functions + * and then call aio_kernel_submit(). From that point forward progress is + * guaranteed by the file system aio method. Eventually the caller's + * completion callback will be called. + * + * These iocbs are special. They don't have a context, we don't limit the + * number pending, they can't be canceled, and can't be retried. In the short + * term callers need to be careful not to call operations which might retry by + * only calling new ops which never add retry support. In the long term + * retry-based AIO should be removed. + */ +struct kiocb *aio_kernel_alloc(gfp_t gfp) +{ + struct kiocb *iocb = kzalloc(sizeof(struct kiocb), gfp); + if (iocb) + iocb->ki_ctx = (void *)-1; + return iocb; +} +EXPORT_SYMBOL_GPL(aio_kernel_alloc); + +void aio_kernel_free(struct kiocb *iocb) +{ + kfree(iocb); +} +EXPORT_SYMBOL_GPL(aio_kernel_free); + +/* + * The iter count must be set before calling here. Some filesystems uses + * iocb->ki_left as an indicator of the size of an IO. + */ +void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp, + unsigned short op, struct iov_iter *iter, loff_t off) +{ + iocb->ki_filp = filp; + iocb->ki_iter = iter; + iocb->ki_opcode = op; + iocb->ki_pos = off; + iocb->ki_nbytes = iov_iter_count(iter); + iocb->ki_left = iocb->ki_nbytes; +} +EXPORT_SYMBOL_GPL(aio_kernel_init_iter); + +void aio_kernel_init_callback(struct kiocb *iocb, + void (*complete)(u64 user_data, long res), + u64 user_data) +{ + iocb->ki_obj.complete = complete; + iocb->ki_user_data = user_data; +} +EXPORT_SYMBOL_GPL(aio_kernel_init_callback); + +/* + * The iocb is our responsibility once this is called. The caller must not + * reference it. This comes from aio_setup_iocb() modifying the iocb. + * + * Callers must be prepared for their iocb completion callback to be called the + * moment they enter this function. The completion callback may be called from + * any context. + * + * Returns: 0: the iocb completion callback will be called with the op result + * negative errno: the operation was not submitted and the iocb was freed + */ +int aio_kernel_submit(struct kiocb *iocb) +{ + int ret; + + BUG_ON(!is_kernel_kiocb(iocb)); + BUG_ON(!iocb->ki_obj.complete); + BUG_ON(!iocb->ki_filp); + + ret = aio_run_iocb(iocb, 0); + + if (ret) + aio_kernel_free(iocb); + + return ret; +} +EXPORT_SYMBOL_GPL(aio_kernel_submit); + static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, struct iocb *iocb, bool compat) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 1655236..ccc51a4 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -806,8 +806,9 @@ static ssize_t inline_to_iov(struct kiocb *iocb, struct iov_iter *i, zero_user_segment(inline_page, inline_len, end); while (left) { - void __user *udata = i->iov->iov_base + i->iov_offset; - size_t n = min(i->iov->iov_len - i->iov_offset, left); + struct iovec *iov = iov_iter_iovec(i); + void __user *udata = iov->iov_base + i->iov_offset; + size_t n = min(iov->iov_len - i->iov_offset, left); if (__copy_to_user(udata, kdata, n)) { ret = -EFAULT; @@ -824,8 +825,9 @@ static ssize_t inline_to_iov(struct kiocb *iocb, struct iov_iter *i, size_t left = min_t(loff_t, iocb->ki_pos + len, i_size) - pos; while (left) { - void __user *udata = i->iov->iov_base + i->iov_offset; - size_t n = min(i->iov->iov_len - i->iov_offset, left); + struct iovec *iov = iov_iter_iovec(i); + void __user *udata = iov->iov_base + i->iov_offset; + size_t n = min(iov->iov_len - i->iov_offset, left); if (__clear_user(udata, n)) { ret = -EFAULT; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0bc0fad..401fa67 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2435,8 +2435,9 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, save_len = cur_len; for (i = 0; i < nr_pages; i++) { bytes = min_t(const size_t, cur_len, PAGE_SIZE); - copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from); + copied = iov_iter_copy_from_user(wdata->pages[i], from, 0, bytes); cur_len -= copied; + iov_iter_advance(from, copied); /* * If we didn't copy as much as we expected, then that * may mean we trod into an unmapped area. Stop copying @@ -2865,8 +2866,10 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; size_t copy = min_t(size_t, remaining, PAGE_SIZE); - size_t written = copy_page_to_iter(page, 0, copy, iter); + size_t written = iov_iter_copy_to_user(page, iter, 0, copy); + remaining -= written; + iov_iter_advance(iter, written); if (written < copy && iov_iter_count(iter) > 0) break; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8f16755..f432b70 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1518,7 +1518,12 @@ static inline void fuse_page_descs_length_init(struct fuse_req *req, static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) { - return (unsigned long)ii->iov->iov_base + ii->iov_offset; + struct iovec *iov; + + BUG_ON(!iov_iter_has_iovec(ii)); + iov = (struct iovec *)ii->data; + + return (unsigned long)iov->iov_base + ii->iov_offset; } static inline size_t fuse_get_frag_size(const struct iov_iter *ii, @@ -2637,8 +2642,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, kaddr = kmap(page); while (todo) { - char __user *uaddr = ii.iov->iov_base + ii.iov_offset; - size_t iov_len = ii.iov->iov_len - ii.iov_offset; + struct iovec *iiov = (struct iovec *)ii.data; + char __user *uaddr = iiov->iov_base + ii.iov_offset; + size_t iov_len = iiov->iov_len - ii.iov_offset; size_t copy = min(todo, iov_len); size_t left; diff --git a/include/linux/aio.h b/include/linux/aio.h index 161aa0c..0aa7dd3 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -42,6 +42,7 @@ struct kiocb { union { void __user *user; struct task_struct *tsk; + void (*complete)(u64 user_data, long res); } ki_obj; __u64 ki_user_data; /* user's data for completion */ @@ -66,6 +67,7 @@ struct kiocb { * this is the underlying eventfd context to deliver events to. */ struct eventfd_ctx *ki_eventfd; + struct iov_iter *ki_iter; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) @@ -73,6 +75,11 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) return kiocb->ki_ctx == NULL; } +static inline bool is_kernel_kiocb(struct kiocb *kiocb) +{ + return kiocb->ki_ctx == (void *)-1; +} + static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { @@ -93,6 +100,14 @@ extern void exit_aio(struct mm_struct *mm); extern long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user *__user *iocbpp, bool compat); void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); +struct kiocb *aio_kernel_alloc(gfp_t gfp); +void aio_kernel_free(struct kiocb *iocb); +void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp, + unsigned short op, struct iov_iter *iter, loff_t off); +void aio_kernel_init_callback(struct kiocb *iocb, + void (*complete)(u64 user_data, long res), + u64 user_data); +int aio_kernel_submit(struct kiocb *iocb); #else static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } static inline void aio_put_req(struct kiocb *iocb) { } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e3c8bfb..1251977 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -28,6 +28,14 @@ struct bio_vec { unsigned int bv_offset; }; +static inline ssize_t bvec_length(const struct bio_vec *bvec, unsigned long nr) +{ + ssize_t bytes = 0; + while (nr--) + bytes += (bvec++)->bv_len; + return bytes; +} + /* * RHEL7 auxillary shadow structure used to extend 'struct bio' without * breaking RHEL kABI -- bio_init_aux() must be used to set bio->bio_aux diff --git a/include/linux/fs.h b/include/linux/fs.h index 2f26ee8..9e6f777 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -318,35 +318,138 @@ struct address_space; struct writeback_control; struct iov_iter { - const struct iovec *iov; + struct iov_iter_ops *ops; + unsigned long data; unsigned long nr_segs; size_t iov_offset; size_t count; }; -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes); -size_t iov_iter_copy_from_user(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes); -void iov_iter_advance(struct iov_iter *i, size_t bytes); -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); -size_t iov_iter_single_seg_count(const struct iov_iter *i); -size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i); -size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i); +struct iov_iter_ops { + size_t (*ii_copy_to_user_atomic)(struct page *, struct iov_iter *, + unsigned long, size_t); + size_t (*ii_copy_to_user)(struct page *, struct iov_iter *, + unsigned long, size_t); + size_t (*ii_copy_from_user_atomic)(struct page *, struct iov_iter *, + unsigned long, size_t); + size_t (*ii_copy_from_user)(struct page *, struct iov_iter *, + unsigned long, size_t); + void (*ii_advance)(struct iov_iter *, size_t); + int (*ii_fault_in_readable)(struct iov_iter *, size_t); + size_t (*ii_single_seg_count)(const struct iov_iter *); + int (*ii_shorten)(struct iov_iter *, size_t); +}; + +static inline size_t iov_iter_copy_to_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + return i->ops->ii_copy_to_user_atomic(page, i, offset, bytes); +} +static inline size_t iov_iter_copy_to_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + return i->ops->ii_copy_to_user(page, i, offset, bytes); +} +static inline size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + return i->ops->ii_copy_from_user_atomic(page, i, offset, bytes); +} +static inline size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + return i->ops->ii_copy_from_user(page, i, offset, bytes); +} +static inline void iov_iter_advance(struct iov_iter *i, size_t bytes) +{ + return i->ops->ii_advance(i, bytes); +} +static inline int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + return i->ops->ii_fault_in_readable(i, bytes); +} +static inline size_t iov_iter_single_seg_count(const struct iov_iter *i) +{ + return i->ops->ii_single_seg_count(i); +} +static inline int iov_iter_shorten(struct iov_iter *i, size_t count) +{ + return i->ops->ii_shorten(i, count); +} + +extern struct iov_iter_ops ii_bvec_ops; + +struct bio_vec; +static inline void iov_iter_init_bvec(struct iov_iter *i, + struct bio_vec *bvec, + unsigned long nr_segs, + size_t count, size_t written) +{ + i->ops = &ii_bvec_ops; + i->data = (unsigned long)bvec; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count + written; + + iov_iter_advance(i, written); +} +static inline int iov_iter_has_bvec(struct iov_iter *i) +{ + return i->ops == &ii_bvec_ops; +} +static inline struct bio_vec *iov_iter_bvec(struct iov_iter *i) +{ + BUG_ON(!iov_iter_has_bvec(i)); + return (struct bio_vec *)i->data; +} + +extern struct iov_iter_ops ii_page_ops; + +static inline void iov_iter_init_page(struct iov_iter *i, + struct page *page, + size_t count, size_t written) +{ + i->ops = &ii_page_ops; + i->data = (unsigned long)page; + i->nr_segs = 1; + i->iov_offset = 0; + i->count = count + written; + + iov_iter_advance(i, written); +} +static inline int iov_iter_has_page(struct iov_iter *i) +{ + return i->ops == &ii_page_ops; +} +static inline struct page *iov_iter_page(struct iov_iter *i) +{ + BUG_ON(!iov_iter_has_page(i)); + return (struct page *)i->data; +} + +extern struct iov_iter_ops ii_iovec_ops; static inline void iov_iter_init(struct iov_iter *i, const struct iovec *iov, unsigned long nr_segs, size_t count, size_t written) { - i->iov = iov; + i->ops = &ii_iovec_ops; + i->data = (unsigned long)iov; i->nr_segs = nr_segs; i->iov_offset = 0; i->count = count + written; iov_iter_advance(i, written); } +static inline int iov_iter_has_iovec(const struct iov_iter *i) +{ + return i->ops == &ii_iovec_ops; +} +static inline struct iovec *iov_iter_iovec(struct iov_iter *i) +{ + BUG_ON(!iov_iter_has_iovec(i)); + return (struct iovec *)i->data; +} static inline size_t iov_iter_count(struct iov_iter *i) { @@ -408,6 +511,10 @@ struct address_space_operations { void (*freepage)(struct page *); ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, loff_t offset, unsigned long nr_segs); + ssize_t (*direct_IO_bvec)(int, struct kiocb *, struct bio_vec *bvec, + loff_t offset, unsigned long bvec_len); + ssize_t (*direct_IO_page)(int, struct kiocb *, struct page *page, + loff_t offset); int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, unsigned long *); /* @@ -1665,7 +1772,9 @@ struct file_operations { ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); + ssize_t (*read_iter) (struct kiocb *, struct iov_iter *, loff_t); ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); + ssize_t (*write_iter) (struct kiocb *, struct iov_iter *, loff_t); int (*readdir) (struct file *, void *, filldir_t); unsigned int (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); @@ -2702,13 +2811,20 @@ extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); +extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *, loff_t); extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t *); +extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *, loff_t *); extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); +extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *, loff_t); extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, unsigned long *, loff_t, loff_t *, size_t, size_t); +extern ssize_t generic_file_direct_write_iter(struct kiocb *, struct iov_iter *, + loff_t, loff_t *, size_t); extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, unsigned long, loff_t, loff_t *, size_t, ssize_t); +extern ssize_t generic_file_buffered_write_iter(struct kiocb *, struct iov_iter *, + loff_t, loff_t *, ssize_t); extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); extern int generic_segment_checks(const struct iovec *iov, diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index bb2554f..22ce4bd 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -44,6 +44,8 @@ enum { IOCB_CMD_NOOP = 6, IOCB_CMD_PREADV = 7, IOCB_CMD_PWRITEV = 8, + IOCB_CMD_READ_ITER = 9, + IOCB_CMD_WRITE_ITER = 10, }; /* diff --git a/mm/Makefile b/mm/Makefile index 204a614..4c3899b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -20,7 +20,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o balloon_compaction.o \ - interval_tree.o list_lru.o workingset.o oom_group.o $(mmu-y) + interval_tree.o list_lru.o workingset.o oom_group.o \ + iov-iter.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/filemap.c b/mm/filemap.c index ad2939d..605b5d3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1439,162 +1439,6 @@ static void shrink_readahead_size_eio(struct file *filp, ra->ra_pages /= 4; } -size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - void *kaddr, *from; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - if (!fault_in_pages_writeable(buf, copy)) { - kaddr = kmap_atomic(page); - from = kaddr + offset; - - /* first chunk, usually the only one */ - left = __copy_to_user_inatomic(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user_inatomic(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - if (likely(!bytes)) { - kunmap_atomic(kaddr); - goto done; - } - offset = from - kaddr; - buf += copy; - kunmap_atomic(kaddr); - copy = min(bytes, iov->iov_len - skip); - } - /* Too bad - revert to non-atomic kmap */ - kaddr = kmap(page); - from = kaddr + offset; - left = __copy_to_user(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - kunmap(page); -done: - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} -EXPORT_SYMBOL(copy_page_to_iter); - -size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - void *kaddr, *to; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - if (!fault_in_pages_readable(buf, copy)) { - kaddr = kmap_atomic(page); - to = kaddr + offset; - - /* first chunk, usually the only one */ - left = __copy_from_user_inatomic(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_from_user_inatomic(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - if (likely(!bytes)) { - kunmap_atomic(kaddr); - goto done; - } - offset = to - kaddr; - buf += copy; - kunmap_atomic(kaddr); - copy = min(bytes, iov->iov_len - skip); - } - /* Too bad - revert to non-atomic kmap */ - kaddr = kmap(page); - to = kaddr + offset; - left = __copy_from_user(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_from_user(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - kunmap(page); -done: - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} -EXPORT_SYMBOL(copy_page_from_iter); - /** * do_generic_file_read - generic file read routine * @filp: the file to read @@ -1912,31 +1756,60 @@ int generic_segment_checks(const struct iovec *iov, } EXPORT_SYMBOL(generic_segment_checks); +static ssize_t mapping_direct_IO(struct address_space *mapping, int rw, + struct kiocb *iocb, struct iov_iter *iter, + loff_t pos) +{ + if (iov_iter_has_iovec(iter)) + return mapping->a_ops->direct_IO(rw, iocb, iov_iter_iovec(iter), + pos, iter->nr_segs); + else if (iov_iter_has_bvec(iter)) + return mapping->a_ops->direct_IO_bvec(rw, iocb, + iov_iter_bvec(iter), pos, + iter->nr_segs); + else if (iov_iter_has_page(iter)) + return mapping->a_ops->direct_IO_page(rw, iocb, + iov_iter_page(iter), pos); + else + BUG(); +} + +static int file_read_iter_actor(read_descriptor_t *desc, struct page *page, + unsigned long offset, unsigned long size) +{ + struct iov_iter *iter = desc->arg.data; + unsigned long copied = 0; + + if (size > desc->count) + size = desc->count; + + copied = iov_iter_copy_to_user(page, iter, offset, size); + if (copied < size) + desc->error = -EFAULT; + + iov_iter_advance(iter, copied); + desc->count -= copied; + desc->written += copied; + + return copied; +} + + /** - * generic_file_aio_read - generic filesystem read routine + * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block - * @iov: io vector request - * @nr_segs: number of segments in the iovec + * @iov_iter: memory vector * @pos: current file position - * - * This is the "read()" routine for all filesystems - * that can use the page cache directly. */ ssize_t -generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg = 0; - size_t count; + read_descriptor_t desc; + ssize_t retval = 0; + size_t count = iov_iter_count(iter); loff_t *ppos = &iocb->ki_pos; - count = 0; - retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (retval) - return retval; - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { loff_t size; @@ -1950,10 +1823,10 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, size = i_size_read(inode); if (pos < size) { retval = filemap_write_and_wait_range(mapping, pos, - pos + iov_length(iov, nr_segs) - 1); + pos + count - 1); if (!retval) { - retval = mapping->a_ops->direct_IO(READ, iocb, - iov, pos, nr_segs); + retval = mapping_direct_IO(mapping, READ, + iocb, iter, pos); } if (retval > 0) { *ppos = pos + retval; @@ -1975,42 +1848,49 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, } } - count = retval; - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - loff_t offset = 0; + iov_iter_advance(iter, retval); - /* - * If we did a short DIO read we need to skip the section of the - * iov that we've already read data into. - */ - if (count) { - if (count > iov[seg].iov_len) { - count -= iov[seg].iov_len; - continue; - } - offset = count; - count = 0; - } + desc.written = 0; + desc.arg.data = iter; + desc.count = count; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_iter_actor); - desc.written = 0; - desc.arg.buf = iov[seg].iov_base + offset; - desc.count = iov[seg].iov_len - offset; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp, ppos, &desc, file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; - } + retval += desc.written; + if (desc.error && !retval) + retval = desc.error; out: return retval; } +EXPORT_SYMBOL(generic_file_read_iter); + +/** + * generic_file_aio_read - generic filesystem read routine + * @iocb: kernel I/O control block + * @iov: io vector request + * @nr_segs: number of segments in the iovec + * @pos: current file position + * + * This is the "read()" routine for all filesystems + * that can use the page cache directly. + */ +ssize_t +generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct iov_iter iter; + int ret; + size_t count; + + count = 0; + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (ret) + return ret; + + iov_iter_init(&iter, iov, nr_segs, count, 0); + + return generic_file_read_iter(iocb, &iter, pos); +} EXPORT_SYMBOL(generic_file_aio_read); #ifdef CONFIG_MMU @@ -2477,150 +2357,6 @@ struct page *read_cache_page(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page); -static size_t __iovec_copy_from_user_inatomic(char *vaddr, - const struct iovec *iov, size_t base, size_t bytes) -{ - size_t copied = 0, left = 0; - - while (bytes) { - char __user *buf = iov->iov_base + base; - int copy = min(bytes, iov->iov_len - base); - - base = 0; - left = __copy_from_user_inatomic(vaddr, buf, copy); - copied += copy; - bytes -= copy; - vaddr += copy; - iov++; - - if (unlikely(left)) - break; - } - return copied - left; -} - -/* - * Copy as much as we can into the page and return the number of bytes which - * were successfully copied. If a fault is encountered then return the number of - * bytes which were copied. - */ -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - BUG_ON(!in_atomic()); - kaddr = kmap_atomic(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap_atomic(kaddr); - - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); - -/* - * This has the same sideeffects and return value as - * iov_iter_copy_from_user_atomic(). - * The difference is that it attempts to resolve faults. - * Page must not be locked. - */ -size_t iov_iter_copy_from_user(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap(page); - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user); - -void iov_iter_advance(struct iov_iter *i, size_t bytes) -{ - BUG_ON(i->count < bytes); - - if (likely(i->nr_segs == 1)) { - i->iov_offset += bytes; - i->count -= bytes; - } else { - const struct iovec *iov = i->iov; - size_t base = i->iov_offset; - unsigned long nr_segs = i->nr_segs; - - /* - * The !iov->iov_len check ensures we skip over unlikely - * zero-length segments (without overruning the iovec). - */ - while (bytes || unlikely(i->count && !iov->iov_len)) { - int copy; - - copy = min(bytes, iov->iov_len - base); - BUG_ON(!i->count || i->count < copy); - i->count -= copy; - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - nr_segs--; - base = 0; - } - } - i->iov = iov; - i->iov_offset = base; - i->nr_segs = nr_segs; - } -} -EXPORT_SYMBOL(iov_iter_advance); - -/* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). - * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). - */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) -{ - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); -} -EXPORT_SYMBOL(iov_iter_fault_in_readable); - -/* - * Return the count of just the current iov_iter segment. - */ -size_t iov_iter_single_seg_count(const struct iov_iter *i) -{ - const struct iovec *iov = i->iov; - if (i->nr_segs == 1) - return i->count; - else - return min(i->count, iov->iov_len - i->iov_offset); -} -EXPORT_SYMBOL(iov_iter_single_seg_count); - /* * Performs necessary checks before doing a write * @@ -2726,9 +2462,8 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, EXPORT_SYMBOL(pagecache_write_end); ssize_t -generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long *nr_segs, loff_t pos, loff_t *ppos, - size_t count, size_t ocount) +generic_file_direct_write_iter(struct kiocb *iocb, struct iov_iter *iter, + loff_t pos, loff_t *ppos, size_t count) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -2737,10 +2472,13 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, size_t write_len; pgoff_t end; - if (count != ocount) - *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); + if (count != iov_iter_count(iter)) { + written = iov_iter_shorten(iter, count); + if (written) + goto out; + } - write_len = iov_length(iov, *nr_segs); + write_len = count; end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); @@ -2767,7 +2505,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } } - written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); + written = mapping_direct_IO(mapping, WRITE, iocb, iter, pos); /* * Finally, try again to invalidate clean pages which might have been @@ -2793,6 +2531,23 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, out: return written; } +EXPORT_SYMBOL(generic_file_direct_write_iter); + +ssize_t +generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long *nr_segs, loff_t pos, loff_t *ppos, + size_t count, size_t ocount) +{ + struct iov_iter iter; + ssize_t ret; + + iov_iter_init(&iter, iov, *nr_segs, ocount, 0); + ret = generic_file_direct_write_iter(iocb, &iter, pos, ppos, count); + /* generic_file_direct_write_iter() might have shortened the vec */ + if (*nr_segs != iter.nr_segs) + *nr_segs = iter.nr_segs; + return ret; +} EXPORT_SYMBOL(generic_file_direct_write); /* @@ -2926,18 +2681,15 @@ again: } ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) +generic_file_buffered_write_iter(struct kiocb *iocb, struct iov_iter *iter, + loff_t pos, loff_t *ppos, ssize_t written) { struct file *file = iocb->ki_filp; ssize_t status; - struct iov_iter i; virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL); - iov_iter_init(&i, iov, nr_segs, count, written); - status = generic_perform_write(file, &i, pos); + status = generic_perform_write(file, iter, pos); if (likely(status >= 0)) { written += status; @@ -2946,13 +2698,24 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, return written ? written : status; } +EXPORT_SYMBOL(generic_file_buffered_write_iter); + +ssize_t +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct iov_iter iter; + iov_iter_init(&iter, iov, nr_segs, count, written); + return generic_file_buffered_write_iter(iocb, &iter, pos, ppos, + written); +} EXPORT_SYMBOL(generic_file_buffered_write); /** * __generic_file_aio_write - write data to a file * @iocb: IO state structure (file, offset, etc.) - * @iov: vector with data to write - * @nr_segs: number of segments in the vector + * @iter: iov_iter specifying memory to write * @ppos: position where to write * * This function does all the work needed for actually writing data to a @@ -2967,24 +2730,18 @@ EXPORT_SYMBOL(generic_file_buffered_write); * A caller has to handle it. This is mainly due to the fact that we want to * avoid syncing under i_mutex. */ -ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter, + loff_t *ppos) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; - size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; loff_t pos; ssize_t written; ssize_t err; - ocount = 0; - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) - return err; - - count = ocount; + count = iov_iter_count(iter); pos = *ppos; /* We can write back this queue in page reclaim */ @@ -3011,8 +2768,8 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, loff_t endbyte; ssize_t written_buffered; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, - ppos, count, ocount); + written = generic_file_direct_write_iter(iocb, iter, pos, + ppos, count); if (written < 0 || written == count) goto out; /* @@ -3021,9 +2778,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, */ pos += written; count -= written; - written_buffered = generic_file_buffered_write(iocb, iov, - nr_segs, pos, ppos, count, - written); + iov_iter_advance(iter, written); + written_buffered = generic_file_buffered_write_iter(iocb, iter, + pos, ppos, written); /* * If generic_file_buffered_write() retuned a synchronous error * then we want to return the number of bytes which were @@ -3055,13 +2812,57 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, */ } } else { - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, ppos, count, written); + iter->count = count; + written = generic_file_buffered_write_iter(iocb, iter, + pos, ppos, written); } out: current->backing_dev_info = NULL; return written ? written : err; } +EXPORT_SYMBOL(__generic_file_write_iter); + +ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + mutex_lock(&inode->i_mutex); + ret = __generic_file_write_iter(iocb, iter, &iocb->ki_pos); + mutex_unlock(&inode->i_mutex); + + if (ret > 0 || ret == -EIOCBQUEUED) { + ssize_t err; + + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL(generic_file_write_iter); + +ssize_t +__generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct iov_iter iter; + size_t count; + int ret; + + count = 0; + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + if (ret) + goto out; + + iov_iter_init(&iter, iov, nr_segs, count, 0); + + ret = __generic_file_write_iter(iocb, &iter, ppos); +out: + return ret; +} EXPORT_SYMBOL(__generic_file_aio_write); /** diff --git a/mm/iov-iter.c b/mm/iov-iter.c new file mode 100644 index 0000000..e6fc15a --- /dev/null +++ b/mm/iov-iter.c @@ -0,0 +1,474 @@ +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/uio.h> +#include <linux/hardirq.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/bio.h> + +static size_t __iovec_copy_to_user_inatomic(char *vaddr, + const struct iovec *iov, size_t base, size_t bytes) +{ + size_t copied = 0, left = 0; + + while (bytes) { + char __user *buf = iov->iov_base + base; + int copy = min(bytes, iov->iov_len - base); + + base = 0; + left = __copy_to_user_inatomic(buf, vaddr, copy); + copied += copy; + bytes -= copy; + vaddr += copy; + iov++; + + if (unlikely(left)) + break; + } + return copied - left; +} + +/* + * Copy as much as we can into the page and return the number of bytes which + * were sucessfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +static size_t ii_iovec_copy_to_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + struct iovec *iov = (struct iovec *)i->data; + char *kaddr; + size_t copied; + + BUG_ON(!in_atomic()); + kaddr = kmap_atomic(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = iov->iov_base + i->iov_offset; + left = __copy_to_user_inatomic(buf, kaddr + offset, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_to_user_inatomic(kaddr + offset, + iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr); + + return copied; +} + +/* + * This has the same sideeffects and return value as + * ii_iovec_copy_to_user_atomic(). + * The difference is that it attempts to resolve faults. + * Page must not be locked. + */ +static size_t ii_iovec_copy_to_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + struct iovec *iov = (struct iovec *)i->data; + char *kaddr; + size_t copied; + + kaddr = kmap(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = iov->iov_base + i->iov_offset; + left = copy_to_user(buf, kaddr + offset, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_to_user_inatomic(kaddr + offset, + iov, i->iov_offset, bytes); + } + kunmap(page); + return copied; +} + + +static size_t __iovec_copy_from_user_inatomic(char *vaddr, + const struct iovec *iov, size_t base, size_t bytes) +{ + size_t copied = 0, left = 0; + + while (bytes) { + char __user *buf = iov->iov_base + base; + int copy = min(bytes, iov->iov_len - base); + + base = 0; + left = __copy_from_user_inatomic(vaddr, buf, copy); + copied += copy; + bytes -= copy; + vaddr += copy; + iov++; + + if (unlikely(left)) + break; + } + return copied - left; +} + +/* + * Copy as much as we can into the page and return the number of bytes which + * were sucessfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +static size_t ii_iovec_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + struct iovec *iov = (struct iovec *)i->data; + char *kaddr; + size_t copied; + + BUG_ON(!in_atomic()); + kaddr = kmap_atomic(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr); + + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +/* + * This has the same sideeffects and return value as + * ii_iovec_copy_from_user_atomic(). + * The difference is that it attempts to resolve faults. + * Page must not be locked. + */ +static size_t ii_iovec_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + struct iovec *iov = (struct iovec *)i->data; + char *kaddr; + size_t copied; + + kaddr = kmap(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = iov->iov_base + i->iov_offset; + left = __copy_from_user(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + iov, i->iov_offset, bytes); + } + kunmap(page); + return copied; +} + +static void ii_iovec_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + struct iovec *iov = (struct iovec *)i->data; + size_t base = i->iov_offset; + unsigned long nr_segs = i->nr_segs; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(i->count && !iov->iov_len)) { + int copy; + + copy = min(bytes, iov->iov_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; + } + } + i->data = (unsigned long)iov; + i->iov_offset = base; + i->nr_segs = nr_segs; + } +} + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +static int ii_iovec_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + struct iovec *iov = (struct iovec *)i->data; + char __user *buf = iov->iov_base + i->iov_offset; + bytes = min(bytes, iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); +} + +/* + * Return the count of just the current iov_iter segment. + */ +static size_t ii_iovec_single_seg_count(const struct iov_iter *i) +{ + struct iovec *iov = (struct iovec *)i->data; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, iov->iov_len - i->iov_offset); +} + +static int ii_iovec_shorten(struct iov_iter *i, size_t count) +{ + struct iovec *iov = (struct iovec *)i->data; + i->nr_segs = iov_shorten(iov, i->nr_segs, count); + return 0; +} + +struct iov_iter_ops ii_iovec_ops = { + .ii_copy_to_user_atomic = ii_iovec_copy_to_user_atomic, + .ii_copy_to_user = ii_iovec_copy_to_user, + .ii_copy_from_user_atomic = ii_iovec_copy_from_user_atomic, + .ii_copy_from_user = ii_iovec_copy_from_user, + .ii_advance = ii_iovec_advance, + .ii_fault_in_readable = ii_iovec_fault_in_readable, + .ii_single_seg_count = ii_iovec_single_seg_count, + .ii_shorten = ii_iovec_shorten, +}; +EXPORT_SYMBOL(ii_iovec_ops); + +/* + * As an easily verifiable first pass, we implement all the methods that + * copy data to and from bvec pages with one function. We implement it + * all with kmap_atomic(). + */ +static size_t bvec_copy_tofrom_page(struct iov_iter *iter, struct page *page, + unsigned long page_offset, size_t bytes, + int topage) +{ + struct bio_vec *bvec = (struct bio_vec *)iter->data; + size_t bvec_offset = iter->iov_offset; + size_t remaining = bytes; + void *bvec_map; + void *page_map; + size_t copy; + + page_map = kmap_atomic(page); + + BUG_ON(bytes > iter->count); + while (remaining) { + BUG_ON(bvec->bv_len == 0); + BUG_ON(bvec_offset >= bvec->bv_len); + copy = min(remaining, bvec->bv_len - bvec_offset); + bvec_map = kmap_atomic(bvec->bv_page); + if (topage) + memcpy(page_map + page_offset, + bvec_map + bvec->bv_offset + bvec_offset, + copy); + else + memcpy(bvec_map + bvec->bv_offset + bvec_offset, + page_map + page_offset, + copy); + kunmap_atomic(bvec_map); + remaining -= copy; + bvec_offset += copy; + page_offset += copy; + if (bvec_offset == bvec->bv_len) { + bvec_offset = 0; + bvec++; + } + } + + kunmap_atomic(page_map); + + return bytes; +} + +size_t ii_bvec_copy_to_user_atomic(struct page *page, struct iov_iter *i, + unsigned long offset, size_t bytes) +{ + return bvec_copy_tofrom_page(i, page, offset, bytes, 0); +} +size_t ii_bvec_copy_to_user(struct page *page, struct iov_iter *i, + unsigned long offset, size_t bytes) +{ + return bvec_copy_tofrom_page(i, page, offset, bytes, 0); +} +size_t ii_bvec_copy_from_user_atomic(struct page *page, struct iov_iter *i, + unsigned long offset, size_t bytes) +{ + return bvec_copy_tofrom_page(i, page, offset, bytes, 1); +} +size_t ii_bvec_copy_from_user(struct page *page, struct iov_iter *i, + unsigned long offset, size_t bytes) +{ + return bvec_copy_tofrom_page(i, page, offset, bytes, 1); +} + +/* + * bio_vecs have a stricter structure than iovecs that might have + * come from userspace. There are no zero length bio_vec elements. + */ +void ii_bvec_advance(struct iov_iter *i, size_t bytes) +{ + struct bio_vec *bvec = (struct bio_vec *)i->data; + size_t offset = i->iov_offset; + size_t delta; + + BUG_ON(i->count < bytes); + while (bytes) { + BUG_ON(bvec->bv_len == 0); + BUG_ON(bvec->bv_len <= offset); + delta = min(bytes, bvec->bv_len - offset); + offset += delta; + i->count -= delta; + bytes -= delta; + if (offset == bvec->bv_len) { + bvec++; + offset = 0; + } + } + + i->data = (unsigned long)bvec; + i->iov_offset = offset; +} + +/* + * pages pointed to by bio_vecs are always pinned. + */ +int ii_bvec_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + return 0; +} + +size_t ii_bvec_single_seg_count(const struct iov_iter *i) +{ + const struct bio_vec *bvec = (struct bio_vec *)i->data; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, bvec->bv_len - i->iov_offset); +} + +static int ii_bvec_shorten(struct iov_iter *i, size_t count) +{ + return -EINVAL; +} + +struct iov_iter_ops ii_bvec_ops = { + .ii_copy_to_user_atomic = ii_bvec_copy_to_user_atomic, + .ii_copy_to_user = ii_bvec_copy_to_user, + .ii_copy_from_user_atomic = ii_bvec_copy_from_user_atomic, + .ii_copy_from_user = ii_bvec_copy_from_user, + .ii_advance = ii_bvec_advance, + .ii_fault_in_readable = ii_bvec_fault_in_readable, + .ii_single_seg_count = ii_bvec_single_seg_count, + .ii_shorten = ii_bvec_shorten, +}; +EXPORT_SYMBOL(ii_bvec_ops); + +/* Functions to get on with single page */ + +static size_t page_copy_tofrom_page(struct iov_iter *iter, struct page *page, + unsigned long page_offset, size_t bytes, + int topage) +{ + struct page *ipage = (struct page *)iter->data; + size_t ipage_offset = iter->iov_offset; + void *ipage_map; + void *page_map; + + BUG_ON(bytes > iter->count); + BUG_ON(bytes > PAGE_SIZE - ipage_offset); + BUG_ON(ipage_offset >= PAGE_SIZE); + + page_map = kmap_atomic(page); + ipage_map = kmap_atomic(ipage); + + if (topage) + memcpy(page_map + page_offset, + ipage_map + ipage_offset, + bytes); + else + memcpy(ipage_map + ipage_offset, + page_map + page_offset, + bytes); + + kunmap_atomic(ipage_map); + kunmap_atomic(page_map); + + return bytes; +} + +size_t ii_page_copy_to_user_atomic(struct page *page, struct iov_iter *i, + unsigned long offset, size_t bytes) +{ + return page_copy_tofrom_page(i, page, offset, bytes, 0); +} +size_t ii_page_copy_to_user(struct page *page, struct iov_iter *i, + unsigned long offset, size_t bytes) +{ + return page_copy_tofrom_page(i, page, offset, bytes, 0); +} +size_t ii_page_copy_from_user_atomic(struct page *page, struct iov_iter *i, + unsigned long offset, size_t bytes) +{ + return page_copy_tofrom_page(i, page, offset, bytes, 1); +} +size_t ii_page_copy_from_user(struct page *page, struct iov_iter *i, + unsigned long offset, size_t bytes) +{ + return page_copy_tofrom_page(i, page, offset, bytes, 1); +} + +void ii_page_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + BUG_ON(i->iov_offset >= PAGE_SIZE); + BUG_ON(bytes > PAGE_SIZE - i->iov_offset); + + i->iov_offset += bytes; + i->count -= bytes; +} + +/* + * pages pointed to by bio_vecs are always pinned. + */ +int ii_page_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + return 0; +} + +size_t ii_page_single_seg_count(const struct iov_iter *i) +{ + BUG_ON(i->nr_segs != 1); + + return i->count; +} + +static int ii_page_shorten(struct iov_iter *i, size_t count) +{ + return -EINVAL; +} + +struct iov_iter_ops ii_page_ops = { + .ii_copy_to_user_atomic = ii_page_copy_to_user_atomic, + .ii_copy_to_user = ii_page_copy_to_user, + .ii_copy_from_user_atomic = ii_page_copy_from_user_atomic, + .ii_copy_from_user = ii_page_copy_from_user, + .ii_advance = ii_page_advance, + .ii_fault_in_readable = ii_page_fault_in_readable, + .ii_single_seg_count = ii_page_single_seg_count, + .ii_shorten = ii_page_shorten, +}; +EXPORT_SYMBOL(ii_page_ops); _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel