On Wed, 2017-01-25 at 08:32 -0500, Jeff Layton wrote:
> Currently, iov_iter_get_pages_alloc will only ever operate on the first
> vector that iterate_all_kinds hands back. Most of the callers however
> would like to have as long a set of pages as possible, to allow for
> fewer, but larger I/Os.
> 
> When the previous vector ends on a page boundary and the current one
> begins on one, we can continue to add more pages.
> 
> Change the function to first scan the iov_iter to see how long an
> array of pages we could create from the current position up to the
> maxsize passed in. Then, allocate an array that large and start
> filling in that many pages.
> 
> The main impetus for this patch is to rip out a swath of code in ceph
> that tries to do this but that doesn't handle ITER_BVEC correctly.
> 
> NFS also uses this function and this also allows the client to do larger
> I/Os when userland passes down an array of page-aligned iovecs in an
> O_DIRECT request. This should also make splice writes into an O_DIRECT
> file on NFS use larger I/Os, since that's now done by passing down an
> ITER_BVEC representing the data to be written.
> 
> I believe the other callers (lustre and 9p) may also benefit from this
> change, but I don't have a great way to verify that.
> 
> Signed-off-by: Jeff Layton <[email protected]>
> ---
>  lib/iov_iter.c | 142 
> +++++++++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 113 insertions(+), 29 deletions(-)
> 
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index e68604ae3ced..c87ba154371a 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -883,6 +883,59 @@ unsigned long iov_iter_gap_alignment(const struct 
> iov_iter *i)
>  }
>  EXPORT_SYMBOL(iov_iter_gap_alignment);
>  
> +/**
> + * iov_iter_pvec_size - find length of page aligned iovecs in iov_iter
> + * @i: iov_iter to in which to find the size
> + * @maxsize: maximum size to return
> + *
> + * Some filesystems can stitch together multiple iovecs into a single page
> + * vector when both the previous tail and current base are page aligned. This
> + * function determines how much of the remaining iov_iter can be stuffed into
> + * a single pagevec, up to the provided maxsize value.
> + */
> +static size_t iov_iter_pvec_size(const struct iov_iter *i, size_t maxsize)
> +{
> +     size_t size = min(iov_iter_count(i), maxsize);
> +     size_t pv_size = 0;
> +     bool contig = false, first = true;
> +
> +     if (!size)
> +             return 0;
> +
> +     /* Pipes are naturally aligned for this */
> +     if (unlikely(i->type & ITER_PIPE))
> +             return size;
> +
> +     /*
> +      * An iov can be page vectored when the current base and previous
> +      * tail are both page aligned. Note that we don't require that the
> +      * initial base in the first iovec also be page aligned.
> +      */
> +     iterate_all_kinds(i, size, v,
> +             ({
> +              if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
> +                     pv_size += v.iov_len;
> +                     first = false;
> +                     contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
> +              }; 0;
> +              }),
> +             ({
> +              if (first || (contig && v.bv_offset == 0)) {
> +                     pv_size += v.bv_len;
> +                     first = false;
> +                     contig = PAGE_ALIGNED(v.bv_offset + v.bv_len);
> +              }
> +              }),
> +             ({
> +              if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
> +                     pv_size += v.iov_len;
> +                     first = false;
> +                     contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
> +              }
> +              }))
> +     return pv_size;
> +}
> +
>  static inline size_t __pipe_get_pages(struct iov_iter *i,
>                               size_t maxsize,
>                               struct page **pages,
> @@ -1006,47 +1059,78 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter 
> *i,
>  }
>  
>  ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
> -                struct page ***pages, size_t maxsize,
> -                size_t *start)
> +                struct page ***ppages, size_t maxsize,
> +                size_t *pstart)
>  {
> -     struct page **p;
> -
> -     if (maxsize > i->count)
> -             maxsize = i->count;
> +     struct page **p, **pc;
> +     size_t start = 0;
> +     ssize_t len = 0;
> +     int npages, res = 0;
> +     bool first = true;
>  
>       if (unlikely(i->type & ITER_PIPE))
> -             return pipe_get_pages_alloc(i, pages, maxsize, start);
> +             return pipe_get_pages_alloc(i, ppages, maxsize, pstart);
> +
> +     maxsize = iov_iter_pvec_size(i, maxsize);
> +     npages = DIV_ROUND_UP(maxsize, PAGE_SIZE);

Bah, the above is wrong when the offset into the first page is non-
zero. I'll fix -- this probably also points out the need for some tests
for this. I'll see if I can cook something up for xfstests.

> +     p = get_pages_array(npages);
> +     if (!p)
> +             return -ENOMEM;
> +
> +     pc = p;
>       iterate_all_kinds(i, maxsize, v, ({
>               unsigned long addr = (unsigned long)v.iov_base;
> -             size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
> +             size_t slen = v.iov_len;
>               int n;
> -             int res;
>  
> -             addr &= ~(PAGE_SIZE - 1);
> -             n = DIV_ROUND_UP(len, PAGE_SIZE);
> -             p = get_pages_array(n);
> -             if (!p)
> -                     return -ENOMEM;
> -             res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, 
> p);
> -             if (unlikely(res < 0)) {
> -                     kvfree(p);
> -                     return res;
> +             if (first) {
> +                     start = addr & (PAGE_SIZE - 1);
> +                     slen += start;
> +                     first = false;
>               }
> -             *pages = p;
> -             return (res == n ? len : res * PAGE_SIZE) - *start;
> +
> +             n = DIV_ROUND_UP(slen, PAGE_SIZE);
> +             if (pc + n > p + npages) {
> +                     /* Did something change the iov array?!? */
> +                     res = -EFAULT;
> +                     goto out;
> +             }
> +             addr &= ~(PAGE_SIZE - 1);
> +             res = get_user_pages_fast(addr, n,
> +                                       (i->type & WRITE) != WRITE, pc);
> +             if (unlikely(res < 0))
> +                     goto out;
> +             len += (res == n ? slen : res * PAGE_SIZE) - start;
> +             pc += res;
>       0;}),({
> -             /* can't be more than PAGE_SIZE */
> -             *start = v.bv_offset;
> -             *pages = p = get_pages_array(1);
> -             if (!p)
> -                     return -ENOMEM;
> -             get_page(*p = v.bv_page);
> -             return v.bv_len;
> +             /* bio_vecs are limited to a single page each */
> +             if (first) {
> +                     start = v.bv_offset;
> +                     first = false;
> +             }
> +             get_page(*pc = v.bv_page);
> +             len += v.bv_len;
> +             ++pc;
> +             BUG_ON(pc > p + npages);
>       }),({
> -             return -EFAULT;
> +             /* FIXME: should we handle this case? */
> +             res = -EFAULT;
> +             goto out;
>       })
>       )
> -     return 0;
> +out:
> +     if (unlikely(res < 0)) {
> +             struct page **i;
> +
> +             for (i = p; i < pc; i++)
> +                     put_page(*i);
> +             kvfree(p);
> +             return res;
> +     }
> +
> +     *ppages = p;
> +     *pstart = start;
> +     return len;
>  }
>  EXPORT_SYMBOL(iov_iter_get_pages_alloc);
>  

Reply via email to