On Wed,  3 Feb 2021 15:47:56 -0500
Daniel Jordan <daniel.m.jor...@oracle.com> wrote:

> Pinning one 4K page at a time is inefficient, so do it in batches of 512
> instead.  This is just an optimization with no functional change
> intended, and in particular the driver still calls iommu_map() with the
> largest physically contiguous range possible.
> 
> Add two fields in vfio_batch to remember where to start between calls to
> vfio_pin_pages_remote(), and use vfio_batch_unpin() to handle remaining
> pages in the batch in case of error.
> 
> qemu pins pages for guests around 8% faster on my test system, a
> two-node Broadwell server with 128G memory per node.  The qemu process
> was bound to one node with its allocations constrained there as well.
> 
>                              base               test
>           guest              ----------------   ----------------
>        mem (GB)   speedup    avg sec    (std)   avg sec    (std)
>               1      7.4%       0.61   (0.00)      0.56   (0.00)
>               2      8.3%       0.93   (0.00)      0.85   (0.00)
>               4      8.4%       1.46   (0.00)      1.34   (0.00)
>               8      8.6%       2.54   (0.01)      2.32   (0.00)
>              16      8.3%       4.66   (0.00)      4.27   (0.01)
>              32      8.3%       8.94   (0.01)      8.20   (0.01)
>              64      8.2%      17.47   (0.01)     16.04   (0.03)
>             120      8.5%      32.45   (0.13)     29.69   (0.01)
> 
> perf diff confirms less time spent in pup.  Here are the top ten
> functions:
> 
>              Baseline  Delta Abs  Symbol
> 
>                78.63%     +6.64%  clear_page_erms
>                 1.50%     -1.50%  __gup_longterm_locked
>                 1.27%     -0.78%  __get_user_pages
>                           +0.76%  kvm_zap_rmapp.constprop.0
>                 0.54%     -0.53%  vmacache_find
>                 0.55%     -0.51%  get_pfnblock_flags_mask
>                 0.48%     -0.48%  __get_user_pages_remote
>                           +0.39%  slot_rmap_walk_next
>                           +0.32%  vfio_pin_map_dma
>                           +0.26%  kvm_handle_hva_range
>                 ...
> 
> Suggested-by: Matthew Wilcox (Oracle) <wi...@infradead.org>
> Signed-off-by: Daniel Jordan <daniel.m.jor...@oracle.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 133 +++++++++++++++++++++-----------
>  1 file changed, 88 insertions(+), 45 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index c26c1a4697e5..ac59bfc4e332 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -101,6 +101,8 @@ struct vfio_batch {
>       struct page             **pages;        /* for pin_user_pages_remote */
>       struct page             *fallback_page; /* if pages alloc fails */
>       int                     capacity;       /* length of pages array */
> +     int                     size;           /* of batch currently */
> +     int                     offset;         /* of next entry in pages */
>  };
>  
>  struct vfio_group {
> @@ -425,6 +427,9 @@ static int put_pfn(unsigned long pfn, int prot)
>  
>  static void vfio_batch_init(struct vfio_batch *batch)
>  {
> +     batch->size = 0;
> +     batch->offset = 0;
> +
>       if (unlikely(disable_hugepages))
>               goto fallback;
>  
> @@ -440,6 +445,17 @@ static void vfio_batch_init(struct vfio_batch *batch)
>       batch->capacity = 1;
>  }
>  
> +static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
> +{
> +     while (batch->size) {
> +             unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
> +
> +             put_pfn(pfn, dma->prot);
> +             batch->offset++;
> +             batch->size--;
> +     }
> +}
> +
>  static void vfio_batch_fini(struct vfio_batch *batch)
>  {
>       if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
> @@ -526,65 +542,88 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, 
> unsigned long vaddr,
>                                 long npage, unsigned long *pfn_base,
>                                 unsigned long limit, struct vfio_batch *batch)
>  {
> -     unsigned long pfn = 0;
> +     unsigned long pfn;
> +     struct mm_struct *mm = current->mm;
>       long ret, pinned = 0, lock_acct = 0;
>       bool rsvd;
>       dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
>  
>       /* This code path is only user initiated */
> -     if (!current->mm)
> +     if (!mm)
>               return -ENODEV;
>  
> -     ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, pfn_base,
> -                          batch->pages);
> -     if (ret < 0)
> -             return ret;
> -
> -     pinned++;
> -     rsvd = is_invalid_reserved_pfn(*pfn_base);
> -
> -     /*
> -      * Reserved pages aren't counted against the user, externally pinned
> -      * pages are already counted against the user.
> -      */
> -     if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> -             if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
> -                     put_pfn(*pfn_base, dma->prot);
> -                     pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> -                                     limit << PAGE_SHIFT);
> -                     return -ENOMEM;
> -             }
> -             lock_acct++;
> +     if (batch->size) {
> +             /* Leftover pages in batch from an earlier call. */
> +             *pfn_base = page_to_pfn(batch->pages[batch->offset]);
> +             pfn = *pfn_base;
> +             rsvd = is_invalid_reserved_pfn(*pfn_base);
> +     } else {
> +             *pfn_base = 0;
>       }
>  
> -     if (unlikely(disable_hugepages))
> -             goto out;
> +     while (npage) {
> +             if (!batch->size) {
> +                     /* Empty batch, so refill it. */
> +                     long req_pages = min_t(long, npage, batch->capacity);
>  
> -     /* Lock all the consecutive pages from pfn_base */
> -     for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
> -          pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> -             ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, &pfn,
> -                                  batch->pages);
> -             if (ret < 0)
> -                     break;
> +                     ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
> +                                          &pfn, batch->pages);
> +                     if (ret < 0)
> +                             return ret;


This might not be the first batch we fill, I think this needs to unwind
rather than direct return.  Series looks good otherwise.  Thanks,

Alex 


>  
> -             if (pfn != *pfn_base + pinned ||
> -                 rsvd != is_invalid_reserved_pfn(pfn)) {
> -                     put_pfn(pfn, dma->prot);
> -                     break;
> +                     batch->size = ret;
> +                     batch->offset = 0;
> +
> +                     if (!*pfn_base) {
> +                             *pfn_base = pfn;
> +                             rsvd = is_invalid_reserved_pfn(*pfn_base);
> +                     }
>               }
>  
> -             if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> -                     if (!dma->lock_cap &&
> -                         current->mm->locked_vm + lock_acct + 1 > limit) {
> -                             put_pfn(pfn, dma->prot);
> -                             pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> -                                     __func__, limit << PAGE_SHIFT);
> -                             ret = -ENOMEM;
> -                             goto unpin_out;
> +             /*
> +              * pfn is preset for the first iteration of this inner loop and
> +              * updated at the end to handle a VM_PFNMAP pfn.  In that case,
> +              * batch->pages isn't valid (there's no struct page), so allow
> +              * batch->pages to be touched only when there's more than one
> +              * pfn to check, which guarantees the pfns are from a
> +              * !VM_PFNMAP vma.
> +              */
> +             while (true) {
> +                     if (pfn != *pfn_base + pinned ||
> +                         rsvd != is_invalid_reserved_pfn(pfn))
> +                             goto out;
> +
> +                     /*
> +                      * Reserved pages aren't counted against the user,
> +                      * externally pinned pages are already counted against
> +                      * the user.
> +                      */
> +                     if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> +                             if (!dma->lock_cap &&
> +                                 mm->locked_vm + lock_acct + 1 > limit) {
> +                                     pr_warn("%s: RLIMIT_MEMLOCK (%ld) 
> exceeded\n",
> +                                             __func__, limit << PAGE_SHIFT);
> +                                     ret = -ENOMEM;
> +                                     goto unpin_out;
> +                             }
> +                             lock_acct++;
>                       }
> -                     lock_acct++;
> +
> +                     pinned++;
> +                     npage--;
> +                     vaddr += PAGE_SIZE;
> +                     iova += PAGE_SIZE;
> +                     batch->offset++;
> +                     batch->size--;
> +
> +                     if (!batch->size)
> +                             break;
> +
> +                     pfn = page_to_pfn(batch->pages[batch->offset]);
>               }
> +
> +             if (unlikely(disable_hugepages))
> +                     break;
>       }
>  
>  out:
> @@ -596,6 +635,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, 
> unsigned long vaddr,
>                       for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
>                               put_pfn(pfn, dma->prot);
>               }
> +             vfio_batch_unpin(batch, dma);
>  
>               return ret;
>       }
> @@ -1305,6 +1345,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, 
> struct vfio_dma *dma,
>               if (ret) {
>                       vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
>                                               npage, true);
> +                     vfio_batch_unpin(&batch, dma);
>                       break;
>               }
>  
> @@ -1546,11 +1587,13 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>                       ret = iommu_map(domain->domain, iova, phys,
>                                       size, dma->prot | domain->prot);
>                       if (ret) {
> -                             if (!dma->iommu_mapped)
> +                             if (!dma->iommu_mapped) {
>                                       vfio_unpin_pages_remote(dma, iova,
>                                                       phys >> PAGE_SHIFT,
>                                                       size >> PAGE_SHIFT,
>                                                       true);
> +                                     vfio_batch_unpin(&batch, dma);
> +                             }
>                               goto unwind;
>                       }
>  

Reply via email to