Hi Alex,

On 06/04/2017 16:43, Alex Williamson wrote:
> On Thu, 6 Apr 2017 10:23:59 +0200
> Auger Eric <eric.au...@redhat.com> wrote:
> 
>> Hi Alex,
>>
>> On 03/04/2017 22:02, Alex Williamson wrote:
>>> If the mmap_sem is contented then the vfio type1 IOMMU backend will
>>> defer locked page accounting updates to a workqueue task.  This has
>>> a few problems and depending on which side the user tries to play,
>>> they might be over-penalized for unmaps that haven't yet been
>>> accounted, or able to race the workqueue to enter more mappings
>>> than they're allowed.  It's not entirely clear what motivated this
>>> workqueue mechanism in the original vfio design, but it seems to
>>> introduce more problems than it solves, so remove it and update the
>>> callers to allow for failure.  We can also now recheck the limit
>>> under write lock to make sure we don't exceed it.
>>>
>>> Cc: sta...@vger.kernel.org
>>> Signed-off-by: Alex Williamson <alex.william...@redhat.com>
>>> ---
>>>
>>> Sergio had proposed a QEMU workaround for this:
>>> https://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00244.html
>>> Clearly the bug is in the kernel and I'm more inclined to fix it via
>>> stable releases.  I also considered adding a flag in the type1 info
>>> structure to indicate synchronous lock accounting, but then second
>>> guessed using that to advertise the defect, especially if the workaround
>>> is only to pause and try again.  Comments/suggestions?  Thanks,
>>>
>>> Alex
>>>
>>>  drivers/vfio/vfio_iommu_type1.c |   99 
>>> ++++++++++++++++++---------------------
>>>  1 file changed, 45 insertions(+), 54 deletions(-)
>>>
>>> diff --git a/drivers/vfio/vfio_iommu_type1.c 
>>> b/drivers/vfio/vfio_iommu_type1.c
>>> index 32d2633092a3..d93a88748d14 100644
>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>> @@ -246,69 +246,45 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma 
>>> *dma, struct vfio_pfn *vpfn)
>>>     return ret;
>>>  }
>>>  
>>> -struct vwork {
>>> -   struct mm_struct        *mm;
>>> -   long                    npage;
>>> -   struct work_struct      work;
>>> -};
>>> -
>>> -/* delayed decrement/increment for locked_vm */
>>> -static void vfio_lock_acct_bg(struct work_struct *work)
>>> +static int vfio_lock_acct(struct task_struct *task, long npage)
>>>  {
>>> -   struct vwork *vwork = container_of(work, struct vwork, work);
>>> -   struct mm_struct *mm;
>>> -
>>> -   mm = vwork->mm;
>>> -   down_write(&mm->mmap_sem);
>>> -   mm->locked_vm += vwork->npage;
>>> -   up_write(&mm->mmap_sem);
>>> -   mmput(mm);
>>> -   kfree(vwork);
>>> -}
>>> -
>>> -static void vfio_lock_acct(struct task_struct *task, long npage)
>>> -{
>>> -   struct vwork *vwork;
>>>     struct mm_struct *mm;
>>>     bool is_current;
>>> +   int ret;
>>>  
>>>     if (!npage)
>>> -           return;
>>> +           return 0;
>>>  
>>>     is_current = (task->mm == current->mm);
>>>  
>>>     mm = is_current ? task->mm : get_task_mm(task);
>>>     if (!mm)
>>> -           return; /* process exited */
>>> +           return -ESRCH; /* process exited */
>>>  
>>> -   if (down_write_trylock(&mm->mmap_sem)) {
>>> +   ret = down_write_killable(&mm->mmap_sem);
>>> +   if (ret)  
>> don't you miss mmput(mm) if (!is_current)?
> 
> Yes!  I'm going to change this it if (!ret) {...
>  
>>> +           return ret;
> 
> Remove this
> 
>>> +
>>> +   if (npage < 0) {
>>>             mm->locked_vm += npage;
>>> -           up_write(&mm->mmap_sem);
>>> -           if (!is_current)
>>> -                   mmput(mm);
>>> -           return;
>>> -   }
>>> +   } else {
>>> +           unsigned long limit;
>>> +
>>> +           limit = is_current ? rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT :
>>> +                           task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>>>  
>>> -   if (is_current) {
>>> -           mm = get_task_mm(task);
>>> -           if (!mm)
>>> -                   return;
>>> +           if (mm->locked_vm + npage <= limit)
>>> +                   mm->locked_vm += npage;
>>> +           else
>>> +                   ret = -ENOMEM;
>>>     }
>>>  
>>> -   /*
>>> -    * Couldn't get mmap_sem lock, so must setup to update
>>> -    * mm->locked_vm later. If locked_vm were atomic, we
>>> -    * wouldn't need this silliness
>>> -    */
>>> -   vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>>> -   if (WARN_ON(!vwork)) {
>>> +   up_write(&mm->mmap_sem);
> 
> And end the (!ret) branch here }
> 
> So if we don't get mmap_sem, we skip to here, mmput, and return the
> error.
> 
>>> +
>>> +   if (!is_current)
>>>             mmput(mm);
>>> -           return;
>>> -   }
>>> -   INIT_WORK(&vwork->work, vfio_lock_acct_bg);
>>> -   vwork->mm = mm;
>>> -   vwork->npage = npage;
>>> -   schedule_work(&vwork->work);
>>> +
>>> +   return ret;
>>>  }
>>>  
>>>  /*
>>> @@ -405,7 +381,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned 
>>> long vaddr,
>>>  static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long 
>>> vaddr,
>>>                               long npage, unsigned long *pfn_base)
>>>  {
>>> -   unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>>> +   unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>>>     bool lock_cap = capable(CAP_IPC_LOCK);
>>>     long ret, pinned = 0, lock_acct = 0;
>>>     bool rsvd;
>>> @@ -442,8 +418,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, 
>>> unsigned long vaddr,
>>>     /* Lock all the consecutive pages from pfn_base */
>>>     for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
>>>          pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
>>> -           unsigned long pfn = 0;
>>> -
>>>             ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
>>>             if (ret)
>>>                     break;
>>> @@ -460,14 +434,25 @@ static long vfio_pin_pages_remote(struct vfio_dma 
>>> *dma, unsigned long vaddr,
>>>                             put_pfn(pfn, dma->prot);
>>>                             pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>>>                                     __func__, limit << PAGE_SHIFT);
>>> -                           break;
>>> +                           ret = -ENOMEM;
>>> +                           goto unpin_out;
>>>                     }
>>>                     lock_acct++;
>>>             }
>>>     }
>>>  
>>>  out:
>>> -   vfio_lock_acct(current, lock_acct);
>>> +   ret = vfio_lock_acct(current, lock_acct);
>>> +
>>> +unpin_out:
>>> +   if (ret) {
>>> +           if (!rsvd) {
>>> +                   for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
>>> +                           put_pfn(pfn, dma->prot);
>>> +           }
>>> +
>>> +           return ret;
>>> +   }  
>> not especially related to that patch but for my understanding I have
>> some questions:
>> - in case vfio_pin_pages_remote() attempts to pin n pages and when
>> scanning pfns associated to [vaddr, vaddr+n] we get and error
>> (vaddr_get_pfn, pfn discontinuity, pfn is resv) we break and lock_acct
>> the previously scanned pages without error. We have not pinned all the
>> pages so why don't we report an error? The returned value is the number
>> of pinned pages but is never checked against the target if I am not wrong.
> 
> vfio_pin_pages_remote() returns a base pfn and number of pages, this
> matches how we call iommu_map().  It doesn't take a list of pages, it
> takes an iova and physical address base and range.  Therefore we
> vfio_pin_pages_remote() cannot continue past a discontinuity, we need
> to break the DMA chunk and map it through the iommu at that point.
> It's up to the caller of this function to iterate through the full
> range, see vfio_pin_map_dma().
Hum I missed "dma->size += npage << PAGE_SHIFT;"

>  
>> - Also during this scan we do not break if we discover the pfn is
>> externally pinned, right? Can't it occur?
> 
> Correct, and yes it can occur.  We don't account for the page in that
> case as it's already accounted for via the external mapping, but we do
> increase the page reference count so that both the external pinning
> and the iommu pinning each hold a reference.
ok
>  
>> - Couldn't we handle the rsvd case at the beginning? if the pfn_base is
>> rsvd then we won't do anything if I get it right.
> 
> Can we necessarily assume that if any page within a range is rsvd that
> they all are?  I don't know.  What tricks might the user play with
> their address space to put reserved areas and non-reserved areas
> together to bypass lock limits?
Yes makes sense.

Thank you for the explanations!

Eric
> 
>>>  
>>>     return pinned;
>>>  }
>>> @@ -522,8 +507,14 @@ static int vfio_pin_page_external(struct vfio_dma 
>>> *dma, unsigned long vaddr,
>>>             goto pin_page_exit;
>>>     }
>>>  
>>> -   if (!rsvd && do_accounting)
>>> -           vfio_lock_acct(dma->task, 1);
>>> +   if (!rsvd && do_accounting) {
>>> +           ret = vfio_lock_acct(dma->task, 1);
>>> +           if (ret) {
>>> +                   put_pfn(*pfn_base, dma->prot);
>>> +                   goto pin_page_exit;
>>> +           }
>>> +   }
>>> +
>>>     ret = 1;  
>> not especially related to this patch but as the returned value is not
>> "standard", ie. 0 on success and <0 on failure, might be helpful to add
>> a kernel doc
> 
> Or better yet, make it more standard, which looks pretty easy to do.
> Thanks for the review.
> 
> Alex
> 

Reply via email to