Re: [PATCH v2 06/11] mm/hmm: improve driver API to work and wait over a range v2

2019-03-28 Thread Ira Weiny
On Thu, Mar 28, 2019 at 08:56:54PM -0400, Jerome Glisse wrote:
> On Thu, Mar 28, 2019 at 09:12:21AM -0700, Ira Weiny wrote:
> > On Mon, Mar 25, 2019 at 10:40:06AM -0400, Jerome Glisse wrote:
> > > From: Jérôme Glisse 
> > > 

[snip]

> > > +/*
> > > + * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a 
> > > range
> > > + *
> > > + * When waiting for mmu notifiers we need some kind of time out 
> > > otherwise we
> > > + * could potentialy wait for ever, 1000ms ie 1s sounds like a long time 
> > > to
> > > + * wait already.
> > > + */
> > > +#define HMM_RANGE_DEFAULT_TIMEOUT 1000
> > > +
> > >  /* This is a temporary helper to avoid merge conflict between trees. */
> > > +static inline bool hmm_vma_range_done(struct hmm_range *range)
> > > +{
> > > + bool ret = hmm_range_valid(range);
> > > +
> > > + hmm_range_unregister(range);
> > > + return ret;
> > > +}
> > > +
> > >  static inline int hmm_vma_fault(struct hmm_range *range, bool block)
> > >  {
> > > - long ret = hmm_range_fault(range, block);
> > > - if (ret == -EBUSY)
> > > - ret = -EAGAIN;
> > > - else if (ret == -EAGAIN)
> > > - ret = -EBUSY;
> > > - return ret < 0 ? ret : 0;
> > > + long ret;
> > > +
> > > + ret = hmm_range_register(range, range->vma->vm_mm,
> > > +  range->start, range->end);
> > > + if (ret)
> > > + return (int)ret;
> > > +
> > > + if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
> > > + up_read(&range->vma->vm_mm->mmap_sem);
> > > + return -EAGAIN;
> > > + }
> > > +
> > > + ret = hmm_range_fault(range, block);
> > > + if (ret <= 0) {
> > > + if (ret == -EBUSY || !ret) {
> > > + up_read(&range->vma->vm_mm->mmap_sem);
> > > + ret = -EBUSY;
> > > + } else if (ret == -EAGAIN)
> > > + ret = -EBUSY;
> > > + hmm_range_unregister(range);
> > > + return ret;
> > > + }
> > > + return 0;
> > 
> > Is hmm_vma_fault() also temporary to keep the nouveau driver working?  It 
> > looks
> > like it to me.
> > 
> > This and hmm_vma_range_done() above are part of the old interface which is 
> > in
> > the Documentation correct?  As stated above we should probably change that
> > documentation with this patch to ensure no new users of these 2 functions
> > appear.
> 
> Ok will update the documentation, note that i already posted patches to use
> this new API see the ODP RDMA link in the cover letter.
> 

Thanks,  Sorry for my previous email on this patch.  After looking more I see
that this is the old interface but this was not clear.  And I have not had time
to follow the previous threads.  I'm finding time to do this now...

Sorry,
Ira



Re: [PATCH v2 06/11] mm/hmm: improve driver API to work and wait over a range v2

2019-03-28 Thread Jerome Glisse
On Thu, Mar 28, 2019 at 09:12:21AM -0700, Ira Weiny wrote:
> On Mon, Mar 25, 2019 at 10:40:06AM -0400, Jerome Glisse wrote:
> > From: Jérôme Glisse 
> > 
> > A common use case for HMM mirror is user trying to mirror a range
> > and before they could program the hardware it get invalidated by
> > some core mm event. Instead of having user re-try right away to
> > mirror the range provide a completion mechanism for them to wait
> > for any active invalidation affecting the range.
> > 
> > This also changes how hmm_range_snapshot() and hmm_range_fault()
> > works by not relying on vma so that we can drop the mmap_sem
> > when waiting and lookup the vma again on retry.
> > 
> > Changes since v1:
> > - squashed: Dan Carpenter: potential deadlock in nonblocking code
> > 
> > Signed-off-by: Jérôme Glisse 
> > Reviewed-by: Ralph Campbell 
> > Cc: Andrew Morton 
> > Cc: John Hubbard 
> > Cc: Dan Williams 
> > Cc: Dan Carpenter 
> > Cc: Matthew Wilcox 
> > ---
> >  include/linux/hmm.h | 208 ++---
> >  mm/hmm.c| 528 +---
> >  2 files changed, 428 insertions(+), 308 deletions(-)
> > 
> > diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> > index e9afd23c2eac..79671036cb5f 100644
> > --- a/include/linux/hmm.h
> > +++ b/include/linux/hmm.h
> > @@ -77,8 +77,34 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> > -struct hmm;
> > +
> > +/*
> > + * struct hmm - HMM per mm struct
> > + *
> > + * @mm: mm struct this HMM struct is bound to
> > + * @lock: lock protecting ranges list
> > + * @ranges: list of range being snapshotted
> > + * @mirrors: list of mirrors for this mm
> > + * @mmu_notifier: mmu notifier to track updates to CPU page table
> > + * @mirrors_sem: read/write semaphore protecting the mirrors list
> > + * @wq: wait queue for user waiting on a range invalidation
> > + * @notifiers: count of active mmu notifiers
> > + * @dead: is the mm dead ?
> > + */
> > +struct hmm {
> > +   struct mm_struct*mm;
> > +   struct kref kref;
> > +   struct mutexlock;
> > +   struct list_headranges;
> > +   struct list_headmirrors;
> > +   struct mmu_notifier mmu_notifier;
> > +   struct rw_semaphore mirrors_sem;
> > +   wait_queue_head_t   wq;
> > +   longnotifiers;
> > +   booldead;
> > +};
> >  
> >  /*
> >   * hmm_pfn_flag_e - HMM flag enums
> > @@ -155,6 +181,38 @@ struct hmm_range {
> > boolvalid;
> >  };
> >  
> > +/*
> > + * hmm_range_wait_until_valid() - wait for range to be valid
> > + * @range: range affected by invalidation to wait on
> > + * @timeout: time out for wait in ms (ie abort wait after that period of 
> > time)
> > + * Returns: true if the range is valid, false otherwise.
> > + */
> > +static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
> > + unsigned long timeout)
> > +{
> > +   /* Check if mm is dead ? */
> > +   if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
> > +   range->valid = false;
> > +   return false;
> > +   }
> > +   if (range->valid)
> > +   return true;
> > +   wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
> > +  msecs_to_jiffies(timeout));
> > +   /* Return current valid status just in case we get lucky */
> > +   return range->valid;
> > +}
> > +
> > +/*
> > + * hmm_range_valid() - test if a range is valid or not
> > + * @range: range
> > + * Returns: true if the range is valid, false otherwise.
> > + */
> > +static inline bool hmm_range_valid(struct hmm_range *range)
> > +{
> > +   return range->valid;
> > +}
> > +
> >  /*
> >   * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
> >   * @range: range use to decode HMM pfn value
> > @@ -357,51 +415,133 @@ void hmm_mirror_unregister(struct hmm_mirror 
> > *mirror);
> >  
> >  
> >  /*
> > - * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a 
> > device
> > - * driver lock that serializes device page table updates, then call
> > - * hmm_vma_range_done(), to check if the snapshot is still valid. The same
> > - * device driver page table update lock must also be used in the
> > - * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
> > - * table invalidation serializes on it.
> > + * To snapshot the CPU page table you first have to call 
> > hmm_range_register()
> > + * to register the range. If hmm_range_register() return an error then 
> > some-
> > + * thing is horribly wrong and you should fail loudly. If it returned true 
> > then
> > + * you can wait for the range to be stable with 
> > hmm_range_wait_until_valid()
> > + * function, a range is valid when there are no concurrent changes to the 
> > CPU
> > + * page table for the range.
> > + *
> > + * Once the range is valid you can call hmm_range_snapshot() if that 
>

Re: [PATCH v2 06/11] mm/hmm: improve driver API to work and wait over a range v2

2019-03-28 Thread Ira Weiny
On Mon, Mar 25, 2019 at 10:40:06AM -0400, Jerome Glisse wrote:
> From: Jérôme Glisse 
> 
> A common use case for HMM mirror is user trying to mirror a range
> and before they could program the hardware it get invalidated by
> some core mm event. Instead of having user re-try right away to
> mirror the range provide a completion mechanism for them to wait
> for any active invalidation affecting the range.
> 
> This also changes how hmm_range_snapshot() and hmm_range_fault()
> works by not relying on vma so that we can drop the mmap_sem
> when waiting and lookup the vma again on retry.
> 
> Changes since v1:
> - squashed: Dan Carpenter: potential deadlock in nonblocking code
> 
> Signed-off-by: Jérôme Glisse 
> Reviewed-by: Ralph Campbell 
> Cc: Andrew Morton 
> Cc: John Hubbard 
> Cc: Dan Williams 
> Cc: Dan Carpenter 
> Cc: Matthew Wilcox 
> ---
>  include/linux/hmm.h | 208 ++---
>  mm/hmm.c| 528 +---
>  2 files changed, 428 insertions(+), 308 deletions(-)
> 
> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index e9afd23c2eac..79671036cb5f 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -77,8 +77,34 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
> -struct hmm;
> +
> +/*
> + * struct hmm - HMM per mm struct
> + *
> + * @mm: mm struct this HMM struct is bound to
> + * @lock: lock protecting ranges list
> + * @ranges: list of range being snapshotted
> + * @mirrors: list of mirrors for this mm
> + * @mmu_notifier: mmu notifier to track updates to CPU page table
> + * @mirrors_sem: read/write semaphore protecting the mirrors list
> + * @wq: wait queue for user waiting on a range invalidation
> + * @notifiers: count of active mmu notifiers
> + * @dead: is the mm dead ?
> + */
> +struct hmm {
> + struct mm_struct*mm;
> + struct kref kref;
> + struct mutexlock;
> + struct list_headranges;
> + struct list_headmirrors;
> + struct mmu_notifier mmu_notifier;
> + struct rw_semaphore mirrors_sem;
> + wait_queue_head_t   wq;
> + longnotifiers;
> + booldead;
> +};
>  
>  /*
>   * hmm_pfn_flag_e - HMM flag enums
> @@ -155,6 +181,38 @@ struct hmm_range {
>   boolvalid;
>  };
>  
> +/*
> + * hmm_range_wait_until_valid() - wait for range to be valid
> + * @range: range affected by invalidation to wait on
> + * @timeout: time out for wait in ms (ie abort wait after that period of 
> time)
> + * Returns: true if the range is valid, false otherwise.
> + */
> +static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
> +   unsigned long timeout)
> +{
> + /* Check if mm is dead ? */
> + if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
> + range->valid = false;
> + return false;
> + }
> + if (range->valid)
> + return true;
> + wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
> +msecs_to_jiffies(timeout));
> + /* Return current valid status just in case we get lucky */
> + return range->valid;
> +}
> +
> +/*
> + * hmm_range_valid() - test if a range is valid or not
> + * @range: range
> + * Returns: true if the range is valid, false otherwise.
> + */
> +static inline bool hmm_range_valid(struct hmm_range *range)
> +{
> + return range->valid;
> +}
> +
>  /*
>   * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
>   * @range: range use to decode HMM pfn value
> @@ -357,51 +415,133 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
>  
>  
>  /*
> - * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a 
> device
> - * driver lock that serializes device page table updates, then call
> - * hmm_vma_range_done(), to check if the snapshot is still valid. The same
> - * device driver page table update lock must also be used in the
> - * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
> - * table invalidation serializes on it.
> + * To snapshot the CPU page table you first have to call hmm_range_register()
> + * to register the range. If hmm_range_register() return an error then some-
> + * thing is horribly wrong and you should fail loudly. If it returned true 
> then
> + * you can wait for the range to be stable with hmm_range_wait_until_valid()
> + * function, a range is valid when there are no concurrent changes to the CPU
> + * page table for the range.
> + *
> + * Once the range is valid you can call hmm_range_snapshot() if that returns
> + * without error then you can take your device page table lock (the same lock
> + * you use in the HMM mirror sync_cpu_device_pagetables() callback). After
> + * taking that lock you have to check the range validity, if it is still 
> valid
> + * (ie hmm_range_valid() returns true) 

Re: [PATCH v2 06/11] mm/hmm: improve driver API to work and wait over a range v2

2019-03-28 Thread Jerome Glisse
On Thu, Mar 28, 2019 at 06:11:54AM -0700, Ira Weiny wrote:
> On Mon, Mar 25, 2019 at 10:40:06AM -0400, Jerome Glisse wrote:
> > From: Jérôme Glisse 
> > 
> > A common use case for HMM mirror is user trying to mirror a range
> > and before they could program the hardware it get invalidated by
> > some core mm event. Instead of having user re-try right away to
> > mirror the range provide a completion mechanism for them to wait
> > for any active invalidation affecting the range.
> > 
> > This also changes how hmm_range_snapshot() and hmm_range_fault()
> > works by not relying on vma so that we can drop the mmap_sem
> > when waiting and lookup the vma again on retry.
> > 
> > Changes since v1:
> > - squashed: Dan Carpenter: potential deadlock in nonblocking code
> > 
> > Signed-off-by: Jérôme Glisse 
> > Reviewed-by: Ralph Campbell 
> > Cc: Andrew Morton 
> > Cc: John Hubbard 
> > Cc: Dan Williams 
> > Cc: Dan Carpenter 
> > Cc: Matthew Wilcox 
> > ---
> >  include/linux/hmm.h | 208 ++---
> >  mm/hmm.c| 528 +---
> >  2 files changed, 428 insertions(+), 308 deletions(-)
> > 
> > diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> > index e9afd23c2eac..79671036cb5f 100644
> > --- a/include/linux/hmm.h
> > +++ b/include/linux/hmm.h
> > @@ -77,8 +77,34 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> > -struct hmm;
> > +
> > +/*
> > + * struct hmm - HMM per mm struct
> > + *
> > + * @mm: mm struct this HMM struct is bound to
> > + * @lock: lock protecting ranges list
> > + * @ranges: list of range being snapshotted
> > + * @mirrors: list of mirrors for this mm
> > + * @mmu_notifier: mmu notifier to track updates to CPU page table
> > + * @mirrors_sem: read/write semaphore protecting the mirrors list
> > + * @wq: wait queue for user waiting on a range invalidation
> > + * @notifiers: count of active mmu notifiers
> > + * @dead: is the mm dead ?
> > + */
> > +struct hmm {
> > +   struct mm_struct*mm;
> > +   struct kref kref;
> > +   struct mutexlock;
> > +   struct list_headranges;
> > +   struct list_headmirrors;
> > +   struct mmu_notifier mmu_notifier;
> > +   struct rw_semaphore mirrors_sem;
> > +   wait_queue_head_t   wq;
> > +   longnotifiers;
> > +   booldead;
> > +};
> >  
> >  /*
> >   * hmm_pfn_flag_e - HMM flag enums
> > @@ -155,6 +181,38 @@ struct hmm_range {
> > boolvalid;
> >  };
> >  
> > +/*
> > + * hmm_range_wait_until_valid() - wait for range to be valid
> > + * @range: range affected by invalidation to wait on
> > + * @timeout: time out for wait in ms (ie abort wait after that period of 
> > time)
> > + * Returns: true if the range is valid, false otherwise.
> > + */
> > +static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
> > + unsigned long timeout)
> > +{
> > +   /* Check if mm is dead ? */
> > +   if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
> > +   range->valid = false;
> > +   return false;
> > +   }
> > +   if (range->valid)
> > +   return true;
> > +   wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
> > +  msecs_to_jiffies(timeout));
> > +   /* Return current valid status just in case we get lucky */
> > +   return range->valid;
> > +}
> > +
> > +/*
> > + * hmm_range_valid() - test if a range is valid or not
> > + * @range: range
> > + * Returns: true if the range is valid, false otherwise.
> > + */
> > +static inline bool hmm_range_valid(struct hmm_range *range)
> > +{
> > +   return range->valid;
> > +}
> > +
> >  /*
> >   * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
> >   * @range: range use to decode HMM pfn value
> > @@ -357,51 +415,133 @@ void hmm_mirror_unregister(struct hmm_mirror 
> > *mirror);
> >  
> >  
> >  /*
> > - * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a 
> > device
> > - * driver lock that serializes device page table updates, then call
> > - * hmm_vma_range_done(), to check if the snapshot is still valid. The same
> > - * device driver page table update lock must also be used in the
> > - * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
> > - * table invalidation serializes on it.
> > + * To snapshot the CPU page table you first have to call 
> > hmm_range_register()
> > + * to register the range. If hmm_range_register() return an error then 
> > some-
> > + * thing is horribly wrong and you should fail loudly. If it returned true 
> > then
> > + * you can wait for the range to be stable with 
> > hmm_range_wait_until_valid()
> > + * function, a range is valid when there are no concurrent changes to the 
> > CPU
> > + * page table for the range.
> > + *
> > + * Once the range is valid you can call hmm_range_snapshot() if that 
>

Re: [PATCH v2 06/11] mm/hmm: improve driver API to work and wait over a range v2

2019-03-28 Thread Ira Weiny
On Mon, Mar 25, 2019 at 10:40:06AM -0400, Jerome Glisse wrote:
> From: Jérôme Glisse 
> 
> A common use case for HMM mirror is user trying to mirror a range
> and before they could program the hardware it get invalidated by
> some core mm event. Instead of having user re-try right away to
> mirror the range provide a completion mechanism for them to wait
> for any active invalidation affecting the range.
> 
> This also changes how hmm_range_snapshot() and hmm_range_fault()
> works by not relying on vma so that we can drop the mmap_sem
> when waiting and lookup the vma again on retry.
> 
> Changes since v1:
> - squashed: Dan Carpenter: potential deadlock in nonblocking code
> 
> Signed-off-by: Jérôme Glisse 
> Reviewed-by: Ralph Campbell 
> Cc: Andrew Morton 
> Cc: John Hubbard 
> Cc: Dan Williams 
> Cc: Dan Carpenter 
> Cc: Matthew Wilcox 
> ---
>  include/linux/hmm.h | 208 ++---
>  mm/hmm.c| 528 +---
>  2 files changed, 428 insertions(+), 308 deletions(-)
> 
> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index e9afd23c2eac..79671036cb5f 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -77,8 +77,34 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
> -struct hmm;
> +
> +/*
> + * struct hmm - HMM per mm struct
> + *
> + * @mm: mm struct this HMM struct is bound to
> + * @lock: lock protecting ranges list
> + * @ranges: list of range being snapshotted
> + * @mirrors: list of mirrors for this mm
> + * @mmu_notifier: mmu notifier to track updates to CPU page table
> + * @mirrors_sem: read/write semaphore protecting the mirrors list
> + * @wq: wait queue for user waiting on a range invalidation
> + * @notifiers: count of active mmu notifiers
> + * @dead: is the mm dead ?
> + */
> +struct hmm {
> + struct mm_struct*mm;
> + struct kref kref;
> + struct mutexlock;
> + struct list_headranges;
> + struct list_headmirrors;
> + struct mmu_notifier mmu_notifier;
> + struct rw_semaphore mirrors_sem;
> + wait_queue_head_t   wq;
> + longnotifiers;
> + booldead;
> +};
>  
>  /*
>   * hmm_pfn_flag_e - HMM flag enums
> @@ -155,6 +181,38 @@ struct hmm_range {
>   boolvalid;
>  };
>  
> +/*
> + * hmm_range_wait_until_valid() - wait for range to be valid
> + * @range: range affected by invalidation to wait on
> + * @timeout: time out for wait in ms (ie abort wait after that period of 
> time)
> + * Returns: true if the range is valid, false otherwise.
> + */
> +static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
> +   unsigned long timeout)
> +{
> + /* Check if mm is dead ? */
> + if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
> + range->valid = false;
> + return false;
> + }
> + if (range->valid)
> + return true;
> + wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
> +msecs_to_jiffies(timeout));
> + /* Return current valid status just in case we get lucky */
> + return range->valid;
> +}
> +
> +/*
> + * hmm_range_valid() - test if a range is valid or not
> + * @range: range
> + * Returns: true if the range is valid, false otherwise.
> + */
> +static inline bool hmm_range_valid(struct hmm_range *range)
> +{
> + return range->valid;
> +}
> +
>  /*
>   * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
>   * @range: range use to decode HMM pfn value
> @@ -357,51 +415,133 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
>  
>  
>  /*
> - * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a 
> device
> - * driver lock that serializes device page table updates, then call
> - * hmm_vma_range_done(), to check if the snapshot is still valid. The same
> - * device driver page table update lock must also be used in the
> - * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
> - * table invalidation serializes on it.
> + * To snapshot the CPU page table you first have to call hmm_range_register()
> + * to register the range. If hmm_range_register() return an error then some-
> + * thing is horribly wrong and you should fail loudly. If it returned true 
> then
> + * you can wait for the range to be stable with hmm_range_wait_until_valid()
> + * function, a range is valid when there are no concurrent changes to the CPU
> + * page table for the range.
> + *
> + * Once the range is valid you can call hmm_range_snapshot() if that returns
> + * without error then you can take your device page table lock (the same lock
> + * you use in the HMM mirror sync_cpu_device_pagetables() callback). After
> + * taking that lock you have to check the range validity, if it is still 
> valid
> + * (ie hmm_range_valid() returns true)