On 19/11/16 05:18, Jérôme Glisse wrote:
> This is a heterogeneous memory management (HMM) process address space
> mirroring. In a nutshell this provide an API to mirror process address
> space on a device. This boils down to keeping CPU and device page table
> synchronize (we assume that both device and CPU are cache coherent like
> PCIe device can be).
> 
> This patch provide a simple API for device driver to achieve address
> space mirroring thus avoiding each device driver to grow its own CPU
> page table walker and its own CPU page table synchronization mechanism.
> 
> This is usefull for NVidia GPU >= Pascal, Mellanox IB >= mlx5 and more
           useful
> hardware in the future.
> 
> Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
> Signed-off-by: Jatin Kumar <jaku...@nvidia.com>
> Signed-off-by: John Hubbard <jhubb...@nvidia.com>
> Signed-off-by: Mark Hairgrove <mhairgr...@nvidia.com>
> Signed-off-by: Sherry Cheung <sche...@nvidia.com>
> Signed-off-by: Subhash Gutti <sgu...@nvidia.com>
> ---
>  include/linux/hmm.h |  97 +++++++++++++++++++++++++++++++
>  mm/hmm.c            | 160 
> ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 257 insertions(+)
> 
> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index 54dd529..f44e270 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -88,6 +88,7 @@
>  
>  #if IS_ENABLED(CONFIG_HMM)
>  
> +struct hmm;
>  
>  /*
>   * hmm_pfn_t - HMM use its own pfn type to keep several flags per page
> @@ -127,6 +128,102 @@ static inline hmm_pfn_t hmm_pfn_from_pfn(unsigned long 
> pfn)
>  }
>  
>  
> +/*
> + * Mirroring: how to use synchronize device page table with CPU page table ?
> + *
> + * Device driver must always synchronize with CPU page table update, for this
> + * they can either directly use mmu_notifier API or they can use the 
> hmm_mirror
> + * API. Device driver can decide to register one mirror per device per 
> process
> + * or just one mirror per process for a group of device. Pattern is :
> + *
> + *      int device_bind_address_space(..., struct mm_struct *mm, ...)
> + *      {
> + *          struct device_address_space *das;
> + *          int ret;
> + *          // Device driver specific initialization, and allocation of das
> + *          // which contain an hmm_mirror struct as one of its field.
> + *          ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops);
> + *          if (ret) {
> + *              // Cleanup on error
> + *              return ret;
> + *          }
> + *          // Other device driver specific initialization
> + *      }
> + *
> + * Device driver must not free the struct containing hmm_mirror struct before
> + * calling hmm_mirror_unregister() expected usage is to do that when device
> + * driver is unbinding from an address space.
> + *
> + *      void device_unbind_address_space(struct device_address_space *das)
> + *      {
> + *          // Device driver specific cleanup
> + *          hmm_mirror_unregister(&das->mirror);
> + *          // Other device driver specific cleanup and now das can be free
> + *      }
> + *
> + * Once an hmm_mirror is register for an address space, device driver will 
> get
> + * callback through the update() operation (see hmm_mirror_ops struct).
> + */
> +
> +struct hmm_mirror;
> +
> +/*
> + * enum hmm_update - type of update
> + * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
> + */
> +enum hmm_update {
> +     HMM_UPDATE_INVALIDATE,
> +};
> +
> +/*
> + * struct hmm_mirror_ops - HMM mirror device operations callback
> + *
> + * @update: callback to update range on a device
> + */
> +struct hmm_mirror_ops {
> +     /* update() - update virtual address range of memory
> +      *
> +      * @mirror: pointer to struct hmm_mirror
> +      * @update: update's type (turn read only, unmap, ...)
> +      * @start: virtual start address of the range to update
> +      * @end: virtual end address of the range to update
> +      *
> +      * This callback is call when the CPU page table is updated, the device
> +      * driver must update device page table accordingly to update's action.
> +      *
> +      * Device driver callback must wait until device have fully updated its
> +      * view for the range. Note we plan to make this asynchronous in later
> +      * patches. So that multiple devices can schedule update to their page
> +      * table and once all device have schedule the update then we wait for
> +      * them to propagate.
> +      */
> +     void (*update)(struct hmm_mirror *mirror,
> +                    enum hmm_update action,
> +                    unsigned long start,
> +                    unsigned long end);
> +};
> +
> +/*
> + * struct hmm_mirror - mirror struct for a device driver
> + *
> + * @hmm: pointer to struct hmm (which is unique per mm_struct)
> + * @ops: device driver callback for HMM mirror operations
> + * @list: for list of mirrors of a given mm
> + *
> + * Each address space (mm_struct) being mirrored by a device must register 
> one
> + * of hmm_mirror struct with HMM. HMM will track list of all mirrors for each
> + * mm_struct (or each process).
> + */
> +struct hmm_mirror {
> +     struct hmm                      *hmm;
> +     const struct hmm_mirror_ops     *ops;
> +     struct list_head                list;
> +};
> +
> +int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
> +void hmm_mirror_unregister(struct hmm_mirror *mirror);
> +
> +
>  /* Below are for HMM internal use only ! Not to be use by device driver ! */
>  void hmm_mm_destroy(struct mm_struct *mm);
>  
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 342b596..3594785 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -21,14 +21,27 @@
>  #include <linux/hmm.h>
>  #include <linux/slab.h>
>  #include <linux/sched.h>
> +#include <linux/mmu_notifier.h>
>  
>  /*
>   * struct hmm - HMM per mm struct
>   *
>   * @mm: mm struct this HMM struct is bound to
> + * @lock: lock protecting mirrors list
> + * @mirrors: list of mirrors for this mm
> + * @wait_queue: wait queue
> + * @sequence: we track update to CPU page table with a sequence number
> + * @mmu_notifier: mmu notifier to track update to CPU page table
> + * @notifier_count: number of currently active notifier count
>   */
>  struct hmm {
>       struct mm_struct        *mm;
> +     spinlock_t              lock;
> +     struct list_head        mirrors;
> +     atomic_t                sequence;
> +     wait_queue_head_t       wait_queue;
> +     struct mmu_notifier     mmu_notifier;
> +     atomic_t                notifier_count;
>  };
>  
>  /*
> @@ -48,6 +61,12 @@ static struct hmm *hmm_register(struct mm_struct *mm)
>               hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
>               if (!hmm)
>                       return NULL;
> +             init_waitqueue_head(&hmm->wait_queue);
> +             atomic_set(&hmm->notifier_count, 0);
> +             INIT_LIST_HEAD(&hmm->mirrors);
> +             atomic_set(&hmm->sequence, 0);
> +             hmm->mmu_notifier.ops = NULL;
> +             spin_lock_init(&hmm->lock);
>               hmm->mm = mm;
>       }
>  
> @@ -84,3 +103,144 @@ void hmm_mm_destroy(struct mm_struct *mm)
>  
>       kfree(hmm);
>  }
> +
> +
> +
> +static void hmm_invalidate_range(struct hmm *hmm,
> +                              enum hmm_update action,
> +                              unsigned long start,
> +                              unsigned long end)
> +{
> +     struct hmm_mirror *mirror;
> +
> +     /*
> +      * Mirror being added or remove is a rare event so list traversal isn't
> +      * protected by a lock, we rely on simple rules. All list modification
> +      * are done using list_add_rcu() and list_del_rcu() under a spinlock to
> +      * protect from concurrent addition or removal but not traversal.
> +      *
> +      * Because hmm_mirror_unregister() wait for all running invalidation to
> +      * complete (and thus all list traversal to finish). None of the mirror
> +      * struct can be freed from under us while traversing the list and thus
> +      * it is safe to dereference their list pointer even if they were just
> +      * remove.
> +      */
> +     list_for_each_entry (mirror, &hmm->mirrors, list)
> +             mirror->ops->update(mirror, action, start, end);
> +}
> +
> +static void hmm_invalidate_page(struct mmu_notifier *mn,
> +                             struct mm_struct *mm,
> +                             unsigned long addr)
> +{
> +     unsigned long start = addr & PAGE_MASK;
> +     unsigned long end = start + PAGE_SIZE;
> +     struct hmm *hmm = mm->hmm;
> +
> +     VM_BUG_ON(!hmm);
> +
> +     atomic_inc(&hmm->notifier_count);
> +     atomic_inc(&hmm->sequence);
> +     hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
> +     atomic_dec(&hmm->notifier_count);
> +     wake_up(&hmm->wait_queue);
> +}
> +
> +static void hmm_invalidate_range_start(struct mmu_notifier *mn,
> +                                    struct mm_struct *mm,
> +                                    unsigned long start,
> +                                    unsigned long end)
> +{
> +     struct hmm *hmm = mm->hmm;
> +
> +     VM_BUG_ON(!hmm);
> +
> +     atomic_inc(&hmm->notifier_count);
> +     atomic_inc(&hmm->sequence);
> +     hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
> +}
> +
> +static void hmm_invalidate_range_end(struct mmu_notifier *mn,
> +                                  struct mm_struct *mm,
> +                                  unsigned long start,
> +                                  unsigned long end)
> +{
> +     struct hmm *hmm = mm->hmm;
> +
> +     VM_BUG_ON(!hmm);
> +
> +     /* Reverse order here because we are getting out of invalidation */
> +     atomic_dec(&hmm->notifier_count);
> +     wake_up(&hmm->wait_queue);
> +}
> +
> +static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
> +     .invalidate_page        = hmm_invalidate_page,
> +     .invalidate_range_start = hmm_invalidate_range_start,
> +     .invalidate_range_end   = hmm_invalidate_range_end,
> +};
> +
> +/*
> + * hmm_mirror_register() - register a mirror against an mm
> + *
> + * @mirror: new mirror struct to register
> + * @mm: mm to register against
> + *
> + * To start mirroring a process address space device driver must register an
> + * HMM mirror struct.
> + */
> +int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
> +{
> +     /* Sanity check */
> +     if (!mm || !mirror || !mirror->ops)
> +             return -EINVAL;
> +
> +     mirror->hmm = hmm_register(mm);
> +     if (!mirror->hmm)
> +             return -ENOMEM;
> +
> +     /* Register mmu_notifier if not already, use mmap_sem for locking */
> +     if (!mirror->hmm->mmu_notifier.ops) {
> +             struct hmm *hmm = mirror->hmm;
> +             down_write(&mm->mmap_sem);
> +             if (!hmm->mmu_notifier.ops) {
> +                     hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
> +                     if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
> +                             hmm->mmu_notifier.ops = NULL;
> +                             up_write(&mm->mmap_sem);
> +                             return -ENOMEM;
> +                     }
> +             }
> +             up_write(&mm->mmap_sem);
> +     }

Does everything get mirrored, every update to the PTE (clear dirty, clear
accessed bit, etc) or does the driver decide?

> +
> +     spin_lock(&mirror->hmm->lock);
> +     list_add_rcu(&mirror->list, &mirror->hmm->mirrors);
> +     spin_unlock(&mirror->hmm->lock);
> +
> +     return 0;
> +}
> +EXPORT_SYMBOL(hmm_mirror_register);
> +
> +/*
> + * hmm_mirror_unregister() - unregister a mirror
> + *
> + * @mirror: new mirror struct to register
> + *
> + * Stop mirroring a process address space and cleanup.
> + */
> +void hmm_mirror_unregister(struct hmm_mirror *mirror)
> +{
> +     struct hmm *hmm = mirror->hmm;
> +
> +     spin_lock(&hmm->lock);
> +     list_del_rcu(&mirror->list);
> +     spin_unlock(&hmm->lock);
> +
> +     /*
> +      * Wait for all active notifier so that it is safe to traverse mirror
> +      * list without any lock.
> +      */
> +     wait_event(hmm->wait_queue, !atomic_read(&hmm->notifier_count));
> +}
> +EXPORT_SYMBOL(hmm_mirror_unregister);
> 

Reply via email to