Felix Kuehling <felix.kuehl...@amd.com> writes:

> On 2022-09-28 08:01, Alistair Popple wrote:
>> When the CPU tries to access a device private page the migrate_to_ram()
>> callback associated with the pgmap for the page is called. However no
>> reference is taken on the faulting page. Therefore a concurrent
>> migration of the device private page can free the page and possibly the
>> underlying pgmap. This results in a race which can crash the kernel due
>> to the migrate_to_ram() function pointer becoming invalid. It also means
>> drivers can't reliably read the zone_device_data field because the page
>> may have been freed with memunmap_pages().
>>
>> Close the race by getting a reference on the page while holding the ptl
>> to ensure it has not been freed. Unfortunately the elevated reference
>> count will cause the migration required to handle the fault to fail. To
>> avoid this failure pass the faulting page into the migrate_vma functions
>> so that if an elevated reference count is found it can be checked to see
>> if it's expected or not.
>
> Do we really have to drag the fault_page all the way into the migrate 
> structure?
> Is the elevated refcount still needed at that time? Maybe it would be easier 
> to
> drop the refcount early in the ops->migrage_to_ram callbacks, so we won't have
> to deal with it in all the migration code.

That would also work. Honestly I don't really like either solution :-)

I didn't like having to plumb it all through the migration code
but I ended up going this way because I felt it was easier to explain
the life time of vmf->page for the migrate_to_ram() callback. This way
vmf->page is guaranteed to be valid for the duration of the
migrate_to_ram() callbak.

As you suggest we could instead have drivers call put_page(vmf->page)
somewhere in migrate_to_ram() before calling migrate_vma_setup(). The
reason I didn't go this way is IMHO it's more subtle because in general
the page will remain valid after that put_page() anyway. So it would be
easy for drivers to introduce a bug assuming the vmf->page is still
valid and not notice because most of the time it is.

Let me know if you disagree with my reasoning though - would appreciate
any review here.

> Regards,
>   Felix
>
>
>>
>> Signed-off-by: Alistair Popple <apop...@nvidia.com>
>> Cc: Jason Gunthorpe <j...@nvidia.com>
>> Cc: John Hubbard <jhubb...@nvidia.com>
>> Cc: Ralph Campbell <rcampb...@nvidia.com>
>> Cc: Michael Ellerman <m...@ellerman.id.au>
>> Cc: Felix Kuehling <felix.kuehl...@amd.com>
>> Cc: Lyude Paul <ly...@redhat.com>
>> ---
>>   arch/powerpc/kvm/book3s_hv_uvmem.c       | 15 ++++++-----
>>   drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++++++------
>>   drivers/gpu/drm/amd/amdkfd/kfd_migrate.h |  2 +-
>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 11 +++++---
>>   include/linux/migrate.h                  |  8 ++++++-
>>   lib/test_hmm.c                           |  7 ++---
>>   mm/memory.c                              | 16 +++++++++++-
>>   mm/migrate.c                             | 34 ++++++++++++++-----------
>>   mm/migrate_device.c                      | 18 +++++++++----
>>   9 files changed, 87 insertions(+), 41 deletions(-)
>>
>> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
>> b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> index 5980063..d4eacf4 100644
>> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
>> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
>>   static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
>>              unsigned long start,
>>              unsigned long end, unsigned long page_shift,
>> -            struct kvm *kvm, unsigned long gpa)
>> +            struct kvm *kvm, unsigned long gpa, struct page *fault_page)
>>   {
>>      unsigned long src_pfn, dst_pfn = 0;
>> -    struct migrate_vma mig;
>> +    struct migrate_vma mig = { 0 };
>>      struct page *dpage, *spage;
>>      struct kvmppc_uvmem_page_pvt *pvt;
>>      unsigned long pfn;
>> @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct 
>> *vma,
>>      mig.dst = &dst_pfn;
>>      mig.pgmap_owner = &kvmppc_uvmem_pgmap;
>>      mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
>> +    mig.fault_page = fault_page;
>>      /* The requested page is already paged-out, nothing to do */
>>      if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
>> @@ -580,12 +581,14 @@ static int __kvmppc_svm_page_out(struct vm_area_struct 
>> *vma,
>>   static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
>>                                    unsigned long start, unsigned long end,
>>                                    unsigned long page_shift,
>> -                                  struct kvm *kvm, unsigned long gpa)
>> +                                  struct kvm *kvm, unsigned long gpa,
>> +                                  struct page *fault_page)
>>   {
>>      int ret;
>>      mutex_lock(&kvm->arch.uvmem_lock);
>> -    ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa);
>> +    ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa,
>> +                            fault_page);
>>      mutex_unlock(&kvm->arch.uvmem_lock);
>>      return ret;
>> @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
>>              bool pagein)
>>   {
>>      unsigned long src_pfn, dst_pfn = 0;
>> -    struct migrate_vma mig;
>> +    struct migrate_vma mig = { 0 };
>>      struct page *spage;
>>      unsigned long pfn;
>>      struct page *dpage;
>> @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct 
>> vm_fault *vmf)
>>      if (kvmppc_svm_page_out(vmf->vma, vmf->address,
>>                              vmf->address + PAGE_SIZE, PAGE_SHIFT,
>> -                            pvt->kvm, pvt->gpa))
>> +                            pvt->kvm, pvt->gpa, vmf->page))
>>              return VM_FAULT_SIGBUS;
>>      else
>>              return 0;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
>> index b059a77..776448b 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
>> @@ -409,7 +409,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, 
>> struct svm_range *prange,
>>      uint64_t npages = (end - start) >> PAGE_SHIFT;
>>      struct kfd_process_device *pdd;
>>      struct dma_fence *mfence = NULL;
>> -    struct migrate_vma migrate;
>> +    struct migrate_vma migrate = { 0 };
>>      unsigned long cpages = 0;
>>      dma_addr_t *scratch;
>>      void *buf;
>> @@ -668,7 +668,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, 
>> struct svm_range *prange,
>>   static long
>>   svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range 
>> *prange,
>>                     struct vm_area_struct *vma, uint64_t start, uint64_t end,
>> -                   uint32_t trigger)
>> +                   uint32_t trigger, struct page *fault_page)
>>   {
>>      struct kfd_process *p = container_of(prange->svms, struct kfd_process, 
>> svms);
>>      uint64_t npages = (end - start) >> PAGE_SHIFT;
>> @@ -676,7 +676,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, 
>> struct svm_range *prange,
>>      unsigned long cpages = 0;
>>      struct kfd_process_device *pdd;
>>      struct dma_fence *mfence = NULL;
>> -    struct migrate_vma migrate;
>> +    struct migrate_vma migrate = { 0 };
>>      dma_addr_t *scratch;
>>      void *buf;
>>      int r = -ENOMEM;
>> @@ -699,6 +699,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, 
>> struct svm_range *prange,
>>      migrate.src = buf;
>>      migrate.dst = migrate.src + npages;
>> +    migrate.fault_page = fault_page;
>>      scratch = (dma_addr_t *)(migrate.dst + npages);
>>      kfd_smi_event_migration_start(adev->kfd.dev, p->lead_thread->pid,
>> @@ -766,7 +767,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, 
>> struct svm_range *prange,
>>    * 0 - OK, otherwise error code
>>    */
>>   int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
>> -                        uint32_t trigger)
>> +                        uint32_t trigger, struct page *fault_page)
>>   {
>>      struct amdgpu_device *adev;
>>      struct vm_area_struct *vma;
>> @@ -807,7 +808,8 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, 
>> struct mm_struct *mm,
>>              }
>>              next = min(vma->vm_end, end);
>> -            r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, 
>> trigger);
>> +            r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, 
>> trigger,
>> +                    fault_page);
>>              if (r < 0) {
>>                      pr_debug("failed %ld to migrate prange %p\n", r, 
>> prange);
>>                      break;
>> @@ -851,7 +853,7 @@ svm_migrate_vram_to_vram(struct svm_range *prange, 
>> uint32_t best_loc,
>>      pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc);
>>      do {
>> -            r = svm_migrate_vram_to_ram(prange, mm, trigger);
>> +            r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL);
>>              if (r)
>>                      return r;
>>      } while (prange->actual_loc && --retries);
>> @@ -938,7 +940,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault 
>> *vmf)
>>              goto out_unlock_prange;
>>      }
>>   -  r = svm_migrate_vram_to_ram(prange, mm,
>> KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU);
>> +    r = svm_migrate_vram_to_ram(prange, mm, 
>> KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
>> +                            vmf->page);
>>      if (r)
>>              pr_debug("failed %d migrate 0x%p [0x%lx 0x%lx] to ram\n", r,
>>                       prange, prange->start, prange->last);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
>> index b3f0754..a5d7e6d 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
>> @@ -43,7 +43,7 @@ enum MIGRATION_COPY_DIR {
>>   int svm_migrate_to_vram(struct svm_range *prange,  uint32_t best_loc,
>>                      struct mm_struct *mm, uint32_t trigger);
>>   int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
>> -                        uint32_t trigger);
>> +                        uint32_t trigger, struct page *fault_page);
>>   unsigned long
>>   svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr);
>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> index 11074cc..9139e5a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> @@ -2913,13 +2913,15 @@ svm_range_restore_pages(struct amdgpu_device *adev, 
>> unsigned int pasid,
>>                               */
>>                              if (prange->actual_loc)
>>                                      r = svm_migrate_vram_to_ram(prange, mm,
>> -                                       KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
>> +                                       KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
>> +                                       NULL);
>>                              else
>>                                      r = 0;
>>                      }
>>              } else {
>>                      r = svm_migrate_vram_to_ram(prange, mm,
>> -                                    KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
>> +                                    KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
>> +                                    NULL);
>>              }
>>              if (r) {
>>                      pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
>> @@ -3242,7 +3244,8 @@ svm_range_trigger_migration(struct mm_struct *mm, 
>> struct svm_range *prange,
>>              return 0;
>>      if (!best_loc) {
>> -            r = svm_migrate_vram_to_ram(prange, mm, 
>> KFD_MIGRATE_TRIGGER_PREFETCH);
>> +            r = svm_migrate_vram_to_ram(prange, mm,
>> +                                    KFD_MIGRATE_TRIGGER_PREFETCH, NULL);
>>              *migrated = !r;
>>              return r;
>>      }
>> @@ -3303,7 +3306,7 @@ static void svm_range_evict_svm_bo_worker(struct 
>> work_struct *work)
>>              mutex_lock(&prange->migrate_mutex);
>>              do {
>>                      r = svm_migrate_vram_to_ram(prange, mm,
>> -                                            
>> KFD_MIGRATE_TRIGGER_TTM_EVICTION);
>> +                                    KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL);
>>              } while (!r && prange->actual_loc && --retries);
>>              if (!r && prange->actual_loc)
>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>> index 22c0a0c..82ffa47 100644
>> --- a/include/linux/migrate.h
>> +++ b/include/linux/migrate.h
>> @@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES];
>>   #ifdef CONFIG_MIGRATION
>>     extern void putback_movable_pages(struct list_head *l);
>> +int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
>> +            struct folio *src, enum migrate_mode mode, int extra_count);
>>   int migrate_folio(struct address_space *mapping, struct folio *dst,
>>              struct folio *src, enum migrate_mode mode);
>>   extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t 
>> free,
>> @@ -212,6 +214,12 @@ struct migrate_vma {
>>       */
>>      void                    *pgmap_owner;
>>      unsigned long           flags;
>> +
>> +    /*
>> +     * Set to vmf->page if this is being called to migrate a page as part of
>> +     * a migrate_to_ram() callback.
>> +     */
>> +    struct page             *fault_page;
>>   };
>>     int migrate_vma_setup(struct migrate_vma *args);
>> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
>> index e3965ca..89463ff 100644
>> --- a/lib/test_hmm.c
>> +++ b/lib/test_hmm.c
>> @@ -907,7 +907,7 @@ static int dmirror_migrate_to_system(struct dmirror 
>> *dmirror,
>>      struct vm_area_struct *vma;
>>      unsigned long src_pfns[64] = { 0 };
>>      unsigned long dst_pfns[64] = { 0 };
>> -    struct migrate_vma args;
>> +    struct migrate_vma args = { 0 };
>>      unsigned long next;
>>      int ret;
>>   @@ -968,7 +968,7 @@ static int dmirror_migrate_to_device(struct dmirror
>> *dmirror,
>>      unsigned long src_pfns[64] = { 0 };
>>      unsigned long dst_pfns[64] = { 0 };
>>      struct dmirror_bounce bounce;
>> -    struct migrate_vma args;
>> +    struct migrate_vma args = { 0 };
>>      unsigned long next;
>>      int ret;
>>   @@ -1334,7 +1334,7 @@ static void dmirror_devmem_free(struct page *page)
>>     static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
>>   {
>> -    struct migrate_vma args;
>> +    struct migrate_vma args = { 0 };
>>      unsigned long src_pfns = 0;
>>      unsigned long dst_pfns = 0;
>>      struct page *rpage;
>> @@ -1357,6 +1357,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault 
>> *vmf)
>>      args.dst = &dst_pfns;
>>      args.pgmap_owner = dmirror->mdevice;
>>      args.flags = dmirror_select_device(dmirror);
>> +    args.fault_page = vmf->page;
>>      if (migrate_vma_setup(&args))
>>              return VM_FAULT_SIGBUS;
>> diff --git a/mm/memory.c b/mm/memory.c
>> index b994784..65d3977 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -3742,7 +3742,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>                      ret = remove_device_exclusive_entry(vmf);
>>              } else if (is_device_private_entry(entry)) {
>>                      vmf->page = pfn_swap_entry_to_page(entry);
>> -                    ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
>> +                    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>> +                                    vmf->address, &vmf->ptl);
>> +                    if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
>> +                            spin_unlock(vmf->ptl);
>> +                            goto out;
>> +                    }
>> +
>> +                    /*
>> +                     * Get a page reference while we know the page can't be
>> +                     * freed.
>> +                     */
>> +                    get_page(vmf->page);
>> +                    pte_unmap_unlock(vmf->pte, vmf->ptl);
>> +                    vmf->page->pgmap->ops->migrate_to_ram(vmf);
>> +                    put_page(vmf->page);
>>              } else if (is_hwpoison_entry(entry)) {
>>                      ret = VM_FAULT_HWPOISON;
>>              } else if (is_swapin_error_entry(entry)) {
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index ce6a58f..e3f78a7 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -620,6 +620,25 @@ EXPORT_SYMBOL(folio_migrate_copy);
>>    *                    Migration functions
>>    ***********************************************************/
>>   +int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
>> +            struct folio *src, enum migrate_mode mode, int extra_count)
>> +{
>> +    int rc;
>> +
>> +    BUG_ON(folio_test_writeback(src));      /* Writeback must be complete */
>> +
>> +    rc = folio_migrate_mapping(mapping, dst, src, extra_count);
>> +
>> +    if (rc != MIGRATEPAGE_SUCCESS)
>> +            return rc;
>> +
>> +    if (mode != MIGRATE_SYNC_NO_COPY)
>> +            folio_migrate_copy(dst, src);
>> +    else
>> +            folio_migrate_flags(dst, src);
>> +    return MIGRATEPAGE_SUCCESS;
>> +}
>> +
>>   /**
>>    * migrate_folio() - Simple folio migration.
>>    * @mapping: The address_space containing the folio.
>> @@ -635,20 +654,7 @@ EXPORT_SYMBOL(folio_migrate_copy);
>>   int migrate_folio(struct address_space *mapping, struct folio *dst,
>>              struct folio *src, enum migrate_mode mode)
>>   {
>> -    int rc;
>> -
>> -    BUG_ON(folio_test_writeback(src));      /* Writeback must be complete */
>> -
>> -    rc = folio_migrate_mapping(mapping, dst, src, 0);
>> -
>> -    if (rc != MIGRATEPAGE_SUCCESS)
>> -            return rc;
>> -
>> -    if (mode != MIGRATE_SYNC_NO_COPY)
>> -            folio_migrate_copy(dst, src);
>> -    else
>> -            folio_migrate_flags(dst, src);
>> -    return MIGRATEPAGE_SUCCESS;
>> +    return migrate_folio_extra(mapping, dst, src, mode, 0);
>>   }
>>   EXPORT_SYMBOL(migrate_folio);
>>   diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index 7235424..f756c00 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -313,14 +313,14 @@ static void migrate_vma_collect(struct migrate_vma 
>> *migrate)
>>    * folio_migrate_mapping(), except that here we allow migration of a
>>    * ZONE_DEVICE page.
>>    */
>> -static bool migrate_vma_check_page(struct page *page)
>> +static bool migrate_vma_check_page(struct page *page, struct page 
>> *fault_page)
>>   {
>>      /*
>>       * One extra ref because caller holds an extra reference, either from
>>       * isolate_lru_page() for a regular page, or migrate_vma_collect() for
>>       * a device page.
>>       */
>> -    int extra = 1;
>> +    int extra = 1 + (page == fault_page);
>>      /*
>>       * FIXME support THP (transparent huge page), it is bit more complex to
>> @@ -393,7 +393,8 @@ static void migrate_vma_unmap(struct migrate_vma 
>> *migrate)
>>              if (folio_mapped(folio))
>>                      try_to_migrate(folio, 0);
>>   -          if (page_mapped(page) || !migrate_vma_check_page(page)) {
>> +            if (page_mapped(page) ||
>> +                !migrate_vma_check_page(page, migrate->fault_page)) {
>>                      if (!is_zone_device_page(page)) {
>>                              get_page(page);
>>                              putback_lru_page(page);
>> @@ -505,6 +506,8 @@ int migrate_vma_setup(struct migrate_vma *args)
>>              return -EINVAL;
>>      if (!args->src || !args->dst)
>>              return -EINVAL;
>> +    if (args->fault_page && !is_device_private_page(args->fault_page))
>> +            return -EINVAL;
>>      memset(args->src, 0, sizeof(*args->src) * nr_pages);
>>      args->cpages = 0;
>> @@ -735,8 +738,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>>                      continue;
>>              }
>>   -          r = migrate_folio(mapping, page_folio(newpage),
>> -                            page_folio(page), MIGRATE_SYNC_NO_COPY);
>> +            if (migrate->fault_page == page)
>> +                    r = migrate_folio_extra(mapping, page_folio(newpage),
>> +                                            page_folio(page),
>> +                                            MIGRATE_SYNC_NO_COPY, 1);
>> +            else
>> +                    r = migrate_folio(mapping, page_folio(newpage),
>> +                                    page_folio(page), MIGRATE_SYNC_NO_COPY);
>>              if (r != MIGRATEPAGE_SUCCESS)
>>                      migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
>>      }

Reply via email to