Re: [PATCH 1/7] mm/memory.c: Fix race when faulting a device private page

2022-09-29 Thread Michael Ellerman
Alistair Popple  writes:
> Michael Ellerman  writes:
>> Alistair Popple  writes:
>>> When the CPU tries to access a device private page the migrate_to_ram()
>>> callback associated with the pgmap for the page is called. However no
>>> reference is taken on the faulting page. Therefore a concurrent
>>> migration of the device private page can free the page and possibly the
>>> underlying pgmap. This results in a race which can crash the kernel due
>>> to the migrate_to_ram() function pointer becoming invalid. It also means
>>> drivers can't reliably read the zone_device_data field because the page
>>> may have been freed with memunmap_pages().
>>>
>>> Close the race by getting a reference on the page while holding the ptl
>>> to ensure it has not been freed. Unfortunately the elevated reference
>>> count will cause the migration required to handle the fault to fail. To
>>> avoid this failure pass the faulting page into the migrate_vma functions
>>> so that if an elevated reference count is found it can be checked to see
>>> if it's expected or not.
>>>
>>> Signed-off-by: Alistair Popple 
>>> ---
>>>  arch/powerpc/kvm/book3s_hv_uvmem.c   | 15 ++-
>>>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++--
>>>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.h |  2 +-
>>>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 +---
>>>  include/linux/migrate.h  |  8 ++-
>>>  lib/test_hmm.c   |  7 ++---
>>>  mm/memory.c  | 16 +++-
>>>  mm/migrate.c | 34 ++---
>>>  mm/migrate_device.c  | 18 +
>>>  9 files changed, 87 insertions(+), 41 deletions(-)
>>>
>>> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
>>> b/arch/powerpc/kvm/book3s_hv_uvmem.c
>>> index 5980063..d4eacf4 100644
>>> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
>>> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
>>> @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
...
>>> @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct 
>>> vm_fault *vmf)
>>>
>>> if (kvmppc_svm_page_out(vmf->vma, vmf->address,
>>> vmf->address + PAGE_SIZE, PAGE_SHIFT,
>>> -   pvt->kvm, pvt->gpa))
>>> +   pvt->kvm, pvt->gpa, vmf->page))
>>> return VM_FAULT_SIGBUS;
>>> else
>>> return 0;
>>
>> I don't have a UV test system, but as-is it doesn't even compile :)
>
> Ugh, thanks. I did get as far as installing a PPC cross-compiler and
> building a kernel. Apparently I did not get as far as enabling
> CONFIG_PPC_UV :)

No worries, that's really on us. If we're going to keep the code in the
tree then it should really be enabled in at least one of our defconfigs.

cheers


Re: [PATCH 1/7] mm/memory.c: Fix race when faulting a device private page

2022-09-28 Thread Michael Ellerman
Alistair Popple  writes:
> When the CPU tries to access a device private page the migrate_to_ram()
> callback associated with the pgmap for the page is called. However no
> reference is taken on the faulting page. Therefore a concurrent
> migration of the device private page can free the page and possibly the
> underlying pgmap. This results in a race which can crash the kernel due
> to the migrate_to_ram() function pointer becoming invalid. It also means
> drivers can't reliably read the zone_device_data field because the page
> may have been freed with memunmap_pages().
>
> Close the race by getting a reference on the page while holding the ptl
> to ensure it has not been freed. Unfortunately the elevated reference
> count will cause the migration required to handle the fault to fail. To
> avoid this failure pass the faulting page into the migrate_vma functions
> so that if an elevated reference count is found it can be checked to see
> if it's expected or not.
>
> Signed-off-by: Alistair Popple 
> ---
>  arch/powerpc/kvm/book3s_hv_uvmem.c   | 15 ++-
>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++--
>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.h |  2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 +---
>  include/linux/migrate.h  |  8 ++-
>  lib/test_hmm.c   |  7 ++---
>  mm/memory.c  | 16 +++-
>  mm/migrate.c | 34 ++---
>  mm/migrate_device.c  | 18 +
>  9 files changed, 87 insertions(+), 41 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
> b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index 5980063..d4eacf4 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
>  static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
>   unsigned long start,
>   unsigned long end, unsigned long page_shift,
> - struct kvm *kvm, unsigned long gpa)
> + struct kvm *kvm, unsigned long gpa, struct page *fault_page)
>  {
>   unsigned long src_pfn, dst_pfn = 0;
> - struct migrate_vma mig;
> + struct migrate_vma mig = { 0 };
>   struct page *dpage, *spage;
>   struct kvmppc_uvmem_page_pvt *pvt;
>   unsigned long pfn;
> @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct 
> *vma,
>   mig.dst = _pfn;
>   mig.pgmap_owner = _uvmem_pgmap;
>   mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
> + mig.fault_page = fault_page;
>  
>   /* The requested page is already paged-out, nothing to do */
>   if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
> @@ -580,12 +581,14 @@ static int __kvmppc_svm_page_out(struct vm_area_struct 
> *vma,
>  static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
> unsigned long start, unsigned long end,
> unsigned long page_shift,
> -   struct kvm *kvm, unsigned long gpa)
> +   struct kvm *kvm, unsigned long gpa,
> +   struct page *fault_page)
>  {
>   int ret;
>  
>   mutex_lock(>arch.uvmem_lock);
> - ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa);
> + ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa,
> + fault_page);
>   mutex_unlock(>arch.uvmem_lock);
>  
>   return ret;
> @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
>   bool pagein)
>  {
>   unsigned long src_pfn, dst_pfn = 0;
> - struct migrate_vma mig;
> + struct migrate_vma mig = { 0 };
>   struct page *spage;
>   unsigned long pfn;
>   struct page *dpage;
> @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct 
> vm_fault *vmf)
>  
>   if (kvmppc_svm_page_out(vmf->vma, vmf->address,
>   vmf->address + PAGE_SIZE, PAGE_SHIFT,
> - pvt->kvm, pvt->gpa))
> + pvt->kvm, pvt->gpa, vmf->page))
>   return VM_FAULT_SIGBUS;
>   else
>   return 0;

I don't have a UV test system, but as-is it doesn't even compile :)

kvmppc_svm_page_out() is called via some paths other than the
migrate_to_ram callback.

I think it's correct to just pass fault_page = NULL when it's not called
from the migrate_to_ram callback?

Incremental diff below.

cheers


diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index d4eacf410956..965c9e9e500b 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -637,7 +637,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot 
*slot,

Re: [PATCH 1/7] mm/memory.c: Fix race when faulting a device private page

2022-09-28 Thread Alistair Popple


Michael Ellerman  writes:

> Alistair Popple  writes:
>> When the CPU tries to access a device private page the migrate_to_ram()
>> callback associated with the pgmap for the page is called. However no
>> reference is taken on the faulting page. Therefore a concurrent
>> migration of the device private page can free the page and possibly the
>> underlying pgmap. This results in a race which can crash the kernel due
>> to the migrate_to_ram() function pointer becoming invalid. It also means
>> drivers can't reliably read the zone_device_data field because the page
>> may have been freed with memunmap_pages().
>>
>> Close the race by getting a reference on the page while holding the ptl
>> to ensure it has not been freed. Unfortunately the elevated reference
>> count will cause the migration required to handle the fault to fail. To
>> avoid this failure pass the faulting page into the migrate_vma functions
>> so that if an elevated reference count is found it can be checked to see
>> if it's expected or not.
>>
>> Signed-off-by: Alistair Popple 
>> ---
>>  arch/powerpc/kvm/book3s_hv_uvmem.c   | 15 ++-
>>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++--
>>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.h |  2 +-
>>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 +---
>>  include/linux/migrate.h  |  8 ++-
>>  lib/test_hmm.c   |  7 ++---
>>  mm/memory.c  | 16 +++-
>>  mm/migrate.c | 34 ++---
>>  mm/migrate_device.c  | 18 +
>>  9 files changed, 87 insertions(+), 41 deletions(-)
>>
>> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
>> b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> index 5980063..d4eacf4 100644
>> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
>> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
>>  static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
>>  unsigned long start,
>>  unsigned long end, unsigned long page_shift,
>> -struct kvm *kvm, unsigned long gpa)
>> +struct kvm *kvm, unsigned long gpa, struct page *fault_page)
>>  {
>>  unsigned long src_pfn, dst_pfn = 0;
>> -struct migrate_vma mig;
>> +struct migrate_vma mig = { 0 };
>>  struct page *dpage, *spage;
>>  struct kvmppc_uvmem_page_pvt *pvt;
>>  unsigned long pfn;
>> @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct 
>> *vma,
>>  mig.dst = _pfn;
>>  mig.pgmap_owner = _uvmem_pgmap;
>>  mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
>> +mig.fault_page = fault_page;
>>
>>  /* The requested page is already paged-out, nothing to do */
>>  if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
>> @@ -580,12 +581,14 @@ static int __kvmppc_svm_page_out(struct vm_area_struct 
>> *vma,
>>  static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
>>unsigned long start, unsigned long end,
>>unsigned long page_shift,
>> -  struct kvm *kvm, unsigned long gpa)
>> +  struct kvm *kvm, unsigned long gpa,
>> +  struct page *fault_page)
>>  {
>>  int ret;
>>
>>  mutex_lock(>arch.uvmem_lock);
>> -ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa);
>> +ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa,
>> +fault_page);
>>  mutex_unlock(>arch.uvmem_lock);
>>
>>  return ret;
>> @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
>>  bool pagein)
>>  {
>>  unsigned long src_pfn, dst_pfn = 0;
>> -struct migrate_vma mig;
>> +struct migrate_vma mig = { 0 };
>>  struct page *spage;
>>  unsigned long pfn;
>>  struct page *dpage;
>> @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct 
>> vm_fault *vmf)
>>
>>  if (kvmppc_svm_page_out(vmf->vma, vmf->address,
>>  vmf->address + PAGE_SIZE, PAGE_SHIFT,
>> -pvt->kvm, pvt->gpa))
>> +pvt->kvm, pvt->gpa, vmf->page))
>>  return VM_FAULT_SIGBUS;
>>  else
>>  return 0;
>
> I don't have a UV test system, but as-is it doesn't even compile :)

Ugh, thanks. I did get as far as installing a PPC cross-compiler and
building a kernel. Apparently I did not get as far as enabling
CONFIG_PPC_UV :)

> kvmppc_svm_page_out() is called via some paths other than the
> migrate_to_ram callback.
>
> I think it's correct to just pass fault_page = NULL when it's not called
> from the migrate_to_ram callback?
>
> Incremental diff below.
>
> cheers
>
>
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
> 

[PATCH 1/7] mm/memory.c: Fix race when faulting a device private page

2022-09-26 Thread Alistair Popple
When the CPU tries to access a device private page the migrate_to_ram()
callback associated with the pgmap for the page is called. However no
reference is taken on the faulting page. Therefore a concurrent
migration of the device private page can free the page and possibly the
underlying pgmap. This results in a race which can crash the kernel due
to the migrate_to_ram() function pointer becoming invalid. It also means
drivers can't reliably read the zone_device_data field because the page
may have been freed with memunmap_pages().

Close the race by getting a reference on the page while holding the ptl
to ensure it has not been freed. Unfortunately the elevated reference
count will cause the migration required to handle the fault to fail. To
avoid this failure pass the faulting page into the migrate_vma functions
so that if an elevated reference count is found it can be checked to see
if it's expected or not.

Signed-off-by: Alistair Popple 
---
 arch/powerpc/kvm/book3s_hv_uvmem.c   | 15 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.h |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 +---
 include/linux/migrate.h  |  8 ++-
 lib/test_hmm.c   |  7 ++---
 mm/memory.c  | 16 +++-
 mm/migrate.c | 34 ++---
 mm/migrate_device.c  | 18 +
 9 files changed, 87 insertions(+), 41 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 5980063..d4eacf4 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
unsigned long start,
unsigned long end, unsigned long page_shift,
-   struct kvm *kvm, unsigned long gpa)
+   struct kvm *kvm, unsigned long gpa, struct page *fault_page)
 {
unsigned long src_pfn, dst_pfn = 0;
-   struct migrate_vma mig;
+   struct migrate_vma mig = { 0 };
struct page *dpage, *spage;
struct kvmppc_uvmem_page_pvt *pvt;
unsigned long pfn;
@@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
mig.dst = _pfn;
mig.pgmap_owner = _uvmem_pgmap;
mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
+   mig.fault_page = fault_page;
 
/* The requested page is already paged-out, nothing to do */
if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
@@ -580,12 +581,14 @@ static int __kvmppc_svm_page_out(struct vm_area_struct 
*vma,
 static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
  unsigned long start, unsigned long end,
  unsigned long page_shift,
- struct kvm *kvm, unsigned long gpa)
+ struct kvm *kvm, unsigned long gpa,
+ struct page *fault_page)
 {
int ret;
 
mutex_lock(>arch.uvmem_lock);
-   ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa);
+   ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa,
+   fault_page);
mutex_unlock(>arch.uvmem_lock);
 
return ret;
@@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
bool pagein)
 {
unsigned long src_pfn, dst_pfn = 0;
-   struct migrate_vma mig;
+   struct migrate_vma mig = { 0 };
struct page *spage;
unsigned long pfn;
struct page *dpage;
@@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct 
vm_fault *vmf)
 
if (kvmppc_svm_page_out(vmf->vma, vmf->address,
vmf->address + PAGE_SIZE, PAGE_SHIFT,
-   pvt->kvm, pvt->gpa))
+   pvt->kvm, pvt->gpa, vmf->page))
return VM_FAULT_SIGBUS;
else
return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index b059a77..776448b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -409,7 +409,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct 
svm_range *prange,
uint64_t npages = (end - start) >> PAGE_SHIFT;
struct kfd_process_device *pdd;
struct dma_fence *mfence = NULL;
-   struct migrate_vma migrate;
+   struct migrate_vma migrate = { 0 };
unsigned long cpages = 0;
dma_addr_t *scratch;
void *buf;
@@ -668,7 +668,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
 static long