Re: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran

2021-07-14 Thread Felix Kuehling
Am 2021-07-08 um 3:53 p.m. schrieb Eric Huang:
> It is to workaround HW bug on other Asics and based on
> reverting two commits:
>   drm/amdkfd: Add heavy-weight TLB flush after unmapping
>   drm/amdkfd: Add memory sync before TLB flush on unmap
>
> Signed-off-by: Eric Huang 

Reviewed-by: Felix Kuehling 


> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 37 +---
>  1 file changed, 20 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index ebb4872c5a9d..5f2655cf0162 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1773,26 +1773,29 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct 
> file *filep,
>   }
>   mutex_unlock(>mutex);
>  
> - err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, (struct kgd_mem *) mem, 
> true);
> - if (err) {
> - pr_debug("Sync memory failed, wait interrupted by user 
> signal\n");
> - goto sync_memory_failed;
> - }
> + if (dev->device_info->asic_family == CHIP_ALDEBARAN) {
> + err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd,
> + (struct kgd_mem *) mem, true);
>  
> - /* Flush TLBs after waiting for the page table updates to complete */
> - for (i = 0; i < args->n_devices; i++) {
> - peer = kfd_device_by_id(devices_arr[i]);
> - if (WARN_ON_ONCE(!peer))
> - continue;
> - peer_pdd = kfd_get_process_device_data(peer, p);
> - if (WARN_ON_ONCE(!peer_pdd))
> - continue;
> - if (!amdgpu_read_lock(peer->ddev, true)) {
> - kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
> - amdgpu_read_unlock(peer->ddev);
> + if (err) {
> + pr_debug("Sync memory failed, wait interrupted by user 
> signal\n");
> + goto sync_memory_failed;
>   }
> - }
>  
> + /* Flush TLBs after waiting for the page table updates to 
> complete */
> + for (i = 0; i < args->n_devices; i++) {
> + peer = kfd_device_by_id(devices_arr[i]);
> + if (WARN_ON_ONCE(!peer))
> + continue;
> + peer_pdd = kfd_get_process_device_data(peer, p);
> + if (WARN_ON_ONCE(!peer_pdd))
> + continue;
> + if (!amdgpu_read_lock(peer->ddev, true)) {
> + kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
> + amdgpu_read_unlock(peer->ddev);
> + }
> + }
> + }
>   kfree(devices_arr);
>  
>   return 0;
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran

2021-07-14 Thread Eric Huang

Correction inline.

On 2021-07-14 11:22 a.m., Eric Huang wrote:

Hi Felix,

I was not able to reproduce the VM fault issue of SWDEV-292611(not 
SWDEV-249241), which is the only regression reported on MI200. So the 
patch is valid to review. Please take a look.


Thanks,
Eric

On 2021-07-09 1:45 a.m., Chen, Guchun wrote:

[Public]

Original patch will cause regressions on Aldebaran as well, so this 
workaround is still invalid.


Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of 
Eric Huang

Sent: Friday, July 9, 2021 3:54 AM
To: amd-gfx@lists.freedesktop.org
Cc: Huang, JinHuiEric ; Kuehling, Felix 

Subject: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on 
Aldebaran


It is to workaround HW bug on other Asics and based on reverting two 
commits:

   drm/amdkfd: Add heavy-weight TLB flush after unmapping
   drm/amdkfd: Add memory sync before TLB flush on unmap

Signed-off-by: Eric Huang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 37 +---
  1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index ebb4872c5a9d..5f2655cf0162 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1773,26 +1773,29 @@ static int 
kfd_ioctl_unmap_memory_from_gpu(struct file *filep,

  }
  mutex_unlock(>mutex);
  -    err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, (struct 
kgd_mem *) mem, true);

-    if (err) {
-    pr_debug("Sync memory failed, wait interrupted by user 
signal\n");

-    goto sync_memory_failed;
-    }
+    if (dev->device_info->asic_family == CHIP_ALDEBARAN) {
+    err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd,
+    (struct kgd_mem *) mem, true);
  -    /* Flush TLBs after waiting for the page table updates to 
complete */

-    for (i = 0; i < args->n_devices; i++) {
-    peer = kfd_device_by_id(devices_arr[i]);
-    if (WARN_ON_ONCE(!peer))
-    continue;
-    peer_pdd = kfd_get_process_device_data(peer, p);
-    if (WARN_ON_ONCE(!peer_pdd))
-    continue;
-    if (!amdgpu_read_lock(peer->ddev, true)) {
-    kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
-    amdgpu_read_unlock(peer->ddev);
+    if (err) {
+    pr_debug("Sync memory failed, wait interrupted by user 
signal\n");

+    goto sync_memory_failed;
  }
-    }
  +    /* Flush TLBs after waiting for the page table updates to 
complete */

+    for (i = 0; i < args->n_devices; i++) {
+    peer = kfd_device_by_id(devices_arr[i]);
+    if (WARN_ON_ONCE(!peer))
+    continue;
+    peer_pdd = kfd_get_process_device_data(peer, p);
+    if (WARN_ON_ONCE(!peer_pdd))
+    continue;
+    if (!amdgpu_read_lock(peer->ddev, true)) {
+    kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
+    amdgpu_read_unlock(peer->ddev);
+    }
+    }
+    }
  kfree(devices_arr);
    return 0;
--
2.25.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=04%7C01%7CJinHuiEric.Huang%40amd.com%7Ce30b956566b74126b7ba08d946db394e%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637618729638094020%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000sdata=RdMx%2FnUxesURFrs1LfdVWIPT4sxZecCRJ4yTRZx8h4g%3Dreserved=0 



___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=04%7C01%7CJinHuiEric.Huang%40amd.com%7Ce30b956566b74126b7ba08d946db394e%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637618729638103981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000sdata=zpQKYQiqEUIrT%2Fdhmqwi8Cgo4cfZAo33i95h8etyplg%3Dreserved=0 



___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran

2021-07-14 Thread Eric Huang

Hi Felix,

I was not able to reproduce the VM fault issue of SWDEV-249241, which is 
the only regression reported on MI200. So the patch is valid to review. 
Please take a look.


Thanks,
Eric

On 2021-07-09 1:45 a.m., Chen, Guchun wrote:

[Public]

Original patch will cause regressions on Aldebaran as well, so this workaround 
is still invalid.

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Eric Huang
Sent: Friday, July 9, 2021 3:54 AM
To: amd-gfx@lists.freedesktop.org
Cc: Huang, JinHuiEric ; Kuehling, Felix 

Subject: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran

It is to workaround HW bug on other Asics and based on reverting two commits:
   drm/amdkfd: Add heavy-weight TLB flush after unmapping
   drm/amdkfd: Add memory sync before TLB flush on unmap

Signed-off-by: Eric Huang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 37 +---
  1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index ebb4872c5a9d..5f2655cf0162 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1773,26 +1773,29 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
}
mutex_unlock(>mutex);
  
-	err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, (struct kgd_mem *) mem, true);

-   if (err) {
-   pr_debug("Sync memory failed, wait interrupted by user 
signal\n");
-   goto sync_memory_failed;
-   }
+   if (dev->device_info->asic_family == CHIP_ALDEBARAN) {
+   err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd,
+   (struct kgd_mem *) mem, true);
  
-	/* Flush TLBs after waiting for the page table updates to complete */

-   for (i = 0; i < args->n_devices; i++) {
-   peer = kfd_device_by_id(devices_arr[i]);
-   if (WARN_ON_ONCE(!peer))
-   continue;
-   peer_pdd = kfd_get_process_device_data(peer, p);
-   if (WARN_ON_ONCE(!peer_pdd))
-   continue;
-   if (!amdgpu_read_lock(peer->ddev, true)) {
-   kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
-   amdgpu_read_unlock(peer->ddev);
+   if (err) {
+   pr_debug("Sync memory failed, wait interrupted by user 
signal\n");
+   goto sync_memory_failed;
}
-   }
  
+		/* Flush TLBs after waiting for the page table updates to complete */

+   for (i = 0; i < args->n_devices; i++) {
+   peer = kfd_device_by_id(devices_arr[i]);
+   if (WARN_ON_ONCE(!peer))
+   continue;
+   peer_pdd = kfd_get_process_device_data(peer, p);
+   if (WARN_ON_ONCE(!peer_pdd))
+   continue;
+   if (!amdgpu_read_lock(peer->ddev, true)) {
+   kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
+   amdgpu_read_unlock(peer->ddev);
+   }
+   }
+   }
kfree(devices_arr);
  
  	return 0;

--
2.25.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=04%7C01%7Cguchun.chen%40amd.com%7C69113cf367eb450a8f8808d9424a23fe%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637613708477013366%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000sdata=0ESYvG5kCSJaFT9dR4jW5VacL8x7TghGw1aKWTRa9R4%3Dreserved=0


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran

2021-07-09 Thread Chen, Guchun
[Public]

Original patch will cause regressions on Aldebaran as well, so this workaround 
is still invalid.

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Eric Huang
Sent: Friday, July 9, 2021 3:54 AM
To: amd-gfx@lists.freedesktop.org
Cc: Huang, JinHuiEric ; Kuehling, Felix 

Subject: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran

It is to workaround HW bug on other Asics and based on reverting two commits:
  drm/amdkfd: Add heavy-weight TLB flush after unmapping
  drm/amdkfd: Add memory sync before TLB flush on unmap

Signed-off-by: Eric Huang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 37 +---
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index ebb4872c5a9d..5f2655cf0162 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1773,26 +1773,29 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
}
mutex_unlock(>mutex);
 
-   err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, (struct kgd_mem *) mem, 
true);
-   if (err) {
-   pr_debug("Sync memory failed, wait interrupted by user 
signal\n");
-   goto sync_memory_failed;
-   }
+   if (dev->device_info->asic_family == CHIP_ALDEBARAN) {
+   err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd,
+   (struct kgd_mem *) mem, true);
 
-   /* Flush TLBs after waiting for the page table updates to complete */
-   for (i = 0; i < args->n_devices; i++) {
-   peer = kfd_device_by_id(devices_arr[i]);
-   if (WARN_ON_ONCE(!peer))
-   continue;
-   peer_pdd = kfd_get_process_device_data(peer, p);
-   if (WARN_ON_ONCE(!peer_pdd))
-   continue;
-   if (!amdgpu_read_lock(peer->ddev, true)) {
-   kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
-   amdgpu_read_unlock(peer->ddev);
+   if (err) {
+   pr_debug("Sync memory failed, wait interrupted by user 
signal\n");
+   goto sync_memory_failed;
}
-   }
 
+   /* Flush TLBs after waiting for the page table updates to 
complete */
+   for (i = 0; i < args->n_devices; i++) {
+   peer = kfd_device_by_id(devices_arr[i]);
+   if (WARN_ON_ONCE(!peer))
+   continue;
+   peer_pdd = kfd_get_process_device_data(peer, p);
+   if (WARN_ON_ONCE(!peer_pdd))
+   continue;
+   if (!amdgpu_read_lock(peer->ddev, true)) {
+   kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
+   amdgpu_read_unlock(peer->ddev);
+   }
+   }
+   }
kfree(devices_arr);
 
return 0;
--
2.25.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=04%7C01%7Cguchun.chen%40amd.com%7C69113cf367eb450a8f8808d9424a23fe%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637613708477013366%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000sdata=0ESYvG5kCSJaFT9dR4jW5VacL8x7TghGw1aKWTRa9R4%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx