Re: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran
Am 2021-07-08 um 3:53 p.m. schrieb Eric Huang: > It is to workaround HW bug on other Asics and based on > reverting two commits: > drm/amdkfd: Add heavy-weight TLB flush after unmapping > drm/amdkfd: Add memory sync before TLB flush on unmap > > Signed-off-by: Eric Huang Reviewed-by: Felix Kuehling > --- > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 37 +--- > 1 file changed, 20 insertions(+), 17 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > index ebb4872c5a9d..5f2655cf0162 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > @@ -1773,26 +1773,29 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct > file *filep, > } > mutex_unlock(>mutex); > > - err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, (struct kgd_mem *) mem, > true); > - if (err) { > - pr_debug("Sync memory failed, wait interrupted by user > signal\n"); > - goto sync_memory_failed; > - } > + if (dev->device_info->asic_family == CHIP_ALDEBARAN) { > + err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, > + (struct kgd_mem *) mem, true); > > - /* Flush TLBs after waiting for the page table updates to complete */ > - for (i = 0; i < args->n_devices; i++) { > - peer = kfd_device_by_id(devices_arr[i]); > - if (WARN_ON_ONCE(!peer)) > - continue; > - peer_pdd = kfd_get_process_device_data(peer, p); > - if (WARN_ON_ONCE(!peer_pdd)) > - continue; > - if (!amdgpu_read_lock(peer->ddev, true)) { > - kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); > - amdgpu_read_unlock(peer->ddev); > + if (err) { > + pr_debug("Sync memory failed, wait interrupted by user > signal\n"); > + goto sync_memory_failed; > } > - } > > + /* Flush TLBs after waiting for the page table updates to > complete */ > + for (i = 0; i < args->n_devices; i++) { > + peer = kfd_device_by_id(devices_arr[i]); > + if (WARN_ON_ONCE(!peer)) > + continue; > + peer_pdd = kfd_get_process_device_data(peer, p); > + if (WARN_ON_ONCE(!peer_pdd)) > + continue; > + if (!amdgpu_read_lock(peer->ddev, true)) { > + kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); > + amdgpu_read_unlock(peer->ddev); > + } > + } > + } > kfree(devices_arr); > > return 0; ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran
Correction inline. On 2021-07-14 11:22 a.m., Eric Huang wrote: Hi Felix, I was not able to reproduce the VM fault issue of SWDEV-292611(not SWDEV-249241), which is the only regression reported on MI200. So the patch is valid to review. Please take a look. Thanks, Eric On 2021-07-09 1:45 a.m., Chen, Guchun wrote: [Public] Original patch will cause regressions on Aldebaran as well, so this workaround is still invalid. Regards, Guchun -Original Message- From: amd-gfx On Behalf Of Eric Huang Sent: Friday, July 9, 2021 3:54 AM To: amd-gfx@lists.freedesktop.org Cc: Huang, JinHuiEric ; Kuehling, Felix Subject: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran It is to workaround HW bug on other Asics and based on reverting two commits: drm/amdkfd: Add heavy-weight TLB flush after unmapping drm/amdkfd: Add memory sync before TLB flush on unmap Signed-off-by: Eric Huang --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 37 +--- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index ebb4872c5a9d..5f2655cf0162 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1773,26 +1773,29 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, } mutex_unlock(>mutex); - err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, (struct kgd_mem *) mem, true); - if (err) { - pr_debug("Sync memory failed, wait interrupted by user signal\n"); - goto sync_memory_failed; - } + if (dev->device_info->asic_family == CHIP_ALDEBARAN) { + err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, + (struct kgd_mem *) mem, true); - /* Flush TLBs after waiting for the page table updates to complete */ - for (i = 0; i < args->n_devices; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (WARN_ON_ONCE(!peer)) - continue; - peer_pdd = kfd_get_process_device_data(peer, p); - if (WARN_ON_ONCE(!peer_pdd)) - continue; - if (!amdgpu_read_lock(peer->ddev, true)) { - kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); - amdgpu_read_unlock(peer->ddev); + if (err) { + pr_debug("Sync memory failed, wait interrupted by user signal\n"); + goto sync_memory_failed; } - } + /* Flush TLBs after waiting for the page table updates to complete */ + for (i = 0; i < args->n_devices; i++) { + peer = kfd_device_by_id(devices_arr[i]); + if (WARN_ON_ONCE(!peer)) + continue; + peer_pdd = kfd_get_process_device_data(peer, p); + if (WARN_ON_ONCE(!peer_pdd)) + continue; + if (!amdgpu_read_lock(peer->ddev, true)) { + kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); + amdgpu_read_unlock(peer->ddev); + } + } + } kfree(devices_arr); return 0; -- 2.25.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=04%7C01%7CJinHuiEric.Huang%40amd.com%7Ce30b956566b74126b7ba08d946db394e%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637618729638094020%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000sdata=RdMx%2FnUxesURFrs1LfdVWIPT4sxZecCRJ4yTRZx8h4g%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=04%7C01%7CJinHuiEric.Huang%40amd.com%7Ce30b956566b74126b7ba08d946db394e%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637618729638103981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000sdata=zpQKYQiqEUIrT%2Fdhmqwi8Cgo4cfZAo33i95h8etyplg%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran
Hi Felix, I was not able to reproduce the VM fault issue of SWDEV-249241, which is the only regression reported on MI200. So the patch is valid to review. Please take a look. Thanks, Eric On 2021-07-09 1:45 a.m., Chen, Guchun wrote: [Public] Original patch will cause regressions on Aldebaran as well, so this workaround is still invalid. Regards, Guchun -Original Message- From: amd-gfx On Behalf Of Eric Huang Sent: Friday, July 9, 2021 3:54 AM To: amd-gfx@lists.freedesktop.org Cc: Huang, JinHuiEric ; Kuehling, Felix Subject: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran It is to workaround HW bug on other Asics and based on reverting two commits: drm/amdkfd: Add heavy-weight TLB flush after unmapping drm/amdkfd: Add memory sync before TLB flush on unmap Signed-off-by: Eric Huang --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 37 +--- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index ebb4872c5a9d..5f2655cf0162 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1773,26 +1773,29 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, } mutex_unlock(>mutex); - err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, (struct kgd_mem *) mem, true); - if (err) { - pr_debug("Sync memory failed, wait interrupted by user signal\n"); - goto sync_memory_failed; - } + if (dev->device_info->asic_family == CHIP_ALDEBARAN) { + err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, + (struct kgd_mem *) mem, true); - /* Flush TLBs after waiting for the page table updates to complete */ - for (i = 0; i < args->n_devices; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (WARN_ON_ONCE(!peer)) - continue; - peer_pdd = kfd_get_process_device_data(peer, p); - if (WARN_ON_ONCE(!peer_pdd)) - continue; - if (!amdgpu_read_lock(peer->ddev, true)) { - kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); - amdgpu_read_unlock(peer->ddev); + if (err) { + pr_debug("Sync memory failed, wait interrupted by user signal\n"); + goto sync_memory_failed; } - } + /* Flush TLBs after waiting for the page table updates to complete */ + for (i = 0; i < args->n_devices; i++) { + peer = kfd_device_by_id(devices_arr[i]); + if (WARN_ON_ONCE(!peer)) + continue; + peer_pdd = kfd_get_process_device_data(peer, p); + if (WARN_ON_ONCE(!peer_pdd)) + continue; + if (!amdgpu_read_lock(peer->ddev, true)) { + kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); + amdgpu_read_unlock(peer->ddev); + } + } + } kfree(devices_arr); return 0; -- 2.25.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=04%7C01%7Cguchun.chen%40amd.com%7C69113cf367eb450a8f8808d9424a23fe%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637613708477013366%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000sdata=0ESYvG5kCSJaFT9dR4jW5VacL8x7TghGw1aKWTRa9R4%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran
[Public] Original patch will cause regressions on Aldebaran as well, so this workaround is still invalid. Regards, Guchun -Original Message- From: amd-gfx On Behalf Of Eric Huang Sent: Friday, July 9, 2021 3:54 AM To: amd-gfx@lists.freedesktop.org Cc: Huang, JinHuiEric ; Kuehling, Felix Subject: [PATCH] drm/amdkfd: Only apply heavy-weight TLB flush on Aldebaran It is to workaround HW bug on other Asics and based on reverting two commits: drm/amdkfd: Add heavy-weight TLB flush after unmapping drm/amdkfd: Add memory sync before TLB flush on unmap Signed-off-by: Eric Huang --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 37 +--- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index ebb4872c5a9d..5f2655cf0162 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1773,26 +1773,29 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, } mutex_unlock(>mutex); - err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, (struct kgd_mem *) mem, true); - if (err) { - pr_debug("Sync memory failed, wait interrupted by user signal\n"); - goto sync_memory_failed; - } + if (dev->device_info->asic_family == CHIP_ALDEBARAN) { + err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, + (struct kgd_mem *) mem, true); - /* Flush TLBs after waiting for the page table updates to complete */ - for (i = 0; i < args->n_devices; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (WARN_ON_ONCE(!peer)) - continue; - peer_pdd = kfd_get_process_device_data(peer, p); - if (WARN_ON_ONCE(!peer_pdd)) - continue; - if (!amdgpu_read_lock(peer->ddev, true)) { - kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); - amdgpu_read_unlock(peer->ddev); + if (err) { + pr_debug("Sync memory failed, wait interrupted by user signal\n"); + goto sync_memory_failed; } - } + /* Flush TLBs after waiting for the page table updates to complete */ + for (i = 0; i < args->n_devices; i++) { + peer = kfd_device_by_id(devices_arr[i]); + if (WARN_ON_ONCE(!peer)) + continue; + peer_pdd = kfd_get_process_device_data(peer, p); + if (WARN_ON_ONCE(!peer_pdd)) + continue; + if (!amdgpu_read_lock(peer->ddev, true)) { + kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); + amdgpu_read_unlock(peer->ddev); + } + } + } kfree(devices_arr); return 0; -- 2.25.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=04%7C01%7Cguchun.chen%40amd.com%7C69113cf367eb450a8f8808d9424a23fe%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637613708477013366%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000sdata=0ESYvG5kCSJaFT9dR4jW5VacL8x7TghGw1aKWTRa9R4%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx