On 29/05/2026 11:30, Timur Kristóf wrote:
Add a fence callback to the VM update and ACK the retry CAM
after the VM update is finished. Previously, we would ACK it
immediately after calling amdgpu_vm_handle_fault() which
caused a race condition that was likely to trigger the same
interrupt again, causing the same fault to be handled
multiple times.

Signed-off-by: Timur Kristóf <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c     | 28 +++++++++++++++++++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h     |  8 ++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c |  2 +-
  4 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 26aea960e2759..21c8d87477448 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -545,6 +545,16 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device 
*adev, uint64_t addr,
        } while (fault->timestamp < tmp);
  }
+static void amdgpu_gmc_retry_fault_handled(struct dma_fence *fence,
+                                          struct dma_fence_cb *cb)
+{
+       struct amdgpu_fence_cb *afc = container_of(cb, struct amdgpu_fence_cb, 
cb);
+       struct amdgpu_device *adev = afc->adev;
+
+       /* CAM index is the array index of the current callback struct */
+       adev->irq.ih_funcs->retry_cam_ack(adev, afc - &adev->gmc.retry_cb[0]);

Is the "afc - &adev->gmc.retry_cb[0]" part correct? It will be the index of the array element, while ->retry_cam_ack() expects the content of that element, no?

+}
+
  int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev,
                                  struct amdgpu_iv_entry *entry,
                                  u64 addr,
@@ -552,6 +562,7 @@ int amdgpu_gmc_handle_retry_fault(struct amdgpu_device 
*adev,
                                  u32 node_id,
                                  bool write_fault)
  {
+       struct dma_fence *fence = NULL;
        int ret;
if (adev->irq.retry_cam_enabled) {
@@ -564,8 +575,21 @@ int amdgpu_gmc_handle_retry_fault(struct amdgpu_device 
*adev,
                }
ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-                                            addr, entry->timestamp, 
write_fault, NULL);
-               adev->irq.ih_funcs->retry_cam_ack(adev, cam_index);
+                                            addr, entry->timestamp, write_fault, 
&fence);
+
+               /* If the update is already done, ACK now, otherwise when it's 
done. */
+               if (fence) {
+                       adev->gmc.retry_cb[cam_index].adev = adev;

Why is 16 retry_cb elements enough? I see in the code cam_index extraced from the IV entry with a mask such as 0x3ff.
+
+                       if (dma_fence_add_callback(fence, 
&adev->gmc.retry_cb[cam_index].cb,
+                                                  
amdgpu_gmc_retry_fault_handled))
+                               adev->irq.ih_funcs->retry_cam_ack(adev, 
cam_index);
+
+                       dma_fence_put(fence);
+               } else {
+                       adev->irq.ih_funcs->retry_cam_ack(adev, cam_index);
+               }
+
                if (ret)
                        return 1;
        } else {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 77eb153802845..3bfb06e011a86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -27,6 +27,7 @@
  #define __AMDGPU_GMC_H__
#include <linux/types.h>
+#include <linux/dma-fence.h>
#include "amdgpu_irq.h"
  #include "amdgpu_xgmi.h"
@@ -214,6 +215,11 @@ struct amdgpu_gmc_memrange {
        int nid_mask;
  };
+struct amdgpu_fence_cb {
+       struct amdgpu_device *adev;
+       struct dma_fence_cb cb;
+};
+
  enum amdgpu_gart_placement {
        AMDGPU_GART_PLACEMENT_BEST_FIT = 0,
        AMDGPU_GART_PLACEMENT_HIGH,
@@ -305,6 +311,8 @@ struct amdgpu_gmc {
        } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
        uint64_t                last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
+ struct amdgpu_fence_cb retry_cb[16];
+
        bool tmz_enabled;
        bool is_app_apu;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 8c3ba7213eb22..f5e9b97e92a8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -3035,7 +3035,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,
r = amdgpu_vm_update_pdes(adev, vm, true); - *fence = vm->last_update;
+       *fence = dma_fence_get(vm->last_update);

Ah! But passing over since you said you are dropping that patch anyway.

  error_unlock:
        amdgpu_bo_unreserve(root);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
index 2eb64df6daa94..6e28f0e435bf5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
@@ -132,7 +132,7 @@ static int amdgpu_vm_sdma_commit(struct 
amdgpu_vm_update_params *p,
                                   DMA_RESV_USAGE_BOOKKEEP);
        }
- if (fence && !p->immediate) {
+       if (fence) {

Is this deliberate and if so what it is about? Commit message should explain it as well.

Regards,

Tvrtko

                /*
                 * Most hw generations now have a separate queue for page table
                 * updates, but when the queue is shared with userspace we need

Reply via email to