amdgpu: ACK the retry CAM after VM update finishes

Tvrtko Ursulin Wed, 24 Jun 2026 07:31:32 -0700


On 29/05/2026 11:30, Timur Kristóf wrote:

Add a fence callback to the VM update and ACK the retry CAM
after the VM update is finished. Previously, we would ACK it
immediately after calling amdgpu_vm_handle_fault() which
caused a race condition that was likely to trigger the same
interrupt again, causing the same fault to be handled
multiple times.

Signed-off-by: Timur Kristóf <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c     | 28 +++++++++++++++++++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h     |  8 ++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c |  2 +-
  4 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 26aea960e2759..21c8d87477448 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -545,6 +545,16 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device 
*adev, uint64_t addr,
        } while (fault->timestamp < tmp);
  }

+static void amdgpu_gmc_retry_fault_handled(struct dma_fence *fence,

+                                          struct dma_fence_cb *cb)
+{
+       struct amdgpu_fence_cb *afc = container_of(cb, struct amdgpu_fence_cb, 
cb);
+       struct amdgpu_device *adev = afc->adev;
+
+       /* CAM index is the array index of the current callback struct */
+       adev->irq.ih_funcs->retry_cam_ack(adev, afc - &adev->gmc.retry_cb[0]);

Is the "afc - &adev->gmc.retry_cb[0]" part correct? It will be the indexof the array element, while ->retry_cam_ack() expects the content ofthat element, no?

+}
+
  int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev,
                                  struct amdgpu_iv_entry *entry,
                                  u64 addr,
@@ -552,6 +562,7 @@ int amdgpu_gmc_handle_retry_fault(struct amdgpu_device 
*adev,
                                  u32 node_id,
                                  bool write_fault)
  {
+       struct dma_fence *fence = NULL;
        int ret;

if (adev->irq.retry_cam_enabled) {

@@ -564,8 +575,21 @@ int amdgpu_gmc_handle_retry_fault(struct amdgpu_device 
*adev,
                }

ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,

-                                            addr, entry->timestamp, 
write_fault, NULL);
-               adev->irq.ih_funcs->retry_cam_ack(adev, cam_index);
+                                            addr, entry->timestamp, write_fault, 
&fence);
+
+               /* If the update is already done, ACK now, otherwise when it's 
done. */
+               if (fence) {
+                       adev->gmc.retry_cb[cam_index].adev = adev;

Why is 16 retry_cb elements enough? I see in the code cam_index extracedfrom the IV entry with a mask such as 0x3ff.

+
+                       if (dma_fence_add_callback(fence, 
&adev->gmc.retry_cb[cam_index].cb,
+                                                  
amdgpu_gmc_retry_fault_handled))
+                               adev->irq.ih_funcs->retry_cam_ack(adev, 
cam_index);
+
+                       dma_fence_put(fence);
+               } else {
+                       adev->irq.ih_funcs->retry_cam_ack(adev, cam_index);
+               }
+
                if (ret)
                        return 1;
        } else {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 77eb153802845..3bfb06e011a86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -27,6 +27,7 @@
  #define __AMDGPU_GMC_H__

#include <linux/types.h>

+#include <linux/dma-fence.h>

#include "amdgpu_irq.h"

  #include "amdgpu_xgmi.h"
@@ -214,6 +215,11 @@ struct amdgpu_gmc_memrange {
        int nid_mask;
  };

+struct amdgpu_fence_cb {

+       struct amdgpu_device *adev;
+       struct dma_fence_cb cb;
+};
+
  enum amdgpu_gart_placement {
        AMDGPU_GART_PLACEMENT_BEST_FIT = 0,
        AMDGPU_GART_PLACEMENT_HIGH,
@@ -305,6 +311,8 @@ struct amdgpu_gmc {
        } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
        uint64_t                last_fault:AMDGPU_GMC_FAULT_RING_ORDER;

+ struct amdgpu_fence_cb retry_cb[16];

+
        bool tmz_enabled;
        bool is_app_apu;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index 8c3ba7213eb22..f5e9b97e92a8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -3035,7 +3035,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,

r = amdgpu_vm_update_pdes(adev, vm, true);- *fence = vm->last_update;

+       *fence = dma_fence_get(vm->last_update);


Ah! But passing over since you said you are dropping that patch anyway.

  error_unlock:
        amdgpu_bo_unreserve(root);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
index 2eb64df6daa94..6e28f0e435bf5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
@@ -132,7 +132,7 @@ static int amdgpu_vm_sdma_commit(struct 
amdgpu_vm_update_params *p,
                                   DMA_RESV_USAGE_BOOKKEEP);
        }

- if (fence && !p->immediate) {

+       if (fence) {

Is this deliberate and if so what it is about? Commit message shouldexplain it as well.


Regards,

Tvrtko

                /*
                 * Most hw generations now have a separate queue for page table
                 * updates, but when the queue is shared with userspace we need

Re: [PATCH 2/7] drm/amdgpu: ACK the retry CAM after VM update finishes

Reply via email to