When GPU memory mappings are updated, the driver returns a fence so
userspace knows when the update is finished.

The previous refactor could pick the wrong fence or rely on checks that
are not safe for GPU mappings that stay valid even when memory is
missing. In some cases this could return an invalid fence or cause fence
reference counting problems.

Fix this by:
- Always returning a valid fence (using a stub fence if no real work ran).
- Using the VM-level fence for always-valid / PRT mappings.
- Using the per-BO fence for normal MAP and REPLACE operations.
- Using the clear-freed fence for UNMAP and CLEAR operations.
- Making sure fence references are always properly released.

This makes VM timeline fences correct and prevents crashes caused by
incorrect fence handling.

Fixes: 463b33e780ae ("drm/amdgpu: Refactor amdgpu_gem_va_ioctl for Handling 
Last Fence Update and Timeline Management v4")
Suggested-by: Christian König <[email protected]>
Signed-off-by: Srinivasan Shanmugam <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 84 ++++++++++++++++---------
 1 file changed, 55 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index f387e47541fc..2a685db91b7c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -730,13 +730,28 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
                        uint32_t operation)
 {
        struct dma_fence *clear_fence = dma_fence_get_stub();
-       struct dma_fence *last_update = NULL;
-       int r;
+       struct dma_fence *last_update;
+       int r = 0;
+
+       /*
+        * Always return a valid fence.
+        *
+        * A fence signals completion of VM work. If no real GPU work was 
needed,
+        * return a stub fence instead of NULL.
+        *
+        * This prevents callers from seeing an invalid or uninitialized fence.
+        */
+       last_update = dma_fence_get(clear_fence);
 
        if (!amdgpu_vm_ready(vm))
-               return clear_fence;
+               goto out;
 
-       /* First clear freed BOs and get a fence for that work, if any. */
+       /*
+        * First clean up any freed GPU mappings.
+        *
+        * This may return a real fence if GPU work was scheduled.
+        * If there is nothing to do, it stays as a stub fence.
+        */
        r = amdgpu_vm_clear_freed(adev, vm, &clear_fence);
        if (r)
                goto error;
@@ -757,33 +772,42 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
        /*
         * Decide which fence represents the "last update" for this VM/BO:
         *
-        * - For MAP/REPLACE we want the PT update fence, which is tracked as
-        *   either vm->last_update (for always-valid BOs) or 
bo_va->last_pt_update
-        *   (for per-BO updates).
+        * UNMAP/CLEAR:
+        *   The fence returned by amdgpu_vm_clear_freed() covers the page table
+        *   work for removing mappings.
+        *
+        * MAP/REPLACE:
+        *   - For always-valid / PRT mappings, vm->last_update is the correct
+        *     fence to export.
+        *   - Otherwise, bo_va->last_pt_update is the correct per-BO fence.
         *
-        * - For UNMAP/CLEAR we rely on the fence returned by
-        *   amdgpu_vm_clear_freed(), which already covers the page table work
-        *   for the removed mappings.
+        * Note (review): vm->last_update and bo_va->last_pt_update are
+        * stub-initialized and not expected to be NULL. We avoid redundant
+        * NULL checks and ensure last_update is never NULL by construction.
         */
        switch (operation) {
        case AMDGPU_VA_OP_MAP:
        case AMDGPU_VA_OP_REPLACE:
-               if (bo_va && bo_va->base.bo) {
-                       if (amdgpu_vm_is_bo_always_valid(vm, bo_va->base.bo)) {
-                               if (vm->last_update)
-                                       last_update = 
dma_fence_get(vm->last_update);
-                       } else {
-                               if (bo_va->last_pt_update)
-                                       last_update = 
dma_fence_get(bo_va->last_pt_update);
-                       }
-               }
+               /* MAP/REPLACE should return the page table update fence. */
+               dma_fence_put(last_update);
+
+               /*
+                * Only treat this as always-valid when bo_va and bo both exist.
+                * This is required for correct PRT and always-valid mappings.
+                */
+               if (bo_va && bo_va->base.bo &&
+                   amdgpu_vm_is_bo_always_valid(vm, bo_va->base.bo))
+                       last_update = dma_fence_get(vm->last_update);
+               else if (bo_va)
+                       last_update = dma_fence_get(bo_va->last_pt_update);
+               else
+                       /* Defensive fallback: keep last_update valid. */
+                       last_update = dma_fence_get(vm->last_update);
                break;
        case AMDGPU_VA_OP_UNMAP:
        case AMDGPU_VA_OP_CLEAR:
-               if (clear_fence)
-                       last_update = dma_fence_get(clear_fence);
-               break;
        default:
+               /* keep default last_update = clear_fence */
                break;
        }
 
@@ -791,17 +815,19 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
        if (r && r != -ERESTARTSYS)
                DRM_ERROR("Couldn't update BO_VA (%d)\n", r);
 
+out:
        /*
-        * If we managed to pick a more specific last-update fence, prefer it
-        * over the generic clear_fence and drop the extra reference to the
-        * latter.
+        * For UNMAP/CLEAR we return the fence from amdgpu_vm_clear_freed().
+        * That function may replace the stub fence with a real one, so refresh
+        * last_update to the final clear_fence before returning.
         */
-       if (last_update) {
-               dma_fence_put(clear_fence);
-               return last_update;
+       if (operation != AMDGPU_VA_OP_MAP && operation != AMDGPU_VA_OP_REPLACE) 
{
+               dma_fence_put(last_update);
+               last_update = dma_fence_get(clear_fence);
        }
 
-       return clear_fence;
+       dma_fence_put(clear_fence);
+       return last_update;
 }
 
 int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
-- 
2.34.1

Reply via email to