When GPU memory mappings are updated, the driver returns a fence so
userspace knows when the update is finished.
The previous refactor could pick the wrong fence or rely on checks that
are not safe for GPU mappings that stay valid even when memory is
missing. In some cases this could return an invalid fence or cause fence
reference counting problems.
Fix this by (v6, per Christian):
- Starting from the VM’s existing last update fence, so a valid and
meaningful fence is always returned even when no new work is required.
- Selecting the VM-level fence only for always-valid / PRT mappings using
the required combined bo_va + bo guard.
- Using the per-BO page table update fence for normal MAP and REPLACE
operations.
- For UNMAP and CLEAR, returning the fence provided by
amdgpu_vm_clear_freed(), which may remain unchanged when nothing needs
clearing.
- Keeping fence reference counting balanced.
This makes VM timeline fences correct and prevents crashes caused by
incorrect fence handling.
Fixes: 463b33e780ae ("drm/amdgpu: Refactor amdgpu_gem_va_ioctl for Handling
Last Fence Update and Timeline Management v4")
Suggested-by: Christian König <[email protected]>
Signed-off-by: Srinivasan Shanmugam <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 75 +++++++++++++------------
1 file changed, 39 insertions(+), 36 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index e5fb8f5346b6..c1dc0cae0cfb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -732,15 +732,23 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
struct amdgpu_bo_va *bo_va,
uint32_t operation)
{
- struct dma_fence *clear_fence = dma_fence_get_stub();
- struct dma_fence *last_update = NULL;
- int r;
+ struct dma_fence *fence;
+ int r = 0;
+
+ /* Always start from the VM's existing last update fence. */
+ fence = dma_fence_get(vm->last_update);
if (!amdgpu_vm_ready(vm))
- return clear_fence;
+ return fence;
- /* First clear freed BOs and get a fence for that work, if any. */
- r = amdgpu_vm_clear_freed(adev, vm, &clear_fence);
+ /*
+ * First clean up any freed mappings in the VM.
+ *
+ * amdgpu_vm_clear_freed() may replace @fence with a new fence if it
+ * schedules GPU work. If nothing needs clearing, @fence can remain as
+ * the original vm->last_update.
+ */
+ r = amdgpu_vm_clear_freed(adev, vm, &fence);
if (r)
goto error;
@@ -758,35 +766,40 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
goto error;
/*
- * Decide which fence represents the "last update" for this VM/BO:
+ * Decide which fence best represents the last update:
+ *
+ * MAP/REPLACE:
+ * - For always-valid / PRT mappings, export vm->last_update.
+ * - Otherwise, export bo_va->last_pt_update.
*
- * - For MAP/REPLACE we want the PT update fence, which is tracked as
- * either vm->last_update (for always-valid BOs) or
bo_va->last_pt_update
- * (for per-BO updates).
+ * UNMAP/CLEAR:
+ * Keep the fence returned by amdgpu_vm_clear_freed(). If no work was
+ * needed, it can remain as vm->last_pt_update.
*
- * - For UNMAP/CLEAR we rely on the fence returned by
- * amdgpu_vm_clear_freed(), which already covers the page table work
- * for the removed mappings.
+ * The VM and BO update fences are always initialized to a valid value.
+ * vm->last_update and bo_va->last_pt_update always start as valid
fences.
+ * and are never expected to be NULL.
*/
switch (operation) {
case AMDGPU_VA_OP_MAP:
case AMDGPU_VA_OP_REPLACE:
- if (bo_va && bo_va->base.bo) {
- if (amdgpu_vm_is_bo_always_valid(vm, bo_va->base.bo)) {
- if (vm->last_update)
- last_update =
dma_fence_get(vm->last_update);
- } else {
- if (bo_va->last_pt_update)
- last_update =
dma_fence_get(bo_va->last_pt_update);
- }
- }
+ /*
+ * For MAP/REPLACE, return the page table update fence for the
+ * mapping we just modified. bo_va is expected to be valid here.
+ */
+ dma_fence_put(fence);
+
+ /* Combined guard required for always-valid / PRT correctness */
+ if (bo_va && bo_va->base.bo &&
+ amdgpu_vm_is_bo_always_valid(vm, bo_va->base.bo))
+ fence = dma_fence_get(vm->last_update);
+ else
+ fence = dma_fence_get(bo_va->last_pt_update);
break;
case AMDGPU_VA_OP_UNMAP:
case AMDGPU_VA_OP_CLEAR:
- if (clear_fence)
- last_update = dma_fence_get(clear_fence);
- break;
default:
+ /* keep @fence as returned by amdgpu_vm_clear_freed() */
break;
}
@@ -794,17 +807,7 @@ amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
if (r && r != -ERESTARTSYS)
DRM_ERROR("Couldn't update BO_VA (%d)\n", r);
- /*
- * If we managed to pick a more specific last-update fence, prefer it
- * over the generic clear_fence and drop the extra reference to the
- * latter.
- */
- if (last_update) {
- dma_fence_put(clear_fence);
- return last_update;
- }
-
- return clear_fence;
+ return fence;
}
int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
--
2.34.1