When we do a full adapter reset, if we know the timedout fence mark the fence with -ETIME rather than -ECANCELED so it gets properly handled by userspace.
Reviewed-by: Christian König <[email protected]> Signed-off-by: Alex Deucher <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 28 +++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 21 ++++++++++------ 7 files changed, 47 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 1f3e52637326b..e36c8e3cfb0f0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1960,7 +1960,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) /* swap out the old fences */ amdgpu_ib_preempt_fences_swap(ring, fences); - amdgpu_fence_driver_force_completion(ring); + amdgpu_fence_driver_force_completion(ring, NULL); /* resubmit unfinished jobs */ amdgpu_ib_preempt_job_recovery(&ring->sched); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 05efa31c3f6a0..52b90c9fef0dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5792,6 +5792,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, { int i, r = 0; struct amdgpu_job *job = NULL; + struct dma_fence *fence = NULL; struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; bool need_full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); @@ -5804,6 +5805,9 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, amdgpu_fence_driver_isr_toggle(adev, true); + if (job) + fence = &job->hw_fence->base; + /* block all schedulers and reset given job's ring */ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; @@ -5812,7 +5816,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, continue; /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ - amdgpu_fence_driver_force_completion(ring); + amdgpu_fence_driver_force_completion(ring, fence); } amdgpu_fence_driver_isr_toggle(adev, false); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index c7a2dff33d80b..d48f61076c06a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -568,7 +568,7 @@ void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev) r = -ENODEV; /* no need to trigger GPU reset as we are unloading */ if (r) - amdgpu_fence_driver_force_completion(ring); + amdgpu_fence_driver_force_completion(ring, NULL); if (!drm_dev_is_unplugged(adev_to_drm(adev)) && ring->fence_drv.irq_src && @@ -683,16 +683,34 @@ void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error) * amdgpu_fence_driver_force_completion - force signal latest fence of ring * * @ring: fence of the ring to signal + * @timedout_fence: fence of the timedout job * */ -void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring) +void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring, + struct dma_fence *timedout_fence) { - amdgpu_fence_driver_set_error(ring, -ECANCELED); + struct amdgpu_fence_driver *drv = &ring->fence_drv; + unsigned long flags; + + spin_lock_irqsave(&drv->lock, flags); + for (unsigned int i = 0; i <= drv->num_fences_mask; ++i) { + struct dma_fence *fence; + + fence = rcu_dereference_protected(drv->fences[i], + lockdep_is_held(&drv->lock)); + if (fence && !dma_fence_is_signaled_locked(fence)) { + if (fence == timedout_fence) + dma_fence_set_error(fence, -ETIME); + else + dma_fence_set_error(fence, -ECANCELED); + } + } + spin_unlock_irqrestore(&drv->lock, flags); + amdgpu_fence_write(ring, ring->fence_drv.sync_seq); amdgpu_fence_process(ring); } - /* * Kernel queue reset handling * @@ -753,7 +771,7 @@ void amdgpu_fence_driver_update_timedout_fence_state(struct amdgpu_fence *af) if (reemitted) { /* if we've already reemitted once then just cancel everything */ - amdgpu_fence_driver_force_completion(af->ring); + amdgpu_fence_driver_force_completion(af->ring, &af->base); af->ring->ring_backup_entries_to_copy = 0; } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 86a788d476957..ce095427611fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -160,7 +160,8 @@ struct amdgpu_fence { extern const struct drm_sched_backend_ops amdgpu_sched_ops; void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error); -void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring); +void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring, + struct dma_fence *timedout_fence); void amdgpu_fence_driver_update_timedout_fence_state(struct amdgpu_fence *af); void amdgpu_fence_save_wptr(struct amdgpu_fence *af); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 8b8a04138711c..c270a40de5e5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -597,10 +597,10 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, * to be submitted to the queues after the reset is complete. */ if (!ret) { - amdgpu_fence_driver_force_completion(gfx_ring); + amdgpu_fence_driver_force_completion(gfx_ring, NULL); drm_sched_wqueue_start(&gfx_ring->sched); if (adev->sdma.has_page_queue) { - amdgpu_fence_driver_force_completion(page_ring); + amdgpu_fence_driver_force_completion(page_ring, NULL); drm_sched_wqueue_start(&page_ring->sched); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index 9d5cca7da1d9e..3a3bc0d370fa6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -512,7 +512,7 @@ int amdgpu_uvd_resume(struct amdgpu_device *adev) } memset_io(ptr, 0, size); /* to restore uvd fence seq */ - amdgpu_fence_driver_force_completion(&adev->uvd.inst[i].ring); + amdgpu_fence_driver_force_completion(&adev->uvd.inst[i].ring, NULL); } } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 75ae9b429420e..d22c8980fa42b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -1482,15 +1482,16 @@ int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block, /** * amdgpu_vcn_reset_engine - Reset a specific VCN engine - * @adev: Pointer to the AMDGPU device - * @instance_id: VCN engine instance to reset + * @ring: Pointer to the VCN ring + * @timedout_fence: fence that timed out * * Returns: 0 on success, or a negative error code on failure. */ -static int amdgpu_vcn_reset_engine(struct amdgpu_device *adev, - uint32_t instance_id) +static int amdgpu_vcn_reset_engine(struct amdgpu_ring *ring, + struct amdgpu_fence *timedout_fence) { - struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[instance_id]; + struct amdgpu_device *adev = ring->adev; + struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[ring->me]; int r, i; mutex_lock(&vinst->engine_reset_mutex); @@ -1514,9 +1515,13 @@ static int amdgpu_vcn_reset_engine(struct amdgpu_device *adev, if (r) goto unlock; } - amdgpu_fence_driver_force_completion(&vinst->ring_dec); + amdgpu_fence_driver_force_completion(&vinst->ring_dec, + (&vinst->ring_dec == ring) ? + &timedout_fence->base : NULL); for (i = 0; i < vinst->num_enc_rings; i++) - amdgpu_fence_driver_force_completion(&vinst->ring_enc[i]); + amdgpu_fence_driver_force_completion(&vinst->ring_enc[i], + (&vinst->ring_enc[i] == ring) ? + &timedout_fence->base : NULL); /* Restart the scheduler's work queue for the dec and enc rings * if they were stopped by this function. This allows new tasks @@ -1552,7 +1557,7 @@ int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring, if (adev->vcn.inst[ring->me].using_unified_queue) return -EINVAL; - return amdgpu_vcn_reset_engine(adev, ring->me); + return amdgpu_vcn_reset_engine(ring, timedout_fence); } int amdgpu_vcn_reg_dump_init(struct amdgpu_device *adev, -- 2.52.0
