Align refcount behaviour for amdgpu_job embedded HW fence with
classic pointer style HW fences by increasing refcount each
time emit is called so amdgpu code doesn't need to make workarounds
using amdgpu_job.job_run_counter to keep the HW fence refcount balanced.

Also since in the previous patch we resumed setting s_fence->parent to NULL
in drm_sched_stop switch to directly checking if job->hw_fence is
signaled to short circuit reset if already signed.

Signed-off-by: Andrey Grodzovsky <andrey.grodzov...@amd.com>
Tested-by: Yiqing Yao <yiqing....@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 ++++++++++++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  |  7 ++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  4 ----
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 513c57f839d8..447bd92c4856 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -684,6 +684,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
                goto err_ib_sched;
        }
 
+       /* Drop the initial kref_init count (see drm_sched_main as example) */
+       dma_fence_put(f);
        ret = dma_fence_wait(f, false);
 
 err_ib_sched:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c99541685804..f9718119834f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5009,16 +5009,28 @@ static void amdgpu_device_recheck_guilty_jobs(
 
                /* clear job's guilty and depend the folowing step to decide 
the real one */
                drm_sched_reset_karma(s_job);
-               /* for the real bad job, it will be resubmitted twice, adding a 
dma_fence_get
-                * to make sure fence is balanced */
-               dma_fence_get(s_job->s_fence->parent);
                drm_sched_resubmit_jobs_ext(&ring->sched, 1);
 
+               if (!s_job->s_fence->parent) {
+                       DRM_WARN("Failed to get a HW fence for job!");
+                       continue;
+               }
+
                ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, 
ring->sched.timeout);
                if (ret == 0) { /* timeout */
                        DRM_ERROR("Found the real bad job! ring:%s, 
job_id:%llx\n",
                                                ring->sched.name, s_job->id);
 
+
+                       /* Clear this failed job from fence array */
+                       amdgpu_fence_driver_clear_job_fences(ring);
+
+                       /* Since the job won't signal and we go for
+                        * another resubmit drop this parent pointer
+                        */
+                       dma_fence_put(s_job->s_fence->parent);
+                       s_job->s_fence->parent = NULL;
+
                        /* set guilty */
                        drm_sched_increase_karma(s_job);
 retry:
@@ -5047,7 +5059,6 @@ static void amdgpu_device_recheck_guilty_jobs(
 
                /* got the hw fence, signal finished fence */
                atomic_dec(ring->sched.score);
-               dma_fence_put(s_job->s_fence->parent);
                dma_fence_get(&s_job->s_fence->finished);
                dma_fence_signal(&s_job->s_fence->finished);
                dma_fence_put(&s_job->s_fence->finished);
@@ -5220,8 +5231,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         *
         * job->base holds a reference to parent fence
         */
-       if (job && job->base.s_fence->parent &&
-           dma_fence_is_signaled(job->base.s_fence->parent)) {
+       if (job && (job->hw_fence.ops != NULL) &&
+           dma_fence_is_signaled(&job->hw_fence)) {
                job_signaled = true;
                dev_info(adev->dev, "Guilty job already signaled, skipping HW 
reset");
                goto skip_hw_reset;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index d6d54ba4c185..9bd4e18212fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -164,11 +164,16 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
dma_fence **f, struct amd
        if (job && job->job_run_counter) {
                /* reinit seq for resubmitted jobs */
                fence->seqno = seq;
+               /* TO be inline with external fence creation and other drivers 
*/
+               dma_fence_get(fence);
        } else {
-               if (job)
+               if (job) {
                        dma_fence_init(fence, &amdgpu_job_fence_ops,
                                       &ring->fence_drv.lock,
                                       adev->fence_context + ring->idx, seq);
+                       /* Against remove in amdgpu_job_{free, free_cb} */
+                       dma_fence_get(fence);
+               }
                else
                        dma_fence_init(fence, &amdgpu_fence_ops,
                                       &ring->fence_drv.lock,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 58568fdde2d0..638e1d600258 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -267,10 +267,6 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
                        DRM_ERROR("Error scheduling IBs (%d)\n", r);
        }
 
-       if (!job->job_run_counter)
-               dma_fence_get(fence);
-       else if (finished->error < 0)
-               dma_fence_put(&job->hw_fence);
        job->job_run_counter++;
        amdgpu_job_free_resources(job);
 
-- 
2.25.1

Reply via email to