Re: [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence
Hi Andrey, The latest patch [PATCH v4] drm/amd/amdgpu embed hw_fence into amdgpu_job has been sent to amd-gfx. can you help review this patch? Best Regards, Jingwen On Tue Aug 10, 2021 at 10:51:17AM +0800, Jingwen Chen wrote: > On Mon Aug 09, 2021 at 12:24:37PM -0400, Andrey Grodzovsky wrote: > > > > On 2021-08-05 4:31 a.m., Jingwen Chen wrote: > > > [Why] > > > After embeded hw_fence to amdgpu_job, we need to add tdr support > > > for this feature. > > > > > > [How] > > > 1. Add a resubmit_flag for resubmit jobs. > > > 2. Clear job fence from RCU and force complete vm flush fences in > > > pre_asic_reset > > > 3. skip dma_fence_get for resubmit jobs and add a dma_fence_put > > > for guilty jobs. > > > v2: > > > use a job_run_counter in amdgpu_job to replace the resubmit_flag in > > > drm_sched_job. When the job_run_counter >= 1, it means this job is a > > > resubmit job. > > > > > > Signed-off-by: Jack Zhang > > > Signed-off-by: Jingwen Chen > > > --- > > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++- > > > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 13 + > > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c| 5 - > > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h| 3 +++ > > > 4 files changed, 27 insertions(+), 6 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > index 9e53ff851496..ade2fa07a50a 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > @@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device > > > *adev) > > > int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > > >struct amdgpu_reset_context > > > *reset_context) > > > { > > > - int i, r = 0; > > > + int i, j, r = 0; > > > struct amdgpu_job *job = NULL; > > > bool need_full_reset = > > > test_bit(AMDGPU_NEED_FULL_RESET, _context->flags); > > > @@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct > > > amdgpu_device *adev, > > > if (!ring || !ring->sched.thread) > > > continue; > > > + /*clear job fence from fence drv to avoid force_completion > > > + *leave NULL and vm flush fence in fence drv */ > > > + for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) { > > > + struct dma_fence *old,**ptr; > > > + ptr = >fence_drv.fences[j]; > > > + old = rcu_dereference_protected(*ptr, 1); > > > + if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, > > > >flags)) { > > > + RCU_INIT_POINTER(*ptr, NULL); > > > + } > > > + } > > > /* after all hw jobs are reset, hw fence is > > > meaningless, so force_completion */ > > > amdgpu_fence_driver_force_completion(ring); > > > } > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > > index 5e29d797a265..c9752cf794fb 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > > @@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, > > > struct dma_fence **f, struct amd > > > } > > > seq = ++ring->fence_drv.sync_seq; > > > - dma_fence_init(fence, _fence_ops, > > > ->fence_drv.lock, > > > -adev->fence_context + ring->idx, > > > -seq); > > > + if (job != NULL && job->job_run_counter) { > > > + /* reinit seq for resubmitted jobs */ > > > + fence->seqno = seq; > > > + } else { > > > + dma_fence_init(fence, _fence_ops, > > > + >fence_drv.lock, > > > + adev->fence_context + ring->idx, > > > + seq); > > > + } > > > > > > I think this should be in the first patch actually (and the counter too), > > without it the first patch is buggy. > > > I was originally split these two patches by adding job submission > seqence and adding tdr sequence, But yes, I should merge these two > patches otherwise the tdr sequence will fail without second patch. > Will send a merged version today. > > Best Regards, > Jingwen > > > > > if (job != NULL) { > > > /* mark this fence has a parent job */ > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > > index 65a395060de2..19b13a65c73b 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > > @@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct > > > drm_sched_job *sched_job) > > > dma_fence_set_error(finished, -ECANCELED);/* skip IB as > > > well if VRAM lost */
Re: [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence
On Mon Aug 09, 2021 at 12:24:37PM -0400, Andrey Grodzovsky wrote: > > On 2021-08-05 4:31 a.m., Jingwen Chen wrote: > > [Why] > > After embeded hw_fence to amdgpu_job, we need to add tdr support > > for this feature. > > > > [How] > > 1. Add a resubmit_flag for resubmit jobs. > > 2. Clear job fence from RCU and force complete vm flush fences in > > pre_asic_reset > > 3. skip dma_fence_get for resubmit jobs and add a dma_fence_put > > for guilty jobs. > > v2: > > use a job_run_counter in amdgpu_job to replace the resubmit_flag in > > drm_sched_job. When the job_run_counter >= 1, it means this job is a > > resubmit job. > > > > Signed-off-by: Jack Zhang > > Signed-off-by: Jingwen Chen > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++- > > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 13 + > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c| 5 - > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h| 3 +++ > > 4 files changed, 27 insertions(+), 6 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > index 9e53ff851496..ade2fa07a50a 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > @@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device > > *adev) > > int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > > struct amdgpu_reset_context *reset_context) > > { > > - int i, r = 0; > > + int i, j, r = 0; > > struct amdgpu_job *job = NULL; > > bool need_full_reset = > > test_bit(AMDGPU_NEED_FULL_RESET, _context->flags); > > @@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct > > amdgpu_device *adev, > > if (!ring || !ring->sched.thread) > > continue; > > + /*clear job fence from fence drv to avoid force_completion > > +*leave NULL and vm flush fence in fence drv */ > > + for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) { > > + struct dma_fence *old,**ptr; > > + ptr = >fence_drv.fences[j]; > > + old = rcu_dereference_protected(*ptr, 1); > > + if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, > > >flags)) { > > + RCU_INIT_POINTER(*ptr, NULL); > > + } > > + } > > /* after all hw jobs are reset, hw fence is meaningless, so > > force_completion */ > > amdgpu_fence_driver_force_completion(ring); > > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > index 5e29d797a265..c9752cf794fb 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > @@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, > > struct dma_fence **f, struct amd > > } > > seq = ++ring->fence_drv.sync_seq; > > - dma_fence_init(fence, _fence_ops, > > - >fence_drv.lock, > > - adev->fence_context + ring->idx, > > - seq); > > + if (job != NULL && job->job_run_counter) { > > + /* reinit seq for resubmitted jobs */ > > + fence->seqno = seq; > > + } else { > > + dma_fence_init(fence, _fence_ops, > > + >fence_drv.lock, > > + adev->fence_context + ring->idx, > > + seq); > > + } > > > I think this should be in the first patch actually (and the counter too), > without it the first patch is buggy. > I was originally split these two patches by adding job submission seqence and adding tdr sequence, But yes, I should merge these two patches otherwise the tdr sequence will fail without second patch. Will send a merged version today. Best Regards, Jingwen > > > if (job != NULL) { > > /* mark this fence has a parent job */ > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > index 65a395060de2..19b13a65c73b 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > @@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct > > drm_sched_job *sched_job) > > dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if > > VRAM lost */ > > if (finished->error < 0) { > > + dma_fence_put(>hw_fence); > > > Would put this check bellow with the job_run_counter check > > > > DRM_INFO("Skip scheduling IBs!\n"); > > } else { > > r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, > > @@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct > > drm_sched_job *sched_job) > > DRM_ERROR("Error scheduling IBs (%d)\n", r); > > } > > -
Re: [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence
On 2021-08-05 4:31 a.m., Jingwen Chen wrote: [Why] After embeded hw_fence to amdgpu_job, we need to add tdr support for this feature. [How] 1. Add a resubmit_flag for resubmit jobs. 2. Clear job fence from RCU and force complete vm flush fences in pre_asic_reset 3. skip dma_fence_get for resubmit jobs and add a dma_fence_put for guilty jobs. v2: use a job_run_counter in amdgpu_job to replace the resubmit_flag in drm_sched_job. When the job_run_counter >= 1, it means this job is a resubmit job. Signed-off-by: Jack Zhang Signed-off-by: Jingwen Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 13 + drivers/gpu/drm/amd/amdgpu/amdgpu_job.c| 5 - drivers/gpu/drm/amd/amdgpu/amdgpu_job.h| 3 +++ 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9e53ff851496..ade2fa07a50a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, struct amdgpu_reset_context *reset_context) { - int i, r = 0; + int i, j, r = 0; struct amdgpu_job *job = NULL; bool need_full_reset = test_bit(AMDGPU_NEED_FULL_RESET, _context->flags); @@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (!ring || !ring->sched.thread) continue; + /*clear job fence from fence drv to avoid force_completion +*leave NULL and vm flush fence in fence drv */ + for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) { + struct dma_fence *old,**ptr; + ptr = >fence_drv.fences[j]; + old = rcu_dereference_protected(*ptr, 1); + if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, >flags)) { + RCU_INIT_POINTER(*ptr, NULL); + } + } /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 5e29d797a265..c9752cf794fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd } seq = ++ring->fence_drv.sync_seq; - dma_fence_init(fence, _fence_ops, - >fence_drv.lock, - adev->fence_context + ring->idx, - seq); + if (job != NULL && job->job_run_counter) { + /* reinit seq for resubmitted jobs */ + fence->seqno = seq; + } else { + dma_fence_init(fence, _fence_ops, + >fence_drv.lock, + adev->fence_context + ring->idx, + seq); + } I think this should be in the first patch actually (and the counter too), without it the first patch is buggy. if (job != NULL) { /* mark this fence has a parent job */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 65a395060de2..19b13a65c73b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */ if (finished->error < 0) { + dma_fence_put(>hw_fence); Would put this check bellow with the job_run_counter check DRM_INFO("Skip scheduling IBs!\n"); } else { r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, @@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) DRM_ERROR("Error scheduling IBs (%d)\n", r); } - dma_fence_get(fence); + if (!job->job_run_counter) + dma_fence_get(fence); + job->job_run_counter ++; amdgpu_job_free_resources(job); Here you modify code you already changed in patch 1, looks to me like those 2 patches should be squashed into one patch as the changes are directly dependent and it's hard to follow Andrey fence = r ? ERR_PTR(r) : fence; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 92324c978534..1fa667f245e1 100644 ---
[PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence
[Why] After embeded hw_fence to amdgpu_job, we need to add tdr support for this feature. [How] 1. Add a resubmit_flag for resubmit jobs. 2. Clear job fence from RCU and force complete vm flush fences in pre_asic_reset 3. skip dma_fence_get for resubmit jobs and add a dma_fence_put for guilty jobs. v2: use a job_run_counter in amdgpu_job to replace the resubmit_flag in drm_sched_job. When the job_run_counter >= 1, it means this job is a resubmit job. Signed-off-by: Jack Zhang Signed-off-by: Jingwen Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 13 + drivers/gpu/drm/amd/amdgpu/amdgpu_job.c| 5 - drivers/gpu/drm/amd/amdgpu/amdgpu_job.h| 3 +++ 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9e53ff851496..ade2fa07a50a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, struct amdgpu_reset_context *reset_context) { - int i, r = 0; + int i, j, r = 0; struct amdgpu_job *job = NULL; bool need_full_reset = test_bit(AMDGPU_NEED_FULL_RESET, _context->flags); @@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (!ring || !ring->sched.thread) continue; + /*clear job fence from fence drv to avoid force_completion +*leave NULL and vm flush fence in fence drv */ + for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) { + struct dma_fence *old,**ptr; + ptr = >fence_drv.fences[j]; + old = rcu_dereference_protected(*ptr, 1); + if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, >flags)) { + RCU_INIT_POINTER(*ptr, NULL); + } + } /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 5e29d797a265..c9752cf794fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd } seq = ++ring->fence_drv.sync_seq; - dma_fence_init(fence, _fence_ops, - >fence_drv.lock, - adev->fence_context + ring->idx, - seq); + if (job != NULL && job->job_run_counter) { + /* reinit seq for resubmitted jobs */ + fence->seqno = seq; + } else { + dma_fence_init(fence, _fence_ops, + >fence_drv.lock, + adev->fence_context + ring->idx, + seq); + } if (job != NULL) { /* mark this fence has a parent job */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 65a395060de2..19b13a65c73b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */ if (finished->error < 0) { + dma_fence_put(>hw_fence); DRM_INFO("Skip scheduling IBs!\n"); } else { r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, @@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) DRM_ERROR("Error scheduling IBs (%d)\n", r); } - dma_fence_get(fence); + if (!job->job_run_counter) + dma_fence_get(fence); + job->job_run_counter ++; amdgpu_job_free_resources(job); fence = r ? ERR_PTR(r) : fence; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 92324c978534..1fa667f245e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -64,6 +64,9 @@ struct amdgpu_job { /* user fence handling */ uint64_tuf_addr; uint64_tuf_sequence; + + /* job_run_counter >= 1 means a resubmit job */ + uint32_tjob_run_counter; }; int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, -- 2.25.1