Re: [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence

2021-08-10 Thread Jingwen Chen
Hi Andrey,

The latest patch [PATCH v4] drm/amd/amdgpu embed hw_fence into
amdgpu_job has been sent to amd-gfx. can you help review this patch?

Best Regards,
Jingwen
On Tue Aug 10, 2021 at 10:51:17AM +0800, Jingwen Chen wrote:
> On Mon Aug 09, 2021 at 12:24:37PM -0400, Andrey Grodzovsky wrote:
> > 
> > On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> > > [Why]
> > > After embeded hw_fence to amdgpu_job, we need to add tdr support
> > > for this feature.
> > > 
> > > [How]
> > > 1. Add a resubmit_flag for resubmit jobs.
> > > 2. Clear job fence from RCU and force complete vm flush fences in
> > > pre_asic_reset
> > > 3. skip dma_fence_get for resubmit jobs and add a dma_fence_put
> > > for guilty jobs.
> > > v2:
> > > use a job_run_counter in amdgpu_job to replace the resubmit_flag in
> > > drm_sched_job. When the job_run_counter >= 1, it means this job is a
> > > resubmit job.
> > > 
> > > Signed-off-by: Jack Zhang 
> > > Signed-off-by: Jingwen Chen 
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++-
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  5 -
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h|  3 +++
> > >   4 files changed, 27 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > index 9e53ff851496..ade2fa07a50a 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > @@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device 
> > > *adev)
> > >   int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> > >struct amdgpu_reset_context 
> > > *reset_context)
> > >   {
> > > - int i, r = 0;
> > > + int i, j, r = 0;
> > >   struct amdgpu_job *job = NULL;
> > >   bool need_full_reset =
> > >   test_bit(AMDGPU_NEED_FULL_RESET, _context->flags);
> > > @@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct 
> > > amdgpu_device *adev,
> > >   if (!ring || !ring->sched.thread)
> > >   continue;
> > > + /*clear job fence from fence drv to avoid force_completion
> > > +  *leave NULL and vm flush fence in fence drv */
> > > + for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) {
> > > + struct dma_fence *old,**ptr;
> > > + ptr = >fence_drv.fences[j];
> > > + old = rcu_dereference_protected(*ptr, 1);
> > > + if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, 
> > > >flags)) {
> > > + RCU_INIT_POINTER(*ptr, NULL);
> > > + }
> > > + }
> > >   /* after all hw jobs are reset, hw fence is 
> > > meaningless, so force_completion */
> > >   amdgpu_fence_driver_force_completion(ring);
> > >   }
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > index 5e29d797a265..c9752cf794fb 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > @@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, 
> > > struct dma_fence **f, struct amd
> > >   }
> > >   seq = ++ring->fence_drv.sync_seq;
> > > - dma_fence_init(fence, _fence_ops,
> > > ->fence_drv.lock,
> > > -adev->fence_context + ring->idx,
> > > -seq);
> > > + if (job != NULL && job->job_run_counter) {
> > > + /* reinit seq for resubmitted jobs */
> > > + fence->seqno = seq;
> > > + } else {
> > > + dma_fence_init(fence, _fence_ops,
> > > + >fence_drv.lock,
> > > + adev->fence_context + ring->idx,
> > > + seq);
> > > + }
> > 
> > 
> > I think this should be in the first patch actually (and the counter too),
> > without it the first patch is buggy.
> > 
> I was originally split these two patches by adding job submission
> seqence and adding tdr sequence, But yes, I should merge these two
> patches otherwise the tdr sequence will fail without second patch.
> Will send a merged version today.
> 
> Best Regards,
> Jingwen
> > 
> > >   if (job != NULL) {
> > >   /* mark this fence has a parent job */
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > index 65a395060de2..19b13a65c73b 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > > @@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct 
> > > drm_sched_job *sched_job)
> > >   dma_fence_set_error(finished, -ECANCELED);/* skip IB as 
> > > well if VRAM lost */

Re: [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence

2021-08-10 Thread Jingwen Chen
On Mon Aug 09, 2021 at 12:24:37PM -0400, Andrey Grodzovsky wrote:
> 
> On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> > [Why]
> > After embeded hw_fence to amdgpu_job, we need to add tdr support
> > for this feature.
> > 
> > [How]
> > 1. Add a resubmit_flag for resubmit jobs.
> > 2. Clear job fence from RCU and force complete vm flush fences in
> > pre_asic_reset
> > 3. skip dma_fence_get for resubmit jobs and add a dma_fence_put
> > for guilty jobs.
> > v2:
> > use a job_run_counter in amdgpu_job to replace the resubmit_flag in
> > drm_sched_job. When the job_run_counter >= 1, it means this job is a
> > resubmit job.
> > 
> > Signed-off-by: Jack Zhang 
> > Signed-off-by: Jingwen Chen 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  5 -
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h|  3 +++
> >   4 files changed, 27 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 9e53ff851496..ade2fa07a50a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device 
> > *adev)
> >   int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> >  struct amdgpu_reset_context *reset_context)
> >   {
> > -   int i, r = 0;
> > +   int i, j, r = 0;
> > struct amdgpu_job *job = NULL;
> > bool need_full_reset =
> > test_bit(AMDGPU_NEED_FULL_RESET, _context->flags);
> > @@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct 
> > amdgpu_device *adev,
> > if (!ring || !ring->sched.thread)
> > continue;
> > +   /*clear job fence from fence drv to avoid force_completion
> > +*leave NULL and vm flush fence in fence drv */
> > +   for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) {
> > +   struct dma_fence *old,**ptr;
> > +   ptr = >fence_drv.fences[j];
> > +   old = rcu_dereference_protected(*ptr, 1);
> > +   if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, 
> > >flags)) {
> > +   RCU_INIT_POINTER(*ptr, NULL);
> > +   }
> > +   }
> > /* after all hw jobs are reset, hw fence is meaningless, so 
> > force_completion */
> > amdgpu_fence_driver_force_completion(ring);
> > }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > index 5e29d797a265..c9752cf794fb 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > @@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, 
> > struct dma_fence **f, struct amd
> > }
> > seq = ++ring->fence_drv.sync_seq;
> > -   dma_fence_init(fence, _fence_ops,
> > -  >fence_drv.lock,
> > -  adev->fence_context + ring->idx,
> > -  seq);
> > +   if (job != NULL && job->job_run_counter) {
> > +   /* reinit seq for resubmitted jobs */
> > +   fence->seqno = seq;
> > +   } else {
> > +   dma_fence_init(fence, _fence_ops,
> > +   >fence_drv.lock,
> > +   adev->fence_context + ring->idx,
> > +   seq);
> > +   }
> 
> 
> I think this should be in the first patch actually (and the counter too),
> without it the first patch is buggy.
> 
I was originally split these two patches by adding job submission
seqence and adding tdr sequence, But yes, I should merge these two
patches otherwise the tdr sequence will fail without second patch.
Will send a merged version today.

Best Regards,
Jingwen
> 
> > if (job != NULL) {
> > /* mark this fence has a parent job */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > index 65a395060de2..19b13a65c73b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > @@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct 
> > drm_sched_job *sched_job)
> > dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if 
> > VRAM lost */
> > if (finished->error < 0) {
> > +   dma_fence_put(>hw_fence);
> 
> 
> Would put this check bellow with the job_run_counter check
> 
> 
> > DRM_INFO("Skip scheduling IBs!\n");
> > } else {
> > r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
> > @@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct 
> > drm_sched_job *sched_job)
> > DRM_ERROR("Error scheduling IBs (%d)\n", r);
> > }
> > -   

Re: [PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence

2021-08-09 Thread Andrey Grodzovsky



On 2021-08-05 4:31 a.m., Jingwen Chen wrote:

[Why]
After embeded hw_fence to amdgpu_job, we need to add tdr support
for this feature.

[How]
1. Add a resubmit_flag for resubmit jobs.
2. Clear job fence from RCU and force complete vm flush fences in
pre_asic_reset
3. skip dma_fence_get for resubmit jobs and add a dma_fence_put
for guilty jobs.
v2:
use a job_run_counter in amdgpu_job to replace the resubmit_flag in
drm_sched_job. When the job_run_counter >= 1, it means this job is a
resubmit job.

Signed-off-by: Jack Zhang 
Signed-off-by: Jingwen Chen 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  5 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h|  3 +++
  4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9e53ff851496..ade2fa07a50a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
  int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 struct amdgpu_reset_context *reset_context)
  {
-   int i, r = 0;
+   int i, j, r = 0;
struct amdgpu_job *job = NULL;
bool need_full_reset =
test_bit(AMDGPU_NEED_FULL_RESET, _context->flags);
@@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
*adev,
if (!ring || !ring->sched.thread)
continue;
  
+		/*clear job fence from fence drv to avoid force_completion

+*leave NULL and vm flush fence in fence drv */
+   for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) {
+   struct dma_fence *old,**ptr;
+   ptr = >fence_drv.fences[j];
+   old = rcu_dereference_protected(*ptr, 1);
+   if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, 
>flags)) {
+   RCU_INIT_POINTER(*ptr, NULL);
+   }
+   }
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 5e29d797a265..c9752cf794fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
dma_fence **f, struct amd
}
  
  	seq = ++ring->fence_drv.sync_seq;

-   dma_fence_init(fence, _fence_ops,
-  >fence_drv.lock,
-  adev->fence_context + ring->idx,
-  seq);
+   if (job != NULL && job->job_run_counter) {
+   /* reinit seq for resubmitted jobs */
+   fence->seqno = seq;
+   } else {
+   dma_fence_init(fence, _fence_ops,
+   >fence_drv.lock,
+   adev->fence_context + ring->idx,
+   seq);
+   }



I think this should be in the first patch actually (and the counter too),
without it the first patch is buggy.


  
  	if (job != NULL) {

/* mark this fence has a parent job */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 65a395060de2..19b13a65c73b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if 
VRAM lost */
  
  	if (finished->error < 0) {

+   dma_fence_put(>hw_fence);



Would put this check bellow with the job_run_counter check



DRM_INFO("Skip scheduling IBs!\n");
} else {
r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
@@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
DRM_ERROR("Error scheduling IBs (%d)\n", r);
}
  
-	dma_fence_get(fence);

+   if (!job->job_run_counter)
+   dma_fence_get(fence);
+   job->job_run_counter ++;
amdgpu_job_free_resources(job);



Here you  modify code you already changed in patch 1, looks to me
like those 2 patches should be squashed into one patch as the changes are
directly dependent and it's hard to follow

Andrey


  
  	fence = r ? ERR_PTR(r) : fence;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 92324c978534..1fa667f245e1 100644
--- 

[PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence

2021-08-05 Thread Jingwen Chen
[Why]
After embeded hw_fence to amdgpu_job, we need to add tdr support
for this feature.

[How]
1. Add a resubmit_flag for resubmit jobs.
2. Clear job fence from RCU and force complete vm flush fences in
   pre_asic_reset
3. skip dma_fence_get for resubmit jobs and add a dma_fence_put
   for guilty jobs.
v2:
use a job_run_counter in amdgpu_job to replace the resubmit_flag in
drm_sched_job. When the job_run_counter >= 1, it means this job is a
resubmit job.

Signed-off-by: Jack Zhang 
Signed-off-by: Jingwen Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  5 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h|  3 +++
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9e53ff851496..ade2fa07a50a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 struct amdgpu_reset_context *reset_context)
 {
-   int i, r = 0;
+   int i, j, r = 0;
struct amdgpu_job *job = NULL;
bool need_full_reset =
test_bit(AMDGPU_NEED_FULL_RESET, _context->flags);
@@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
*adev,
if (!ring || !ring->sched.thread)
continue;
 
+   /*clear job fence from fence drv to avoid force_completion
+*leave NULL and vm flush fence in fence drv */
+   for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) {
+   struct dma_fence *old,**ptr;
+   ptr = >fence_drv.fences[j];
+   old = rcu_dereference_protected(*ptr, 1);
+   if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, 
>flags)) {
+   RCU_INIT_POINTER(*ptr, NULL);
+   }
+   }
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 5e29d797a265..c9752cf794fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
dma_fence **f, struct amd
}
 
seq = ++ring->fence_drv.sync_seq;
-   dma_fence_init(fence, _fence_ops,
-  >fence_drv.lock,
-  adev->fence_context + ring->idx,
-  seq);
+   if (job != NULL && job->job_run_counter) {
+   /* reinit seq for resubmitted jobs */
+   fence->seqno = seq;
+   } else {
+   dma_fence_init(fence, _fence_ops,
+   >fence_drv.lock,
+   adev->fence_context + ring->idx,
+   seq);
+   }
 
if (job != NULL) {
/* mark this fence has a parent job */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 65a395060de2..19b13a65c73b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if 
VRAM lost */
 
if (finished->error < 0) {
+   dma_fence_put(>hw_fence);
DRM_INFO("Skip scheduling IBs!\n");
} else {
r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
@@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
DRM_ERROR("Error scheduling IBs (%d)\n", r);
}
 
-   dma_fence_get(fence);
+   if (!job->job_run_counter)
+   dma_fence_get(fence);
+   job->job_run_counter ++;
amdgpu_job_free_resources(job);
 
fence = r ? ERR_PTR(r) : fence;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 92324c978534..1fa667f245e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -64,6 +64,9 @@ struct amdgpu_job {
/* user fence handling */
uint64_tuf_addr;
uint64_tuf_sequence;
+
+   /* job_run_counter >= 1 means a resubmit job */
+   uint32_tjob_run_counter;
 };
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
-- 
2.25.1