Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job

2021-08-09 Thread Jingwen Chen
On Mon Aug 09, 2021 at 10:18:37AM +0800, Jingwen Chen wrote:
> On Fri Aug 06, 2021 at 11:48:04AM +0200, Christian König wrote:
> > 
> > 
> > Am 06.08.21 um 07:52 schrieb Jingwen Chen:
> > > On Thu Aug 05, 2021 at 05:13:22PM -0400, Andrey Grodzovsky wrote:
> > > > On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> > > > > From: Jack Zhang 
> > > > > 
> > > > > Why: Previously hw fence is alloced separately with job.
> > > > > It caused historical lifetime issues and corner cases.
> > > > > The ideal situation is to take fence to manage both job
> > > > > and fence's lifetime, and simplify the design of gpu-scheduler.
> > > > > 
> > > > > How:
> > > > > We propose to embed hw_fence into amdgpu_job.
> > > > > 1. We cover the normal job submission by this method.
> > > > > 2. For ib_test, and submit without a parent job keep the
> > > > > legacy way to create a hw fence separately.
> > > > > v2:
> > > > > use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
> > > > > embeded in a job.
> > > > > 
> > > > > Signed-off-by: Jingwen Chen 
> > > > > Signed-off-by: Jack Zhang 
> > > > > ---
> > > > >drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
> > > > >drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
> > > > >drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 
> > > > > -
> > > > >drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  |  2 +-
> > > > >drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 35 
> > > > >drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  4 +-
> > > > >drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h|  5 +-
> > > > >drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  2 +-
> > > > >8 files changed, 84 insertions(+), 30 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> > > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > > index 7b46ba551cb2..3003ee1c9487 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > > @@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, 
> > > > > enum kgd_engine_type engine,
> > > > >   ret = dma_fence_wait(f, false);
> > > > >err_ib_sched:
> > > > > - dma_fence_put(f);
> > > > >   amdgpu_job_free(job);
> > > > >err:
> > > > >   return ret;
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> > > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > > index 536005bff24a..277128846dd1 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > > @@ -1414,7 +1414,7 @@ static void 
> > > > > amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
> > > > >   continue;
> > > > >   }
> > > > >   job = to_amdgpu_job(s_job);
> > > > > - if (preempted && job->fence == fence)
> > > > > + if (preempted && (>hw_fence) == fence)
> > > > >   /* mark the job as preempted */
> > > > >   job->preemption_status |= AMDGPU_IB_PREEMPTED;
> > > > >   }
> > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> > > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > > index 7495911516c2..5e29d797a265 100644
> > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > > @@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring 
> > > > > *ring)
> > > > > *
> > > > > * @ring: ring the fence is associated with
> > > > > * @f: resulting fence object
> > > > > + * @job: job the fence is embeded in
> > > > > * @flags: flags to pass into the subordinate .emit_fence() call
> > > > > *
> > > > > * Emits a fence command on the requested ring (all asics).
> > > > > * Returns 0 on success, -ENOMEM on failure.
> > > > > */
> > > > > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> > > > > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence 
> > > > > **f, struct amdgpu_job *job,
> > > > > unsigned flags)
> > > > >{
> > > > >   struct amdgpu_device *adev = ring->adev;
> > > > > - struct amdgpu_fence *fence;
> > > > > + struct dma_fence *fence;
> > > > > + struct amdgpu_fence *am_fence;
> > > > >   struct dma_fence __rcu **ptr;
> > > > >   uint32_t seq;
> > > > >   int r;
> > > > > - fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
> > > > > - if (fence == NULL)
> > > > > - return -ENOMEM;
> > > > > + if (job == NULL) {
> > > > > + /* create a sperate hw fence */
> > > > > + am_fence = kmem_cache_alloc(amdgpu_fence_slab, 
> > > > > GFP_ATOMIC);
> > > > > + if (am_fence == NULL)
> > > > > + return -ENOMEM;
> > > > > + fence = _fence->base;
> > > > > + 

Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job

2021-08-09 Thread Jingwen Chen
On Fri Aug 06, 2021 at 11:48:04AM +0200, Christian König wrote:
> 
> 
> Am 06.08.21 um 07:52 schrieb Jingwen Chen:
> > On Thu Aug 05, 2021 at 05:13:22PM -0400, Andrey Grodzovsky wrote:
> > > On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> > > > From: Jack Zhang 
> > > > 
> > > > Why: Previously hw fence is alloced separately with job.
> > > > It caused historical lifetime issues and corner cases.
> > > > The ideal situation is to take fence to manage both job
> > > > and fence's lifetime, and simplify the design of gpu-scheduler.
> > > > 
> > > > How:
> > > > We propose to embed hw_fence into amdgpu_job.
> > > > 1. We cover the normal job submission by this method.
> > > > 2. For ib_test, and submit without a parent job keep the
> > > > legacy way to create a hw fence separately.
> > > > v2:
> > > > use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
> > > > embeded in a job.
> > > > 
> > > > Signed-off-by: Jingwen Chen 
> > > > Signed-off-by: Jack Zhang 
> > > > ---
> > > >drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
> > > >drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
> > > >drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 
> > > > -
> > > >drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  |  2 +-
> > > >drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 35 
> > > >drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  4 +-
> > > >drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h|  5 +-
> > > >drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  2 +-
> > > >8 files changed, 84 insertions(+), 30 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > index 7b46ba551cb2..3003ee1c9487 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > > > @@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, 
> > > > enum kgd_engine_type engine,
> > > > ret = dma_fence_wait(f, false);
> > > >err_ib_sched:
> > > > -   dma_fence_put(f);
> > > > amdgpu_job_free(job);
> > > >err:
> > > > return ret;
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > index 536005bff24a..277128846dd1 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > > > @@ -1414,7 +1414,7 @@ static void 
> > > > amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
> > > > continue;
> > > > }
> > > > job = to_amdgpu_job(s_job);
> > > > -   if (preempted && job->fence == fence)
> > > > +   if (preempted && (>hw_fence) == fence)
> > > > /* mark the job as preempted */
> > > > job->preemption_status |= AMDGPU_IB_PREEMPTED;
> > > > }
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > index 7495911516c2..5e29d797a265 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > > > @@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring 
> > > > *ring)
> > > > *
> > > > * @ring: ring the fence is associated with
> > > > * @f: resulting fence object
> > > > + * @job: job the fence is embeded in
> > > > * @flags: flags to pass into the subordinate .emit_fence() call
> > > > *
> > > > * Emits a fence command on the requested ring (all asics).
> > > > * Returns 0 on success, -ENOMEM on failure.
> > > > */
> > > > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> > > > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, 
> > > > struct amdgpu_job *job,
> > > >   unsigned flags)
> > > >{
> > > > struct amdgpu_device *adev = ring->adev;
> > > > -   struct amdgpu_fence *fence;
> > > > +   struct dma_fence *fence;
> > > > +   struct amdgpu_fence *am_fence;
> > > > struct dma_fence __rcu **ptr;
> > > > uint32_t seq;
> > > > int r;
> > > > -   fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
> > > > -   if (fence == NULL)
> > > > -   return -ENOMEM;
> > > > +   if (job == NULL) {
> > > > +   /* create a sperate hw fence */
> > > > +   am_fence = kmem_cache_alloc(amdgpu_fence_slab, 
> > > > GFP_ATOMIC);
> > > > +   if (am_fence == NULL)
> > > > +   return -ENOMEM;
> > > > +   fence = _fence->base;
> > > > +   am_fence->ring = ring;
> > > > +   } else {
> > > > +   /* take use of job-embedded fence */
> > > > +   fence = >hw_fence;
> > > > +   job->ring = ring;
> > > 
> > > If you 

Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job

2021-08-06 Thread Jingwen Chen
On Thu Aug 05, 2021 at 05:13:22PM -0400, Andrey Grodzovsky wrote:
> 
> On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> > From: Jack Zhang 
> > 
> > Why: Previously hw fence is alloced separately with job.
> > It caused historical lifetime issues and corner cases.
> > The ideal situation is to take fence to manage both job
> > and fence's lifetime, and simplify the design of gpu-scheduler.
> > 
> > How:
> > We propose to embed hw_fence into amdgpu_job.
> > 1. We cover the normal job submission by this method.
> > 2. For ib_test, and submit without a parent job keep the
> > legacy way to create a hw fence separately.
> > v2:
> > use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
> > embeded in a job.
> > 
> > Signed-off-by: Jingwen Chen 
> > Signed-off-by: Jack Zhang 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 -
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  |  2 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 35 
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  4 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h|  5 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  2 +-
> >   8 files changed, 84 insertions(+), 30 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > index 7b46ba551cb2..3003ee1c9487 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > @@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
> > kgd_engine_type engine,
> > ret = dma_fence_wait(f, false);
> >   err_ib_sched:
> > -   dma_fence_put(f);
> > amdgpu_job_free(job);
> >   err:
> > return ret;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > index 536005bff24a..277128846dd1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > @@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct 
> > amdgpu_ring *ring)
> > continue;
> > }
> > job = to_amdgpu_job(s_job);
> > -   if (preempted && job->fence == fence)
> > +   if (preempted && (>hw_fence) == fence)
> > /* mark the job as preempted */
> > job->preemption_status |= AMDGPU_IB_PREEMPTED;
> > }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > index 7495911516c2..5e29d797a265 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > @@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
> >*
> >* @ring: ring the fence is associated with
> >* @f: resulting fence object
> > + * @job: job the fence is embeded in
> >* @flags: flags to pass into the subordinate .emit_fence() call
> >*
> >* Emits a fence command on the requested ring (all asics).
> >* Returns 0 on success, -ENOMEM on failure.
> >*/
> > -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> > +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, 
> > struct amdgpu_job *job,
> >   unsigned flags)
> >   {
> > struct amdgpu_device *adev = ring->adev;
> > -   struct amdgpu_fence *fence;
> > +   struct dma_fence *fence;
> > +   struct amdgpu_fence *am_fence;
> > struct dma_fence __rcu **ptr;
> > uint32_t seq;
> > int r;
> > -   fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
> > -   if (fence == NULL)
> > -   return -ENOMEM;
> > +   if (job == NULL) {
> > +   /* create a sperate hw fence */
> > +   am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
> > +   if (am_fence == NULL)
> > +   return -ENOMEM;
> > +   fence = _fence->base;
> > +   am_fence->ring = ring;
> > +   } else {
> > +   /* take use of job-embedded fence */
> > +   fence = >hw_fence;
> > +   job->ring = ring;
> 
> 
> If you would make hw_fence of type amdgpu_fence
> you could probably avoid the special job->ring = ring
> See more in related comment at the bottom
> 
Hi Andry,

I'm only make the amdgpu_fence for the fence without job parameter
provided to amdgpu_fence_emit. For embeded fence which is the hw_fence
in amdgpu_job, it will be allocated along with amdgpu_job as dma_fence.

Regards,
Jingwen Chen
> 
> > +   }
> > seq = ++ring->fence_drv.sync_seq;
> > -   fence->ring = ring;
> > -   dma_fence_init(>base, _fence_ops,
> > +   dma_fence_init(fence, _fence_ops,
> >>fence_drv.lock,
> >adev->fence_context + ring->idx,
> >seq);
> > +
> > +   if 

Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job

2021-08-06 Thread Christian König




Am 06.08.21 um 07:52 schrieb Jingwen Chen:

On Thu Aug 05, 2021 at 05:13:22PM -0400, Andrey Grodzovsky wrote:

On 2021-08-05 4:31 a.m., Jingwen Chen wrote:

From: Jack Zhang 

Why: Previously hw fence is alloced separately with job.
It caused historical lifetime issues and corner cases.
The ideal situation is to take fence to manage both job
and fence's lifetime, and simplify the design of gpu-scheduler.

How:
We propose to embed hw_fence into amdgpu_job.
1. We cover the normal job submission by this method.
2. For ib_test, and submit without a parent job keep the
legacy way to create a hw fence separately.
v2:
use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
embeded in a job.

Signed-off-by: Jingwen Chen 
Signed-off-by: Jack Zhang 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 -
   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  |  2 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 35 
   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  4 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h|  5 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  2 +-
   8 files changed, 84 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7b46ba551cb2..3003ee1c9487 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
kgd_engine_type engine,
ret = dma_fence_wait(f, false);
   err_ib_sched:
-   dma_fence_put(f);
amdgpu_job_free(job);
   err:
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 536005bff24a..277128846dd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct 
amdgpu_ring *ring)
continue;
}
job = to_amdgpu_job(s_job);
-   if (preempted && job->fence == fence)
+   if (preempted && (>hw_fence) == fence)
/* mark the job as preempted */
job->preemption_status |= AMDGPU_IB_PREEMPTED;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 7495911516c2..5e29d797a265 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
*
* @ring: ring the fence is associated with
* @f: resulting fence object
+ * @job: job the fence is embeded in
* @flags: flags to pass into the subordinate .emit_fence() call
*
* Emits a fence command on the requested ring (all asics).
* Returns 0 on success, -ENOMEM on failure.
*/
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct 
amdgpu_job *job,
  unsigned flags)
   {
struct amdgpu_device *adev = ring->adev;
-   struct amdgpu_fence *fence;
+   struct dma_fence *fence;
+   struct amdgpu_fence *am_fence;
struct dma_fence __rcu **ptr;
uint32_t seq;
int r;
-   fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
-   if (fence == NULL)
-   return -ENOMEM;
+   if (job == NULL) {
+   /* create a sperate hw fence */
+   am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
+   if (am_fence == NULL)
+   return -ENOMEM;
+   fence = _fence->base;
+   am_fence->ring = ring;
+   } else {
+   /* take use of job-embedded fence */
+   fence = >hw_fence;
+   job->ring = ring;


If you would make hw_fence of type amdgpu_fence
you could probably avoid the special job->ring = ring
See more in related comment at the bottom


Hi Andry,

I'm only make the amdgpu_fence for the fence without job parameter
provided to amdgpu_fence_emit. For embeded fence which is the hw_fence
in amdgpu_job, it will be allocated along with amdgpu_job as dma_fence.


When you have the job and need the ring you can just do 
conatiner_of(job->sched, struct amdgpu_ring, sched).


No need for an extra ring pointer here.

Regards,
Christian.



Regards,
Jingwen Chen

+   }
seq = ++ring->fence_drv.sync_seq;
-   fence->ring = ring;
-   dma_fence_init(>base, _fence_ops,
+   dma_fence_init(fence, _fence_ops,
   >fence_drv.lock,
   adev->fence_context + ring->idx,
   seq);
+
+   if (job != NULL) {
+   /* 

Re: [PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job

2021-08-05 Thread Andrey Grodzovsky



On 2021-08-05 4:31 a.m., Jingwen Chen wrote:

From: Jack Zhang 

Why: Previously hw fence is alloced separately with job.
It caused historical lifetime issues and corner cases.
The ideal situation is to take fence to manage both job
and fence's lifetime, and simplify the design of gpu-scheduler.

How:
We propose to embed hw_fence into amdgpu_job.
1. We cover the normal job submission by this method.
2. For ib_test, and submit without a parent job keep the
legacy way to create a hw fence separately.
v2:
use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
embeded in a job.

Signed-off-by: Jingwen Chen 
Signed-off-by: Jack Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 35 
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  4 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h|  5 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  2 +-
  8 files changed, 84 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7b46ba551cb2..3003ee1c9487 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
kgd_engine_type engine,
ret = dma_fence_wait(f, false);
  
  err_ib_sched:

-   dma_fence_put(f);
amdgpu_job_free(job);
  err:
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 536005bff24a..277128846dd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct 
amdgpu_ring *ring)
continue;
}
job = to_amdgpu_job(s_job);
-   if (preempted && job->fence == fence)
+   if (preempted && (>hw_fence) == fence)
/* mark the job as preempted */
job->preemption_status |= AMDGPU_IB_PREEMPTED;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 7495911516c2..5e29d797a265 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
   *
   * @ring: ring the fence is associated with
   * @f: resulting fence object
+ * @job: job the fence is embeded in
   * @flags: flags to pass into the subordinate .emit_fence() call
   *
   * Emits a fence command on the requested ring (all asics).
   * Returns 0 on success, -ENOMEM on failure.
   */
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct 
amdgpu_job *job,
  unsigned flags)
  {
struct amdgpu_device *adev = ring->adev;
-   struct amdgpu_fence *fence;
+   struct dma_fence *fence;
+   struct amdgpu_fence *am_fence;
struct dma_fence __rcu **ptr;
uint32_t seq;
int r;
  
-	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);

-   if (fence == NULL)
-   return -ENOMEM;
+   if (job == NULL) {
+   /* create a sperate hw fence */
+   am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
+   if (am_fence == NULL)
+   return -ENOMEM;
+   fence = _fence->base;
+   am_fence->ring = ring;
+   } else {
+   /* take use of job-embedded fence */
+   fence = >hw_fence;
+   job->ring = ring;



If you would make hw_fence of type amdgpu_fence
you could probably avoid the special job->ring = ring
See more in related comment at the bottom



+   }
  
  	seq = ++ring->fence_drv.sync_seq;

-   fence->ring = ring;
-   dma_fence_init(>base, _fence_ops,
+   dma_fence_init(fence, _fence_ops,
   >fence_drv.lock,
   adev->fence_context + ring->idx,
   seq);
+
+   if (job != NULL) {
+   /* mark this fence has a parent job */
+   set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, >flags);
+   }
+
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
   seq, flags | AMDGPU_FENCE_FLAG_INT);
pm_runtime_get_noresume(adev_to_drm(adev)->dev);
@@ -175,9 +191,9 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
dma_fence **f,
/* This function can't be called concurrently anyway, otherwise
 * emitting the fence would mess up the hardware ring buffer.
 */

[PATCHv2 1/2] drm/amd/amdgpu embed hw_fence into amdgpu_job

2021-08-05 Thread Jingwen Chen
From: Jack Zhang 

Why: Previously hw fence is alloced separately with job.
It caused historical lifetime issues and corner cases.
The ideal situation is to take fence to manage both job
and fence's lifetime, and simplify the design of gpu-scheduler.

How:
We propose to embed hw_fence into amdgpu_job.
1. We cover the normal job submission by this method.
2. For ib_test, and submit without a parent job keep the
legacy way to create a hw fence separately.
v2:
use AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT to show that the fence is
embeded in a job.

Signed-off-by: Jingwen Chen 
Signed-off-by: Jack Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c  |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 63 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 35 
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h|  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  2 +-
 8 files changed, 84 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7b46ba551cb2..3003ee1c9487 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -714,7 +714,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
kgd_engine_type engine,
ret = dma_fence_wait(f, false);
 
 err_ib_sched:
-   dma_fence_put(f);
amdgpu_job_free(job);
 err:
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 536005bff24a..277128846dd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1414,7 +1414,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct 
amdgpu_ring *ring)
continue;
}
job = to_amdgpu_job(s_job);
-   if (preempted && job->fence == fence)
+   if (preempted && (>hw_fence) == fence)
/* mark the job as preempted */
job->preemption_status |= AMDGPU_IB_PREEMPTED;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 7495911516c2..5e29d797a265 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -129,30 +129,46 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
  *
  * @ring: ring the fence is associated with
  * @f: resulting fence object
+ * @job: job the fence is embeded in
  * @flags: flags to pass into the subordinate .emit_fence() call
  *
  * Emits a fence command on the requested ring (all asics).
  * Returns 0 on success, -ENOMEM on failure.
  */
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct 
amdgpu_job *job,
  unsigned flags)
 {
struct amdgpu_device *adev = ring->adev;
-   struct amdgpu_fence *fence;
+   struct dma_fence *fence;
+   struct amdgpu_fence *am_fence;
struct dma_fence __rcu **ptr;
uint32_t seq;
int r;
 
-   fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
-   if (fence == NULL)
-   return -ENOMEM;
+   if (job == NULL) {
+   /* create a sperate hw fence */
+   am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
+   if (am_fence == NULL)
+   return -ENOMEM;
+   fence = _fence->base;
+   am_fence->ring = ring;
+   } else {
+   /* take use of job-embedded fence */
+   fence = >hw_fence;
+   job->ring = ring;
+   }
 
seq = ++ring->fence_drv.sync_seq;
-   fence->ring = ring;
-   dma_fence_init(>base, _fence_ops,
+   dma_fence_init(fence, _fence_ops,
   >fence_drv.lock,
   adev->fence_context + ring->idx,
   seq);
+
+   if (job != NULL) {
+   /* mark this fence has a parent job */
+   set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, >flags);
+   }
+
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
   seq, flags | AMDGPU_FENCE_FLAG_INT);
pm_runtime_get_noresume(adev_to_drm(adev)->dev);
@@ -175,9 +191,9 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
dma_fence **f,
/* This function can't be called concurrently anyway, otherwise
 * emitting the fence would mess up the hardware ring buffer.
 */
-   rcu_assign_pointer(*ptr, dma_fence_get(>base));
+   rcu_assign_pointer(*ptr, dma_fence_get(fence));
 
-   *f = >base;
+   *f = fence;
 
return 0;
 }
@@ -621,8 +637,16 @@ static const