Re: [PATCH 10/10] drm/amdgpu: add gang submit frontend v2

2022-07-14 Thread Andrey Grodzovsky

Acked-by: Andrey Grodzovsky 

Andrey

On 2022-07-14 06:39, Christian König wrote:

Allows submitting jobs as gang which needs to run on multiple engines at the
same time.

All members of the gang get the same implicit, explicit and VM dependencies. So
no gang member will start running until everything else is ready.

The last job is considered the gang leader (usually a submission to the GFX
ring) and used for signaling output dependencies.

Each job is remembered individually as user of a buffer object, so there is no
joining of work at the end.

v2: rebase and fix review comments from Andrey and Yogesh

Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c| 256 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h|  10 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h |  12 +-
  3 files changed, 183 insertions(+), 95 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 88f491dc7ca2..e1c41db20efb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -69,6 +69,7 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
   unsigned int *num_ibs)
  {
struct drm_sched_entity *entity;
+   unsigned int i;
int r;
  
  	r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type,

@@ -77,17 +78,28 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
if (r)
return r;
  
-	/* Abort if there is no run queue associated with this entity.

-* Possibly because of disabled HW IP*/
+   /*
+* Abort if there is no run queue associated with this entity.
+* Possibly because of disabled HW IP.
+*/
if (entity->rq == NULL)
return -EINVAL;
  
-	/* Currently we don't support submitting to multiple entities */

-   if (p->entity && p->entity != entity)
+   /* Check if we can add this IB to some existing job */
+   for (i = 0; i < p->gang_size; ++i) {
+   if (p->entities[i] == entity)
+   goto found;
+   }
+
+   /* If not increase the gang size if possible */
+   if (i == AMDGPU_CS_GANG_SIZE)
return -EINVAL;
  
-	p->entity = entity;

-   ++(*num_ibs);
+   p->entities[i] = entity;
+   p->gang_size = i + 1;
+
+found:
+   ++(num_ibs[i]);
return 0;
  }
  
@@ -161,11 +173,12 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,

   union drm_amdgpu_cs *cs)
  {
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
+   unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { };
struct amdgpu_vm *vm = >vm;
uint64_t *chunk_array_user;
uint64_t *chunk_array;
-   unsigned size, num_ibs = 0;
uint32_t uf_offset = 0;
+   unsigned int size;
int ret;
int i;
  
@@ -231,7 +244,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,

if (size < sizeof(struct drm_amdgpu_cs_chunk_ib))
goto free_partial_kdata;
  
-			ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, _ibs);

+   ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs);
if (ret)
goto free_partial_kdata;
break;
@@ -268,21 +281,28 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
}
}
  
-	ret = amdgpu_job_alloc(p->adev, num_ibs, >job, vm);

-   if (ret)
-   goto free_all_kdata;
+   if (!p->gang_size)
+   return -EINVAL;
  
-	ret = drm_sched_job_init(>job->base, p->entity, >vm);

-   if (ret)
-   goto free_all_kdata;
+   for (i = 0; i < p->gang_size; ++i) {
+   ret = amdgpu_job_alloc(p->adev, num_ibs[i], >jobs[i], vm);
+   if (ret)
+   goto free_all_kdata;
+
+   ret = drm_sched_job_init(>jobs[i]->base, p->entities[i],
+>vm);
+   if (ret)
+   goto free_all_kdata;
+   }
+   p->gang_leader = p->jobs[p->gang_size - 1];
  
-	if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) {

+   if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) {
ret = -ECANCELED;
goto free_all_kdata;
}
  
  	if (p->uf_entry.tv.bo)

-   p->job->uf_addr = uf_offset;
+   p->gang_leader->uf_addr = uf_offset;
kvfree(chunk_array);
  
  	/* Use this opportunity to fill in task info for the vm */

@@ -304,22 +324,18 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
return ret;
  }
  
-static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,

-  struct amdgpu_cs_chunk *chunk,
-  unsigned int *num_ibs,
-  unsigned int *ce_preempt,
-  

RE: [PATCH 10/10] drm/amdgpu: add gang submit frontend

2022-06-01 Thread Mohan Marimuthu, Yogesh
[Public]

Hi Christian,

No other comments. With p->jobs[i] fixed, the test case worked. I have to clean 
up the code and send it for review.
I wanted to add comparing the time with and without gang submission and fail 
test case if former is slow. I will do this later. I will send the test case 
for review first.

Thank you,
Yogesh

-Original Message-
From: Koenig, Christian  
Sent: Wednesday, June 1, 2022 5:42 PM
To: Mohan Marimuthu, Yogesh ; Christian König 
; amd-gfx@lists.freedesktop.org; Olsak, Marek 

Subject: Re: [PATCH 10/10] drm/amdgpu: add gang submit frontend



Am 01.06.22 um 14:09 schrieb Mohan Marimuthu, Yogesh:
> [SNIP]
> - /* Make sure all BOs are remembered as writers */
> - amdgpu_bo_list_for_each_entry(e, p->bo_list)
> + list_for_each_entry(e, >validated, tv.head) {
> +
> + /* Everybody except for the gang leader uses BOOKKEEP */
> + for (i = 0; i < (p->gang_size - 1); ++i) {
> + dma_resv_add_fence(e->tv.bo->base.resv,
> +>jobs[i]->base.s_fence->finished,
> +DMA_RESV_USAGE_BOOKKEEP);
> + }
> +
> + /* The gang leader as remembered as writer */
>   e->tv.num_shared = 0;
> + }
>
>
> p->jobs[i] = NULL is done in previous loop. Now when running 
> >jobs[i]->base.s_fence->finished there is NULL pointer crash.

Ah, yes good point. Going to fix that.

Any more comments on this? Did you finished the test cases?

Thanks,
Christian.


Re: [PATCH 10/10] drm/amdgpu: add gang submit frontend

2022-06-01 Thread Christian König




Am 01.06.22 um 14:09 schrieb Mohan Marimuthu, Yogesh:

[SNIP]
-   /* Make sure all BOs are remembered as writers */
-   amdgpu_bo_list_for_each_entry(e, p->bo_list)
+   list_for_each_entry(e, >validated, tv.head) {
+
+   /* Everybody except for the gang leader uses BOOKKEEP */
+   for (i = 0; i < (p->gang_size - 1); ++i) {
+   dma_resv_add_fence(e->tv.bo->base.resv,
+  >jobs[i]->base.s_fence->finished,
+  DMA_RESV_USAGE_BOOKKEEP);
+   }
+
+   /* The gang leader as remembered as writer */
e->tv.num_shared = 0;
+   }


p->jobs[i] = NULL is done in previous loop. Now when running 
>jobs[i]->base.s_fence->finished there is NULL pointer crash.


Ah, yes good point. Going to fix that.

Any more comments on this? Did you finished the test cases?

Thanks,
Christian.



RE: [PATCH 10/10] drm/amdgpu: add gang submit frontend

2022-06-01 Thread Mohan Marimuthu, Yogesh
[AMD Official Use Only - General]



-Original Message-
From: amd-gfx  On Behalf Of Christian 
König
Sent: Thursday, March 3, 2022 1:53 PM
To: amd-gfx@lists.freedesktop.org; Olsak, Marek 
Cc: Koenig, Christian 
Subject: [PATCH 10/10] drm/amdgpu: add gang submit frontend

Allows submitting jobs as gang which needs to run on multiple engines at the 
same time.

All members of the gang get the same implicit, explicit and VM dependencies. So 
no gang member will start running until everything else is ready.

The last job is considered the gang leader (usually a submission to the GFX
ring) and used for signaling output dependencies.

Each job is remembered individually as user of a buffer object, so there is no 
joining of work at the end.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c| 244 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h|   9 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h |  12 +-
 3 files changed, 173 insertions(+), 92 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index c6541f7b8f54..7429e64919fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -69,6 +69,7 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
   unsigned int *num_ibs)
 {
struct drm_sched_entity *entity;
+   unsigned int i;
int r;
 
r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type, @@ -83,11 +84,19 
@@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
return -EINVAL;
 
/* Currently we don't support submitting to multiple entities */
-   if (p->entity && p->entity != entity)
+   for (i = 0; i < p->gang_size; ++i) {
+   if (p->entities[i] == entity)
+   goto found;
+   }
+
+   if (i == AMDGPU_CS_GANG_SIZE)
return -EINVAL;
 
-   p->entity = entity;
-   ++(*num_ibs);
+   p->entities[i] = entity;
+   p->gang_size = i + 1;
+
+found:
+   ++(num_ibs[i]);
return 0;
 }
 
@@ -161,11 +170,12 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
   union drm_amdgpu_cs *cs)
 {
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
+   unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { };
struct amdgpu_vm *vm = >vm;
uint64_t *chunk_array_user;
uint64_t *chunk_array;
-   unsigned size, num_ibs = 0;
uint32_t uf_offset = 0;
+   unsigned int size;
int ret;
int i;
 
@@ -228,7 +238,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
if (size < sizeof(struct drm_amdgpu_cs_chunk_ib))
goto free_partial_kdata;
 
-   ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, _ibs);
+   ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs);
if (ret)
goto free_partial_kdata;
break;
@@ -265,21 +275,27 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
}
}
 
-   ret = amdgpu_job_alloc(p->adev, num_ibs, >job, vm);
-   if (ret)
-   goto free_all_kdata;
+   if (!p->gang_size)
+   return -EINVAL;
 
-   ret = drm_sched_job_init(>job->base, p->entity, >vm);
-   if (ret)
-   goto free_all_kdata;
+   for (i = 0; i < p->gang_size; ++i) {
+   ret = amdgpu_job_alloc(p->adev, num_ibs[i], >jobs[i], vm);
+   if (ret)
+   goto free_all_kdata;
+
+   ret = drm_sched_job_init(>jobs[i]->base, p->entities[i],
+>vm);
+   if (ret)
+   goto free_all_kdata;
+   }
 
-   if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) {
+   if (p->ctx->vram_lost_counter != p->jobs[0]->vram_lost_counter) {
ret = -ECANCELED;
goto free_all_kdata;
}
 
if (p->uf_entry.tv.bo)
-   p->job->uf_addr = uf_offset;
+   p->jobs[p->gang_size - 1]->uf_addr = uf_offset;
kvfree(chunk_array);
 
/* Use this opportunity to fill in task info for the vm */ @@ -301,22 
+317,18 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
return ret;
 }
 
-static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
-  struct amdgpu_cs_chunk *chunk,
-  unsigned int *num_ibs,
-  unsigned int *ce_preempt,
-  unsigned int *de_preempt)
+static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, struct amdgpu_job *job,
+  struct amdgpu_ib *ib, struct amdgpu_cs_chunk *chunk,
+  unsigned int *ce_preempt, unsigned int *de_preempt)
 {
-   struct amdgpu_ring *ring = 

Re: [PATCH 10/10] drm/amdgpu: add gang submit frontend

2022-03-07 Thread Andrey Grodzovsky



On 2022-03-03 03:23, Christian König wrote:

Allows submitting jobs as gang which needs to run on multiple engines at the
same time.

All members of the gang get the same implicit, explicit and VM dependencies. So
no gang member will start running until everything else is ready.

The last job is considered the gang leader (usually a submission to the GFX
ring) and used for signaling output dependencies.

Each job is remembered individually as user of a buffer object, so there is no
joining of work at the end.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c| 244 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h|   9 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h |  12 +-
  3 files changed, 173 insertions(+), 92 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index c6541f7b8f54..7429e64919fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -69,6 +69,7 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
   unsigned int *num_ibs)
  {
struct drm_sched_entity *entity;
+   unsigned int i;
int r;
  
  	r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type,

@@ -83,11 +84,19 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
return -EINVAL;
  
  	/* Currently we don't support submitting to multiple entities */

-   if (p->entity && p->entity != entity)
+   for (i = 0; i < p->gang_size; ++i) {
+   if (p->entities[i] == entity)
+   goto found;
+   }
+
+   if (i == AMDGPU_CS_GANG_SIZE)
return -EINVAL;
  
-	p->entity = entity;

-   ++(*num_ibs);
+   p->entities[i] = entity;
+   p->gang_size = i + 1;
+
+found:
+   ++(num_ibs[i]);
return 0;
  }
  
@@ -161,11 +170,12 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,

   union drm_amdgpu_cs *cs)
  {
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
+   unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { };
struct amdgpu_vm *vm = >vm;
uint64_t *chunk_array_user;
uint64_t *chunk_array;
-   unsigned size, num_ibs = 0;
uint32_t uf_offset = 0;
+   unsigned int size;
int ret;
int i;
  
@@ -228,7 +238,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,

if (size < sizeof(struct drm_amdgpu_cs_chunk_ib))
goto free_partial_kdata;
  
-			ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, _ibs);

+   ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs);
if (ret)
goto free_partial_kdata;
break;
@@ -265,21 +275,27 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
}
}
  
-	ret = amdgpu_job_alloc(p->adev, num_ibs, >job, vm);

-   if (ret)
-   goto free_all_kdata;
+   if (!p->gang_size)
+   return -EINVAL;
  
-	ret = drm_sched_job_init(>job->base, p->entity, >vm);

-   if (ret)
-   goto free_all_kdata;
+   for (i = 0; i < p->gang_size; ++i) {
+   ret = amdgpu_job_alloc(p->adev, num_ibs[i], >jobs[i], vm);
+   if (ret)
+   goto free_all_kdata;
+
+   ret = drm_sched_job_init(>jobs[i]->base, p->entities[i],
+>vm);
+   if (ret)
+   goto free_all_kdata;
+   }
  
-	if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) {

+   if (p->ctx->vram_lost_counter != p->jobs[0]->vram_lost_counter) {
ret = -ECANCELED;
goto free_all_kdata;
}
  
  	if (p->uf_entry.tv.bo)

-   p->job->uf_addr = uf_offset;
+   p->jobs[p->gang_size - 1]->uf_addr = uf_offset;



I would use some macro here for the index or maybe even a getter
function or a macro that explicitly shows you are retrieving the gang 
leader


Maybe also something for  the 'jobs[0]' above which as I understated 
just used
for retrieving data which is identical for each job in the gang - but 
why not just

use the leader then for all such retrievals ?

Andrey



kvfree(chunk_array);
  
  	/* Use this opportunity to fill in task info for the vm */

@@ -301,22 +317,18 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
return ret;
  }
  
-static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,

-  struct amdgpu_cs_chunk *chunk,
-  unsigned int *num_ibs,
-  unsigned int *ce_preempt,
-  unsigned int *de_preempt)
+static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, struct amdgpu_job *job,
+  struct amdgpu_ib *ib, struct amdgpu_cs_chunk *chunk,
+