Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-29 Thread Tvrtko Ursulin



On 28/03/2022 18:16, fei.y...@intel.com wrote:

From: Fei Yang 

GPU hangs have been observed when multiple engines write to the
same aux_inv register at the same time. To avoid this each engine
should only invalidate its own auxiliary table. The function
gen12_emit_flush_xcs() currently invalidate the auxiliary table for
all engines because the rq->engine is not necessarily the engine
eventually carrying out the request, and potentially the engine
could even be a virtual one (with engine->instance being -1).
With the MMIO remap feature, we can actually set bit 17 of MI_LRI
instruction and let the hardware to figure out the local aux_inv
register at runtime to avoid invalidating auxiliary table for all
engines.

Bspec: 45728

v2: Invalidate AUX table for indirect context as well.

Cc: Stuart Summers 
Cc: Tvrtko Ursulin 
Signed-off-by: Chris Wilson 
Signed-off-by: Fei Yang 


Reviewed-by: Tvrtko Ursulin 

Regards,

Tvrtko


---
  drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 50 +---
  drivers/gpu/drm/i915/gt/gen8_engine_cs.h |  4 +-
  drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  1 +
  drivers/gpu/drm/i915/gt/intel_lrc.c  | 12 +
  4 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c 
b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
index 36148887c699..047b5a710149 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -6,7 +6,6 @@
  #include "gen8_engine_cs.h"
  #include "i915_drv.h"
  #include "intel_gpu_commands.h"
-#include "intel_gt_regs.h"
  #include "intel_lrc.h"
  #include "intel_ring.h"
  
@@ -165,33 +164,9 @@ static u32 preparser_disable(bool state)

return MI_ARB_CHECK | 1 << 8 | state;
  }
  
-static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)

+u32 *gen12_emit_aux_table_inv(u32 *cs, const i915_reg_t inv_reg)
  {
-   static const i915_reg_t vd[] = {
-   GEN12_VD0_AUX_NV,
-   GEN12_VD1_AUX_NV,
-   GEN12_VD2_AUX_NV,
-   GEN12_VD3_AUX_NV,
-   };
-
-   static const i915_reg_t ve[] = {
-   GEN12_VE0_AUX_NV,
-   GEN12_VE1_AUX_NV,
-   };
-
-   if (engine->class == VIDEO_DECODE_CLASS)
-   return vd[engine->instance];
-
-   if (engine->class == VIDEO_ENHANCEMENT_CLASS)
-   return ve[engine->instance];
-
-   GEM_BUG_ON("unknown aux_inv reg\n");
-   return INVALID_MMIO_REG;
-}
-
-static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
-{
-   *cs++ = MI_LOAD_REGISTER_IMM(1);
+   *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN;
*cs++ = i915_mmio_reg_offset(inv_reg);
*cs++ = AUX_INV;
*cs++ = MI_NOOP;
@@ -274,7 +249,7 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
  
  		if (!HAS_FLAT_CCS(rq->engine->i915)) {

/* hsdes: 1809175790 */
-   cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
+   cs = gen12_emit_aux_table_inv(cs, GEN12_GFX_CCS_AUX_NV);
}
  
  		*cs++ = preparser_disable(false);

@@ -293,10 +268,12 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 
mode)
if (mode & EMIT_INVALIDATE) {
cmd += 2;
  
-		if (!HAS_FLAT_CCS(rq->engine->i915)) {

+   if (!HAS_FLAT_CCS(rq->engine->i915) &&
+   (rq->engine->class == VIDEO_DECODE_CLASS ||
+rq->engine->class == VIDEO_ENHANCEMENT_CLASS)) {
aux_inv = rq->engine->mask & ~BIT(BCS0);
if (aux_inv)
-   cmd += 2 * hweight32(aux_inv) + 2;
+   cmd += 4;
}
}
  
@@ -329,15 +306,10 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)

*cs++ = 0; /* value */
  
  	if (aux_inv) { /* hsdes: 1809175790 */

-   struct intel_engine_cs *engine;
-   unsigned int tmp;
-
-   *cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
-   for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
-   *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
-   *cs++ = AUX_INV;
-   }
-   *cs++ = MI_NOOP;
+   if (rq->engine->class == VIDEO_DECODE_CLASS)
+   cs = gen12_emit_aux_table_inv(cs, GEN12_VD0_AUX_NV);
+   else
+   cs = gen12_emit_aux_table_inv(cs, GEN12_VE0_AUX_NV);
}
  
  	if (mode & EMIT_INVALIDATE)

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.h 
b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h
index cc6e21d3662a..107ab42539ab 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.h
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h
@@ -10,7 +10,7 @@
  #include 
  
  #include "i915_gem.h" /* GEM_BUG_ON */

-
+#include "intel_gt_regs.h"
  #include "intel_gpu_

Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-28 Thread Yang, Fei
>> +u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
>> -static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
>
> I think all helpers which emit to ring take cs as the first argument so it 
> would be good to make this consistent.

Updated the patch, please review rev10.
This helper function has been there for a long while, I was hesitant to change. 
But I agree cs should be the first argument. Since I removed the "static" 
anyway, so might just change the order all together.

>> @@ -329,15 +306,10 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 
>> mode)
>>  *cs++ = 0; /* value */
>>   
>>  if (aux_inv) { /* hsdes: 1809175790 */
>> -struct intel_engine_cs *engine;
>> -unsigned int tmp;
>> -
>> -*cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
>> -for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
>> -*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
>> -*cs++ = AUX_INV;
>> -}
>> -*cs++ = MI_NOOP;
>> +if (rq->engine->class == VIDEO_DECODE_CLASS)
>> +cs = gen12_emit_aux_table_inv(GEN12_VD0_AUX_NV, cs);
>> +else
>> +cs = gen12_emit_aux_table_inv(GEN12_VE0_AUX_NV, cs);
>
> Not sure if, here and below, it would be worth to put register in a local and 
> then have a single function call - up to you.

I feel it's easier to check the code correctness in the *_rcs/*_xcs functions 
and leave the helper function as simple as possible.

>
> Apart from the cs re-order looks good to me.

If no other problems with rev10, would you please help push it upstream? I 
don't have the commit right, will need to find someone to help take it further.

Thanks,
-Fei

> Reviewed-by: Tvrtko Ursulin 



Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-28 Thread Tvrtko Ursulin



On 28/03/2022 04:16, fei.y...@intel.com wrote:

From: Fei Yang 

GPU hangs have been observed when multiple engines write to the
same aux_inv register at the same time. To avoid this each engine
should only invalidate its own auxiliary table. The function
gen12_emit_flush_xcs() currently invalidate the auxiliary table for
all engines because the rq->engine is not necessarily the engine
eventually carrying out the request, and potentially the engine
could even be a virtual one (with engine->instance being -1).
With the MMIO remap feature, we can actually set bit 17 of MI_LRI
instruction and let the hardware to figure out the local aux_inv
register at runtime to avoid invalidating auxiliary table for all
engines.

Bspec: 45728

v2: Invalidate AUX table for indirect context as well.

Cc: Stuart Summers 
Cc: Tvrtko Ursulin 
Signed-off-by: Chris Wilson 
Signed-off-by: Fei Yang 
---
  drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 48 
  drivers/gpu/drm/i915/gt/gen8_engine_cs.h |  4 +-
  drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  1 +
  drivers/gpu/drm/i915/gt/intel_lrc.c  | 12 +
  4 files changed, 26 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c 
b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
index 36148887c699..8178be083b42 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -6,7 +6,6 @@
  #include "gen8_engine_cs.h"
  #include "i915_drv.h"
  #include "intel_gpu_commands.h"
-#include "intel_gt_regs.h"
  #include "intel_lrc.h"
  #include "intel_ring.h"
  
@@ -165,33 +164,9 @@ static u32 preparser_disable(bool state)

return MI_ARB_CHECK | 1 << 8 | state;
  }
  
-static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)

+u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)


I think all helpers which emit to ring take cs as the first argument so 
it would be good to make this consistent.



  {
-   static const i915_reg_t vd[] = {
-   GEN12_VD0_AUX_NV,
-   GEN12_VD1_AUX_NV,
-   GEN12_VD2_AUX_NV,
-   GEN12_VD3_AUX_NV,
-   };
-
-   static const i915_reg_t ve[] = {
-   GEN12_VE0_AUX_NV,
-   GEN12_VE1_AUX_NV,
-   };
-
-   if (engine->class == VIDEO_DECODE_CLASS)
-   return vd[engine->instance];
-
-   if (engine->class == VIDEO_ENHANCEMENT_CLASS)
-   return ve[engine->instance];
-
-   GEM_BUG_ON("unknown aux_inv reg\n");
-   return INVALID_MMIO_REG;
-}
-
-static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
-{
-   *cs++ = MI_LOAD_REGISTER_IMM(1);
+   *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN;
*cs++ = i915_mmio_reg_offset(inv_reg);
*cs++ = AUX_INV;
*cs++ = MI_NOOP;
@@ -293,10 +268,12 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 
mode)
if (mode & EMIT_INVALIDATE) {
cmd += 2;
  
-		if (!HAS_FLAT_CCS(rq->engine->i915)) {

+   if (!HAS_FLAT_CCS(rq->engine->i915) &&
+   (rq->engine->class == VIDEO_DECODE_CLASS ||
+rq->engine->class == VIDEO_ENHANCEMENT_CLASS)) {
aux_inv = rq->engine->mask & ~BIT(BCS0);
if (aux_inv)
-   cmd += 2 * hweight32(aux_inv) + 2;
+   cmd += 4;
}
}
  
@@ -329,15 +306,10 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)

*cs++ = 0; /* value */
  
  	if (aux_inv) { /* hsdes: 1809175790 */

-   struct intel_engine_cs *engine;
-   unsigned int tmp;
-
-   *cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
-   for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
-   *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
-   *cs++ = AUX_INV;
-   }
-   *cs++ = MI_NOOP;
+   if (rq->engine->class == VIDEO_DECODE_CLASS)
+   cs = gen12_emit_aux_table_inv(GEN12_VD0_AUX_NV, cs);
+   else
+   cs = gen12_emit_aux_table_inv(GEN12_VE0_AUX_NV, cs);


Not sure if, here and below, it would be worth to put register in a 
local and then have a single function call - up to you.



}
  
  	if (mode & EMIT_INVALIDATE)

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.h 
b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h
index cc6e21d3662a..94f589e73d10 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.h
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h
@@ -10,7 +10,7 @@
  #include 
  
  #include "i915_gem.h" /* GEM_BUG_ON */

-
+#include "intel_gt_regs.h"
  #include "intel_gpu_commands.h"
  
  struct i915_request;

@@ -38,6 +38,8 @@ u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, 
u32 *cs);
  u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs

Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-21 Thread Tvrtko Ursulin



On 18/03/2022 18:08, fei.y...@intel.com wrote:

From: Fei Yang 

GPU hangs have been observed when multiple engines write to the
same aux_inv register at the same time. To avoid this each engine
should only invalidate its own auxiliary table. The function
gen12_emit_flush_xcs() currently invalidate the auxiliary table for
all engines because the rq->engine is not necessarily the engine
eventually carrying out the request, and potentially the engine
could even be a virtual one (with engine->instance being -1).
With the MMIO remap feature, we can actually set bit 17 of MI_LRI
instruction and let the hardware to figure out the local aux_inv
register at runtime to avoid invalidating auxiliary table for all
engines.

Bspec: 45728

Cc: Stuart Summers 
Cc: Tvrtko Ursulin 
Signed-off-by: Chris Wilson 
Signed-off-by: Fei Yang 
---
  drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 44 +---
  drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  1 +
  2 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c 
b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
index 36148887c699..6e83ac06aaf6 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -165,30 +165,6 @@ static u32 preparser_disable(bool state)
return MI_ARB_CHECK | 1 << 8 | state;
  }
  
-static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)

-{
-   static const i915_reg_t vd[] = {
-   GEN12_VD0_AUX_NV,
-   GEN12_VD1_AUX_NV,
-   GEN12_VD2_AUX_NV,
-   GEN12_VD3_AUX_NV,
-   };
-
-   static const i915_reg_t ve[] = {
-   GEN12_VE0_AUX_NV,
-   GEN12_VE1_AUX_NV,
-   };
-
-   if (engine->class == VIDEO_DECODE_CLASS)
-   return vd[engine->instance];
-
-   if (engine->class == VIDEO_ENHANCEMENT_CLASS)
-   return ve[engine->instance];
-
-   GEM_BUG_ON("unknown aux_inv reg\n");
-   return INVALID_MMIO_REG;
-}
-
  static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
  {
*cs++ = MI_LOAD_REGISTER_IMM(1);
@@ -293,10 +269,12 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 
mode)
if (mode & EMIT_INVALIDATE) {
cmd += 2;
  
-		if (!HAS_FLAT_CCS(rq->engine->i915)) {

+   if (!HAS_FLAT_CCS(rq->engine->i915) &&
+   (rq->engine->class == VIDEO_DECODE_CLASS ||
+rq->engine->class == VIDEO_ENHANCEMENT_CLASS)) {
aux_inv = rq->engine->mask & ~BIT(BCS0);
if (aux_inv)
-   cmd += 2 * hweight32(aux_inv) + 2;
+   cmd += 4;
}
}
  
@@ -329,14 +307,12 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)

*cs++ = 0; /* value */
  
  	if (aux_inv) { /* hsdes: 1809175790 */

-   struct intel_engine_cs *engine;
-   unsigned int tmp;
-
-   *cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
-   for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
-   *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
-   *cs++ = AUX_INV;
-   }
+   *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN;
+   if (rq->engine->class == VIDEO_DECODE_CLASS)
+   *cs++ = i915_mmio_reg_offset(GEN12_VD0_AUX_NV);
+   else
+   *cs++ = i915_mmio_reg_offset(GEN12_VE0_AUX_NV);
+   *cs++ = AUX_INV;
*cs++ = MI_NOOP;
}
  
diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h

index d112ffd56418..4243be030bc1 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -144,6 +144,7 @@
  #define MI_LOAD_REGISTER_IMM(x)   MI_INSTR(0x22, 2*(x)-1)
  /* Gen11+. addr = base + (ctx_restore ? offset & GENMASK(12,2) : offset) */
  #define   MI_LRI_LRM_CS_MMIO  REG_BIT(19)
+#define   MI_LRI_MMIO_REMAP_EN REG_BIT(17)
  #define   MI_LRI_FORCE_POSTED (1<<12)
  #define MI_LOAD_REGISTER_IMM_MAX_REGS (126)
  #define MI_STORE_REGISTER_MEMMI_INSTR(0x24, 1)


LGTM.

Reviewed-by: Tvrtko Ursulin 

Affects igpus from TGL onwards? If so:

Cc: sta...@vger.kernel.org # v5.7+

?

Backporting might end up fun..

Regards,

Tvrtko


Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-18 Thread Yang, Fei
>>   static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
>>   {
>>  *cs++ = MI_LOAD_REGISTER_IMM(1);
>> @@ -296,7 +272,7 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 
>> mode)
>>  if (!HAS_FLAT_CCS(rq->engine->i915)) {
>>  aux_inv = rq->engine->mask & ~BIT(BCS0);
>>  if (aux_inv)
>> -cmd += 2 * hweight32(aux_inv) + 2;
>> +cmd += 4;
>>  }
>>  }
>>   
>> @@ -329,14 +305,16 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 
>> mode)
>>  *cs++ = 0; /* value */
>>   
>>  if (aux_inv) { /* hsdes: 1809175790 */
>> -struct intel_engine_cs *engine;
>> -unsigned int tmp;
>> -
>> -*cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
>> -for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
>> -*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
>> -*cs++ = AUX_INV;
>> +*cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN;
> 
> Cool, I didn't know this exists. First Bspec link I found did not mention 
> these register, but second did.
> That one however (29545) has a worrying "removed by" tag which seems to point 
> to a story suggesting the
> remapping table might be gone on machines with flat ccs?! Could you double 
> check please?

The variable aux_inv is set only if (!HAS_FLAT_CCS(rq->engine->i915)).

>> +if (rq->engine->class == VIDEO_DECODE_CLASS) {
>> +*cs++ = i915_mmio_reg_offset(GEN12_VD0_AUX_NV);
>> +} else if (rq->engine->class == VIDEO_ENHANCEMENT_CLASS) {
>> +*cs++ = i915_mmio_reg_offset(GEN12_VE0_AUX_NV);
>> +} else {
>> +GEM_BUG_ON("unknown aux_inv reg\n");
>> +*cs++ = i915_mmio_reg_offset(INVALID_MMIO_REG);
>
> I'd consider not emitting the LRI if we don't know what to put in unless 
> there is some hidden point to do it?

That's true. I was following the original logic flow here. I think it would be 
better to check for engine class before setting the variable aux_inv.

>
>>  }
>> +*cs++ = AUX_INV;
>>  *cs++ = MI_NOOP;
>>  }
>>   
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
>> b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> index d112ffd56418..2d150eec5c65 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> @@ -144,6 +144,7 @@
>>   #define MI_LOAD_REGISTER_IMM(x)MI_INSTR(0x22, 2*(x)-1)
>>   /* Gen11+. addr = base + (ctx_restore ? offset & GENMASK(12,2) : offset) */
>>   #define   MI_LRI_LRM_CS_MMIO   REG_BIT(19)
>> +#define   MI_LRI_MMIO_REMAP_EN  (1 << 17)
>>   #define   MI_LRI_FORCE_POSTED  (1<<12)
>
> Passing observation - three bits, three flavours of expressing them, sigh...
Haha, REG_BIT(17) it is. The other one causes a CHECK:SPACING, but don't want 
to touch that in this patch.

>>   #define MI_LOAD_REGISTER_IMM_MAX_REGS (126)
>>   #define MI_STORE_REGISTER_MEMMI_INSTR(0x24, 1)


Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-18 Thread Tvrtko Ursulin



On 18/03/2022 05:26, fei.y...@intel.com wrote:

From: Fei Yang 

GPU hangs have been observed when multiple engines write to the
same aux_inv register at the same time. To avoid this each engine
should only invalidate its own auxiliary table. The function
gen12_emit_flush_xcs() currently invalidate the auxiliary table for
all engines because the rq->engine is not necessarily the engine
eventually carrying out the request, and potentially the engine
could even be a virtual one (with engine->instance being -1).
With the MMIO remap feature, we can actually set bit 17 of MI_LRI
instruction and let the hardware to figure out the local aux_inv
register at runtime to avoid invalidating auxiliary table for all
engines.

Bspec: 45728

Cc: Stuart Summers 
Cc: Tvrtko Ursulin 
Signed-off-by: Chris Wilson 
Signed-off-by: Fei Yang 
---
  drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 42 +---
  drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  1 +
  2 files changed, 11 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c 
b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
index 36148887c699..d440c5dfb6b7 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -165,30 +165,6 @@ static u32 preparser_disable(bool state)
return MI_ARB_CHECK | 1 << 8 | state;
  }
  
-static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)

-{
-   static const i915_reg_t vd[] = {
-   GEN12_VD0_AUX_NV,
-   GEN12_VD1_AUX_NV,
-   GEN12_VD2_AUX_NV,
-   GEN12_VD3_AUX_NV,
-   };
-
-   static const i915_reg_t ve[] = {
-   GEN12_VE0_AUX_NV,
-   GEN12_VE1_AUX_NV,
-   };
-
-   if (engine->class == VIDEO_DECODE_CLASS)
-   return vd[engine->instance];
-
-   if (engine->class == VIDEO_ENHANCEMENT_CLASS)
-   return ve[engine->instance];
-
-   GEM_BUG_ON("unknown aux_inv reg\n");
-   return INVALID_MMIO_REG;
-}
-
  static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
  {
*cs++ = MI_LOAD_REGISTER_IMM(1);
@@ -296,7 +272,7 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
if (!HAS_FLAT_CCS(rq->engine->i915)) {
aux_inv = rq->engine->mask & ~BIT(BCS0);
if (aux_inv)
-   cmd += 2 * hweight32(aux_inv) + 2;
+   cmd += 4;
}
}
  
@@ -329,14 +305,16 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)

*cs++ = 0; /* value */
  
  	if (aux_inv) { /* hsdes: 1809175790 */

-   struct intel_engine_cs *engine;
-   unsigned int tmp;
-
-   *cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
-   for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
-   *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
-   *cs++ = AUX_INV;
+   *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN;


Cool, I didn't know this exists. First Bspec link I found did not 
mention these register, but second did. That one however (29545) has a 
worrying "removed by" tag which seems to point to a story suggesting the 
remapping table might be gone on machines with flat ccs?! Could you 
double check please?



+   if (rq->engine->class == VIDEO_DECODE_CLASS) {
+   *cs++ = i915_mmio_reg_offset(GEN12_VD0_AUX_NV);
+   } else if (rq->engine->class == VIDEO_ENHANCEMENT_CLASS) {
+   *cs++ = i915_mmio_reg_offset(GEN12_VE0_AUX_NV);
+   } else {
+   GEM_BUG_ON("unknown aux_inv reg\n");
+   *cs++ = i915_mmio_reg_offset(INVALID_MMIO_REG);


I'd consider not emitting the LRI if we don't know what to put in unless 
there is some hidden point to do it?



}
+   *cs++ = AUX_INV;
*cs++ = MI_NOOP;
}
  
diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h

index d112ffd56418..2d150eec5c65 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -144,6 +144,7 @@
  #define MI_LOAD_REGISTER_IMM(x)   MI_INSTR(0x22, 2*(x)-1)
  /* Gen11+. addr = base + (ctx_restore ? offset & GENMASK(12,2) : offset) */
  #define   MI_LRI_LRM_CS_MMIO  REG_BIT(19)
+#define   MI_LRI_MMIO_REMAP_EN (1 << 17)
  #define   MI_LRI_FORCE_POSTED (1<<12)


Passing observation - three bits, three flavours of expressing them, sigh...

Regards,

Tvrtko


  #define MI_LOAD_REGISTER_IMM_MAX_REGS (126)
  #define MI_STORE_REGISTER_MEMMI_INSTR(0x24, 1)


Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-16 Thread Yang, Fei
>> diff --git a/drivers/gpu/drm/i915/gt/gen2_engine_cs.c 
>> b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
>> index 1c82caf525c3..0ec4986e4805 100644
>> --- a/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
>> +++ b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
>> @@ -37,6 +37,9 @@ int gen2_emit_flush(struct i915_request *rq, u32 
>> mode)
>>   
>>  intel_ring_advance(rq, cs);
>>   
>> +/* hsdes: 1809175790. No fixup needed for gen2 */
>> +rq->aux_inv_fixup = NULL;
>
> Same thing that Stuart mentioned - would it not work for instance to 
> initialize this in __i915_request_create?

I didn't try __i915_request_create because there is code like the following in 
the driver, and I'm not sure how many such allocation is there. I will give it 
a shot.
struct measure_breadcrumb {
struct i915_request rq;
struct intel_ring ring;
u32 cs[2048];
};

static int measure_breadcrumb_dw(struct intel_context *ce)
{
struct intel_engine_cs *engine = ce->engine;
struct measure_breadcrumb *frame;
int dw;

GEM_BUG_ON(!engine->gt->scratch);

frame = kzalloc(sizeof(*frame), GFP_KERNEL);
if (!frame)
return -ENOMEM;

frame->rq.engine = engine;
frame->rq.context = ce;
...
}
>> +
>>  return 0;
>>   }
>>   


Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-16 Thread Tvrtko Ursulin



On 04/03/2022 22:14, fei.y...@intel.com wrote:

From: Fei Yang 

GPU hangs have been observed when multiple engines write to the
same aux_inv register at the same time. To avoid this each engine
should only invalidate its own auxiliary table. The function
gen12_emit_flush_xcs() currently invalidate the auxiliary table for
all engines because the rq->engine is not necessarily the engine
eventually carrying out the request, and potentially the engine
could even be a virtual one (with engine->instance being -1).
With this patch, auxiliary table invalidation is done only for the
engine executing the request. And the mmio address for the aux_inv
register is set after the engine instance becomes certain.

Signed-off-by: Chris Wilson 
Signed-off-by: Fei Yang 
---
  drivers/gpu/drm/i915/gt/gen2_engine_cs.c  |  9 +++
  drivers/gpu/drm/i915/gt/gen6_engine_cs.c  |  9 +++
  drivers/gpu/drm/i915/gt/gen8_engine_cs.c  | 61 ---
  .../drm/i915/gt/intel_execlists_submission.c  | 35 +++
  drivers/gpu/drm/i915/i915_request.h   |  2 +
  5 files changed, 82 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen2_engine_cs.c 
b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
index 1c82caf525c3..0ec4986e4805 100644
--- a/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
@@ -37,6 +37,9 @@ int gen2_emit_flush(struct i915_request *rq, u32 mode)
  
  	intel_ring_advance(rq, cs);
  
+	/* hsdes: 1809175790. No fixup needed for gen2 */

+   rq->aux_inv_fixup = NULL;


Same thing that Stuart mentioned - would it not work for instance to 
initialize this in __i915_request_create?



+
return 0;
  }
  
@@ -123,6 +126,9 @@ int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode)
  
  	intel_ring_advance(rq, cs);
  
+	/* hsdes: 1809175790. No fixup needed for gen4 rcs */

+   rq->aux_inv_fixup = NULL;
+
return 0;
  }
  
@@ -138,6 +144,9 @@ int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode)

*cs++ = MI_NOOP;
intel_ring_advance(rq, cs);
  
+	/* hsdes: 1809175790. No fixup needed for gen4 vcs */

+   rq->aux_inv_fixup = NULL;
+
return 0;
  }
  
diff --git a/drivers/gpu/drm/i915/gt/gen6_engine_cs.c b/drivers/gpu/drm/i915/gt/gen6_engine_cs.c

index 5e65550b4dfb..efe51c4662fe 100644
--- a/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
@@ -137,6 +137,9 @@ int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
*cs++ = 0;
intel_ring_advance(rq, cs);
  
+	/* hsdes: 1809175790. No fixup needed for gen6 */

+   rq->aux_inv_fixup = NULL;
+
return 0;
  }
  
@@ -208,6 +211,9 @@ static int mi_flush_dw(struct i915_request *rq, u32 flags)
  
  	intel_ring_advance(rq, cs);
  
+	/* hsdes: 1809175790. No fixup needed for gen6 */

+   rq->aux_inv_fixup = NULL;
+
return 0;
  }
  
@@ -347,6 +353,9 @@ int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)

*cs++ = 0;
intel_ring_advance(rq, cs);
  
+	/* hsdes: 1809175790. No fixup needed for gen7 rcs */

+   rq->aux_inv_fixup = NULL;
+
return 0;
  }
  
diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c

index b1b9c3fd7bf9..b6374cf53314 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -73,6 +73,9 @@ int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
  
  	intel_ring_advance(rq, cs);
  
+	/* hsdes: 1809175790. No fixup needed for gen8 rcs */

+   rq->aux_inv_fixup = NULL;
+
return 0;
  }
  
@@ -106,6 +109,9 @@ int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)

*cs++ = 0; /* value */
intel_ring_advance(rq, cs);
  
+	/* hsdes: 1809175790. No fixup needed for gen8 xcs */

+   rq->aux_inv_fixup = NULL;
+
return 0;
  }
  
@@ -157,6 +163,9 @@ int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)

intel_ring_advance(rq, cs);
}
  
+	/* hsdes: 1809175790. No fixup needed for gen11 rcs */

+   rq->aux_inv_fixup = NULL;
+
return 0;
  }
  
@@ -165,30 +174,6 @@ static u32 preparser_disable(bool state)

return MI_ARB_CHECK | 1 << 8 | state;
  }
  
-static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)

-{
-   static const i915_reg_t vd[] = {
-   GEN12_VD0_AUX_NV,
-   GEN12_VD1_AUX_NV,
-   GEN12_VD2_AUX_NV,
-   GEN12_VD3_AUX_NV,
-   };
-
-   static const i915_reg_t ve[] = {
-   GEN12_VE0_AUX_NV,
-   GEN12_VE1_AUX_NV,
-   };
-
-   if (engine->class == VIDEO_DECODE_CLASS)
-   return vd[engine->instance];
-
-   if (engine->class == VIDEO_ENHANCEMENT_CLASS)
-   return ve[engine->instance];
-
-   GEM_BUG_ON("unknown aux_inv reg\n");
-   return INVALID_MMIO_REG;
-}
-
  static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_

Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-16 Thread Yang, Fei
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> index e1470bb60f34..7e8552414275 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> @@ -1258,6 +1258,34 @@ static bool completed(const struct i915_request 
>>> *rq)
>>> return __i915_request_is_complete(rq);  }
>>>  
>>> +static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) {
>>> +   static const i915_reg_t vd[] = {
>>> +   GEN12_VD0_AUX_NV,
>>> +   GEN12_VD1_AUX_NV,
>>> +   GEN12_VD2_AUX_NV,
>>> +   GEN12_VD3_AUX_NV,
>>> +   };
>>> +
>>> +   static const i915_reg_t ve[] = {
>>> +   GEN12_VE0_AUX_NV,
>>> +   GEN12_VE1_AUX_NV,
>>> +   };
>>> +
>>> +   if (engine->class == VIDEO_DECODE_CLASS) {
>>> +   GEM_BUG_ON(engine->instance >= ARRAY_SIZE(vd));
>>> +   return vd[engine->instance];
>>> +   }
>>> +
>>> +   if (engine->class == VIDEO_ENHANCEMENT_CLASS) {
>>> +   GEM_BUG_ON(engine->instance >= ARRAY_SIZE(ve));
>>> +   return ve[engine->instance];
>>> +   }
>>> +
>>> +   GEM_BUG_ON("unknown aux_inv reg\n");
>>> +   return INVALID_MMIO_REG;
>>> +}
>>> +
>>>  static void execlists_dequeue(struct intel_engine_cs *engine)
>> 
>> So in the previous implementation, this "worked" for both execlists and guc 
>> submission. But how will this work now for GuC based submission?
>> This flow and the address of the engine is owned by the GuC.
>>
>> If we are going to say this is an execlist only requirement (e.g.
>> platforms using GuC submission don't need this workaround), you should add 
>> an if (!using guc submission) in the sequence you added to the various 
>> emit_flush() routines above.
>
> Good point.
> I didn't consider GuC submission because Chrome doesn't enable GuC for TGL. 
> But it is true that the implementation will have problem with GuC submission.
> I'm not sure if it's possible for i915 to know which engine will eventually 
> carry out the request because it might be scheduled by GuC. I will need to 
> investigate.

I think the same can be done in intel_guc_submission.c after 
__i915_request_submit(rq) calls.

>> Thanks,
>> Stuart



Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-15 Thread Yang, Fei
>> @@ -157,6 +163,9 @@ int gen11_emit_flush_rcs(struct i915_request *rq,
>> u32 mode)
>>  intel_ring_advance(rq, cs);
>>  }
>>  
>> +/* hsdes: 1809175790. No fixup needed for gen11 rcs */
>> +rq->aux_inv_fixup = NULL;
>
> This is a little ugly to me. Can we just set this to 0 or 0xdeadbeef by 
> default maybe and check that value below instead of retroactively adding all 
> of these assignments?

The problem is there are many code paths that a i915_request could be 
allocated, I'm not aware of a unified routine where I could initialize the 
pointer for all i915_requests.

>> +
>>  return 0;
>>  }
>>  
>> +/*
>> + * We don't know which engine will eventually carry out
>> + * this request, so the mmio aux_inv register address
>> is
>> + * unknown at this moment. We save the cs pointer
>> supposed
>> + * to hold the aux_inv address in rq->aux_inv_fixup and
>> set
>> + * it in execlists_dequeue() when the engine instance
>> + * carrying out this request becomes certain
>> + */
>> +*cs++ = MI_LOAD_REGISTER_IMM(1);
>> +rq->aux_inv_fixup = cs; /* save the pointer to aux_inv
>> */
>> +*cs++ = 0; /* mmio addr to be set at submission to HW
>> */
>
>Maybe MI_NOOP instead?

This is supposed to be the mmio address field for the MI_LOAD_REGISTER_IMM 
instruction, setting it to 0 makes more sense?

>> +*cs++ = AUX_INV;
>>  *cs++ = MI_NOOP;
>> -}
>> +} else
>
> Can you add the brackets here on the else:
> } else {
>aux_inv_fixup = NULL
> }
>
>Also good to run checkpatch. I see this showing up as a warning in the 
>checkpatch results.

I noticed the warning, will update.

>> +rq->aux_inv_fixup = NULL;
>>  
>>  if (mode & EMIT_INVALIDATE)
>>  *cs++ = preparser_disable(false);
>> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>> b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>> index e1470bb60f34..7e8552414275 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>> @@ -1258,6 +1258,34 @@ static bool completed(const struct i915_request 
>> *rq)
>>  return __i915_request_is_complete(rq);  }
>>  
>> +static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) {
>> +static const i915_reg_t vd[] = {
>> +GEN12_VD0_AUX_NV,
>> +GEN12_VD1_AUX_NV,
>> +GEN12_VD2_AUX_NV,
>> +GEN12_VD3_AUX_NV,
>> +};
>> +
>> +static const i915_reg_t ve[] = {
>> +GEN12_VE0_AUX_NV,
>> +GEN12_VE1_AUX_NV,
>> +};
>> +
>> +if (engine->class == VIDEO_DECODE_CLASS) {
>> +GEM_BUG_ON(engine->instance >= ARRAY_SIZE(vd));
>> +return vd[engine->instance];
>> +}
>> +
>> +if (engine->class == VIDEO_ENHANCEMENT_CLASS) {
>> +GEM_BUG_ON(engine->instance >= ARRAY_SIZE(ve));
>> +return ve[engine->instance];
>> +}
>> +
>> +GEM_BUG_ON("unknown aux_inv reg\n");
>> +return INVALID_MMIO_REG;
>> +}
>> +
>>  static void execlists_dequeue(struct intel_engine_cs *engine)
> 
> So in the previous implementation, this "worked" for both execlists and guc 
> submission. But how will this work now for GuC based submission?
> This flow and the address of the engine is owned by the GuC.
>
> If we are going to say this is an execlist only requirement (e.g.
> platforms using GuC submission don't need this workaround), you should add an 
> if (!using guc submission) in the sequence you added to the various 
> emit_flush() routines above.

Good point.
I didn't consider GuC submission because Chrome doesn't enable GuC for TGL. But 
it is true that the implementation will have problem with GuC submission.
I'm not sure if it's possible for i915 to know which engine will eventually 
carry out the request because it might be scheduled by GuC. I will need to 
investigate.

> Thanks,
> Stuart



Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-15 Thread Summers, Stuart
On Fri, 2022-03-04 at 14:14 -0800, fei.y...@intel.com wrote:
> From: Fei Yang 
> 
> GPU hangs have been observed when multiple engines write to the
> same aux_inv register at the same time. To avoid this each engine
> should only invalidate its own auxiliary table. The function
> gen12_emit_flush_xcs() currently invalidate the auxiliary table for
> all engines because the rq->engine is not necessarily the engine
> eventually carrying out the request, and potentially the engine
> could even be a virtual one (with engine->instance being -1).
> With this patch, auxiliary table invalidation is done only for the
> engine executing the request. And the mmio address for the aux_inv
> register is set after the engine instance becomes certain.
> 
> Signed-off-by: Chris Wilson 
> Signed-off-by: Fei Yang 
> ---
>  drivers/gpu/drm/i915/gt/gen2_engine_cs.c  |  9 +++
>  drivers/gpu/drm/i915/gt/gen6_engine_cs.c  |  9 +++
>  drivers/gpu/drm/i915/gt/gen8_engine_cs.c  | 61 -
> --
>  .../drm/i915/gt/intel_execlists_submission.c  | 35 +++
>  drivers/gpu/drm/i915/i915_request.h   |  2 +
>  5 files changed, 82 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
> b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
> index 1c82caf525c3..0ec4986e4805 100644
> --- a/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
> @@ -37,6 +37,9 @@ int gen2_emit_flush(struct i915_request *rq, u32
> mode)
>  
>   intel_ring_advance(rq, cs);
>  
> + /* hsdes: 1809175790. No fixup needed for gen2 */
> + rq->aux_inv_fixup = NULL;
> +
>   return 0;
>  }
>  
> @@ -123,6 +126,9 @@ int gen4_emit_flush_rcs(struct i915_request *rq,
> u32 mode)
>  
>   intel_ring_advance(rq, cs);
>  
> + /* hsdes: 1809175790. No fixup needed for gen4 rcs */
> + rq->aux_inv_fixup = NULL;
> +
>   return 0;
>  }
>  
> @@ -138,6 +144,9 @@ int gen4_emit_flush_vcs(struct i915_request *rq,
> u32 mode)
>   *cs++ = MI_NOOP;
>   intel_ring_advance(rq, cs);
>  
> + /* hsdes: 1809175790. No fixup needed for gen4 vcs */
> + rq->aux_inv_fixup = NULL;
> +
>   return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
> b/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
> index 5e65550b4dfb..efe51c4662fe 100644
> --- a/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
> @@ -137,6 +137,9 @@ int gen6_emit_flush_rcs(struct i915_request *rq,
> u32 mode)
>   *cs++ = 0;
>   intel_ring_advance(rq, cs);
>  
> + /* hsdes: 1809175790. No fixup needed for gen6 */
> + rq->aux_inv_fixup = NULL;
> +
>   return 0;
>  }
>  
> @@ -208,6 +211,9 @@ static int mi_flush_dw(struct i915_request *rq,
> u32 flags)
>  
>   intel_ring_advance(rq, cs);
>  
> + /* hsdes: 1809175790. No fixup needed for gen6 */
> + rq->aux_inv_fixup = NULL;
> +
>   return 0;
>  }
>  
> @@ -347,6 +353,9 @@ int gen7_emit_flush_rcs(struct i915_request *rq,
> u32 mode)
>   *cs++ = 0;
>   intel_ring_advance(rq, cs);
>  
> + /* hsdes: 1809175790. No fixup needed for gen7 rcs */
> + rq->aux_inv_fixup = NULL;
> +
>   return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> index b1b9c3fd7bf9..b6374cf53314 100644
> --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> @@ -73,6 +73,9 @@ int gen8_emit_flush_rcs(struct i915_request *rq,
> u32 mode)
>  
>   intel_ring_advance(rq, cs);
>  
> + /* hsdes: 1809175790. No fixup needed for gen8 rcs */
> + rq->aux_inv_fixup = NULL;
> +
>   return 0;
>  }
>  
> @@ -106,6 +109,9 @@ int gen8_emit_flush_xcs(struct i915_request *rq,
> u32 mode)
>   *cs++ = 0; /* value */
>   intel_ring_advance(rq, cs);
>  
> + /* hsdes: 1809175790. No fixup needed for gen8 xcs */
> + rq->aux_inv_fixup = NULL;
> +
>   return 0;
>  }
>  
> @@ -157,6 +163,9 @@ int gen11_emit_flush_rcs(struct i915_request *rq,
> u32 mode)
>   intel_ring_advance(rq, cs);
>   }
>  
> + /* hsdes: 1809175790. No fixup needed for gen11 rcs */
> + rq->aux_inv_fixup = NULL;

This is a little ugly to me. Can we just set this to 0 or 0xdeadbeef by
default maybe and check that value below instead of retroactively
adding all of these assignments?

> +
>   return 0;
>  }
>  
> @@ -165,30 +174,6 @@ static u32 preparser_disable(bool state)
>   return MI_ARB_CHECK | 1 << 8 | state;
>  }
>  
> -static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
> -{
> - static const i915_reg_t vd[] = {
> - GEN12_VD0_AUX_NV,
> - GEN12_VD1_AUX_NV,
> - GEN12_VD2_AUX_NV,
> - GEN12_VD3_AUX_NV,
> - };
> -
> - static const i915_reg_t ve[] = {
> - GEN12_VE0_AUX_NV,
> - GEN12_VE1_AUX_NV,
> - };
> -
> - if (engine->class == VIDEO_DECODE_CLASS)

Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

2022-03-02 Thread Chris Wilson
Quoting fei.y...@intel.com (2022-03-02 18:26:57)
> From: Fei Yang 
> 
> GPU hangs have been observed when multiple engines write to the
> same aux_inv register at the same time. To avoid this each engine
> should only invalidate its own auxiliary table. The function
> gen12_emit_flush_xcs() currently invalidate the auxiliary table for
> all engines because the rq->engine is not necessarily the engine
> eventually carrying out the request, and potentially the engine
> could even be a virtual one (with engine->instance being -1).
> With this patch, auxiliary table invalidation is done only for the
> engine executing the request. And the mmio address for the aux_inv
> register is set after the engine instance becomes certain.
> 
> Signed-off-by: Chris Wilson 
> Signed-off-by: Fei Yang 
> ---
>  drivers/gpu/drm/i915/gt/gen8_engine_cs.c  | 41 ---
>  .../drm/i915/gt/intel_execlists_submission.c  | 38 +
>  drivers/gpu/drm/i915/i915_request.h   |  2 +
>  3 files changed, 47 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c 
> b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> index b1b9c3fd7bf9..af62e2bc2c9b 100644
> --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> @@ -165,30 +165,6 @@ static u32 preparser_disable(bool state)
> return MI_ARB_CHECK | 1 << 8 | state;
>  }
>  
> -static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
> -{
> -   static const i915_reg_t vd[] = {
> -   GEN12_VD0_AUX_NV,
> -   GEN12_VD1_AUX_NV,
> -   GEN12_VD2_AUX_NV,
> -   GEN12_VD3_AUX_NV,
> -   };
> -
> -   static const i915_reg_t ve[] = {
> -   GEN12_VE0_AUX_NV,
> -   GEN12_VE1_AUX_NV,
> -   };
> -
> -   if (engine->class == VIDEO_DECODE_CLASS)
> -   return vd[engine->instance];
> -
> -   if (engine->class == VIDEO_ENHANCEMENT_CLASS)
> -   return ve[engine->instance];
> -
> -   GEM_BUG_ON("unknown aux_inv reg\n");
> -   return INVALID_MMIO_REG;
> -}
> -
>  static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
>  {
> *cs++ = MI_LOAD_REGISTER_IMM(1);
> @@ -288,7 +264,7 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 
> mode)
> if (mode & EMIT_INVALIDATE)
> aux_inv = rq->engine->mask & ~BIT(BCS0);
> if (aux_inv)
> -   cmd += 2 * hweight32(aux_inv) + 2;
> +   cmd += 4;
>  
> cs = intel_ring_begin(rq, cmd);
> if (IS_ERR(cs))
> @@ -319,16 +295,13 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 
> mode)
> *cs++ = 0; /* value */
>  
> if (aux_inv) { /* hsdes: 1809175790 */
> -   struct intel_engine_cs *engine;
> -   unsigned int tmp;
> -
> -   *cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
> -   for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
> -   *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
> -   *cs++ = AUX_INV;
> -   }
> +   *cs++ = MI_LOAD_REGISTER_IMM(1);
> +   rq->vd_ve_aux_inv = cs;
> +   *cs++ = 0; /* address to be set at submission to HW */
> +   *cs++ = AUX_INV;
> *cs++ = MI_NOOP;
> -   }
> +   } else
> +   rq->vd_ve_aux_inv = NULL;
>  
> if (mode & EMIT_INVALIDATE)
> *cs++ = preparser_disable(false);
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
> b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 1c602d4ae297..a018de6dcac5 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -1258,6 +1258,34 @@ static bool completed(const struct i915_request *rq)
> return __i915_request_is_complete(rq);
>  }
>  
> +static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
> +{
> +   static const i915_reg_t vd[] = {
> +   GEN12_VD0_AUX_NV,
> +   GEN12_VD1_AUX_NV,
> +   GEN12_VD2_AUX_NV,
> +   GEN12_VD3_AUX_NV,
> +   };
> +
> +   static const i915_reg_t ve[] = {
> +   GEN12_VE0_AUX_NV,
> +   GEN12_VE1_AUX_NV,
> +   };
> +
> +   if (engine->class == VIDEO_DECODE_CLASS) {
> +   GEM_BUG_ON(engine->instance >= ARRAY_SIZE(vd));
> +   return vd[engine->instance];
> +   }
> +
> +   if (engine->class == VIDEO_ENHANCEMENT_CLASS) {
> +   GEM_BUG_ON(engine->instance >= ARRAY_SIZE(ve));
> +   return ve[engine->instance];
> +   }
> +
> +   GEM_BUG_ON("unknown aux_inv reg\n");
> +   return INVALID_MMIO_REG;
> +}
> +
>  static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
> struct intel_engine_execlists * const execlists = &