Re: [Mesa-dev] [PATCH] r600: Add spill output to group only if register or target index changes

2018-07-11 Thread Gert Wollny
Am Dienstag, den 10.07.2018, 23:25 +0200 schrieb Roland Scheidegger:
...
thanks for the review,

> 
> I'm not really too familiar with this code, but makes sense to me.
> Are there piglit or dEPQ or whatever failures associated with this?
> If so you could mention them in the commit log.

The piglits that force spilling used only one component, even when they
were called "vec2-4", so I prepared a patch [1] to force the use of all
components, and this would trigger the failures that this patch fixes

[1] https://patchwork.freedesktop.org/series/46064/

best, 
Gert



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] r600: Add spill output to group only if register or target index changes

2018-07-10 Thread Roland Scheidegger
Am 05.07.2018 um 19:11 schrieb Gert Wollny:
> The current spill code checks in each instruction of an instruction group 
> whether
> spliing
spilling

> is needed and if so, it adds spilling for each component as a seperate
> instruction and it allocates a new temporary for each component and since it 
> takes
> the write mask from the TGSI representation, all components might be written
> each time and as a result already written components might be overwritten with
> garbage like: 
> 
>...
>y: MOVR9.y,  [0x4214 37].x
>t: MOVR8.x,  [0x4204 33].y
>... 
>MEM_SCRATCH  WRITE_IND_ACK 0 R9.xy__, @R4.x  ES:3
>MEM_SCRATCH  WRITE_IND_ACK 0 R8.xy__, @R4.x  ES:3
>...
> 
> To resolve this isse accululate
accumulate

> spills to the same memory location so that only one
> memory write instruction is emmited
emitted

> for an instruction group that writes up to all
> four components.
> 
> Signed-off-by: Gert Wollny 
> ---
>  src/gallium/drivers/r600/r600_shader.c | 93 
> +-
>  1 file changed, 57 insertions(+), 36 deletions(-)
> 
> diff --git a/src/gallium/drivers/r600/r600_shader.c 
> b/src/gallium/drivers/r600/r600_shader.c
> index 5479861c58..354091322a 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -4342,8 +4342,32 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
>  
>   if (spilled) {
>   struct r600_bytecode_output cf;
> - int reg = r600_get_temp(ctx);
> + int reg = 0;
>   int r;
> + bool add_pending_output = true;
> +
> + memset(, 0, sizeof(struct r600_bytecode_output));
> + get_spilled_array_base_and_size(ctx, 
> tgsi_dst->Register.Index,
> + _base, _size);
> +
> + /* If no componet
component

> has spilled, reserve a register and add the spill code
> +  *  ctx->bc->n_pending_outputs is cleared after each instruction 
> group */
> + if (ctx->bc->n_pending_outputs == 0) {
> + reg = r600_get_temp(ctx);
> + } else {
> + /* If we already spilling and the output 
> address is the same like
If we are

> +  * before then just reuse the same slot */
> +struct r600_bytecode_output *tmpl = 
> >bc->pending_outputs[ctx->bc->n_pending_outputs-1];
> +if ((cf.array_base + idx == tmpl->array_base) ||
> +(cf.array_base == tmpl->array_base &&
> +   tmpl->index_gpr == ctx->bc->ar_reg &&
> +   tgsi_dst->Register.Indirect)) {
> + reg = ctx->bc->pending_outputs[0].gpr;
> + add_pending_output = false;
> +} else {
> + reg = r600_get_temp(ctx);
> +}
> + }
>  
>   r600_dst->sel = reg;
>   r600_dst->chan = swizzle;
> @@ -4352,42 +4376,39 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
>   r600_dst->clamp = 1;
>   }
>  
> - // needs to be added after op using tgsi_dst
> - memset(, 0, sizeof(struct r600_bytecode_output));
> - cf.op = CF_OP_MEM_SCRATCH;
> - cf.elem_size = 3;
> - cf.gpr = reg;
> - cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
> - cf.mark = 1;
> - cf.comp_mask = inst->Dst[0].Register.WriteMask;
> - cf.swizzle_x = 0;
> - cf.swizzle_y = 1;
> - cf.swizzle_z = 2;
> - cf.swizzle_w = 3;
> - cf.burst_count = 1;
> -
> - get_spilled_array_base_and_size(ctx, 
> tgsi_dst->Register.Index,
> - _base, _size);
> -
> - if (tgsi_dst->Register.Indirect) {
> - if (ctx->bc->chip_class < R700)
> - cf.type = 
> V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
> - else
> - cf.type = 3; // 
> V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
> - cf.index_gpr = ctx->bc->ar_reg;
> - }
> - else {
> - cf.array_base += idx;
> - cf.array_size = 0;
> + /* Add new outputs as pending */
> + if (add_pending_output) {
> + 

[Mesa-dev] [PATCH] r600: Add spill output to group only if register or target index changes

2018-07-05 Thread Gert Wollny
The current spill code checks in each instruction of an instruction group 
whether
spliing is needed and if so, it adds spilling for each component as a seperate
instruction and it allocates a new temporary for each component and since it 
takes
the write mask from the TGSI representation, all components might be written
each time and as a result already written components might be overwritten with
garbage like: 

   ...
   y: MOVR9.y,  [0x4214 37].x
   t: MOVR8.x,  [0x4204 33].y
   ... 
   MEM_SCRATCH  WRITE_IND_ACK 0 R9.xy__, @R4.x  ES:3
   MEM_SCRATCH  WRITE_IND_ACK 0 R8.xy__, @R4.x  ES:3
   ...

To resolve this isse accululate spills to the same memory location so that only 
one
memory write instruction is emmited for an instruction group that writes up to 
all
four components.

Signed-off-by: Gert Wollny 
---
 src/gallium/drivers/r600/r600_shader.c | 93 +-
 1 file changed, 57 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 5479861c58..354091322a 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -4342,8 +4342,32 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
 
if (spilled) {
struct r600_bytecode_output cf;
-   int reg = r600_get_temp(ctx);
+   int reg = 0;
int r;
+   bool add_pending_output = true;
+
+   memset(, 0, sizeof(struct r600_bytecode_output));
+   get_spilled_array_base_and_size(ctx, 
tgsi_dst->Register.Index,
+   _base, _size);
+
+   /* If no componet has spilled, reserve a register and 
add the spill code
+  *  ctx->bc->n_pending_outputs is cleared after each instruction 
group */
+   if (ctx->bc->n_pending_outputs == 0) {
+   reg = r600_get_temp(ctx);
+   } else {
+   /* If we already spilling and the output 
address is the same like
+* before then just reuse the same slot */
+  struct r600_bytecode_output *tmpl = 
>bc->pending_outputs[ctx->bc->n_pending_outputs-1];
+  if ((cf.array_base + idx == tmpl->array_base) ||
+  (cf.array_base == tmpl->array_base &&
+ tmpl->index_gpr == ctx->bc->ar_reg &&
+ tgsi_dst->Register.Indirect)) {
+   reg = ctx->bc->pending_outputs[0].gpr;
+   add_pending_output = false;
+  } else {
+   reg = r600_get_temp(ctx);
+  }
+   }
 
r600_dst->sel = reg;
r600_dst->chan = swizzle;
@@ -4352,42 +4376,39 @@ static void tgsi_dst(struct r600_shader_ctx *ctx,
r600_dst->clamp = 1;
}
 
-   // needs to be added after op using tgsi_dst
-   memset(, 0, sizeof(struct r600_bytecode_output));
-   cf.op = CF_OP_MEM_SCRATCH;
-   cf.elem_size = 3;
-   cf.gpr = reg;
-   cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
-   cf.mark = 1;
-   cf.comp_mask = inst->Dst[0].Register.WriteMask;
-   cf.swizzle_x = 0;
-   cf.swizzle_y = 1;
-   cf.swizzle_z = 2;
-   cf.swizzle_w = 3;
-   cf.burst_count = 1;
-
-   get_spilled_array_base_and_size(ctx, 
tgsi_dst->Register.Index,
-   _base, _size);
-
-   if (tgsi_dst->Register.Indirect) {
-   if (ctx->bc->chip_class < R700)
-   cf.type = 
V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
-   else
-   cf.type = 3; // 
V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
-   cf.index_gpr = ctx->bc->ar_reg;
-   }
-   else {
-   cf.array_base += idx;
-   cf.array_size = 0;
+   /* Add new outputs as pending */
+   if (add_pending_output) {
+  cf.op = CF_OP_MEM_SCRATCH;
+  cf.elem_size = 3;
+  cf.gpr = reg;
+  cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
+