The current spill code checks in each instruction of an instruction group whether spliing is needed and if so, it adds spilling for each component as a seperate instruction and it allocates a new temporary for each component and since it takes the write mask from the TGSI representation, all components might be written each time and as a result already written components might be overwritten with garbage like:
... y: MOV R9.y, [0x42140000 37].x t: MOV R8.x, [0x42040000 33].y ... MEM_SCRATCH WRITE_IND_ACK 0 R9.xy__, @R4.x ES:3 MEM_SCRATCH WRITE_IND_ACK 0 R8.xy__, @R4.x ES:3 ... To resolve this isse accululate spills to the same memory location so that only one memory write instruction is emmited for an instruction group that writes up to all four components. Signed-off-by: Gert Wollny <gert.wol...@collabora.com> --- src/gallium/drivers/r600/r600_shader.c | 93 +++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 36 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 5479861c58..354091322a 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -4342,8 +4342,32 @@ static void tgsi_dst(struct r600_shader_ctx *ctx, if (spilled) { struct r600_bytecode_output cf; - int reg = r600_get_temp(ctx); + int reg = 0; int r; + bool add_pending_output = true; + + memset(&cf, 0, sizeof(struct r600_bytecode_output)); + get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index, + &cf.array_base, &cf.array_size); + + /* If no componet has spilled, reserve a register and add the spill code + * ctx->bc->n_pending_outputs is cleared after each instruction group */ + if (ctx->bc->n_pending_outputs == 0) { + reg = r600_get_temp(ctx); + } else { + /* If we already spilling and the output address is the same like + * before then just reuse the same slot */ + struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1]; + if ((cf.array_base + idx == tmpl->array_base) || + (cf.array_base == tmpl->array_base && + tmpl->index_gpr == ctx->bc->ar_reg && + tgsi_dst->Register.Indirect)) { + reg = ctx->bc->pending_outputs[0].gpr; + add_pending_output = false; + } else { + reg = r600_get_temp(ctx); + } + } r600_dst->sel = reg; r600_dst->chan = swizzle; @@ -4352,42 +4376,39 @@ static void tgsi_dst(struct r600_shader_ctx *ctx, r600_dst->clamp = 1; } - // needs to be added after op using tgsi_dst - memset(&cf, 0, sizeof(struct r600_bytecode_output)); - cf.op = CF_OP_MEM_SCRATCH; - cf.elem_size = 3; - cf.gpr = reg; - cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; - cf.mark = 1; - cf.comp_mask = inst->Dst[0].Register.WriteMask; - cf.swizzle_x = 0; - cf.swizzle_y = 1; - cf.swizzle_z = 2; - cf.swizzle_w = 3; - cf.burst_count = 1; - - get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index, - &cf.array_base, &cf.array_size); - - if (tgsi_dst->Register.Indirect) { - if (ctx->bc->chip_class < R700) - cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; - else - cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK; - cf.index_gpr = ctx->bc->ar_reg; - } - else { - cf.array_base += idx; - cf.array_size = 0; + /* Add new outputs as pending */ + if (add_pending_output) { + cf.op = CF_OP_MEM_SCRATCH; + cf.elem_size = 3; + cf.gpr = reg; + cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; + cf.mark = 1; + cf.comp_mask = inst->Dst[0].Register.WriteMask; + cf.swizzle_x = 0; + cf.swizzle_y = 1; + cf.swizzle_z = 2; + cf.swizzle_w = 3; + cf.burst_count = 1; + + if (tgsi_dst->Register.Indirect) { + if (ctx->bc->chip_class < R700) + cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; + else + cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK; + cf.index_gpr = ctx->bc->ar_reg; + } + else { + cf.array_base += idx; + cf.array_size = 0; + } + + r = r600_bytecode_add_pending_output(ctx->bc, &cf); + if (r) + return; + + if (ctx->bc->chip_class >= R700) + r600_bytecode_need_wait_ack(ctx->bc, true); } - - r = r600_bytecode_add_pending_output(ctx->bc, &cf); - if (r) - return; - - if (ctx->bc->chip_class >= R700) - r600_bytecode_need_wait_ack(ctx->bc, true); - return; } else { -- 2.16.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev