v2: - Enable spilling for partial DF reads/writes on HSW+ Signed-off-by: Samuel Iglesias Gonsálvez <sigles...@igalia.com> --- src/intel/compiler/brw_vec4_reg_allocate.cpp | 54 ++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 14 deletions(-)
diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp index a6f1070ebd..3ad18b12bb 100644 --- a/src/intel/compiler/brw_vec4_reg_allocate.cpp +++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp @@ -411,17 +411,21 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill) spill_costs[inst->src[i].nr] += loop_scale * spill_cost_for_type(inst->src[i].type); if (inst->src[i].reladdr || - inst->src[i].offset >= REG_SIZE) + (inst->src[i].offset >= REG_SIZE && + (type_sz(inst->src[i].type) != 8 || + !(inst->src[i].offset == 32 && inst->group == 4)))) no_spill[inst->src[i].nr] = true; - /* We don't support unspills of partial DF reads. + /* For execsize == 8, our 64-bit unspills are implemented with + * two 32-bit scratch messages, each one reading that for both + * SIMD4x2 threads that we need to shuffle into correct 64-bit + * data. Ensure that we are reading data for both threads. * - * Our 64-bit unspills are implemented with two 32-bit scratch - * messages, each one reading that for both SIMD4x2 threads that - * we need to shuffle into correct 64-bit data. Ensure that we - * are reading data for both threads. + * For execsize == 4, it is similar but using 1-Oword block + * read messages and we don't need to shuffle data. */ - if (type_sz(inst->src[i].type) == 8 && inst->exec_size != 8) + if (type_sz(inst->src[i].type) == 8 && + inst->exec_size != 8 && inst->exec_size != 4) no_spill[inst->src[i].nr] = true; } @@ -439,16 +443,21 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill) if (inst->dst.file == VGRF && !no_spill[inst->dst.nr]) { spill_costs[inst->dst.nr] += loop_scale * spill_cost_for_type(inst->dst.type); - if (inst->dst.reladdr || inst->dst.offset >= REG_SIZE) + if (inst->dst.reladdr || + (inst->dst.offset >= REG_SIZE && + (type_sz(inst->dst.type) != 8 || + !(inst->dst.offset == 32 && inst->group == 4)))) no_spill[inst->dst.nr] = true; - /* We don't support spills of partial DF writes. + /* For execsize == 8, our 64-bit spills are implemented with two + * 32-bit scratch messages, each one writing that for both SIMD4x2 + * threads. Ensure that we are writing data for both threads. * - * Our 64-bit spills are implemented with two 32-bit scratch messages, - * each one writing that for both SIMD4x2 threads. Ensure that we - * are writing data for both threads. + * For execsize == 4, it is similar but using 1-Oword block + * write messages. */ - if (type_sz(inst->dst.type) == 8 && inst->exec_size != 8) + if (type_sz(inst->dst.type) == 8 && + inst->exec_size != 8 && inst->exec_size != 4) no_spill[inst->dst.nr] = true; /* We can't spill registers that mix 32-bit and 64-bit access (that @@ -514,11 +523,25 @@ vec4_visitor::spill_reg(int spill_reg_nr) /* Generate spill/unspill instructions for the objects being spilled. */ int scratch_reg = -1; + bool do_partial_df_scratch_read = false; foreach_block_and_inst(block, vec4_instruction, inst, cfg) { for (unsigned int i = 0; i < 3; i++) { if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) { + /* DF scratch reads are not actual partial reads because we are + * going to read both GRFs in the first read instruction. + * Because of that, we will skip scratch read of the other splitted + * instruction (if any), as it can reuse the read value. We check + * the value of done_scratch_read to know if we need to do scratch + * read or not. + */ + bool do_df_scratch_read = devinfo->gen >= 7 && + type_sz(inst->src[i].type) == 8 && + (inst->exec_size != 4 || do_partial_df_scratch_read); + if (scratch_reg == -1 || - !can_use_scratch_for_source(inst, i, scratch_reg, false)) { + (!can_use_scratch_for_source(inst, i, scratch_reg, + do_partial_df_scratch_read) && + (do_df_scratch_read || type_sz(inst->src[i].type) != 8))) { /* We need to unspill anyway so make sure we read the full vec4 * in any case. This way, the cached register can be reused * for consecutive instructions that read different channels of @@ -532,6 +555,7 @@ vec4_visitor::spill_reg(int spill_reg_nr) emit_scratch_read(block, inst, dst_reg(temp), inst->src[i], spill_offset, false); temp.offset = inst->src[i].offset; + do_partial_df_scratch_read = false; } assert(scratch_reg != -1); inst->src[i].nr = scratch_reg; @@ -541,6 +565,8 @@ vec4_visitor::spill_reg(int spill_reg_nr) if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) { emit_scratch_write(block, inst, spill_offset, false); scratch_reg = inst->dst.nr; + if (type_sz(inst->dst.type) == 8 && inst->exec_size == 4) + do_partial_df_scratch_read = true; } } -- 2.11.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev