opt_vector_float() transforms several scalar MOV operations to a single vectorial MOV.
This is done when those MOV covers all the components of the destination register. So something like: mov vgrf3.0.xy:D, 0D mov vgrf3.0.w:D, 1065353216D mov vgrf3.0.z:D, 0D is transformed in: mov vgrf3.0:F, [0F, 0F, 0F, 1F] But there are cases where not all the components are written. For example, in: mov vgrf2.0.x:D, 1073741824D mov vgrf3.0.xy:D, 0D mov vgrf3.0.w:D, 1065353216D mov vgrf4.0.xy:D, 1065353216D mov vgrf4.0.w:D, 0D mov vgrf6.0:UD, u4.xyzw:UD Nor vgrf3 nor vgrf4 .z components are written, so the optimization is not applied. But it could be applied anyway with the components covered, using a writemask to select the ones written. So we could transform it in: mov vgrf2.0.x:D, 1073741824D mov vgrf3.0.xyw:F, [0F, 0F, 0F, 1F] mov vgrf4.0.xyw:F, [1F, 1F, 0F, 0F] mov vgrf6.0:UD, u4.xyzw:UD This commit does precisely that: opportunistically apply opt_vector_float() when possible. The improvement obtained regarding current upstream (11.1-branchpoint-654-gc51c09c) is: total instructions in shared programs: 6846435 -> 6838649 (-0.11%) instructions in affected programs: 393820 -> 386034 (-1.98%) total loops in shared programs: 1971 -> 1971 (0.00%) helped: 3980 HURT: 0 GAINED: 0 LOST: 0 v2: change vectorize_mov() signature (Matt). v3: take in account predicates (Juan). Signed-off-by: Juan A. Suarez Romero <jasua...@igalia.com> --- src/mesa/drivers/dri/i965/brw_vec4.cpp | 62 ++++++++++++++++++++++------------ src/mesa/drivers/dri/i965/brw_vec4.h | 4 +++ 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index a697bdf..ffbbf1a 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -309,6 +309,29 @@ src_reg::equals(const src_reg &r) const } bool +vec4_visitor::vectorize_mov(bblock_t *block, vec4_instruction *inst, uint8_t imm[4], + vec4_instruction *imm_inst[4], int inst_count, + unsigned writemask) +{ + if (inst_count < 2) { + return false; + } + + unsigned vf; + memcpy(&vf, imm, sizeof(vf)); + vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf)); + mov->dst.type = BRW_REGISTER_TYPE_F; + mov->dst.writemask = writemask; + inst->insert_before(block, mov); + + for (int i = 0; i < inst_count; i++) { + imm_inst[i]->remove(block); + } + + return true;} + + +bool vec4_visitor::opt_vector_float() { bool progress = false; @@ -316,27 +339,37 @@ vec4_visitor::opt_vector_float() int last_reg = -1, last_reg_offset = -1; enum brw_reg_file last_reg_file = BAD_FILE; - int remaining_channels = 0; - uint8_t imm[4]; + uint8_t imm[4] = { 0 }; int inst_count = 0; vec4_instruction *imm_inst[4]; + unsigned writemask = 0; foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { if (last_reg != inst->dst.nr || last_reg_offset != inst->dst.reg_offset || last_reg_file != inst->dst.file) { + + progress |= vectorize_mov(block, inst, imm, imm_inst, inst_count, writemask); + + inst_count = 0; + writemask = 0; last_reg = inst->dst.nr; last_reg_offset = inst->dst.reg_offset; last_reg_file = inst->dst.file; - remaining_channels = WRITEMASK_XYZW; - - inst_count = 0; + for (int i = 0; i < 4; i++) { + imm[i] = 0; + } } if (inst->opcode != BRW_OPCODE_MOV || inst->dst.writemask == WRITEMASK_XYZW || - inst->src[0].file != IMM) + inst->src[0].file != IMM || + inst->predicate != BRW_PREDICATE_NONE) { + progress |= vectorize_mov(block, inst, imm, imm_inst, inst_count, writemask); + inst_count = 0; + last_reg = -1; continue; + } int vf = brw_float_to_vf(inst->src[0].f); if (vf == -1) @@ -351,23 +384,8 @@ vec4_visitor::opt_vector_float() if ((inst->dst.writemask & WRITEMASK_W) != 0) imm[3] = vf; + writemask |= inst->dst.writemask; imm_inst[inst_count++] = inst; - - remaining_channels &= ~inst->dst.writemask; - if (remaining_channels == 0) { - unsigned vf; - memcpy(&vf, imm, sizeof(vf)); - vec4_instruction *mov = MOV(inst->dst, brw_imm_vf(vf)); - mov->dst.type = BRW_REGISTER_TYPE_F; - mov->dst.writemask = WRITEMASK_XYZW; - inst->insert_after(block, mov); - last_reg = -1; - - for (int i = 0; i < inst_count; i++) { - imm_inst[i]->remove(block); - } - progress = true; - } } if (progress) diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index ae5bf69..6476d3b 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -363,6 +363,10 @@ protected: virtual void gs_end_primitive(); private: + bool vectorize_mov(bblock_t *block, vec4_instruction *inst, uint8_t imm[4], + vec4_instruction *imm_inst[4], int inst_count, + unsigned writemask); + /** * If true, then register allocation should fail instead of spilling. */ -- 2.5.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev