Module: Mesa Branch: main Commit: e5dfff0946bfb1468fcf1b481571ba4a0469452a URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=e5dfff0946bfb1468fcf1b481571ba4a0469452a
Author: Lionel Landwerlin <[email protected]> Date: Wed Sep 14 02:40:01 2022 +0300 intel/fs: reduce liveness of variables in lowering passes When lowering a single instruction with a destination VGRF to 2 or more, the VGRF is now considered partially written by each generated instruction and that increases its liveness especially in loops. Thus potentially increasing the number of spills/fills due to register allocation. Putting an UNDEF instruction in front of the lowered instructions allows the IR to limit the liveness of the VGRF, reducing register pressure. This has a pretty dramatic effect on spills/fills for RT shaders. Here the stats on Q2RTX shaders on DG2 (wipping out any spills/fills due to register allocation) : Instructions in all programs: 26150 -> 24955 (-4.6%) SENDs in all programs: 1148 -> 1148 (+0.0%) Loops in all programs: 4 -> 4 (+0.0%) Cycles in all programs: 392179 -> 332787 (-15.1%) Spills in all programs: 132 -> 116 (-12.1%) Fills in all programs: 262 -> 154 (-41.2%) Shader-db results on TGL : total instructions in shared programs: 21158140 -> 21158377 (<.01%) instructions in affected programs: 76629 -> 76866 (0.31%) helped: 18 HURT: 20 helped stats (abs) min: 1 max: 60 x̄: 18.89 x̃: 12 helped stats (rel) min: 0.21% max: 3.61% x̄: 1.02% x̃: 0.77% HURT stats (abs) min: 1 max: 79 x̄: 28.85 x̃: 18 HURT stats (rel) min: 0.04% max: 2.81% x̄: 1.13% x̃: 0.79% 95% mean confidence interval for instructions value: -4.82 17.30 95% mean confidence interval for instructions %-change: -0.34% 0.57% Inconclusive result (value mean confidence interval includes 0). total loops in shared programs: 5753 -> 5753 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 798856834 -> 798870688 (<.01%) cycles in affected programs: 6208395 -> 6222249 (0.22%) helped: 22 HURT: 17 helped stats (abs) min: 2 max: 8794 x̄: 1438.18 x̃: 782 helped stats (rel) min: 0.05% max: 2.28% x̄: 0.63% x̃: 0.44% HURT stats (abs) min: 2 max: 19178 x̄: 2676.12 x̃: 1358 HURT stats (rel) min: 0.04% max: 23.49% x̄: 2.25% x̃: 0.71% 95% mean confidence interval for cycles value: -952.19 1662.65 95% mean confidence interval for cycles %-change: -0.64% 1.90% Inconclusive result (value mean confidence interval includes 0). total spills in shared programs: 4078 -> 4066 (-0.29%) spills in affected programs: 40 -> 28 (-30.00%) helped: 2 HURT: 0 total fills in shared programs: 2856 -> 2832 (-0.84%) fills in affected programs: 127 -> 103 (-18.90%) helped: 2 HURT: 0 total sends in shared programs: 998554 -> 998554 (0.00%) sends in affected programs: 0 -> 0 helped: 0 HURT: 0 LOST: 0 GAINED: 0 Total CPU time (seconds): 2346.06 -> 2304.80 (-1.76%) Signed-off-by: Lionel Landwerlin <[email protected]> Reviewed-by: Francisco Jerez <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18657> --- src/intel/compiler/brw_fs.cpp | 16 ++++++++++++++++ src/intel/compiler/brw_fs_builder.h | 13 ++++++++++++- src/intel/compiler/brw_fs_lower_pack.cpp | 7 +++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index a5c7e9346fd..6e3429f72e1 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2549,6 +2549,9 @@ fs_visitor::opt_algebraic() assert(!inst->src[0].negate); const brw::fs_builder ibld(this, block, inst); + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); + ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 1), subscript(inst->src[0], BRW_REGISTER_TYPE_F, 1)); ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 0), @@ -2567,6 +2570,9 @@ fs_visitor::opt_algebraic() assert(!inst->src[0].negate); const brw::fs_builder ibld(this, block, inst); + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); + ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1), subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1)); ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), @@ -2697,6 +2703,9 @@ fs_visitor::opt_algebraic() assert(!inst->src[1].abs && !inst->src[1].negate); const brw::fs_builder ibld(this, block, inst); + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); + set_predicate(inst->predicate, ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), @@ -4107,6 +4116,7 @@ fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block) subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)); ibld.MOV(bd_low, acc); + ibld.UNDEF(bd); ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low); ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high); } @@ -4123,6 +4133,8 @@ fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block) if (devinfo->has_64bit_int) { ibld.MOV(inst->dst, bd); } else { + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), subscript(bd, BRW_REGISTER_TYPE_UD, 0)); ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1), @@ -5564,6 +5576,10 @@ fs_visitor::lower_find_live_channel() */ fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); + const fs_builder ibld(this, block, inst); + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); + const fs_builder ubld = bld.at(block, inst).exec_all().group(1, 0); /* ce0 doesn't consider the thread dispatch mask (DMask or VMask), diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h index 7b4cdd726a2..d2d8d5e7ef0 100644 --- a/src/intel/compiler/brw_fs_builder.h +++ b/src/intel/compiler/brw_fs_builder.h @@ -565,6 +565,17 @@ namespace brw { } } + instruction * + emit_undef_for_dst(const instruction *old_inst) const + { + assert(old_inst->dst.file == VGRF); + instruction *inst = emit(SHADER_OPCODE_UNDEF, + retype(old_inst->dst, BRW_REGISTER_TYPE_UD)); + inst->size_written = old_inst->size_written; + + return inst; + } + /** * Assorted arithmetic ops. * @{ @@ -785,7 +796,7 @@ namespace brw { assert(dst.offset % REG_SIZE == 0); instruction *inst = emit(SHADER_OPCODE_UNDEF, retype(dst, BRW_REGISTER_TYPE_UD)); - inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE; + inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset; return inst; } diff --git a/src/intel/compiler/brw_fs_lower_pack.cpp b/src/intel/compiler/brw_fs_lower_pack.cpp index ac7b61de6f0..0b0f9417513 100644 --- a/src/intel/compiler/brw_fs_lower_pack.cpp +++ b/src/intel/compiler/brw_fs_lower_pack.cpp @@ -41,6 +41,13 @@ fs_visitor::lower_pack() fs_reg dst = inst->dst; const fs_builder ibld(this, block, inst); + /* The lowering generates 2 instructions for what was previously 1. This + * can trick the IR to believe we're doing partial writes, but the + * register is actually fully written. Mark it as undef to help the IR + * reduce the liveness of the register. + */ + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); for (unsigned i = 0; i < inst->sources; i++) ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]);
