Module: Mesa
Branch: main
Commit: e5dfff0946bfb1468fcf1b481571ba4a0469452a
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=e5dfff0946bfb1468fcf1b481571ba4a0469452a

Author: Lionel Landwerlin <[email protected]>
Date:   Wed Sep 14 02:40:01 2022 +0300

intel/fs: reduce liveness of variables in lowering passes

When lowering a single instruction with a destination VGRF to 2 or
more, the VGRF is now considered partially written by each generated
instruction and that increases its liveness especially in loops. Thus
potentially increasing the number of spills/fills due to register
allocation.

Putting an UNDEF instruction in front of the lowered instructions
allows the IR to limit the liveness of the VGRF, reducing register
pressure.

This has a pretty dramatic effect on spills/fills for RT shaders. Here
the stats on Q2RTX shaders on DG2 (wipping out any spills/fills due to
register allocation) :

Instructions in all programs: 26150 -> 24955 (-4.6%)
SENDs in all programs: 1148 -> 1148 (+0.0%)
Loops in all programs: 4 -> 4 (+0.0%)
Cycles in all programs: 392179 -> 332787 (-15.1%)
Spills in all programs: 132 -> 116 (-12.1%)
Fills in all programs: 262 -> 154 (-41.2%)

Shader-db results on TGL :

total instructions in shared programs: 21158140 -> 21158377 (<.01%)
instructions in affected programs: 76629 -> 76866 (0.31%)
helped: 18
HURT: 20
helped stats (abs) min: 1 max: 60 x̄: 18.89 x̃: 12
helped stats (rel) min: 0.21% max: 3.61% x̄: 1.02% x̃: 0.77%
HURT stats (abs)   min: 1 max: 79 x̄: 28.85 x̃: 18
HURT stats (rel)   min: 0.04% max: 2.81% x̄: 1.13% x̃: 0.79%
95% mean confidence interval for instructions value: -4.82 17.30
95% mean confidence interval for instructions %-change: -0.34% 0.57%
Inconclusive result (value mean confidence interval includes 0).

total loops in shared programs: 5753 -> 5753 (0.00%)
loops in affected programs: 0 -> 0
helped: 0
HURT: 0

total cycles in shared programs: 798856834 -> 798870688 (<.01%)
cycles in affected programs: 6208395 -> 6222249 (0.22%)
helped: 22
HURT: 17
helped stats (abs) min: 2 max: 8794 x̄: 1438.18 x̃: 782
helped stats (rel) min: 0.05% max: 2.28% x̄: 0.63% x̃: 0.44%
HURT stats (abs)   min: 2 max: 19178 x̄: 2676.12 x̃: 1358
HURT stats (rel)   min: 0.04% max: 23.49% x̄: 2.25% x̃: 0.71%
95% mean confidence interval for cycles value: -952.19 1662.65
95% mean confidence interval for cycles %-change: -0.64% 1.90%
Inconclusive result (value mean confidence interval includes 0).

total spills in shared programs: 4078 -> 4066 (-0.29%)
spills in affected programs: 40 -> 28 (-30.00%)
helped: 2
HURT: 0

total fills in shared programs: 2856 -> 2832 (-0.84%)
fills in affected programs: 127 -> 103 (-18.90%)
helped: 2
HURT: 0

total sends in shared programs: 998554 -> 998554 (0.00%)
sends in affected programs: 0 -> 0
helped: 0
HURT: 0

LOST:   0
GAINED: 0

Total CPU time (seconds): 2346.06 -> 2304.80 (-1.76%)

Signed-off-by: Lionel Landwerlin <[email protected]>
Reviewed-by: Francisco Jerez <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18657>

---

 src/intel/compiler/brw_fs.cpp            | 16 ++++++++++++++++
 src/intel/compiler/brw_fs_builder.h      | 13 ++++++++++++-
 src/intel/compiler/brw_fs_lower_pack.cpp |  7 +++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index a5c7e9346fd..6e3429f72e1 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2549,6 +2549,9 @@ fs_visitor::opt_algebraic()
             assert(!inst->src[0].negate);
             const brw::fs_builder ibld(this, block, inst);
 
+            if (!inst->is_partial_write())
+               ibld.emit_undef_for_dst(inst);
+
             ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 1),
                      subscript(inst->src[0], BRW_REGISTER_TYPE_F, 1));
             ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 0),
@@ -2567,6 +2570,9 @@ fs_visitor::opt_algebraic()
             assert(!inst->src[0].negate);
             const brw::fs_builder ibld(this, block, inst);
 
+            if (!inst->is_partial_write())
+               ibld.emit_undef_for_dst(inst);
+
             ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
                      subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1));
             ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
@@ -2697,6 +2703,9 @@ fs_visitor::opt_algebraic()
             assert(!inst->src[1].abs && !inst->src[1].negate);
             const brw::fs_builder ibld(this, block, inst);
 
+            if (!inst->is_partial_write())
+               ibld.emit_undef_for_dst(inst);
+
             set_predicate(inst->predicate,
                           ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 
0),
                                    subscript(inst->src[0], 
BRW_REGISTER_TYPE_UD, 0),
@@ -4107,6 +4116,7 @@ fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t 
*block)
                 subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
       ibld.MOV(bd_low, acc);
 
+      ibld.UNDEF(bd);
       ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);
       ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);
    }
@@ -4123,6 +4133,8 @@ fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t 
*block)
    if (devinfo->has_64bit_int) {
       ibld.MOV(inst->dst, bd);
    } else {
+      if (!inst->is_partial_write())
+         ibld.emit_undef_for_dst(inst);
       ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
                subscript(bd, BRW_REGISTER_TYPE_UD, 0));
       ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
@@ -5564,6 +5576,10 @@ fs_visitor::lower_find_live_channel()
        */
       fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
 
+      const fs_builder ibld(this, block, inst);
+      if (!inst->is_partial_write())
+         ibld.emit_undef_for_dst(inst);
+
       const fs_builder ubld = bld.at(block, inst).exec_all().group(1, 0);
 
       /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
diff --git a/src/intel/compiler/brw_fs_builder.h 
b/src/intel/compiler/brw_fs_builder.h
index 7b4cdd726a2..d2d8d5e7ef0 100644
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -565,6 +565,17 @@ namespace brw {
          }
       }
 
+      instruction *
+      emit_undef_for_dst(const instruction *old_inst) const
+      {
+         assert(old_inst->dst.file == VGRF);
+         instruction *inst = emit(SHADER_OPCODE_UNDEF,
+                                  retype(old_inst->dst, BRW_REGISTER_TYPE_UD));
+         inst->size_written = old_inst->size_written;
+
+         return inst;
+      }
+
       /**
        * Assorted arithmetic ops.
        * @{
@@ -785,7 +796,7 @@ namespace brw {
          assert(dst.offset % REG_SIZE == 0);
          instruction *inst = emit(SHADER_OPCODE_UNDEF,
                                   retype(dst, BRW_REGISTER_TYPE_UD));
-         inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
+         inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - 
dst.offset;
 
          return inst;
       }
diff --git a/src/intel/compiler/brw_fs_lower_pack.cpp 
b/src/intel/compiler/brw_fs_lower_pack.cpp
index ac7b61de6f0..0b0f9417513 100644
--- a/src/intel/compiler/brw_fs_lower_pack.cpp
+++ b/src/intel/compiler/brw_fs_lower_pack.cpp
@@ -41,6 +41,13 @@ fs_visitor::lower_pack()
       fs_reg dst = inst->dst;
 
       const fs_builder ibld(this, block, inst);
+      /* The lowering generates 2 instructions for what was previously 1. This
+       * can trick the IR to believe we're doing partial writes, but the
+       * register is actually fully written. Mark it as undef to help the IR
+       * reduce the liveness of the register.
+       */
+      if (!inst->is_partial_write())
+         ibld.emit_undef_for_dst(inst);
       for (unsigned i = 0; i < inst->sources; i++)
          ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]);
 

Reply via email to