Here comes another case that requires run a pass once more, as this is
not the common suggested direction to solve problems, not quite sure
whether it is still a reasonble fix here.  Source code is something like:

ref = ip + *hslot;
while (ip < in_end - 2) {
  unsigned int len = 2;
  len++;
    for ()   {
      do len++;
      while (len < maxlen && ref[len] == ip[len]); //sink code here.
      break;
    }
  len -= 2;
  ip++;
  ip += len + 1;
  if (ip >= in_end - 2)
    break;
}

Before ivopts, the gimple for inner while loop is xxx.c.172t.slp1:

  <bb 31> [local count: 75120046]:
  # len_160 = PHI <len_161(30), len_189(58)>
  len_189 = len_160 + 1;
  _423 = (sizetype) len_189;
  _424 = ip_229 + _423;
  if (maxlen_186 > len_189)
    goto <bb 32>; [94.50%]
  else
    goto <bb 33>; [5.50%]

  <bb 32> [local count: 70988443]:
  _84 = *_424;
  _86 = ref_182 + _423;
  _87 = *_86;
  if (_84 == _87)
    goto <bb 58>; [94.50%]
  else
    goto <bb 33>; [5.50%]

  <bb 58> [local count: 67084079]:
  goto <bb 31>; [100.00%]

  <bb 33> [local count: 14847855]:
  # len_263 = PHI <len_160(32), len_160(31)>
  # _262 = PHI <_423(32), _423(31)>
  # _264 = PHI <_424(32), _424(31)>
  len_190 = len_263 + 4294967295;
  if (len_190 <= 6)
    goto <bb 34>; [0.00%]
  else
    goto <bb 36>; [100.00%]

Then in ivopts, instructions are updated to xxx.c.174t.ivopts:

  <bb 31> [local count: 75120046]:
  # ivtmp.30_29 = PHI <ivtmp.30_32(30), ivtmp.30_31(58)>
  _34 = (unsigned int) ivtmp.30_29;
  len_160 = _34 + 4294967295;
  _423 = ivtmp.30_29;
  _35 = (unsigned long) ip_229;
  _420 = ivtmp.30_29 + _35;
  _419 = (uint8_t *) _420;
  _424 = _419;
  len_418 = (unsigned int) ivtmp.30_29;
  if (maxlen_186 > len_418)
    goto <bb 32>; [94.50%]
  else
    goto <bb 33>; [5.50%]

  <bb 32> [local count: 70988443]:
  _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_29 * 1];
  ivtmp.30_31 = ivtmp.30_29 + 1;
  _417 = ref_182 + 18446744073709551615;
  _87 = MEM[(uint8_t *)_417 + ivtmp.30_31 * 1];
  if (_84 == _87)
    goto <bb 58>; [94.50%]
  else
    goto <bb 33>; [5.50%]

  <bb 58> [local count: 67084079]:
  goto <bb 31>; [100.00%]

  <bb 33> [local count: 14847855]:
  # len_263 = PHI <len_160(32), len_160(31)>
  # _262 = PHI <_423(32), _423(31)>
  # _264 = PHI <_424(32), _424(31)>
  len_190 = len_263 + 4294967295;
  if (len_190 <= 6)
    goto <bb 34>; [0.00%]
  else
    goto <bb 36>; [100.00%]

Some instructions in BB 31 are not used in the loop and could be sinked
out of loop to reduce the computation, but they are not sinked
throughout all passes later.  Run the sink_code pass once more at least
after fre5 could improve this typical case performance 23% due to few
instructions exausted in loop.
xxx.c.209t.sink2:

Sinking _419 = (uint8_t *) _420;
 from bb 31 to bb 89
Sinking _420 = ivtmp.30_29 + _35;
 from bb 31 to bb 89
Sinking _35 = (unsigned long) ip_229;
 from bb 31 to bb 89
Sinking len_160 = _34 + 4294967295;
 from bb 31 to bb 33

I also tested the SPEC2017 performance on P8LE, 544.nab_r is improved
by 2.43%, but no big changes to other cases, GEOMEAN is improved quite
small with 0.25%.

The reason why it should be run after fre5 is fre would do some phi
optimization to expose the optimization.  The patch put it after
pass_modref is due to my guess that some gimple optimizations like
thread_jumps, dse, dce etc. could provide more opportunities for
sinking code.  Not sure it is the correct place to put.  I also
verified this issue exists in both X86 and ARM64.
Any comments?  Thanks.
---
 gcc/passes.def      | 1 +
 gcc/tree-ssa-sink.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/gcc/passes.def b/gcc/passes.def
index 21b2e2af0f7..69106615729 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -355,6 +355,7 @@ along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_uncprop);
       NEXT_PASS (pass_local_pure_const);
       NEXT_PASS (pass_modref);
+      NEXT_PASS (pass_sink_code);
   POP_INSERT_PASSES ()
   NEXT_PASS (pass_all_optimizations_g);
   PUSH_INSERT_PASSES_WITHIN (pass_all_optimizations_g)
diff --git a/gcc/tree-ssa-sink.c b/gcc/tree-ssa-sink.c
index b0abf4147d6..824659f3919 100644
--- a/gcc/tree-ssa-sink.c
+++ b/gcc/tree-ssa-sink.c
@@ -819,6 +819,7 @@ public:
   /* opt_pass methods: */
   virtual bool gate (function *) { return flag_tree_sink != 0; }
   virtual unsigned int execute (function *);
+  opt_pass *clone (void) { return new pass_sink_code (m_ctxt); }
 
 }; // class pass_sink_code
 
-- 
2.27.0.90.geebb51ba8c

Reply via email to