http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60172
--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
I can't really interpret the asm differences but it seems we need more
registers?
Forwprop applies the association transform (those that fold-const.c already
does when presented with large enough GENERIC trees) - it transforms
(p +p off1) +p off2 to (p +p (off1 + off2)), that is, associates the
pointer that is offsetted first and computes the offset using unsigned
integer arithmetic. That enables the reassociation pass to process
the offset expression and simplifying it (that pass cannot handle a
pointer addition chain).
This happens in forwprop4 only - thus does -fdisable-tree-forwprop4 fix the
regression?
I really can't see a fundamental difference (but the associated adds) in
the resulting code. So I wonder what RTL transform does / does not trigger
with one of the variants.
On x86_64 the code difference with -O2 [-fno-tree-forwprop4] is
@@ -11,22 +11,25 @@
.cfi_startproc
leal 5(%rdx), %r8d
movslq %edx, %rdx
+ salq $2, %rdx
movslq %r8d, %rax
leaq 0(,%rax,4), %r9
- addq %r9, %rax
leaq (%rdi,%r9), %r10
- leaq (%rax,%rax,4), %rax
+ addq %r9, %rax
movl %ecx, (%r10)
movl %ecx, 4(%rdi,%r9)
- leaq (%rsi,%rax,4), %rax
+ leaq (%rax,%rax,4), %rcx
movl %r8d, 60(%rdi,%r9)
- leaq (%rax,%rdx,4), %rax
+ salq $2, %rcx
+ leaq (%rdx,%rcx), %rax
+ addq %rsi, %rax
addl $1, 16(%rax)
movl %r8d, 20(%rax)
movl %r8d, 24(%rax)
- movl (%r10), %edx
+ movl (%r10), %edi
+ leaq 1000(%rsi,%rcx), %rax
movl $5, Int_Glob(%rip)
- movl %edx, 1020(%rax)
+ movl %edi, 20(%rdx,%rax)
ret
.cfi_endproc
If we look at immediate uses before RTL expansion relevant changes
(single-use -> non-single-use change or vice-versa - enables combine/fwprop)
are
-_32 : --> single use.
+_32 : -->2 uses.
+_16 = _41 + _32;
_33 = Arr_2_Par_Ref_22(D) + _32;
which happens when associating
_32 = pretmp_20 + 1000;
_33 = Arr_2_Par_Ref_22(D) + _32;
_34 = *_8;
- _51 = _33 + _41;
+ _16 = _41 + _32;
+ _51 = Arr_2_Par_Ref_22(D) + _16;
MEM[(int[25] *)_51 + 20B] = _34;
but _33 is dead after the transform.
+_33 : --> no uses
so that's a spurious difference. Stmts with no uses are not expanded,
but it seems to change what TER does. Hmm.
-_32 replace with --> _32 = pretmp_20 + 1000;
-
Killing dead stmts with
Index: gcc/tree-outof-ssa.c
===================================================================
--- gcc/tree-outof-ssa.c (revision 207757)
+++ gcc/tree-outof-ssa.c (working copy)
@@ -876,6 +876,21 @@ eliminate_useless_phis (void)
}
}
}
+
+ for (unsigned i = 1; i < num_ssa_names; ++i)
+ {
+ tree name = ssa_name (i);
+ if (!name || !has_zero_uses (name) || virtual_operand_p (name))
+ continue;
+ gimple def_stmt = SSA_NAME_DEF_STMT (name);
+ if (!is_gimple_assign (def_stmt)
+ || gimple_has_side_effects (def_stmt)
+ || stmt_could_throw_p (def_stmt))
+ continue;
+ gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
+ gsi_remove (&gsi, true);
+ release_defs (def_stmt);
+ }
}
fixes that (hack alert). With that we get strictly more TER. Does
-fno-tree-ter also make the testcase regress, even with
-fdisable-tree-forwprop4?