On 11/05/16 12:52, Jiong Wang wrote:
On 09/05/16 16:08, Segher Boessenkool wrote:
Hi Christophe,
On Mon, May 09, 2016 at 03:54:26PM +0200, Christophe Lyon wrote:
After this patch, I've noticed that
gcc.target/arm/pr43920-2.c
now fails at:
/* { dg-final { scan-assembler-times "pop" 2 } } */
Before the patch, the generated code was:
[...]
pop {r3, r4, r5, r6, r7, pc}
.L4:
mov r0, #-1
.L1:
pop {r3, r4, r5, r6, r7, pc}
it is now:
[...]
.L1:
pop {r3, r4, r5, r6, r7, pc}
.L4:
mov r0, #-1
b .L1
The new version does not seem better, as it adds a branch on the path
and it is not smaller.
That looks like bb-reorder isn't doing its job? Maybe it thinks that
pop is too expensive to copy?
I think so. Filed PR71061
ARM backend is not setting the length attribute correctly, that the bb
failed copy_bb_p check.
The length attributes for these pop pattern has been correctted by r237331.
Unfortunately I am afraid even we fixed the backend length issue, this
testcase
will keep failing, because it's specify "-Os" that some bb copy won't
be triggerd.
A further investigation shows this issue is actually gcc couldn't optimize
"bl to pop" into "pop" which is "jump to return" into "return", so a better
place to fix this issue is at try_optimize_cfg where we are doing these
jump/return optimization already:
/* Try to change a branch to a return to just that return. */
rtx_insn *ret, *use;
...
Currently we are using ANY_RETURN_P to check whether an rtx is return
while ANY_RETURN_P only covered RETURN, SIMPLE_RETURN without side-effect:
#define ANY_RETURN_P(X) \
(GET_CODE (X) == RETURN || GET_CODE (X) == SIMPLE_RETURN)
It is possible that some architectures support return instruction with
side-effect, for example ARM has pop instruction which will do a return
and pop registers at the same time. The instruction pattern for that is
something like:
(jump_insn 90 89 93 7 (parallel [
(return)
(set/f (reg/f:SI 13 sp)
(plus:SI (reg/f:SI 13 sp)
(const_int 24 [0x18])))
(set/f (reg:SI 3 r3)
(mem/c:SI (reg/f:SI 13 sp) [3 S4 A32]))
(set/f (reg:SI 4 r4)
(mem/c:SI (plus:SI (reg/f:SI 13 sp)
(const_int 4 [0x4])) [3 S4 A32]))
...
Above pattern will fail ANY_RETURN_P check, that gcc can't optimize the
folowing jump to return:
[bb 1]
(jump_insn 76 38 77 4 (set (pc)
(label_ref 43)) pr43920-2.c:27 203 {*arm_jump}
(nil)
-> 43)
[bb 2]
(code_label 43)
...
(jump_insn 90 89 93 7 (parallel [
(return)
(set/f (reg/f:SI 13 sp)
(plus:SI (reg/f:SI 13 sp)
(const_int 24 [0x18])))
(set/f (reg:SI 3 r3)
(mem/c:SI (reg/f:SI 13 sp) [3 S4 A32]))
(set/f (reg:SI 4 r4)
(mem/c:SI (plus:SI (reg/f:SI 13 sp)
(const_int 4 [0x4])) [3 S4 A32]))
into:
(jump_insn 76 38 77 4 (parallel [
(return)
(set/f (reg/f:SI 13 sp)
(plus:SI (reg/f:SI 13 sp)
(const_int 24 [0x18])))
(set/f (reg:SI 3 r3)
(mem/c:SI (reg/f:SI 13 sp) [3 S4 A32]))
(set/f (reg:SI 4 r4)
(mem/c:SI (plus:SI (reg/f:SI 13 sp)
(const_int 4 [0x4])) [3 S4 A32]))
(set/f (reg:SI 5 r5)
(mem/c:SI (plus:SI (reg/f:SI 13 sp)
(const_int 8 [0x8])) [3 S4 A32]))
This patch:
* rename existed ANY_RETURN_P into ANY_PURE_RETURN_P.
* Re-implement ANY_RETURN_P to consider complex JUMP_INSN with
PARALLEL in the pattern.
* Removed the side_effect_p check on the last insn in both bb1
and bb2. This is because suppose we have bb1 and bb2 contains
the following single jump_insn and both fall through to EXIT_BB:
bb 1 bb 2
(jump_insn (jump_insn
(parallel [ (parallel [
(return) (return)
(set/f (set/f
(reg:SI 15 pc) (reg:SI 15 pc)
(mem:SI (mem:SI
(post_inc:SI (post_inc:SI
(reg/f:SI 13 sp)) (reg/f:SI 13 sp))
]) ])
-> return) -> return)
\ /
\ /
\ /
v v
EXIT_BB
cross jump optimization will try to change the jump_insn in bb1 into
a unconditional jump to bb2, then we will enter the next iteration
of try_optimize_cfg, and it will be a new "jump to return", then
bb1 will be optimized back into above patterns, and thus another round
of cross jump optimization, we will end up with infinite loop here.
boostrap ok on x86_64/arm32/arm64, no gcc/g++ regression on any of
them.
OK for trunk?
2016-06-14 Jiong Wang<jiong.w...@arm.com>
* cfgcleanup.c (output.h): New include.
(try_optimize_cfg): Check instruction length when optimizing jump to
return into return, consider optimize for size.
(flow_find_cross_jump): Remove side_effect_p check on the last insn
in both bb1 and bb2.
* jump.c (return_in_jump): New function.
(redirect_jump_2): Consider complex pattern when updating jump label.
* rtl.c (classify_insn): Use ANY_PURE_RETURN_P instead of
ANY_RETURN_P.
* rtl.h (return_in_jump): New declaration.
(ANY_RETURN_P): Rename to ANY_PURE_RETURN_P. Re-implement
through return_in_jump.
diff --git a/gcc/cfgcleanup.c b/gcc/cfgcleanup.c
index 023b9d2..a4bea09 100644
--- a/gcc/cfgcleanup.c
+++ b/gcc/cfgcleanup.c
@@ -41,6 +41,7 @@ along with GCC; see the file COPYING3. If not see
#include "tm_p.h"
#include "insn-config.h"
#include "emit-rtl.h"
+#include "output.h"
#include "cselib.h"
#include "params.h"
#include "tree-pass.h"
@@ -1337,7 +1338,7 @@ flow_find_cross_jump (basic_block bb1, basic_block bb2, rtx_insn **f1,
i1 = BB_END (bb1);
last1 = afterlast1 = last2 = afterlast2 = NULL;
if (onlyjump_p (i1)
- || (returnjump_p (i1) && !side_effects_p (PATTERN (i1))))
+ || returnjump_p (i1))
{
last1 = i1;
i1 = PREV_INSN (i1);
@@ -1345,7 +1346,7 @@ flow_find_cross_jump (basic_block bb1, basic_block bb2, rtx_insn **f1,
i2 = BB_END (bb2);
if (onlyjump_p (i2)
- || (returnjump_p (i2) && !side_effects_p (PATTERN (i2))))
+ || returnjump_p (i2))
{
last2 = i2;
/* Count everything except for unconditional jump as insn.
@@ -2825,7 +2826,11 @@ try_optimize_cfg (int mode)
rtx_insn *ret, *use;
if (single_succ_p (b)
&& onlyjump_p (BB_END (b))
- && bb_is_just_return (single_succ (b), &ret, &use))
+ && bb_is_just_return (single_succ (b), &ret, &use)
+ && ((get_attr_min_length (ret)
+ <= get_attr_min_length (BB_END (b)))
+ || optimize_bb_for_speed_p (b)))
+
{
if (redirect_jump (as_a <rtx_jump_insn *> (BB_END (b)),
PATTERN (ret), 0))
diff --git a/gcc/jump.c b/gcc/jump.c
index 5b433af..e962921 100644
--- a/gcc/jump.c
+++ b/gcc/jump.c
@@ -1437,7 +1437,35 @@ delete_for_peephole (rtx_insn *from, rtx_insn *to)
since the peephole that replaces this sequence
is also an unconditional jump in that case. */
}
-
+
+/* If X is RETURN or SIMPLE_RETURN then return itself. If X is PARALLEL, return
+ the contained RETURN or SIMPLE_RETURN if it's not a CALL_INSN, otherwise
+ return NULL_RTX. */
+
+rtx
+return_in_jump (rtx x)
+{
+ if (ANY_PURE_RETURN_P (x))
+ return x;
+
+ rtx ret = NULL_RTX;
+
+ if (GET_CODE (x) == PARALLEL)
+ {
+ int j = 0;
+ for (; j < XVECLEN (x, 0); j++)
+ if (ANY_PURE_RETURN_P (XVECEXP (x, 0, j)))
+ ret = XVECEXP (x, 0, j);
+ /* Return NULL_RTX for CALL_INSN. */
+ else if (GET_CODE (XVECEXP (x, 0, j)) == CALL
+ || (GET_CODE (XVECEXP (x, 0, j)) == SET
+ && GET_CODE (SET_SRC (XVECEXP (x, 0, j))) == CALL))
+ return NULL_RTX;
+ }
+
+ return ret;
+}
+
/* A helper function for redirect_exp_1; examines its input X and returns
either a LABEL_REF around a label, or a RETURN if X was NULL. */
static rtx
@@ -1586,8 +1614,14 @@ redirect_jump_2 (rtx_jump_insn *jump, rtx olabel, rtx nlabel, int delete_unused,
moving FUNCTION_END note. Just sanity check that no user still worry
about this. */
gcc_assert (delete_unused >= 0);
- JUMP_LABEL (jump) = nlabel;
- if (!ANY_RETURN_P (nlabel))
+ /* "nlabel" might be a pattern where "return/simple_return" is inside a
+ PARALLEL that it can't be used for JUMP_LABEL directly. */
+ rtx ret = return_in_jump (nlabel);
+ if (ret)
+ JUMP_LABEL (jump) = ret;
+ else
+ JUMP_LABEL (jump) = nlabel;
+ if (!ret)
++LABEL_NUSES (nlabel);
/* Update labels in any REG_EQUAL note. */
diff --git a/gcc/rtl.h b/gcc/rtl.h
index b531ab7..a2db61b 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -961,9 +961,12 @@ is_a_helper <rtx_note *>::test (rtx_insn *insn)
}
/* Predicate yielding nonzero iff X is a return or simple_return. */
-#define ANY_RETURN_P(X) \
+#define ANY_PURE_RETURN_P(X) \
(GET_CODE (X) == RETURN || GET_CODE (X) == SIMPLE_RETURN)
+/* See comments for return_in_jump. */
+#define ANY_RETURN_P(X) (return_in_jump (X) != NULL_RTX)
+
/* 1 if X is a unary operator. */
#define UNARY_P(X) \
@@ -3503,6 +3506,7 @@ extern enum rtx_code reversed_comparison_code_parts (enum rtx_code, const_rtx,
const_rtx, const_rtx);
extern void delete_for_peephole (rtx_insn *, rtx_insn *);
extern int condjump_in_parallel_p (const rtx_insn *);
+extern rtx return_in_jump (rtx);
/* In emit-rtl.c. */
extern int max_reg_num (void);
diff --git a/gcc/rtl.c b/gcc/rtl.c
index a445cdc..fd96839 100644
--- a/gcc/rtl.c
+++ b/gcc/rtl.c
@@ -694,7 +694,7 @@ classify_insn (rtx x)
return CODE_LABEL;
if (GET_CODE (x) == CALL)
return CALL_INSN;
- if (ANY_RETURN_P (x))
+ if (ANY_PURE_RETURN_P (x))
return JUMP_INSN;
if (GET_CODE (x) == SET)
{
@@ -712,7 +712,7 @@ classify_insn (rtx x)
for (j = XVECLEN (x, 0) - 1; j >= 0; j--)
if (GET_CODE (XVECEXP (x, 0, j)) == CALL)
return CALL_INSN;
- else if (ANY_RETURN_P (XVECEXP (x, 0, j)))
+ else if (ANY_PURE_RETURN_P (XVECEXP (x, 0, j)))
has_return_p = true;
else if (GET_CODE (XVECEXP (x, 0, j)) == SET
&& GET_CODE (SET_DEST (XVECEXP (x, 0, j))) == PC)