http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60884
Oleg Endo changed:
What|Removed |Added
CC||christian.bruel at st dot com
--- Comment #1 from Oleg Endo ---
With the following patch applied to current trunk (r210026)
Index: gcc/config/sh/sh-mem.cc
===
--- gcc/config/sh/sh-mem.cc(revision 210037)
+++ gcc/config/sh/sh-mem.cc(working copy)
@@ -568,7 +568,7 @@
addr1 = adjust_automodify_address (addr1, SImode, current_addr, 0);
- /*start long loop. */
+ /* start long loop. */
emit_label (L_loop_long);
/* tmp1 is aligned, OK to load. */
@@ -589,29 +589,15 @@
addr1 = adjust_address (addr1, QImode, 0);
/* unroll remaining bytes. */
- emit_insn (gen_extendqisi2 (tmp1, addr1));
- emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
- jump = emit_jump_insn (gen_branch_true (L_return));
- add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+ for (int i = 0; i < 4; ++i)
+{
+ emit_insn (gen_extendqisi2 (tmp1, addr1));
+ emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+ emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+ jump = emit_jump_insn (gen_branch_true (L_return));
+ add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+}
- emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
- emit_insn (gen_extendqisi2 (tmp1, addr1));
- emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
- jump = emit_jump_insn (gen_branch_true (L_return));
- add_int_reg_note (jump, REG_BR_PROB, prob_likely);
-
- emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
- emit_insn (gen_extendqisi2 (tmp1, addr1));
- emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
- jump = emit_jump_insn (gen_branch_true (L_return));
- add_int_reg_note (jump, REG_BR_PROB, prob_likely);
-
- emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
- emit_insn (gen_extendqisi2 (tmp1, addr1));
- jump = emit_jump_insn (gen_jump_compact (L_return));
emit_barrier_after (jump);
/* start byte loop. */
@@ -626,10 +612,9 @@
/* end loop. */
- emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
-
emit_label (L_return);
+ emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
emit_insn (gen_subsi3 (operands[0], current_addr, start_addr));
return true;
I get the following when compiling
unsigned int test (const char* x)
{
return __builtin_strlen (x);
}
with -O2 -m4:
_test:
movr4,r0
tst#3,r0
bf/s.L12
movr4,r1
mov#0,r3
.L4:
mov.l@r1+,r2
cmp/strr3,r2
bf.L4
add#-4,r1
mov.b@r1+,r2
tstr2,r2
bt.L2
mov.b@r1+,r2
tstr2,r2
bt.L2
mov.b@r1+,r2
tstr2,r2
mov#-1,r2
negcr2,r2
addr2,r1
.L2:
movr1,r0
rts
subcr4,r0
.align 1
.L12:
mov.b@r1+,r2
tstr2,r2
bf/s.L12
movr1,r0
rts
subcr4,r0
which is 5 insns shorter than the currently expanded sequence.
It seems that other optimizers are able to figure out that the 4th byte load is
not needed and eliminate it.
Moving the 'emit_insn (gen_addsi3 ...' after the return label allows it to
utilize the subc insn, which is difficult to get otherwise, as combine looks
only at one BB at a time.
Christian, what do you think? Any objections?