[Bug target/60884] [SH] improve inlined strlen-like builtin functions

2015-01-17 Thread olegendo at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60884

--- Comment #5 from Oleg Endo olegendo at gcc dot gnu.org ---
The test case gcc.target/sh/memset.c:

void
test00(char *dstb)
{
  __builtin_memset (dstb, 0, 15);
}


compiles to:
mov r4,r0
tst #3,r0
mov #0,r1
bf/s.L5
mov #15,r2
mov #3,r2
.L3:
mov.l   r1,@r4loop runs 3x.
dt  r2better emit 3x mov.l
bf/s.L3
add #4,r4

mov.b   r1,@r4
add #1,r4
mov.b   r1,@r4
add #1,r4
rts
mov.b   r1,@r4
.align 1
.L5:
mov.b   r1,@r4
dt  r2
bf/s.L5
add #1,r4
rts
nop

Especially when the number of the iterations is known, we should try to unroll
the loops.


[Bug target/60884] [SH] improve inlined strlen-like builtin functions

2014-05-07 Thread olegendo at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60884

--- Comment #4 from Oleg Endo olegendo at gcc dot gnu.org ---
Author: olegendo
Date: Wed May  7 20:08:23 2014
New Revision: 210187

URL: http://gcc.gnu.org/viewcvs?rev=210187root=gccview=rev
Log:
gcc/
PR target/60884
* config/sh/sh-mem.cc (sh_expand_strlen): Use loop when emitting
unrolled byte insns.  Emit address increments after move insns.

gcc/testsuite/
PR target/60884
* gcc.target/sh/pr53976-1.c (test_02): Remove inappropriate test case.
(test_03): Rename to test_02.


Modified:
trunk/gcc/ChangeLog
trunk/gcc/config/sh/sh-mem.cc
trunk/gcc/testsuite/ChangeLog
trunk/gcc/testsuite/gcc.target/sh/pr53976-1.c


[Bug target/60884] [SH] improve inlined strlen-like builtin functions

2014-05-05 Thread chrbr at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60884

chrbr at gcc dot gnu.org changed:

   What|Removed |Added

 CC||chrbr at gcc dot gnu.org

--- Comment #3 from chrbr at gcc dot gnu.org ---

 
 Christian, what do you think?  Any objections?

yes, nice catch to use the subc to swallow the last iteration. post-inc is best
this way indeed, thanks.


[Bug target/60884] [SH] improve inlined strlen-like builtin functions

2014-05-04 Thread olegendo at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60884

--- Comment #2 from Oleg Endo olegendo at gcc dot gnu.org ---
I was trying to see how to implement the strchr builtin function which could
also utilize the cmp/str insn.  However, it seems that the necessary builtin
expansion code for strchr is not there (yet).


[Bug target/60884] [SH] improve inlined strlen-like builtin functions

2014-05-03 Thread olegendo at gcc dot gnu.org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60884

Oleg Endo olegendo at gcc dot gnu.org changed:

   What|Removed |Added

 CC||christian.bruel at st dot com

--- Comment #1 from Oleg Endo olegendo at gcc dot gnu.org ---
With the following patch applied to current trunk (r210026)

Index: gcc/config/sh/sh-mem.cc
===
--- gcc/config/sh/sh-mem.cc(revision 210037)
+++ gcc/config/sh/sh-mem.cc(working copy)
@@ -568,7 +568,7 @@

   addr1 = adjust_automodify_address (addr1, SImode, current_addr, 0);

-  /*start long loop.  */
+  /* start long loop.  */
   emit_label (L_loop_long);

   /* tmp1 is aligned, OK to load.  */
@@ -589,29 +589,15 @@
   addr1 = adjust_address (addr1, QImode, 0);

   /* unroll remaining bytes.  */
-  emit_insn (gen_extendqisi2 (tmp1, addr1));
-  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
-  jump = emit_jump_insn (gen_branch_true (L_return));
-  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+  for (int i = 0; i  4; ++i)
+{
+  emit_insn (gen_extendqisi2 (tmp1, addr1));
+  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+  jump = emit_jump_insn (gen_branch_true (L_return));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+}

-  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
-  emit_insn (gen_extendqisi2 (tmp1, addr1));
-  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
-  jump = emit_jump_insn (gen_branch_true (L_return));
-  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
-
-  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
-  emit_insn (gen_extendqisi2 (tmp1, addr1));
-  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
-  jump = emit_jump_insn (gen_branch_true (L_return));
-  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
-
-  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
-  emit_insn (gen_extendqisi2 (tmp1, addr1));
-  jump = emit_jump_insn (gen_jump_compact (L_return));
   emit_barrier_after (jump);

   /* start byte loop.  */
@@ -626,10 +612,9 @@

   /* end loop.  */

-  emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
-
   emit_label (L_return);

+  emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
   emit_insn (gen_subsi3 (operands[0], current_addr, start_addr));

   return true;


I get the following when compiling

unsigned int test (const char* x)
{
  return __builtin_strlen (x);
}

with -O2 -m4:
_test:
movr4,r0
tst#3,r0
bf/s.L12
movr4,r1
mov#0,r3
.L4:
mov.l@r1+,r2
cmp/strr3,r2
bf.L4
add#-4,r1
mov.b@r1+,r2
tstr2,r2
bt.L2
mov.b@r1+,r2
tstr2,r2
bt.L2
mov.b@r1+,r2
tstr2,r2
mov#-1,r2
negcr2,r2
addr2,r1
.L2:
movr1,r0
rts
subcr4,r0
.align 1
.L12:
mov.b@r1+,r2
tstr2,r2
bf/s.L12
movr1,r0
rts
subcr4,r0

which is 5 insns shorter than the currently expanded sequence.
It seems that other optimizers are able to figure out that the 4th byte load is
not needed and eliminate it.
Moving the 'emit_insn (gen_addsi3 ...' after the return label allows it to
utilize the subc insn, which is difficult to get otherwise, as combine looks
only at one BB at a time.

Christian, what do you think?  Any objections?