Hello,

This patches adds a few instructions to the inlined builtin_strlen to
unroll the remaining bytes for word-at-a-time loop. This enables to have
2 distinct execution paths (no fall-thru in the byte-at-a-time loop),
allowing block alignment assignation. This partially improves the
problem reported with by Oleg. in [Bug target/0539] New: [SH] builtin
string functions ignore loop and label alignment

whereas the test now expands (-O2 -m4) as
        mov     r4,r0
        tst     #3,r0
        mov     r4,r2
        bf/s    .L12
        mov     r4,r3
        mov     #0,r2
.L4:
        mov.l   @r4+,r1
        cmp/str r2,r1
        bf      .L4
        add     #-4,r4
        mov.b   @r4,r1
        tst     r1,r1
        bt      .L2
        add     #1,r4
        mov.b   @r4,r1
        tst     r1,r1
        bt      .L2
        add     #1,r4
        mov.b   @r4,r1
        tst     r1,r1
        mov     #-1,r1
        negc    r1,r1
        add     r1,r4
.L2:
        mov     r4,r0
        rts
        sub     r3,r0
        .align 1
.L12:
        mov.b   @r4+,r1
        tst     r1,r1
        bf/s    .L12
        mov     r2,r3
        add     #1,r3
        mov     r4,r0
        rts
        sub     r3,r0


Best tuning compared to the "compact" version I got on is ~1% for c++
regular expression benchmark, but well, code looks best this way.

regtested tested for -m2, -m4

OK for trunk ?


2014-03-20  Christian Bruel  <christian.br...@st.com>

	* config/sh/sh-mem.cc (sh_expand_strlen): Unroll last word.

Index: gcc/config/sh/sh-mem.cc
===================================================================
--- gcc/config/sh/sh-mem.cc	(revision 208745)
+++ gcc/config/sh/sh-mem.cc	(working copy)
@@ -586,9 +586,35 @@ sh_expand_strlen (rtx *operands)
 
   emit_move_insn (current_addr, plus_constant (Pmode, current_addr, -4));
 
-  /* start byte loop.  */
   addr1 = adjust_address (addr1, QImode, 0);
 
+  /* unroll remaining bytes.  */
+  emit_insn (gen_extendqisi2 (tmp1, addr1));
+  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+  jump = emit_jump_insn (gen_branch_true (L_return));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+
+  emit_insn (gen_extendqisi2 (tmp1, addr1));
+  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+  jump = emit_jump_insn (gen_branch_true (L_return));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+
+  emit_insn (gen_extendqisi2 (tmp1, addr1));
+  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+  jump = emit_jump_insn (gen_branch_true (L_return));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+
+  emit_insn (gen_extendqisi2 (tmp1, addr1));
+  jump = emit_jump_insn (gen_jump_compact (L_return));
+  emit_barrier_after (jump);
+
+  /* start byte loop.  */
   emit_label (L_loop_byte);
 
   emit_insn (gen_extendqisi2 (tmp1, addr1));
@@ -600,11 +626,12 @@ sh_expand_strlen (rtx *operands)
 
   /* end loop.  */
 
+  emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
+
   emit_label (L_return);
 
-  emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
-
   emit_insn (gen_subsi3 (operands[0], current_addr, start_addr));
 
   return true;
 }
+

Reply via email to