Hello, This patches adds a few instructions to the inlined builtin_strlen to unroll the remaining bytes for word-at-a-time loop. This enables to have 2 distinct execution paths (no fall-thru in the byte-at-a-time loop), allowing block alignment assignation. This partially improves the problem reported with by Oleg. in [Bug target/0539] New: [SH] builtin string functions ignore loop and label alignment
whereas the test now expands (-O2 -m4) as mov r4,r0 tst #3,r0 mov r4,r2 bf/s .L12 mov r4,r3 mov #0,r2 .L4: mov.l @r4+,r1 cmp/str r2,r1 bf .L4 add #-4,r4 mov.b @r4,r1 tst r1,r1 bt .L2 add #1,r4 mov.b @r4,r1 tst r1,r1 bt .L2 add #1,r4 mov.b @r4,r1 tst r1,r1 mov #-1,r1 negc r1,r1 add r1,r4 .L2: mov r4,r0 rts sub r3,r0 .align 1 .L12: mov.b @r4+,r1 tst r1,r1 bf/s .L12 mov r2,r3 add #1,r3 mov r4,r0 rts sub r3,r0 Best tuning compared to the "compact" version I got on is ~1% for c++ regular expression benchmark, but well, code looks best this way. regtested tested for -m2, -m4 OK for trunk ?
2014-03-20 Christian Bruel <christian.br...@st.com> * config/sh/sh-mem.cc (sh_expand_strlen): Unroll last word. Index: gcc/config/sh/sh-mem.cc =================================================================== --- gcc/config/sh/sh-mem.cc (revision 208745) +++ gcc/config/sh/sh-mem.cc (working copy) @@ -586,9 +586,35 @@ sh_expand_strlen (rtx *operands) emit_move_insn (current_addr, plus_constant (Pmode, current_addr, -4)); - /* start byte loop. */ addr1 = adjust_address (addr1, QImode, 0); + /* unroll remaining bytes. */ + emit_insn (gen_extendqisi2 (tmp1, addr1)); + emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx)); + jump = emit_jump_insn (gen_branch_true (L_return)); + add_int_reg_note (jump, REG_BR_PROB, prob_likely); + + emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1)); + + emit_insn (gen_extendqisi2 (tmp1, addr1)); + emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx)); + jump = emit_jump_insn (gen_branch_true (L_return)); + add_int_reg_note (jump, REG_BR_PROB, prob_likely); + + emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1)); + + emit_insn (gen_extendqisi2 (tmp1, addr1)); + emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx)); + jump = emit_jump_insn (gen_branch_true (L_return)); + add_int_reg_note (jump, REG_BR_PROB, prob_likely); + + emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1)); + + emit_insn (gen_extendqisi2 (tmp1, addr1)); + jump = emit_jump_insn (gen_jump_compact (L_return)); + emit_barrier_after (jump); + + /* start byte loop. */ emit_label (L_loop_byte); emit_insn (gen_extendqisi2 (tmp1, addr1)); @@ -600,11 +626,12 @@ sh_expand_strlen (rtx *operands) /* end loop. */ + emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1))); + emit_label (L_return); - emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1))); - emit_insn (gen_subsi3 (operands[0], current_addr, start_addr)); return true; } +