Hi Haochen, on 2024/1/10 09:35, HAO CHEN GUI wrote: > Hi, > This patch refactors function expand_compare_loop and split it to two > functions. One is for fixed length and another is for variable length. > These two functions share some low level common help functions.
I'm expecting refactoring doesn't introduce any functional changes, but this patch has some enhancements as described below, so I think the subject is off, it's more like rework. > > Besides above changes, the patch also does: > 1. Don't generate load and compare loop when max_bytes is less than > loop bytes. > 2. Remove do_load_mask_compare as it's no needed. All sub-targets > entering the function should support efficient overlapping load and > compare. > 3. Implement an variable length overlapping load and compare for the > case which remain bytes is less than the loop bytes in variable length > compare. The 4k boundary test and one-byte load and compare loop are > removed as they're no need now. > 4. Remove the codes for "bytes > max_bytes" with fixed length as the > case is already excluded by pre-checking. > 5. Remove running time codes for "bytes > max_bytes" with variable length > as it should jump to call library at the beginning. > 6. Enhance do_overlap_load_compare to avoid overlapping load and compare > when the remain bytes can be loaded and compared by a smaller unit. Considering it's stage 4 now and the impact of this patch, let's defer this to next stage 1, if possible could you organize the above changes into patches: 1) Refactor expand_compare_loop by splitting into two functions without any functional changes. 2) Remove some useless codes like 2, 4, 5. 3) Some more enhancements like 1, 3, 6. ? It would be helpful for the review. Thanks! BR, Kewen > > Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no > regressions. Is this OK for trunk? > > Thanks > Gui Haochen > > > ChangeLog > rs6000: Refactor expand_compare_loop and split it to two functions > > The original expand_compare_loop has a complicated logical as it's > designed for both fixed and variable length. This patch splits it to > two functions and make these two functions share common help functions. > Also the 4K boundary test and corresponding one byte load and compare > are replaced by variable length overlapping load and compare. The > do_load_mask_compare is removed as all sub-targets entering the function > has efficient overlapping load and compare so that mask load is no needed. > > gcc/ > * config/rs6000/rs6000-string.cc (do_isel): Remove. > (do_load_mask_compare): Remove. > (do_reg_compare): New. > (do_load_and_compare): New. > (do_overlap_load_compare): Do load and compare with a small unit > other than overlapping load and compare when the remain bytes can > be done by one instruction. > (expand_compare_loop): Remove. > (get_max_inline_loop_bytes): New. > (do_load_compare_rest_of_loop): New. > (generate_6432_conversion): Set it to a static function and move > ahead of gen_diff_handle. > (gen_diff_handle): New. > (gen_load_compare_loop): New. > (gen_library_call): New. > (expand_compare_with_fixed_length): New. > (expand_compare_with_variable_length): New. > (expand_block_compare): Call expand_compare_with_variable_length > to expand block compare for variable length. Call > expand_compare_with_fixed_length to expand block compare loop for > fixed length. > > gcc/testsuite/ > * gcc.target/powerpc/block-cmp-5.c: New. > * gcc.target/powerpc/block-cmp-6.c: New. > * gcc.target/powerpc/block-cmp-7.c: New. > > patch.diff > diff --git a/gcc/config/rs6000/rs6000-string.cc > b/gcc/config/rs6000/rs6000-string.cc > index f707bb2727e..018b87f2501 100644 > --- a/gcc/config/rs6000/rs6000-string.cc > +++ b/gcc/config/rs6000/rs6000-string.cc > @@ -404,21 +404,6 @@ do_ifelse (machine_mode cmpmode, rtx_code comparison, > LABEL_NUSES (true_label) += 1; > } > > -/* Emit an isel of the proper mode for DEST. > - > - DEST is the isel destination register. > - SRC1 is the isel source if CR is true. > - SRC2 is the isel source if CR is false. > - CR is the condition for the isel. */ > -static void > -do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr) > -{ > - if (GET_MODE (dest) == DImode) > - emit_insn (gen_isel_cc_di (dest, cmp, src_t, src_f, cr)); > - else > - emit_insn (gen_isel_cc_si (dest, cmp, src_t, src_f, cr)); > -} > - > /* Emit a subtract of the proper mode for DEST. > > DEST is the destination register for the subtract. > @@ -499,65 +484,61 @@ do_rotl3 (rtx dest, rtx src1, rtx src2) > emit_insn (gen_rotlsi3 (dest, src1, src2)); > } > > -/* Generate rtl for a load, shift, and compare of less than a full word. > - > - LOAD_MODE is the machine mode for the loads. > - DIFF is the reg for the difference. > - CMP_REM is the reg containing the remaining bytes to compare. > - DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. > - SRC1_ADDR is the first source address. > - SRC2_ADDR is the second source address. > - ORIG_SRC1 is the original first source block's address rtx. > - ORIG_SRC2 is the original second source block's address rtx. */ > +/* Do the compare for two registers. */ > static void > -do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, > rtx dcond, > - rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx > orig_src2) > +do_reg_compare (bool use_vec, rtx vec_result, rtx diff, rtx *dcond, rtx d1, > + rtx d2) > { > - HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); > - rtx shift_amount = gen_reg_rtx (word_mode); > - rtx d1 = gen_reg_rtx (word_mode); > - rtx d2 = gen_reg_rtx (word_mode); > - > - do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1); > - do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2); > - do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem); > - > - if (word_mode == DImode) > - { > - emit_insn (gen_ashldi3 (shift_amount, shift_amount, > - GEN_INT (LOG2_BITS_PER_UNIT))); > - emit_insn (gen_lshrdi3 (d1, d1, > - gen_lowpart (SImode, shift_amount))); > - emit_insn (gen_lshrdi3 (d2, d2, > - gen_lowpart (SImode, shift_amount))); > - } > - else > - { > - emit_insn (gen_ashlsi3 (shift_amount, shift_amount, > - GEN_INT (LOG2_BITS_PER_UNIT))); > - emit_insn (gen_lshrsi3 (d1, d1, shift_amount)); > - emit_insn (gen_lshrsi3 (d2, d2, shift_amount)); > - } > + gcc_assert (!use_vec || vec_result != NULL_RTX); > + gcc_assert (REG_P (d1) && REG_P (d2)); > + gcc_assert (GET_MODE (d1) == GET_MODE (d2)); > > - if (TARGET_P9_MISC) > + if (use_vec) > + emit_insn (gen_altivec_vcmpequb_p (vec_result, d1, d2)); > + else if (TARGET_P9_MISC) > { > /* Generate a compare, and convert with a setb later. */ > rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); > - emit_insn (gen_rtx_SET (dcond, cmp)); > + emit_insn (gen_rtx_SET (*dcond, cmp)); > } > else > { > + *dcond = gen_reg_rtx (CCmode); > if (word_mode == DImode) > - emit_insn (gen_subfdi3_carry (diff, d2, d1)); > + emit_insn (gen_subfdi3_carry_dot2 (diff, d2, d1, *dcond)); > else > - emit_insn (gen_subfsi3_carry (diff, d2, d1)); > + emit_insn (gen_subfsi3_carry_dot2 (diff, d2, d1, *dcond)); > } > } > > +/* Load the memory to register and do the compare. */ > +static void > +do_load_and_compare (machine_mode load_mode, rtx addr1, rtx addr2, rtx > *dcond, > + rtx diff, rtx orig_src1, rtx orig_src2) > +{ > + rtx d1 = gen_reg_rtx (word_mode); > + rtx d2 = gen_reg_rtx (word_mode); > + > + if (MEM_P (addr1)) > + do_load_for_compare (d1, addr1, load_mode); > + else > + do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1); > + > + if (MEM_P (addr2)) > + do_load_for_compare (d2, addr2, load_mode); > + else > + do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2); > + > + do_reg_compare (false, NULL_RTX, diff, dcond, d1, d2); > +} > + > /* Generate rtl for an overlapping load and compare of less than a > full load_mode. This assumes that the previous word is part of the > block being compared so it's ok to back up part of a word so we can > compare the last unaligned full word that ends at the end of the block. > + If the remain bytes can be loaded and compared by a small unit with > + only one instruction, just do the load and compare by the small unit > + other than the full word overlapping load and compare. > > LOAD_MODE is the machine mode for the loads. > ISCONST tells whether the remaining length is a constant or in a register. > @@ -569,55 +550,41 @@ do_load_mask_compare (const machine_mode load_mode, rtx > diff, rtx cmp_rem, rtx d > SRC2_ADDR is the second source address. > ORIG_SRC1 is the original first source block's address rtx. > ORIG_SRC2 is the original second source block's address rtx. */ > + > static void > -do_overlap_load_compare (machine_mode load_mode, bool isConst, > - HOST_WIDE_INT bytes_rem, rtx diff, > - rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr, > - rtx orig_src1, rtx orig_src2) > +do_overlap_load_compare (machine_mode load_mode, HOST_WIDE_INT bytes_rem, > + rtx diff, rtx *dcond, rtx orig_src1, rtx orig_src2, > + HOST_WIDE_INT length) > { > HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); > - HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem; > - rtx d1 = gen_reg_rtx (word_mode); > - rtx d2 = gen_reg_rtx (word_mode); > + gcc_assert (IN_RANGE (bytes_rem, 0, load_mode_size - 1)); > > rtx addr1, addr2; > - if (!isConst || addr_adj) > - { > - rtx adj_reg = gen_reg_rtx (word_mode); > - if (isConst) > - emit_move_insn (adj_reg, GEN_INT (-addr_adj)); > - else > - { > - rtx reg_lms = gen_reg_rtx (word_mode); > - emit_move_insn (reg_lms, GEN_INT (load_mode_size)); > - do_sub3 (adj_reg, cmp_rem, reg_lms); > - } > > - addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg); > - addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg); > - } > - else > + switch (bytes_rem) > { > - addr1 = src1_addr; > - addr2 = src2_addr; > + case 0: > + return; > + case 1: > + load_mode = QImode; > + break; > + case 2: > + load_mode = HImode; > + break; > + case 4: > + load_mode = SImode; > + break; > + case 8: > + if (TARGET_POWERPC64) > + load_mode = DImode; > + break; > } > > - do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1); > - do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2); > - > - if (TARGET_P9_MISC) > - { > - /* Generate a compare, and convert with a setb later. */ > - rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); > - emit_insn (gen_rtx_SET (dcond, cmp)); > - } > - else > - { > - if (word_mode == DImode) > - emit_insn (gen_subfdi3_carry (diff, d2, d1)); > - else > - emit_insn (gen_subfsi3_carry (diff, d2, d1)); > - } > + load_mode_size = GET_MODE_SIZE (load_mode); > + addr1 = adjust_address (orig_src1, load_mode, length - load_mode_size); > + addr2 = adjust_address (orig_src2, load_mode, length - load_mode_size); > + do_load_and_compare (load_mode, addr1, addr2, dcond, diff, > + orig_src1, orig_src2); > } > > /* Generate the sequence of compares for strcmp/strncmp using vec/vsx > @@ -889,790 +856,550 @@ emit_final_compare_vec (rtx str1, rtx str2, rtx > result, > return; > } > > -/* Expand a block compare operation using loop code, and return true > - if successful. Return false if we should let the compiler generate > - normal code, probably a memcmp call. > - > - OPERANDS[0] is the target (result). > - OPERANDS[1] is the first source. > - OPERANDS[2] is the second source. > - OPERANDS[3] is the length. > - OPERANDS[4] is the alignment. */ > -bool > -expand_compare_loop (rtx operands[]) > +static HOST_WIDE_INT > +get_max_inline_loop_bytes (bool bytes_is_const, int align) > { > - rtx target = operands[0]; > - rtx orig_src1 = operands[1]; > - rtx orig_src2 = operands[2]; > - rtx bytes_rtx = operands[3]; > - rtx align_rtx = operands[4]; > - > - /* This case is complicated to handle because the subtract > - with carry instructions do not generate the 64-bit > - carry and so we must emit code to calculate it ourselves. > - We choose not to implement this yet. */ > - if (TARGET_32BIT && TARGET_POWERPC64) > - return false; > - > - /* Allow non-const length. */ > - int bytes_is_const = CONST_INT_P (bytes_rtx); > - > - /* This must be a fixed size alignment. */ > - if (!CONST_INT_P (align_rtx)) > - return false; > - > - HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; > - HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; > - HOST_WIDE_INT minalign = MIN (align1, align2); > - > - bool isP7 = (rs6000_tune == PROCESSOR_POWER7); > - > - gcc_assert (GET_MODE (target) == SImode); > - > - /* Anything to move? */ > - HOST_WIDE_INT bytes = 0; > - if (bytes_is_const) > - bytes = INTVAL (bytes_rtx); > - > - if (bytes_is_const && bytes == 0) > - return true; > - > - /* Limit the amount we compare, if known statically. */ > - HOST_WIDE_INT max_bytes; > switch (rs6000_tune) > { > case PROCESSOR_POWER7: > if (!bytes_is_const) > - if (minalign < 8) > - max_bytes = 0; > + if (align < 8) > + return 0; > else > - max_bytes = 128; > + return 128; > else > - if (minalign < 8) > - max_bytes = 32; > + if (align < 8) > + return 32; > else > - max_bytes = 128; > + return 128; > break; > case PROCESSOR_POWER8: > if (!bytes_is_const) > - max_bytes = 0; > + return 0; > else > - if (minalign < 8) > - max_bytes = 128; > + if (align < 8) > + return 128; > else > - max_bytes = 64; > + return 64; > break; > case PROCESSOR_POWER9: > case PROCESSOR_POWER10: > if (bytes_is_const) > - max_bytes = 191; > + return 191; > else > - max_bytes = 0; > + return 0; > break; > default: > - max_bytes = 128; > + return 128; > } > +} > > - /* Allow the option to override the default. */ > - if (rs6000_block_compare_inline_loop_limit >= 0) > - max_bytes = (unsigned HOST_WIDE_INT) > rs6000_block_compare_inline_loop_limit; > - > - if (max_bytes == 0) > - return false; > +/* Do the load and compare when remain bytes is less than loop bytes > + and it's a variable length compare. expand_bytes indicates the > + maximum bytes needed to be expanded. */ > +static void > +do_load_compare_rest_of_loop (machine_mode load_mode, rtx src1_addr, > + rtx src2_addr, rtx cmp_rem, rtx diff, > + rtx diff_label, rtx *dcond, rtx final_label, > + rtx orig_src1, rtx orig_src2, > + HOST_WIDE_INT loop_bytes, > + HOST_WIDE_INT expand_bytes) > +{ > + gcc_assert ((TARGET_POWERPC64 && load_mode == DImode) > + || (!TARGET_POWERPC64 && load_mode == SImode)); > + HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); > + gcc_assert (loop_bytes = 2 * load_mode_size); > + gcc_assert (expand_bytes < loop_bytes); > > - rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */ > - rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. > */ > - HOST_WIDE_INT niter; > - rtx iter = gen_reg_rtx (word_mode); > - rtx iv1 = gen_reg_rtx (word_mode); > - rtx iv2 = gen_reg_rtx (word_mode); > - rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */ > - rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */ > - rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */ > - rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */ > + rtx adj_reg = gen_reg_rtx (word_mode); > + rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); > + rtx j; > + rtx cmp; > + rtx ccreg = gen_reg_rtx (CCmode); > > - /* Strip unneeded subreg from length if there is one. */ > - if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx)) > - bytes_rtx = SUBREG_REG (bytes_rtx); > - /* Extend bytes_rtx to word_mode if needed. But, we expect only to > - maybe have to deal with the case were bytes_rtx is SImode and > - word_mode is DImode. */ > - if (!bytes_is_const) > + if (TARGET_POWERPC64 && expand_bytes >= 8) > { > - if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode)) > - /* Do not expect length longer than word_mode. */ > - return false; > - else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE > (word_mode)) > + /* Compare with 8 bytes. */ > + rtx cmp_4 = gen_label_rtx (); > + cmp = gen_rtx_COMPARE (CCmode, cmp_rem, GEN_INT (8)); > + emit_insn (gen_rtx_SET (ccreg, cmp)); > + do_ifelse (CCmode, LT, NULL_RTX, NULL_RTX, ccreg, cmp_4, > + profile_probability::even ()); > + do_load_and_compare (DImode, src1_addr, src2_addr, dcond, diff, > + orig_src1, orig_src2); > + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX, > + *dcond, diff_label, profile_probability::unlikely ()); > + > + if (expand_bytes > 8) > { > - bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); > - bytes_rtx = force_reg (word_mode, > - gen_rtx_fmt_e (ZERO_EXTEND, word_mode, > - bytes_rtx)); > + do_ifelse (CCmode, EQ, NULL_RTX, NULL_RTX, ccreg, final_label, > + profile_probability::unlikely ()); > + > + /* cmp_rem is great than 8 bytes. Do 8 bytes overlap compare. */ > + do_add3 (adj_reg, cmp_rem, GEN_INT (-8)); > + do_add3 (src1_addr, src1_addr, adj_reg); > + do_add3 (src2_addr, src2_addr, adj_reg); > + do_load_and_compare (DImode, src1_addr, src2_addr, dcond, diff, > + orig_src1, orig_src2); > + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX, > + *dcond, diff_label, profile_probability::likely ()); > } > - else > - /* Make sure it's in a register before we get started. */ > - bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); > - } > - > - machine_mode load_mode = word_mode; > - HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); > - > - /* Number of bytes per iteration of the unrolled loop. */ > - HOST_WIDE_INT loop_bytes = 2 * load_mode_size; > - /* max iters and bytes compared in the loop. */ > - HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes; > - HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes; > - int l2lb = floor_log2 (loop_bytes); > > - if (bytes_is_const && (max_bytes < load_mode_size > - || !IN_RANGE (bytes, load_mode_size, max_bytes))) > - return false; > + j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); > + JUMP_LABEL (j) = final_label; > + LABEL_NUSES (final_label) += 1; > + emit_barrier (); > > - bool no_remainder_code = false; > - rtx final_label = gen_label_rtx (); > - rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); > - rtx diff_label = gen_label_rtx (); > - rtx library_call_label = NULL; > - rtx cleanup_label = gen_label_rtx (); > + emit_label (cmp_4); > + } > > - rtx cr; > + if (expand_bytes >= 4) > + { > + /* Compare with 4 bytes. */ > + rtx cmp_2 = gen_label_rtx (); > + cmp = gen_rtx_COMPARE (CCmode, cmp_rem, GEN_INT (4)); > + emit_insn (gen_rtx_SET (ccreg, cmp)); > + do_ifelse (CCmode, LT, NULL_RTX, NULL_RTX, ccreg, cmp_2, > + profile_probability::even ()); > + do_load_and_compare (SImode, src1_addr, src2_addr, dcond, diff, > + orig_src1, orig_src2); > + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX, > + *dcond, diff_label, profile_probability::unlikely ()); > + > + if (expand_bytes > 4) > + { > + do_ifelse (CCmode, EQ, NULL_RTX, NULL_RTX, ccreg, final_label, > + profile_probability::unlikely ()); > + > + /* cmp_rem is great than 4 bytes. Do 4 bytes overlap compare. */ > + do_add3 (adj_reg, cmp_rem, GEN_INT (-4)); > + do_add3 (src1_addr, src1_addr, adj_reg); > + do_add3 (src2_addr, src2_addr, adj_reg); > + do_load_and_compare (SImode, src1_addr, src2_addr, dcond, diff, > + orig_src1, orig_src2); > + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX, > + *dcond, diff_label, profile_probability::likely ()); > + } > > - rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0)); > - rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0)); > + j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); > + JUMP_LABEL (j) = final_label; > + LABEL_NUSES (final_label) += 1; > + emit_barrier (); > > - /* Difference found is stored here before jump to diff_label. */ > - rtx diff = gen_reg_rtx (word_mode); > - rtx_insn *j; > + emit_label (cmp_2); > + } > > - /* Example of generated code for 35 bytes aligned 1 byte. > - > - mtctr 8 > - li 6,0 > - li 5,8 > - .L13: > - ldbrx 7,3,6 > - ldbrx 9,10,6 > - ldbrx 0,3,5 > - ldbrx 4,10,5 > - addi 6,6,16 > - addi 5,5,16 > - subfc. 9,9,7 > - bne 0,.L10 > - subfc. 9,4,0 > - bdnzt 2,.L13 > - bne 0,.L10 > - add 3,3,6 > - add 10,10,6 > - addi 9,3,-5 > - ldbrx 7,0,9 > - addi 9,10,-5 > - ldbrx 9,0,9 > - subfc 9,9,7 > - .p2align 4,,15 > - .L10: > - popcntd 9,9 > - subfe 10,10,10 > - or 9,9,10 > - > - Compiled with -fno-reorder-blocks for clarity. */ > - > - /* Structure of what we're going to do: > - Two separate lengths: what we will compare before bailing to library > - call (max_bytes), and the total length to be checked. > - if length <= 16, branch to linear cleanup code starting with > - remainder length check (length not known at compile time) > - set up 2 iv's and load count reg, compute remainder length > - unrollx2 compare loop > - if loop exit due to a difference, branch to difference handling code > - if remainder length < 8, branch to final cleanup compare > - load and compare 8B > - final cleanup comparison (depends on alignment and length) > - load 8B, shift off bytes past length, compare > - load 8B ending at last byte and compare > - load/compare 1 byte at a time (short block abutting 4k boundary) > - difference handling, 64->32 conversion > - final result > - branch around memcmp call > - memcmp library call > - */ > - > - /* If bytes is not const, compare length and branch directly > - to the cleanup code that can handle 0-16 bytes if length > - is >= 16. Stash away bytes-max_bytes for the library call. */ > - if (bytes_is_const) > + if (expand_bytes >= 2) > { > - /* These need to be set for some of the places we may jump to. */ > - if (bytes > max_bytes) > - { > - no_remainder_code = true; > - niter = max_loop_iter; > - library_call_label = gen_label_rtx (); > - } > - else > + /* Compare with 2 bytes. */ > + rtx cmp_1 = gen_label_rtx (); > + cmp = gen_rtx_COMPARE (CCmode, cmp_rem, GEN_INT (2)); > + emit_insn (gen_rtx_SET (ccreg, cmp)); > + do_ifelse (CCmode, LT, NULL_RTX, NULL_RTX, ccreg, cmp_1, > + profile_probability::even ()); > + do_load_and_compare (HImode, src1_addr, src2_addr, dcond, diff, > + orig_src1, orig_src2); > + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX, > + *dcond, diff_label, profile_probability::unlikely ()); > + > + if (expand_bytes > 2) > { > - niter = bytes / loop_bytes; > + do_ifelse (CCmode, EQ, NULL_RTX, NULL_RTX, ccreg, final_label, > + profile_probability::unlikely ()); > + > + /* cmp_rem equals to 3 bytes and leave 1 byte to load and > + compare. */ > + do_add3 (src1_addr, src1_addr, GEN_INT (2)); > + do_add3 (src2_addr, src2_addr, GEN_INT (2)); > } > - emit_move_insn (iter, GEN_INT (niter)); > - emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes)); > - emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes)); > - } > - else > - { > - library_call_label = gen_label_rtx (); > > - /* If we go to the cleanup code, it expects length to be in cmp_rem. > */ > - emit_move_insn (cmp_rem, bytes_rtx); > + emit_label (cmp_1); > + } > > - /* Check for > max_bytes bytes. We want to bail out as quickly as > - possible if we have to go over to memcmp. */ > - do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes), > - NULL_RTX, library_call_label, profile_probability::even ()); > + /* Do 1 byte load and compare. */ > + do_load_and_compare (QImode, src1_addr, src2_addr, dcond, diff, > + orig_src1, orig_src2); > + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX, > + *dcond, diff_label, profile_probability::likely ()); > + j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); > + JUMP_LABEL (j) = final_label; > + LABEL_NUSES (final_label) += 1; > + emit_barrier (); > +} > > - /* Check for < loop_bytes bytes. */ > - do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes), > - NULL_RTX, cleanup_label, profile_probability::even ()); > +/* Generate code to convert a DImode-plus-carry subtract result into > + a SImode result that has the same <0 / ==0 / >0 properties to > + produce the final result from memcmp. > > - /* Loop compare bytes and iterations if bytes>max_bytes. */ > - rtx mb_reg = gen_reg_rtx (word_mode); > - emit_move_insn (mb_reg, GEN_INT (max_loop_bytes)); > - rtx mi_reg = gen_reg_rtx (word_mode); > - emit_move_insn (mi_reg, GEN_INT (max_loop_iter)); > + TARGET is the rtx for the register to receive the memcmp result. > + SUB_RESULT is the rtx for the register contining the subtract result. */ > > - /* Compute number of loop iterations if bytes <= max_bytes. */ > - if (word_mode == DImode) > - emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb))); > - else > - emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb))); > +static void > +generate_6432_conversion (rtx target, rtx sub_result) > +{ > + /* We need to produce DI result from sub, then convert to target SI > + while maintaining <0 / ==0 / >0 properties. This sequence works: > + subfc L,A,B > + subfe H,H,H > + popcntd L,L > + rldimi L,H,6,0 > > - /* Compute bytes to compare in loop if bytes <= max_bytes. */ > - rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb); > - if (word_mode == DImode) > - { > - emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask)); > - } > - else > - { > - emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask)); > - } > + This is an alternate one Segher cooked up if somebody > + wants to expand this for something that doesn't have popcntd: > + subfc L,a,b > + subfe H,x,x > + addic t,L,-1 > + subfe v,t,L > + or z,v,H > > - /* Check for bytes <= max_bytes. */ > - if (TARGET_ISEL) > - { > - /* P9 has fast isel so we use one compare and two isel. */ > - cr = gen_reg_rtx (CCmode); > - rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx, > - GEN_INT (max_bytes)); > - emit_move_insn (cr, compare_rtx); > - rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx); > - do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr); > - do_isel (iter, cmp_rtx, iter, mi_reg, cr); > - } > - else > - { > - rtx lab_after = gen_label_rtx (); > - do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes), > - NULL_RTX, lab_after, profile_probability::even ()); > - emit_move_insn (loop_cmp, mb_reg); > - emit_move_insn (iter, mi_reg); > - emit_label (lab_after); > - } > + And finally, p9 can just do this: > + cmpld A,B > + setb r > + . */ > > - /* Now compute remainder bytes which isn't used until after the loop. > */ > - do_sub3 (cmp_rem, bytes_rtx, loop_cmp); > + if (TARGET_64BIT) > + { > + rtx tmp_reg_ca = gen_reg_rtx (DImode); > + emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); > + rtx popcnt = gen_reg_rtx (DImode); > + emit_insn (gen_popcntddi2 (popcnt, sub_result)); > + rtx tmp2 = gen_reg_rtx (DImode); > + emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca)); > + emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2))); > + } > + else > + { > + rtx tmp_reg_ca = gen_reg_rtx (SImode); > + emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); > + rtx popcnt = gen_reg_rtx (SImode); > + emit_insn (gen_popcntdsi2 (popcnt, sub_result)); > + emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca)); > } > +} > > - rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */ > - /* For p9 we need to have just one of these as multiple places define > - it and it gets used by the setb at the end. */ > +/* Generate the return value when memcmp finds a difference from > + compare. */ > +static void > +gen_diff_handle (rtx target, rtx dcond, rtx diff, rtx diff_label, > + rtx final_label) > +{ > + emit_label (diff_label); > if (TARGET_P9_MISC) > - dcond = gen_reg_rtx (CCUNSmode); > + emit_insn (gen_setb_unsigned (target, dcond)); > + else > + generate_6432_conversion (target, diff); > > - if (!bytes_is_const || bytes >= loop_bytes) > - { > - /* It should not be possible to come here if remaining bytes is > - < 16 in the runtime case either. Compute number of loop > - iterations. We compare 2*word_mode per iteration so 16B for > - 64-bit code and 8B for 32-bit. Set up two induction > - variables and load count register. */ > - > - /* HACK ALERT: create hard reg for CTR here. If we just use a > - pseudo, cse will get rid of it and then the allocator will > - see it used in the lshr above and won't give us ctr. */ > - rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); > - emit_move_insn (ctr, iter); > - emit_move_insn (diff, GEN_INT (0)); > - emit_move_insn (iv1, GEN_INT (0)); > - emit_move_insn (iv2, GEN_INT (load_mode_size)); > - > - /* inner loop to compare 2*word_mode */ > - rtx loop_top_label = gen_label_rtx (); > - emit_label (loop_top_label); > - > - rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1); > - rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1); > - > - do_load_for_compare_from_addr (load_mode, d1_1, > - src1_ix1, orig_src1); > - do_load_for_compare_from_addr (load_mode, d2_1, > - src2_ix1, orig_src2); > - do_add3 (iv1, iv1, GEN_INT (loop_bytes)); > - > - rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2); > - rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2); > - > - do_load_for_compare_from_addr (load_mode, d1_2, > - src1_ix2, orig_src1); > - do_load_for_compare_from_addr (load_mode, d2_2, > - src2_ix2, orig_src2); > - do_add3 (iv2, iv2, GEN_INT (loop_bytes)); > - > - if (TARGET_P9_MISC) > - { > - /* Generate a compare, and convert with a setb later. */ > - rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); > - emit_insn (gen_rtx_SET (dcond, cmp)); > - } > - else > - { > - dcond = gen_reg_rtx (CCmode); > - if (word_mode == DImode) > - emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); > - else > - emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); > - } > + rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); > + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); > + JUMP_LABEL (j) = final_label; > + LABEL_NUSES (final_label) += 1; > + emit_barrier (); > +} > > - do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, > - dcond, diff_label, profile_probability::unlikely ()); > +static void > +gen_load_compare_loop (machine_mode load_mode, rtx src1_addr, rtx src2_addr, > + rtx orig_src1, rtx orig_src2, rtx diff, rtx diff_label, > + rtx cmp_rem, rtx *dcond, HOST_WIDE_INT loop_bytes, > + rtx final_label, bool bytes_is_const) > +{ > + HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); > + gcc_assert (loop_bytes == 2 * load_mode_size); > > - if (TARGET_P9_MISC) > - { > - /* Generate a compare, and convert with a setb later. */ > - rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2); > - emit_insn (gen_rtx_SET (dcond, cmp)); > - } > - else > - { > - dcond = gen_reg_rtx (CCmode); > - if (word_mode == DImode) > - emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond)); > - else > - emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond)); > - } > + rtx iter = gen_reg_rtx (word_mode); > > - rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2); > - if (TARGET_64BIT) > - j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr, > - eqrtx, dcond)); > - else > - j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr, > - eqrtx, dcond)); > - add_reg_br_prob_note (j, profile_probability::likely ()); > - JUMP_LABEL (j) = loop_top_label; > - LABEL_NUSES (loop_top_label) += 1; > - } > + int l2lb = floor_log2 (loop_bytes); > + if (word_mode == DImode) > + emit_insn (gen_lshrdi3 (iter, cmp_rem, GEN_INT (l2lb))); > + else > + emit_insn (gen_lshrsi3 (iter, cmp_rem, GEN_INT (l2lb))); > > - HOST_WIDE_INT bytes_remaining = 0; > - if (bytes_is_const) > - bytes_remaining = (bytes % loop_bytes); > + rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); > + emit_move_insn (ctr, iter); > > - /* If diff is nonzero, branch to difference handling > - code. If we exit here with a nonzero diff, it is > - because the second word differed. */ > - if (TARGET_P9_MISC) > - do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, > - diff_label, profile_probability::unlikely ()); > + rtx iv1 = gen_reg_rtx (word_mode); > + rtx iv2 = gen_reg_rtx (word_mode); > + rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1. */ > + rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2. */ > + rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1. */ > + rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2. */ > + > + emit_move_insn (iv1, GEN_INT (0)); > + emit_move_insn (iv2, GEN_INT (load_mode_size)); > + > + rtx loop_top_label = gen_label_rtx (); > + emit_label (loop_top_label); > + > + /* Manually put two pair of loads together. */ > + rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1); > + rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1); > + do_load_for_compare_from_addr (load_mode, d1_1, src1_ix1, orig_src1); > + do_load_for_compare_from_addr (load_mode, d2_1, src2_ix1, orig_src2); > + rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2); > + rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2); > + do_load_for_compare_from_addr (load_mode, d1_2, src1_ix2, orig_src1); > + do_load_for_compare_from_addr (load_mode, d2_2, src2_ix2, orig_src2); > + > + do_reg_compare (false, NULL_RTX, diff, dcond, d1_1, d2_1); > + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX, > + *dcond, diff_label, profile_probability::unlikely ()); > + do_reg_compare (false, NULL_RTX, diff, dcond, d1_2, d2_2); > + > + do_add3 (iv1, iv1, GEN_INT (loop_bytes)); > + do_add3 (iv2, iv2, GEN_INT (loop_bytes)); > + > + rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2); > + rtx_insn *j; > + if (TARGET_64BIT) > + j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr, > + eqrtx, *dcond)); > else > - do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, > - diff_label, profile_probability::unlikely ()); > + j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr, > + eqrtx, *dcond)); > + add_reg_br_prob_note (j, profile_probability::likely ()); > + JUMP_LABEL (j) = loop_top_label; > + LABEL_NUSES (loop_top_label) += 1; > > - if (library_call_label != NULL && bytes_is_const && bytes > max_bytes) > - { > - /* If the length is known at compile time, then we will always > - have a remainder to go to the library call with. */ > - rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, > library_call_label); > - j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref)); > - JUMP_LABEL (j) = library_call_label; > - LABEL_NUSES (library_call_label) += 1; > - emit_barrier (); > - } > + do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX, *dcond, > + diff_label, profile_probability::unlikely ()); > > - if (bytes_is_const && bytes_remaining == 0) > + /* If length is fixed, we know how many bytes are left. So skip the > + remain bytes test. */ > + if (!bytes_is_const) > { > - /* No remainder and if we are here then diff is 0 so just return 0 */ > - if (TARGET_64BIT) > - emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); > - else > - emit_move_insn (target, diff); > - j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); > - JUMP_LABEL (j) = final_label; > - LABEL_NUSES (final_label) += 1; > - emit_barrier (); > + do_sub3 (cmp_rem, cmp_rem, iv1); > + do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX, > + final_label, profile_probability::unlikely ()); > } > - else if (!no_remainder_code) > - { > - /* Update addresses to point to the next word to examine. */ > - do_add3 (src1_addr, src1_addr, iv1); > - do_add3 (src2_addr, src2_addr, iv1); > - > - emit_label (cleanup_label); > > - if (!bytes_is_const) > - { > - /* If we're dealing with runtime length, we have to check if > - it's zero after the loop. When length is known at compile > - time the no-remainder condition is dealt with above. By > - doing this after cleanup_label, we also deal with the > - case where length is 0 at the start and we bypass the > - loop with a branch to cleanup_label. */ > - emit_move_insn (target, const0_rtx); > - do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, > - NULL_RTX, final_label, profile_probability::unlikely ()); > - } > - > - rtx final_cleanup = gen_label_rtx (); > - rtx cmp_rem_before = gen_reg_rtx (word_mode); > - /* Compare one more word_mode chunk if needed. */ > - if (!bytes_is_const || bytes_remaining >= load_mode_size) > - { > - /* If remainder length < word length, branch to final > - cleanup compare. */ > - > - if (!bytes_is_const) > - { > - do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size), > - NULL_RTX, final_cleanup, profile_probability::even ()); > - } > + do_add3 (src1_addr, src1_addr, iv1); > + do_add3 (src2_addr, src2_addr, iv1); > +} > > - /* load and compare 8B */ > - do_load_for_compare_from_addr (load_mode, d1_1, > - src1_addr, orig_src1); > - do_load_for_compare_from_addr (load_mode, d2_1, > - src2_addr, orig_src2); > +/* Generate memcmp library call. */ > +static void > +gen_library_call (rtx target, rtx src1_addr, rtx src2_addr, rtx bytes_rtx, > + rtx library_label) > +{ > + emit_label (library_label); > + > + rtx len_rtx = gen_reg_rtx (word_mode); > + emit_move_insn (len_rtx, bytes_rtx); > + tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP); > + emit_library_call_value (XEXP (DECL_RTL (fun), 0), > + target, LCT_NORMAL, GET_MODE (target), > + src1_addr, Pmode, src2_addr, Pmode, > + len_rtx, GET_MODE (len_rtx)); > +} > > - /* Compare the word, see if we need to do the last partial. */ > - if (TARGET_P9_MISC) > - { > - /* Generate a compare, and convert with a setb later. */ > - rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); > - emit_insn (gen_rtx_SET (dcond, cmp)); > - } > - else > - { > - dcond = gen_reg_rtx (CCmode); > - if (word_mode == DImode) > - emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); > - else > - emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); > - } > +static bool > +expand_compare_with_fixed_length (rtx operands[]) > +{ > + rtx target = operands[0]; > + rtx orig_src1 = operands[1]; > + rtx orig_src2 = operands[2]; > + rtx bytes_rtx = operands[3]; > + rtx align_rtx = operands[4]; > > - do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, > - dcond, diff_label, profile_probability::even ()); > + gcc_assert (CONST_INT_P (bytes_rtx)); > + gcc_assert (GET_MODE (target) == SImode); > > - do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size)); > - do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size)); > - emit_move_insn (cmp_rem_before, cmp_rem); > - do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size)); > - if (bytes_is_const) > - bytes_remaining -= load_mode_size; > - else > - /* See if remaining length is now zero. We previously set > - target to 0 so we can just jump to the end. */ > - do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX, > - final_label, profile_probability::unlikely ()); > - } > + if (TARGET_32BIT && TARGET_POWERPC64) > + return false; > > - /* Cases: > - bytes_is_const > - We can always shift back to do an overlapping compare > - of the last chunk because we know length >= 8. > - > - !bytes_is_const > - align>=load_mode_size > - Read word_mode and mask > - align<load_mode_size > - avoid stepping past end > - > - Three strategies: > - * decrement address and do overlapping compare > - * read word_mode and mask > - * carefully avoid crossing 4k boundary > - */ > - > - if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7)) > - && align1 >= load_mode_size && align2 >= load_mode_size) > - { > - /* Alignment is larger than word_mode so we do not need to be > - concerned with extra page crossings. But, we do not know > - that the length is larger than load_mode_size so we might > - end up compareing against data before the block if we try > - an overlapping compare. Also we use this on P7 for fixed length > - remainder because P7 doesn't like overlapping unaligned. > - Strategy: load 8B, shift off bytes past length, and compare. */ > - emit_label (final_cleanup); > - do_load_mask_compare (load_mode, diff, cmp_rem, dcond, > - src1_addr, src2_addr, orig_src1, orig_src2); > - } > - else if (bytes_remaining && bytes_is_const) > - { > - /* We do not do loop expand if length < 32 so we know at the > - end we can do an overlapping compare. > - Strategy: shift address back and do word_mode load that > - ends at the end of the block. */ > - emit_label (final_cleanup); > - do_overlap_load_compare (load_mode, true, bytes_remaining, diff, > - cmp_rem, dcond, src1_addr, src2_addr, > - orig_src1, orig_src2); > - } > - else if (!bytes_is_const) > - { > - rtx handle4k_label = gen_label_rtx (); > - rtx nonconst_overlap = gen_label_rtx (); > - emit_label (nonconst_overlap); > - > - /* Here we have to handle the case where whe have runtime > - length which may be too short for overlap compare, and > - alignment is not at least load_mode_size so we have to > - tread carefully to avoid stepping across 4k boundaries. */ > - > - /* If the length after the loop was larger than word_mode > - size, we can just do an overlapping compare and we're > - done. We fall through to this code from the word_mode > - compare that preceeds this. */ > - do_overlap_load_compare (load_mode, false, 0, diff, > - cmp_rem, dcond, src1_addr, src2_addr, > - orig_src1, orig_src2); > - > - rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label); > - j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); > - JUMP_LABEL (j) = diff_label; > - LABEL_NUSES (diff_label) += 1; > - emit_barrier (); > - > - /* If we couldn't do the overlap compare we have to be more > - careful of the 4k boundary. Test to see if either > - address is less than word_mode_size away from a 4k > - boundary. If not, then we can do a load/shift/compare > - and we are done. We come to this code if length was less > - than word_mode_size. */ > - > - emit_label (final_cleanup); > - > - /* We can still avoid the slow case if the length was larger > - than one loop iteration, in which case go do the overlap > - load compare path. */ > - do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes), > - NULL_RTX, nonconst_overlap, profile_probability::even ()); > - > - rtx rem4k = gen_reg_rtx (word_mode); > - rtx dist1 = gen_reg_rtx (word_mode); > - rtx dist2 = gen_reg_rtx (word_mode); > - do_sub3 (rem4k, GEN_INT (4096), cmp_rem); > - if (word_mode == SImode) > - emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff))); > - else > - emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff))); > - do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, > - handle4k_label, profile_probability::very_unlikely ()); > - if (word_mode == SImode) > - emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff))); > - else > - emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff))); > - do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, > - handle4k_label, profile_probability::very_unlikely ()); > + /* This must be a fixed size alignment. */ > + if (!CONST_INT_P (align_rtx)) > + return false; > > - /* We don't have a 4k boundary to deal with, so do > - a load/shift/compare and jump to diff. */ > + HOST_WIDE_INT align = INTVAL (align_rtx) / BITS_PER_UNIT; > + HOST_WIDE_INT bytes = INTVAL (bytes_rtx); > > - do_load_mask_compare (load_mode, diff, cmp_rem, dcond, > - src1_addr, src2_addr, orig_src1, orig_src2); > + if (bytes == 0) > + return true; > > - j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); > - JUMP_LABEL (j) = diff_label; > - LABEL_NUSES (diff_label) += 1; > - emit_barrier (); > + /* Limit the amount we compare, if known statically. */ > + HOST_WIDE_INT max_bytes = get_max_inline_loop_bytes (true, align); > > - /* Finally in the unlikely case we are inching up to a > - 4k boundary we use a compact lbzx/compare loop to do > - it a byte at a time. */ > + /* Allow the option to override the default. */ > + if (rs6000_block_compare_inline_loop_limit >= 0) > + max_bytes = (unsigned HOST_WIDE_INT) > rs6000_block_compare_inline_loop_limit; > > - emit_label (handle4k_label); > + if (max_bytes == 0) > + return false; > > - rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); > - emit_move_insn (ctr, cmp_rem); > - rtx ixreg = gen_reg_rtx (Pmode); > - emit_move_insn (ixreg, const0_rtx); > + machine_mode load_mode = word_mode; > + HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); > > - rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg); > - rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg); > - rtx d1 = gen_reg_rtx (word_mode); > - rtx d2 = gen_reg_rtx (word_mode); > + if (max_bytes < load_mode_size > + || !IN_RANGE (bytes, load_mode_size, max_bytes)) > + return false; > > - rtx fc_loop = gen_label_rtx (); > - emit_label (fc_loop); > + /* Remainder bytes for compare. */ > + rtx cmp_rem = gen_reg_rtx (word_mode); > + /* Number of bytes per iteration of the unrolled loop. */ > + HOST_WIDE_INT loop_bytes = 2 * load_mode_size; > > - do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1); > - do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2); > + rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0)); > + rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0)); > > - do_add3 (ixreg, ixreg, const1_rtx); > + /* Label for set target when finding a diff. */ > + rtx diff_label = gen_label_rtx (); > + rtx final_label = gen_label_rtx (); > > - rtx cond = gen_reg_rtx (CCmode); > - rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2); > - rs6000_emit_dot_insn (diff, subexpr, 2, cond); > + /* CC used for when we jump to diff_label. */ > + rtx dcond = NULL_RTX; > + /* For p9 we need to have just one of these as multiple places define > + it and it gets used by the setb at the end. */ > + if (TARGET_P9_MISC) > + dcond = gen_reg_rtx (CCUNSmode); > > - rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2); > - if (TARGET_64BIT) > - j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr, > - eqrtx, cond)); > - else > - j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr, > - eqrtx, cond)); > - add_reg_br_prob_note (j, profile_probability::likely ()); > - JUMP_LABEL (j) = fc_loop; > - LABEL_NUSES (fc_loop) += 1; > - > - if (TARGET_64BIT) > - emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); > - else > - emit_move_insn (target, diff); > - > - /* Since we are comparing bytes, the difference can be used > - as the final result and we are done here. */ > - j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); > - JUMP_LABEL (j) = final_label; > - LABEL_NUSES (final_label) += 1; > - emit_barrier (); > - } > - } > + /* Difference found is stored here before jump to diff_label. */ > + rtx diff = gen_reg_rtx (word_mode); > > - emit_label (diff_label); > - /* difference handling, 64->32 conversion */ > + emit_move_insn (cmp_rem, GEN_INT (bytes)); > + emit_move_insn (target, const0_rtx); > > - /* We need to produce DI result from sub, then convert to target SI > - while maintaining <0 / ==0 / >0 properties. This sequence works: > - subfc L,A,B > - subfe H,H,H > - popcntd L,L > - rldimi L,H,6,0 > + gen_load_compare_loop (load_mode, src1_addr, src2_addr, orig_src1, > + orig_src2, diff, diff_label, cmp_rem, &dcond, > + loop_bytes, final_label, true); > > - This is an alternate one Segher cooked up if somebody > - wants to expand this for something that doesn't have popcntd: > - subfc L,a,b > - subfe H,x,x > - addic t,L,-1 > - subfe v,t,L > - or z,v,H > + HOST_WIDE_INT rem_bytes = bytes % loop_bytes; > > - And finally, p9 can just do this: > - cmpld A,B > - setb r */ > - > - if (TARGET_P9_MISC) > - emit_insn (gen_setb_unsigned (target, dcond)); > - else > + if (rem_bytes >= load_mode_size) > { > - if (TARGET_64BIT) > - { > - rtx tmp_reg_ca = gen_reg_rtx (DImode); > - emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); > - emit_insn (gen_popcntddi2 (diff, diff)); > - emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca)); > - emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); > - } > - else > - { > - rtx tmp_reg_ca = gen_reg_rtx (SImode); > - emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); > - emit_insn (gen_popcntdsi2 (diff, diff)); > - emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca)); > - } > + do_load_and_compare (load_mode, src1_addr, src2_addr, &dcond, diff, > + orig_src1, orig_src2); > + do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, > + dcond, diff_label, profile_probability::unlikely ()); > + rem_bytes -= load_mode_size; > } > > - if (library_call_label != NULL) > + if (rem_bytes > 0) > { > - /* Branch around memcmp call. */ > - j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); > - JUMP_LABEL (j) = final_label; > - LABEL_NUSES (final_label) += 1; > - emit_barrier (); > - > - /* Make memcmp library call. cmp_rem is the remaining bytes that > - were compared and cmp_rem is the expected amount to be compared > - by memcmp. If we don't find a difference in the loop compare, do > - the library call directly instead of doing a small compare just > - to get to an arbitrary boundary before calling it anyway. > - Also, update addresses to point to the next word to examine. */ > - emit_label (library_call_label); > - > - rtx len_rtx = gen_reg_rtx (word_mode); > - if (bytes_is_const) > - { > - emit_move_insn (len_rtx, cmp_rem); > - do_add3 (src1_addr, src1_addr, iv1); > - do_add3 (src2_addr, src2_addr, iv1); > - } > - else > - emit_move_insn (len_rtx, bytes_rtx); > - > - tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP); > - emit_library_call_value (XEXP (DECL_RTL (fun), 0), > - target, LCT_NORMAL, GET_MODE (target), > - src1_addr, Pmode, > - src2_addr, Pmode, > - len_rtx, GET_MODE (len_rtx)); > + do_overlap_load_compare (load_mode, rem_bytes, diff, &dcond, > + orig_src1, orig_src2, bytes); > + do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, > + dcond, diff_label, profile_probability::unlikely ()); > } > > - /* emit final_label */ > + rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); > + rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); > + JUMP_LABEL (j) = final_label; > + LABEL_NUSES (final_label) += 1; > + emit_barrier (); > + > + gen_diff_handle (target, dcond, diff, diff_label, final_label); > emit_label (final_label); > + > return true; > } > > -/* Generate code to convert a DImode-plus-carry subtract result into > - a SImode result that has the same <0 / ==0 / >0 properties to > - produce the final result from memcmp. > +static bool > +expand_compare_with_variable_length (rtx operands[]) > +{ > + rtx target = operands[0]; > + rtx orig_src1 = operands[1]; > + rtx orig_src2 = operands[2]; > + rtx bytes_rtx = operands[3]; > + rtx align_rtx = operands[4]; > > - TARGET is the rtx for the register to receive the memcmp result. > - SUB_RESULT is the rtx for the register contining the subtract result. */ > + gcc_assert (!CONST_INT_P (bytes_rtx)); > > -void > -generate_6432_conversion(rtx target, rtx sub_result) > -{ > - /* We need to produce DI result from sub, then convert to target SI > - while maintaining <0 / ==0 / >0 properties. This sequence works: > - subfc L,A,B > - subfe H,H,H > - popcntd L,L > - rldimi L,H,6,0 > + if (TARGET_32BIT && TARGET_POWERPC64) > + return false; > > - This is an alternate one Segher cooked up if somebody > - wants to expand this for something that doesn't have popcntd: > - subfc L,a,b > - subfe H,x,x > - addic t,L,-1 > - subfe v,t,L > - or z,v,H > + /* This must be a fixed size alignment. */ > + if (!CONST_INT_P (align_rtx)) > + return false; > > - And finally, p9 can just do this: > - cmpld A,B > - setb r */ > + HOST_WIDE_INT align = INTVAL (align_rtx) / BITS_PER_UNIT; > > - if (TARGET_64BIT) > + /* Limit the amount we compare, if known statically. */ > + HOST_WIDE_INT max_bytes = get_max_inline_loop_bytes (false, align); > + > + /* Allow the option to override the default. */ > + if (rs6000_block_compare_inline_loop_limit >= 0) > + max_bytes = rs6000_block_compare_inline_loop_limit; > + if (max_bytes == 0) > + return false; > + > + /* Remainder bytes for compare. */ > + rtx cmp_rem = gen_reg_rtx (word_mode); > + > + /* Strip unneeded subreg from length if there is one. */ > + if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx)) > + bytes_rtx = SUBREG_REG (bytes_rtx); > + > + if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode)) > + /* Do not expect length longer than word_mode. */ > + return false; > + else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode)) > { > - rtx tmp_reg_ca = gen_reg_rtx (DImode); > - emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); > - rtx popcnt = gen_reg_rtx (DImode); > - emit_insn (gen_popcntddi2 (popcnt, sub_result)); > - rtx tmp2 = gen_reg_rtx (DImode); > - emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca)); > - emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2))); > + bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); > + bytes_rtx = force_reg (word_mode, > + gen_rtx_fmt_e (ZERO_EXTEND, word_mode, bytes_rtx)); > } > else > + /* Make sure it's in a register before we get started. */ > + bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); > + > + machine_mode load_mode = word_mode; > + HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); > + rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0)); > + rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0)); > + > + rtx library_call_label = gen_label_rtx (); > + > + /* Call library if length > max_bytes. */ > + do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes), > + NULL_RTX, library_call_label, profile_probability::unlikely ()); > + > + /* Number of bytes per iteration of the unrolled loop. */ > + HOST_WIDE_INT loop_bytes = 2 * load_mode_size; > + > + /* Label for set target when finding a diff. */ > + rtx diff_label = gen_label_rtx (); > + rtx final_label = gen_label_rtx (); > + > + /* CC used for when we jump to diff_label. */ > + rtx dcond = NULL_RTX; > + /* For p9 we need to have just one of these as multiple places define > + it and it gets used by the setb at the end. */ > + if (TARGET_P9_MISC) > + dcond = gen_reg_rtx (CCUNSmode); > + > + /* Difference found is stored here before jump to diff_label. */ > + rtx diff = gen_reg_rtx (word_mode); > + > + emit_move_insn (target, const0_rtx); > + emit_move_insn (cmp_rem, bytes_rtx); > + > + /* Number of bytes to be expanded at rest of loop. */ > + HOST_WIDE_INT expand_bytes = max_bytes % loop_bytes; > + if (max_bytes >= loop_bytes) > { > - rtx tmp_reg_ca = gen_reg_rtx (SImode); > - emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); > - rtx popcnt = gen_reg_rtx (SImode); > - emit_insn (gen_popcntdsi2 (popcnt, sub_result)); > - emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca)); > + rtx compare_rest_label = gen_label_rtx (); > + do_ifelse (CCmode, LT, cmp_rem, GEN_INT (loop_bytes), NULL_RTX, > + compare_rest_label, profile_probability::unlikely ()); > + /* max bytes >= cmp_rem >= loop_bytes. */ > + gen_load_compare_loop (load_mode, src1_addr, src2_addr, orig_src1, > + orig_src2, diff, diff_label, cmp_rem, &dcond, > + loop_bytes, final_label, false); > + emit_label (compare_rest_label); > + expand_bytes = loop_bytes - 1; > } > + > + /* cmp_rem < loop_bytes. */ > + do_load_compare_rest_of_loop (load_mode, src1_addr, src2_addr, cmp_rem, > + diff, diff_label, &dcond, final_label, > + orig_src1, orig_src2, loop_bytes, > + expand_bytes); > + > + gen_diff_handle (target, dcond, diff, diff_label, final_label); > + gen_library_call (target, src1_addr, src2_addr, bytes_rtx, > + library_call_label); > + emit_label (final_label); > + > + return true; > } > > /* Generate memcmp expansion using in-line non-loop GPR instructions. > @@ -1975,7 +1702,7 @@ expand_block_compare (rtx operands[]) > /* If this is not a fixed size compare, try generating loop code and > if that fails just call memcmp. */ > if (!CONST_INT_P (bytes_rtx)) > - return expand_compare_loop (operands); > + return expand_compare_with_variable_length (operands); > > /* This must be a fixed size alignment. */ > if (!CONST_INT_P (align_rtx)) > @@ -2016,7 +1743,7 @@ expand_block_compare (rtx operands[]) > max_bytes = ((max_bytes + 1) / 2) - 1; > > if (!IN_RANGE (bytes, 1, max_bytes)) > - return expand_compare_loop (operands); > + return expand_compare_with_fixed_length (operands); > > rtx final_label = NULL; > > @@ -2069,7 +1796,7 @@ expand_block_compare (rtx operands[]) > if (TARGET_P9_MISC) > emit_insn (gen_setb_unsigned (target, cond)); > else > - generate_6432_conversion(target, sub_result); > + generate_6432_conversion (target, sub_result); > } > } > > diff --git a/gcc/testsuite/gcc.target/powerpc/block-cmp-5.c > b/gcc/testsuite/gcc.target/powerpc/block-cmp-5.c > new file mode 100644 > index 00000000000..60a38030784 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/block-cmp-5.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mblock-compare-inline-loop-limit=1" } */ > +/* { dg-final { scan-assembler-not {\mbdnzt\M} } } */ > + > +/* Test that no loop will be generated when the inline loop limit is less > + than the loop bytes (2 * word_mode_size). */ > + > +int foo (const char* s1, const char* s2, int l) > +{ > + return __builtin_memcmp (s1, s2, l); > +} > diff --git a/gcc/testsuite/gcc.target/powerpc/block-cmp-6.c > b/gcc/testsuite/gcc.target/powerpc/block-cmp-6.c > new file mode 100644 > index 00000000000..0e03f2af943 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/block-cmp-6.c > @@ -0,0 +1,5 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mblock-compare-inline-loop-limit=4" } */ > +/* { dg-timeout-factor 2 } */ > + > +#include "../../gcc.dg/memcmp-1.c" > diff --git a/gcc/testsuite/gcc.target/powerpc/block-cmp-7.c > b/gcc/testsuite/gcc.target/powerpc/block-cmp-7.c > new file mode 100644 > index 00000000000..499f5faee17 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/block-cmp-7.c > @@ -0,0 +1,5 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mblock-compare-inline-loop-limit=32" } */ > +/* { dg-timeout-factor 2 } */ > + > +#include "../../gcc.dg/memcmp-1.c" BR, Kewen