Hi Haochen,

on 2024/1/10 09:35, HAO CHEN GUI wrote:
> Hi,
>   This patch refactors function expand_compare_loop and split it to two
> functions. One is for fixed length and another is for variable length.
> These two functions share some low level common help functions.

I'm expecting refactoring doesn't introduce any functional changes, but
this patch has some enhancements as described below, so I think the
subject is off, it's more like rework.

> 
>   Besides above changes, the patch also does:
> 1. Don't generate load and compare loop when max_bytes is less than
> loop bytes.
> 2. Remove do_load_mask_compare as it's no needed. All sub-targets
> entering the function should support efficient overlapping load and
> compare.
> 3. Implement an variable length overlapping load and compare for the
> case which remain bytes is less than the loop bytes in variable length
> compare. The 4k boundary test and one-byte load and compare loop are
> removed as they're no need now.
> 4. Remove the codes for "bytes > max_bytes" with fixed length as the
> case is already excluded by pre-checking.
> 5. Remove running time codes for "bytes > max_bytes" with variable length
> as it should jump to call library at the beginning.
> 6. Enhance do_overlap_load_compare to avoid overlapping load and compare
> when the remain bytes can be loaded and compared by a smaller unit.

Considering it's stage 4 now and the impact of this patch, let's defer
this to next stage 1, if possible could you organize the above changes
into patches:

1) Refactor expand_compare_loop by splitting into two functions without
   any functional changes.
2) Remove some useless codes like 2, 4, 5.
3) Some more enhancements like 1, 3, 6.

?  It would be helpful for the review.  Thanks!

BR,
Kewen

> 
>   Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
> regressions. Is this OK for trunk?
> 
> Thanks
> Gui Haochen
> 
> 
> ChangeLog
> rs6000: Refactor expand_compare_loop and split it to two functions
> 
> The original expand_compare_loop has a complicated logical as it's
> designed for both fixed and variable length.  This patch splits it to
> two functions and make these two functions share common help functions.
> Also the 4K boundary test and corresponding one byte load and compare
> are replaced by variable length overlapping load and compare.  The
> do_load_mask_compare is removed as all sub-targets entering the function
> has efficient overlapping load and compare so that mask load is no needed.
> 
> gcc/
>       * config/rs6000/rs6000-string.cc (do_isel): Remove.
>       (do_load_mask_compare): Remove.
>       (do_reg_compare): New.
>       (do_load_and_compare): New.
>       (do_overlap_load_compare): Do load and compare with a small unit
>       other than overlapping load and compare when the remain bytes can
>       be done by one instruction.
>       (expand_compare_loop): Remove.
>       (get_max_inline_loop_bytes): New.
>       (do_load_compare_rest_of_loop): New.
>       (generate_6432_conversion): Set it to a static function and move
>       ahead of gen_diff_handle.
>       (gen_diff_handle): New.
>       (gen_load_compare_loop): New.
>       (gen_library_call): New.
>       (expand_compare_with_fixed_length): New.
>       (expand_compare_with_variable_length): New.
>       (expand_block_compare): Call expand_compare_with_variable_length
>       to expand block compare for variable length.  Call
>       expand_compare_with_fixed_length to expand block compare loop for
>       fixed length.
> 
> gcc/testsuite/
>       * gcc.target/powerpc/block-cmp-5.c: New.
>       * gcc.target/powerpc/block-cmp-6.c: New.
>       * gcc.target/powerpc/block-cmp-7.c: New.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000-string.cc 
> b/gcc/config/rs6000/rs6000-string.cc
> index f707bb2727e..018b87f2501 100644
> --- a/gcc/config/rs6000/rs6000-string.cc
> +++ b/gcc/config/rs6000/rs6000-string.cc
> @@ -404,21 +404,6 @@ do_ifelse (machine_mode cmpmode, rtx_code comparison,
>    LABEL_NUSES (true_label) += 1;
>  }
> 
> -/* Emit an isel of the proper mode for DEST.
> -
> -   DEST is the isel destination register.
> -   SRC1 is the isel source if CR is true.
> -   SRC2 is the isel source if CR is false.
> -   CR is the condition for the isel.  */
> -static void
> -do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
> -{
> -  if (GET_MODE (dest) == DImode)
> -    emit_insn (gen_isel_cc_di (dest, cmp, src_t, src_f, cr));
> -  else
> -    emit_insn (gen_isel_cc_si (dest, cmp, src_t, src_f, cr));
> -}
> -
>  /* Emit a subtract of the proper mode for DEST.
> 
>     DEST is the destination register for the subtract.
> @@ -499,65 +484,61 @@ do_rotl3 (rtx dest, rtx src1, rtx src2)
>      emit_insn (gen_rotlsi3 (dest, src1, src2));
>  }
> 
> -/* Generate rtl for a load, shift, and compare of less than a full word.
> -
> -   LOAD_MODE is the machine mode for the loads.
> -   DIFF is the reg for the difference.
> -   CMP_REM is the reg containing the remaining bytes to compare.
> -   DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
> -   SRC1_ADDR is the first source address.
> -   SRC2_ADDR is the second source address.
> -   ORIG_SRC1 is the original first source block's address rtx.
> -   ORIG_SRC2 is the original second source block's address rtx.  */
> +/* Do the compare for two registers.  */
>  static void
> -do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, 
> rtx dcond,
> -                   rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx 
> orig_src2)
> +do_reg_compare (bool use_vec, rtx vec_result, rtx diff, rtx *dcond, rtx d1,
> +             rtx d2)
>  {
> -  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> -  rtx shift_amount = gen_reg_rtx (word_mode);
> -  rtx d1 = gen_reg_rtx (word_mode);
> -  rtx d2 = gen_reg_rtx (word_mode);
> -
> -  do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
> -  do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
> -  do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
> -
> -  if (word_mode == DImode)
> -    {
> -      emit_insn (gen_ashldi3 (shift_amount, shift_amount,
> -                           GEN_INT (LOG2_BITS_PER_UNIT)));
> -      emit_insn (gen_lshrdi3 (d1, d1,
> -                           gen_lowpart (SImode, shift_amount)));
> -      emit_insn (gen_lshrdi3 (d2, d2,
> -                           gen_lowpart (SImode, shift_amount)));
> -    }
> -  else
> -    {
> -      emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
> -                           GEN_INT (LOG2_BITS_PER_UNIT)));
> -      emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
> -      emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
> -    }
> +  gcc_assert (!use_vec || vec_result != NULL_RTX);
> +  gcc_assert (REG_P (d1) && REG_P (d2));
> +  gcc_assert (GET_MODE (d1) == GET_MODE (d2));
> 
> -  if (TARGET_P9_MISC)
> +  if (use_vec)
> +    emit_insn (gen_altivec_vcmpequb_p (vec_result, d1, d2));
> +  else if (TARGET_P9_MISC)
>      {
>        /* Generate a compare, and convert with a setb later.  */
>        rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
> -      emit_insn (gen_rtx_SET (dcond, cmp));
> +      emit_insn (gen_rtx_SET (*dcond, cmp));
>      }
>    else
>      {
> +      *dcond = gen_reg_rtx (CCmode);
>        if (word_mode == DImode)
> -     emit_insn (gen_subfdi3_carry (diff, d2, d1));
> +     emit_insn (gen_subfdi3_carry_dot2 (diff, d2, d1, *dcond));
>        else
> -     emit_insn (gen_subfsi3_carry (diff, d2, d1));
> +     emit_insn (gen_subfsi3_carry_dot2 (diff, d2, d1, *dcond));
>      }
>  }
> 
> +/* Load the memory to register and do the compare.  */
> +static void
> +do_load_and_compare (machine_mode load_mode, rtx addr1, rtx addr2, rtx 
> *dcond,
> +                  rtx diff, rtx orig_src1, rtx orig_src2)
> +{
> +  rtx d1 = gen_reg_rtx (word_mode);
> +  rtx d2 = gen_reg_rtx (word_mode);
> +
> +  if (MEM_P (addr1))
> +    do_load_for_compare (d1, addr1, load_mode);
> +  else
> +    do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
> +
> +  if (MEM_P (addr2))
> +    do_load_for_compare (d2, addr2, load_mode);
> +  else
> +    do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
> +
> +  do_reg_compare (false, NULL_RTX, diff, dcond, d1, d2);
> +}
> +
>  /* Generate rtl for an overlapping load and compare of less than a
>     full load_mode.  This assumes that the previous word is part of the
>     block being compared so it's ok to back up part of a word so we can
>     compare the last unaligned full word that ends at the end of the block.
> +   If the remain bytes can be loaded and compared by a small unit with
> +   only one instruction, just do the load and compare by the small unit
> +   other than the full word overlapping load and compare.
> 
>     LOAD_MODE is the machine mode for the loads.
>     ISCONST tells whether the remaining length is a constant or in a register.
> @@ -569,55 +550,41 @@ do_load_mask_compare (const machine_mode load_mode, rtx 
> diff, rtx cmp_rem, rtx d
>     SRC2_ADDR is the second source address.
>     ORIG_SRC1 is the original first source block's address rtx.
>     ORIG_SRC2 is the original second source block's address rtx.  */
> +
>  static void
> -do_overlap_load_compare (machine_mode load_mode, bool isConst,
> -                     HOST_WIDE_INT bytes_rem, rtx diff,
> -                     rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
> -                     rtx orig_src1, rtx orig_src2)
> +do_overlap_load_compare (machine_mode load_mode, HOST_WIDE_INT bytes_rem,
> +                      rtx diff, rtx *dcond, rtx orig_src1, rtx orig_src2,
> +                      HOST_WIDE_INT length)
>  {
>    HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> -  HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
> -  rtx d1 = gen_reg_rtx (word_mode);
> -  rtx d2 = gen_reg_rtx (word_mode);
> +  gcc_assert (IN_RANGE (bytes_rem, 0, load_mode_size - 1));
> 
>    rtx addr1, addr2;
> -  if (!isConst || addr_adj)
> -    {
> -      rtx adj_reg = gen_reg_rtx (word_mode);
> -      if (isConst)
> -     emit_move_insn (adj_reg, GEN_INT (-addr_adj));
> -      else
> -     {
> -       rtx reg_lms = gen_reg_rtx (word_mode);
> -       emit_move_insn (reg_lms, GEN_INT (load_mode_size));
> -       do_sub3 (adj_reg, cmp_rem, reg_lms);
> -     }
> 
> -      addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
> -      addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
> -    }
> -  else
> +  switch (bytes_rem)
>      {
> -      addr1 = src1_addr;
> -      addr2 = src2_addr;
> +      case 0:
> +     return;
> +      case 1:
> +     load_mode = QImode;
> +     break;
> +      case 2:
> +     load_mode = HImode;
> +     break;
> +      case 4:
> +     load_mode = SImode;
> +     break;
> +      case 8:
> +     if (TARGET_POWERPC64)
> +       load_mode = DImode;
> +     break;
>      }
> 
> -  do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
> -  do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
> -
> -  if (TARGET_P9_MISC)
> -    {
> -      /* Generate a compare, and convert with a setb later.  */
> -      rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
> -      emit_insn (gen_rtx_SET (dcond, cmp));
> -    }
> -  else
> -    {
> -      if (word_mode == DImode)
> -     emit_insn (gen_subfdi3_carry (diff, d2, d1));
> -      else
> -     emit_insn (gen_subfsi3_carry (diff, d2, d1));
> -    }
> +  load_mode_size = GET_MODE_SIZE (load_mode);
> +  addr1 = adjust_address (orig_src1, load_mode, length - load_mode_size);
> +  addr2 = adjust_address (orig_src2, load_mode, length - load_mode_size);
> +  do_load_and_compare (load_mode, addr1, addr2, dcond, diff,
> +                    orig_src1, orig_src2);
>  }
> 
>  /* Generate the sequence of compares for strcmp/strncmp using vec/vsx
> @@ -889,790 +856,550 @@ emit_final_compare_vec (rtx str1, rtx str2, rtx 
> result,
>    return;
>  }
> 
> -/* Expand a block compare operation using loop code, and return true
> -   if successful.  Return false if we should let the compiler generate
> -   normal code, probably a memcmp call.
> -
> -   OPERANDS[0] is the target (result).
> -   OPERANDS[1] is the first source.
> -   OPERANDS[2] is the second source.
> -   OPERANDS[3] is the length.
> -   OPERANDS[4] is the alignment.  */
> -bool
> -expand_compare_loop (rtx operands[])
> +static HOST_WIDE_INT
> +get_max_inline_loop_bytes (bool bytes_is_const, int align)
>  {
> -  rtx target = operands[0];
> -  rtx orig_src1 = operands[1];
> -  rtx orig_src2 = operands[2];
> -  rtx bytes_rtx = operands[3];
> -  rtx align_rtx = operands[4];
> -
> -  /* This case is complicated to handle because the subtract
> -     with carry instructions do not generate the 64-bit
> -     carry and so we must emit code to calculate it ourselves.
> -     We choose not to implement this yet.  */
> -  if (TARGET_32BIT && TARGET_POWERPC64)
> -    return false;
> -
> -  /* Allow non-const length.  */
> -  int bytes_is_const = CONST_INT_P (bytes_rtx);
> -
> -  /* This must be a fixed size alignment.  */
> -  if (!CONST_INT_P (align_rtx))
> -    return false;
> -
> -  HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
> -  HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
> -  HOST_WIDE_INT minalign = MIN (align1, align2);
> -
> -  bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
> -
> -  gcc_assert (GET_MODE (target) == SImode);
> -
> -  /* Anything to move?       */
> -  HOST_WIDE_INT bytes = 0;
> -  if (bytes_is_const)
> -    bytes = INTVAL (bytes_rtx);
> -
> -  if (bytes_is_const && bytes == 0)
> -    return true;
> -
> -  /* Limit the amount we compare, if known statically.  */
> -  HOST_WIDE_INT max_bytes;
>    switch (rs6000_tune)
>      {
>      case PROCESSOR_POWER7:
>        if (!bytes_is_const)
> -     if (minalign < 8)
> -       max_bytes = 0;
> +     if (align < 8)
> +       return 0;
>       else
> -       max_bytes = 128;
> +       return 128;
>        else
> -     if (minalign < 8)
> -       max_bytes = 32;
> +     if (align < 8)
> +       return 32;
>       else
> -       max_bytes = 128;
> +       return 128;
>        break;
>      case PROCESSOR_POWER8:
>        if (!bytes_is_const)
> -     max_bytes = 0;
> +     return 0;
>        else
> -     if (minalign < 8)
> -       max_bytes = 128;
> +     if (align < 8)
> +       return 128;
>       else
> -       max_bytes = 64;
> +       return 64;
>        break;
>      case PROCESSOR_POWER9:
>      case PROCESSOR_POWER10:
>        if (bytes_is_const)
> -     max_bytes = 191;
> +     return 191;
>        else
> -     max_bytes = 0;
> +     return 0;
>        break;
>      default:
> -      max_bytes = 128;
> +      return 128;
>      }
> +}
> 
> -  /* Allow the option to override the default.  */
> -  if (rs6000_block_compare_inline_loop_limit >= 0)
> -    max_bytes = (unsigned HOST_WIDE_INT) 
> rs6000_block_compare_inline_loop_limit;
> -
> -  if (max_bytes == 0)
> -    return false;
> +/* Do the load and compare when remain bytes is less than loop bytes
> +   and it's a variable length compare.  expand_bytes indicates the
> +   maximum bytes needed to be expanded.  */
> +static void
> +do_load_compare_rest_of_loop (machine_mode load_mode, rtx src1_addr,
> +                           rtx src2_addr, rtx cmp_rem, rtx diff,
> +                           rtx diff_label, rtx *dcond, rtx final_label,
> +                           rtx orig_src1, rtx orig_src2,
> +                           HOST_WIDE_INT loop_bytes,
> +                           HOST_WIDE_INT expand_bytes)
> +{
> +  gcc_assert ((TARGET_POWERPC64 && load_mode == DImode)
> +           || (!TARGET_POWERPC64 && load_mode == SImode));
> +  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> +  gcc_assert (loop_bytes = 2 * load_mode_size);
> +  gcc_assert (expand_bytes < loop_bytes);
> 
> -  rtx cmp_rem = gen_reg_rtx (word_mode);  /* Remainder for library call.  */
> -  rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. 
>  */
> -  HOST_WIDE_INT niter;
> -  rtx iter = gen_reg_rtx (word_mode);
> -  rtx iv1 = gen_reg_rtx (word_mode);
> -  rtx iv2 = gen_reg_rtx (word_mode);
> -  rtx d1_1 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv1 */
> -  rtx d1_2 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv2 */
> -  rtx d2_1 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv1 */
> -  rtx d2_2 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv2 */
> +  rtx adj_reg = gen_reg_rtx (word_mode);
> +  rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
> +  rtx j;
> +  rtx cmp;
> +  rtx ccreg = gen_reg_rtx (CCmode);
> 
> -  /* Strip unneeded subreg from length if there is one.  */
> -  if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
> -    bytes_rtx = SUBREG_REG (bytes_rtx);
> -  /* Extend bytes_rtx to word_mode if needed.  But, we expect only to
> -   maybe have to deal with the case were bytes_rtx is SImode and
> -   word_mode is DImode.  */
> -  if (!bytes_is_const)
> +  if (TARGET_POWERPC64 && expand_bytes >= 8)
>      {
> -      if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
> -     /* Do not expect length longer than word_mode.  */
> -     return false;
> -      else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE 
> (word_mode))
> +      /* Compare with 8 bytes.  */
> +      rtx cmp_4 = gen_label_rtx ();
> +      cmp = gen_rtx_COMPARE (CCmode, cmp_rem, GEN_INT (8));
> +      emit_insn (gen_rtx_SET (ccreg, cmp));
> +      do_ifelse (CCmode, LT, NULL_RTX, NULL_RTX, ccreg, cmp_4,
> +              profile_probability::even ());
> +      do_load_and_compare (DImode, src1_addr, src2_addr, dcond, diff,
> +                        orig_src1, orig_src2);
> +      do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> +              *dcond, diff_label, profile_probability::unlikely ());
> +
> +      if (expand_bytes > 8)
>       {
> -       bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
> -       bytes_rtx = force_reg (word_mode,
> -                              gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
> -                                             bytes_rtx));
> +       do_ifelse (CCmode, EQ, NULL_RTX, NULL_RTX, ccreg, final_label,
> +                  profile_probability::unlikely ());
> +
> +       /* cmp_rem is great than 8 bytes.  Do 8 bytes overlap compare.  */
> +       do_add3 (adj_reg, cmp_rem, GEN_INT (-8));
> +       do_add3 (src1_addr, src1_addr, adj_reg);
> +       do_add3 (src2_addr, src2_addr, adj_reg);
> +       do_load_and_compare (DImode, src1_addr, src2_addr, dcond, diff,
> +                            orig_src1, orig_src2);
> +       do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> +                  *dcond, diff_label, profile_probability::likely ());
>       }
> -      else
> -     /* Make sure it's in a register before we get started.  */
> -     bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
> -    }
> -
> -  machine_mode load_mode = word_mode;
> -  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> -
> -  /* Number of bytes per iteration of the unrolled loop.  */
> -  HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
> -  /* max iters and bytes compared in the loop.  */
> -  HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
> -  HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
> -  int l2lb = floor_log2 (loop_bytes);
> 
> -  if (bytes_is_const && (max_bytes < load_mode_size
> -                      || !IN_RANGE (bytes, load_mode_size, max_bytes)))
> -    return false;
> +      j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> +      JUMP_LABEL (j) = final_label;
> +      LABEL_NUSES (final_label) += 1;
> +      emit_barrier ();
> 
> -  bool no_remainder_code = false;
> -  rtx final_label = gen_label_rtx ();
> -  rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
> -  rtx diff_label = gen_label_rtx ();
> -  rtx library_call_label = NULL;
> -  rtx cleanup_label = gen_label_rtx ();
> +      emit_label (cmp_4);
> +    }
> 
> -  rtx cr;
> +  if (expand_bytes >= 4)
> +    {
> +      /* Compare with 4 bytes.  */
> +      rtx cmp_2 = gen_label_rtx ();
> +      cmp = gen_rtx_COMPARE (CCmode, cmp_rem, GEN_INT (4));
> +      emit_insn (gen_rtx_SET (ccreg, cmp));
> +      do_ifelse (CCmode, LT, NULL_RTX, NULL_RTX, ccreg, cmp_2,
> +              profile_probability::even ());
> +      do_load_and_compare (SImode, src1_addr, src2_addr, dcond, diff,
> +                        orig_src1, orig_src2);
> +      do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> +              *dcond, diff_label, profile_probability::unlikely ());
> +
> +      if (expand_bytes > 4)
> +     {
> +       do_ifelse (CCmode, EQ, NULL_RTX, NULL_RTX, ccreg, final_label,
> +                  profile_probability::unlikely ());
> +
> +       /* cmp_rem is great than 4 bytes.  Do 4 bytes overlap compare.  */
> +       do_add3 (adj_reg, cmp_rem, GEN_INT (-4));
> +       do_add3 (src1_addr, src1_addr, adj_reg);
> +       do_add3 (src2_addr, src2_addr, adj_reg);
> +       do_load_and_compare (SImode, src1_addr, src2_addr, dcond, diff,
> +                            orig_src1, orig_src2);
> +       do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> +                  *dcond, diff_label, profile_probability::likely ());
> +     }
> 
> -  rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
> -  rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
> +      j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> +      JUMP_LABEL (j) = final_label;
> +      LABEL_NUSES (final_label) += 1;
> +      emit_barrier ();
> 
> -  /* Difference found is stored here before jump to diff_label.  */
> -  rtx diff = gen_reg_rtx (word_mode);
> -  rtx_insn *j;
> +      emit_label (cmp_2);
> +  }
> 
> -  /* Example of generated code for 35 bytes aligned 1 byte.
> -
> -          mtctr 8
> -          li 6,0
> -          li 5,8
> -     .L13:
> -          ldbrx 7,3,6
> -          ldbrx 9,10,6
> -          ldbrx 0,3,5
> -          ldbrx 4,10,5
> -          addi 6,6,16
> -          addi 5,5,16
> -          subfc. 9,9,7
> -          bne 0,.L10
> -          subfc. 9,4,0
> -          bdnzt 2,.L13
> -          bne 0,.L10
> -          add 3,3,6
> -          add 10,10,6
> -          addi 9,3,-5
> -          ldbrx 7,0,9
> -          addi 9,10,-5
> -          ldbrx 9,0,9
> -          subfc 9,9,7
> -          .p2align 4,,15
> -     .L10:
> -          popcntd 9,9
> -          subfe 10,10,10
> -          or 9,9,10
> -
> -     Compiled with -fno-reorder-blocks for clarity.  */
> -
> -  /* Structure of what we're going to do:
> -     Two separate lengths: what we will compare before bailing to library
> -     call (max_bytes), and the total length to be checked.
> -     if length <= 16, branch to linear cleanup code starting with
> -     remainder length check (length not known at compile time)
> -     set up 2 iv's and load count reg, compute remainder length
> -     unrollx2 compare loop
> -     if loop exit due to a difference, branch to difference handling code
> -     if remainder length < 8, branch to final cleanup compare
> -     load and compare 8B
> -     final cleanup comparison (depends on alignment and length)
> -     load 8B, shift off bytes past length, compare
> -     load 8B ending at last byte and compare
> -     load/compare 1 byte at a time (short block abutting 4k boundary)
> -     difference handling, 64->32 conversion
> -     final result
> -     branch around memcmp call
> -     memcmp library call
> -  */
> -
> -  /* If bytes is not const, compare length and branch directly
> -     to the cleanup code that can handle 0-16 bytes if length
> -     is >= 16.  Stash away bytes-max_bytes for the library call.  */
> -  if (bytes_is_const)
> +  if (expand_bytes >= 2)
>      {
> -      /* These need to be set for some of the places we may jump to.  */
> -      if (bytes > max_bytes)
> -     {
> -       no_remainder_code = true;
> -       niter = max_loop_iter;
> -       library_call_label = gen_label_rtx ();
> -     }
> -      else
> +      /* Compare with 2 bytes.  */
> +      rtx cmp_1 = gen_label_rtx ();
> +      cmp = gen_rtx_COMPARE (CCmode, cmp_rem, GEN_INT (2));
> +      emit_insn (gen_rtx_SET (ccreg, cmp));
> +      do_ifelse (CCmode, LT, NULL_RTX, NULL_RTX, ccreg, cmp_1,
> +              profile_probability::even ());
> +      do_load_and_compare (HImode, src1_addr, src2_addr, dcond, diff,
> +                        orig_src1, orig_src2);
> +      do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> +              *dcond, diff_label, profile_probability::unlikely ());
> +
> +      if (expand_bytes > 2)
>       {
> -       niter = bytes / loop_bytes;
> +       do_ifelse (CCmode, EQ, NULL_RTX, NULL_RTX, ccreg, final_label,
> +                  profile_probability::unlikely ());
> +
> +       /* cmp_rem equals to 3 bytes and leave 1 byte to load and
> +          compare.  */
> +       do_add3 (src1_addr, src1_addr, GEN_INT (2));
> +       do_add3 (src2_addr, src2_addr, GEN_INT (2));
>       }
> -      emit_move_insn (iter, GEN_INT (niter));
> -      emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
> -      emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
> -    }
> -  else
> -    {
> -      library_call_label = gen_label_rtx ();
> 
> -      /* If we go to the cleanup code, it expects length to be in cmp_rem.  
> */
> -      emit_move_insn (cmp_rem, bytes_rtx);
> +      emit_label (cmp_1);
> +    }
> 
> -      /* Check for > max_bytes bytes.  We want to bail out as quickly as
> -      possible if we have to go over to memcmp.  */
> -      do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
> -              NULL_RTX, library_call_label, profile_probability::even ());
> +  /* Do 1 byte load and compare.  */
> +  do_load_and_compare (QImode, src1_addr, src2_addr, dcond, diff,
> +                    orig_src1, orig_src2);
> +  do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> +          *dcond, diff_label, profile_probability::likely ());
> +  j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> +  JUMP_LABEL (j) = final_label;
> +  LABEL_NUSES (final_label) += 1;
> +  emit_barrier ();
> +}
> 
> -      /* Check for < loop_bytes bytes.  */
> -      do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
> -              NULL_RTX, cleanup_label, profile_probability::even ());
> +/* Generate code to convert a DImode-plus-carry subtract result into
> +   a SImode result that has the same <0 / ==0 / >0 properties to
> +   produce the final result from memcmp.
> 
> -      /* Loop compare bytes and iterations if bytes>max_bytes.  */
> -      rtx mb_reg = gen_reg_rtx (word_mode);
> -      emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
> -      rtx mi_reg = gen_reg_rtx (word_mode);
> -      emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
> +   TARGET is the rtx for the register to receive the memcmp result.
> +   SUB_RESULT is the rtx for the register contining the subtract result.  */
> 
> -      /* Compute number of loop iterations if bytes <= max_bytes.  */
> -      if (word_mode == DImode)
> -     emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
> -      else
> -     emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
> +static void
> +generate_6432_conversion (rtx target, rtx sub_result)
> +{
> +  /* We need to produce DI result from sub, then convert to target SI
> +     while maintaining <0 / ==0 / >0 properties.  This sequence works:
> +     subfc L,A,B
> +     subfe H,H,H
> +     popcntd L,L
> +     rldimi L,H,6,0
> 
> -      /* Compute bytes to compare in loop if bytes <= max_bytes.  */
> -      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
> -      if (word_mode == DImode)
> -     {
> -       emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
> -     }
> -      else
> -     {
> -       emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
> -     }
> +     This is an alternate one Segher cooked up if somebody
> +     wants to expand this for something that doesn't have popcntd:
> +     subfc L,a,b
> +     subfe H,x,x
> +     addic t,L,-1
> +     subfe v,t,L
> +     or z,v,H
> 
> -      /* Check for bytes <= max_bytes.  */
> -      if (TARGET_ISEL)
> -     {
> -       /* P9 has fast isel so we use one compare and two isel.  */
> -       cr = gen_reg_rtx (CCmode);
> -       rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
> -                                          GEN_INT (max_bytes));
> -       emit_move_insn (cr, compare_rtx);
> -       rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
> -       do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
> -       do_isel (iter, cmp_rtx, iter, mi_reg, cr);
> -     }
> -      else
> -     {
> -       rtx lab_after = gen_label_rtx ();
> -       do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
> -                  NULL_RTX, lab_after, profile_probability::even ());
> -       emit_move_insn (loop_cmp, mb_reg);
> -       emit_move_insn (iter, mi_reg);
> -       emit_label (lab_after);
> -     }
> +     And finally, p9 can just do this:
> +     cmpld A,B
> +     setb r
> +     .  */
> 
> -      /* Now compute remainder bytes which isn't used until after the loop.  
> */
> -      do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
> +  if (TARGET_64BIT)
> +    {
> +      rtx tmp_reg_ca = gen_reg_rtx (DImode);
> +      emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
> +      rtx popcnt = gen_reg_rtx (DImode);
> +      emit_insn (gen_popcntddi2 (popcnt, sub_result));
> +      rtx tmp2 = gen_reg_rtx (DImode);
> +      emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca));
> +      emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2)));
> +    }
> +  else
> +    {
> +      rtx tmp_reg_ca = gen_reg_rtx (SImode);
> +      emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
> +      rtx popcnt = gen_reg_rtx (SImode);
> +      emit_insn (gen_popcntdsi2 (popcnt, sub_result));
> +      emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca));
>      }
> +}
> 
> -  rtx dcond = NULL_RTX; /* Used for when we jump to diff_label.  */
> -  /* For p9 we need to have just one of these as multiple places define
> -     it and it gets used by the setb at the end.  */
> +/* Generate the return value when memcmp finds a difference from
> +   compare.  */
> +static void
> +gen_diff_handle (rtx target, rtx dcond, rtx diff, rtx diff_label,
> +              rtx final_label)
> +{
> +  emit_label (diff_label);
>    if (TARGET_P9_MISC)
> -    dcond = gen_reg_rtx (CCUNSmode);
> +    emit_insn (gen_setb_unsigned (target, dcond));
> +  else
> +    generate_6432_conversion (target, diff);
> 
> -  if (!bytes_is_const || bytes >= loop_bytes)
> -    {
> -      /* It should not be possible to come here if remaining bytes is
> -      < 16 in the runtime case either.  Compute number of loop
> -      iterations.  We compare 2*word_mode per iteration so 16B for
> -      64-bit code and 8B for 32-bit.  Set up two induction
> -      variables and load count register.  */
> -
> -      /* HACK ALERT: create hard reg for CTR here.  If we just use a
> -      pseudo, cse will get rid of it and then the allocator will
> -      see it used in the lshr above and won't give us ctr.  */
> -      rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
> -      emit_move_insn (ctr, iter);
> -      emit_move_insn (diff, GEN_INT (0));
> -      emit_move_insn (iv1, GEN_INT (0));
> -      emit_move_insn (iv2, GEN_INT (load_mode_size));
> -
> -      /* inner loop to compare 2*word_mode */
> -      rtx loop_top_label = gen_label_rtx ();
> -      emit_label (loop_top_label);
> -
> -      rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
> -      rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
> -
> -      do_load_for_compare_from_addr (load_mode, d1_1,
> -                                  src1_ix1, orig_src1);
> -      do_load_for_compare_from_addr (load_mode, d2_1,
> -                                  src2_ix1, orig_src2);
> -      do_add3 (iv1, iv1, GEN_INT (loop_bytes));
> -
> -      rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
> -      rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
> -
> -      do_load_for_compare_from_addr (load_mode, d1_2,
> -                                  src1_ix2, orig_src1);
> -      do_load_for_compare_from_addr (load_mode, d2_2,
> -                                  src2_ix2, orig_src2);
> -      do_add3 (iv2, iv2, GEN_INT (loop_bytes));
> -
> -      if (TARGET_P9_MISC)
> -     {
> -       /* Generate a compare, and convert with a setb later.  */
> -       rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
> -       emit_insn (gen_rtx_SET (dcond, cmp));
> -     }
> -      else
> -     {
> -       dcond = gen_reg_rtx (CCmode);
> -       if (word_mode == DImode)
> -         emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
> -       else
> -         emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
> -     }
> +  rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
> +  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> +  JUMP_LABEL (j) = final_label;
> +  LABEL_NUSES (final_label) += 1;
> +  emit_barrier ();
> +}
> 
> -      do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
> -              dcond, diff_label, profile_probability::unlikely ());
> +static void
> +gen_load_compare_loop (machine_mode load_mode, rtx src1_addr, rtx src2_addr,
> +                    rtx orig_src1, rtx orig_src2, rtx diff, rtx diff_label,
> +                    rtx cmp_rem, rtx *dcond, HOST_WIDE_INT loop_bytes,
> +                    rtx final_label, bool bytes_is_const)
> +{
> +  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> +  gcc_assert (loop_bytes == 2 * load_mode_size);
> 
> -      if (TARGET_P9_MISC)
> -     {
> -       /* Generate a compare, and convert with a setb later.  */
> -       rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
> -       emit_insn (gen_rtx_SET (dcond, cmp));
> -     }
> -      else
> -     {
> -       dcond = gen_reg_rtx (CCmode);
> -       if (word_mode == DImode)
> -         emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
> -       else
> -         emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
> -     }
> +  rtx iter = gen_reg_rtx (word_mode);
> 
> -      rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
> -      if (TARGET_64BIT)
> -     j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
> -                                        eqrtx, dcond));
> -      else
> -     j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
> -                                        eqrtx, dcond));
> -      add_reg_br_prob_note (j, profile_probability::likely ());
> -      JUMP_LABEL (j) = loop_top_label;
> -      LABEL_NUSES (loop_top_label) += 1;
> -    }
> +  int l2lb = floor_log2 (loop_bytes);
> +  if (word_mode == DImode)
> +    emit_insn (gen_lshrdi3 (iter, cmp_rem, GEN_INT (l2lb)));
> +  else
> +    emit_insn (gen_lshrsi3 (iter, cmp_rem, GEN_INT (l2lb)));
> 
> -  HOST_WIDE_INT bytes_remaining = 0;
> -  if (bytes_is_const)
> -    bytes_remaining = (bytes % loop_bytes);
> +  rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
> +  emit_move_insn (ctr, iter);
> 
> -  /* If diff is nonzero, branch to difference handling
> -     code.  If we exit here with a nonzero diff, it is
> -     because the second word differed.  */
> -  if (TARGET_P9_MISC)
> -    do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond,
> -            diff_label, profile_probability::unlikely ());
> +  rtx iv1 = gen_reg_rtx (word_mode);
> +  rtx iv2 = gen_reg_rtx (word_mode);
> +  rtx d1_1 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv1.  */
> +  rtx d1_2 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv2.  */
> +  rtx d2_1 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv1.  */
> +  rtx d2_2 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv2.  */
> +
> +  emit_move_insn (iv1, GEN_INT (0));
> +  emit_move_insn (iv2, GEN_INT (load_mode_size));
> +
> +  rtx loop_top_label = gen_label_rtx ();
> +  emit_label (loop_top_label);
> +
> +  /* Manually put two pair of loads together.  */
> +  rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
> +  rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
> +  do_load_for_compare_from_addr (load_mode, d1_1, src1_ix1, orig_src1);
> +  do_load_for_compare_from_addr (load_mode, d2_1, src2_ix1, orig_src2);
> +  rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
> +  rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
> +  do_load_for_compare_from_addr (load_mode, d1_2, src1_ix2, orig_src1);
> +  do_load_for_compare_from_addr (load_mode, d2_2, src2_ix2, orig_src2);
> +
> +  do_reg_compare (false, NULL_RTX, diff, dcond, d1_1, d2_1);
> +  do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX,
> +          *dcond, diff_label, profile_probability::unlikely ());
> +  do_reg_compare (false, NULL_RTX, diff, dcond, d1_2, d2_2);
> +
> +  do_add3 (iv1, iv1, GEN_INT (loop_bytes));
> +  do_add3 (iv2, iv2, GEN_INT (loop_bytes));
> +
> +  rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
> +  rtx_insn *j;
> +  if (TARGET_64BIT)
> +    j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
> +                                    eqrtx, *dcond));
>    else
> -    do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX,
> -            diff_label, profile_probability::unlikely ());
> +    j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
> +                                    eqrtx, *dcond));
> +  add_reg_br_prob_note (j, profile_probability::likely ());
> +  JUMP_LABEL (j) = loop_top_label;
> +  LABEL_NUSES (loop_top_label) += 1;
> 
> -  if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
> -    {
> -      /* If the length is known at compile time, then we will always
> -      have a remainder to go to the library call with.  */
> -      rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, 
> library_call_label);
> -      j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
> -      JUMP_LABEL (j) = library_call_label;
> -      LABEL_NUSES (library_call_label) += 1;
> -      emit_barrier ();
> -    }
> +  do_ifelse (GET_MODE (*dcond), NE, NULL_RTX, NULL_RTX, *dcond,
> +          diff_label, profile_probability::unlikely ());
> 
> -  if (bytes_is_const && bytes_remaining == 0)
> +  /* If length is fixed, we know how many bytes are left.  So skip the
> +     remain bytes test.  */
> +  if (!bytes_is_const)
>      {
> -      /* No remainder and if we are here then diff is 0 so just return 0 */
> -      if (TARGET_64BIT)
> -     emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
> -      else
> -     emit_move_insn (target, diff);
> -      j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> -      JUMP_LABEL (j) = final_label;
> -      LABEL_NUSES (final_label) += 1;
> -      emit_barrier ();
> +      do_sub3 (cmp_rem, cmp_rem, iv1);
> +      do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX,
> +              final_label, profile_probability::unlikely ());
>      }
> -  else if (!no_remainder_code)
> -    {
> -      /* Update addresses to point to the next word to examine.  */
> -      do_add3 (src1_addr, src1_addr, iv1);
> -      do_add3 (src2_addr, src2_addr, iv1);
> -
> -      emit_label (cleanup_label);
> 
> -      if (!bytes_is_const)
> -     {
> -       /* If we're dealing with runtime length, we have to check if
> -          it's zero after the loop.  When length is known at compile
> -          time the no-remainder condition is dealt with above.  By
> -          doing this after cleanup_label, we also deal with the
> -          case where length is 0 at the start and we bypass the
> -          loop with a branch to cleanup_label.  */
> -       emit_move_insn (target, const0_rtx);
> -       do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
> -                  NULL_RTX, final_label, profile_probability::unlikely ());
> -     }
> -
> -      rtx final_cleanup = gen_label_rtx ();
> -      rtx cmp_rem_before = gen_reg_rtx (word_mode);
> -      /* Compare one more word_mode chunk if needed.  */
> -      if (!bytes_is_const || bytes_remaining >= load_mode_size)
> -     {
> -       /* If remainder length < word length, branch to final
> -          cleanup compare.  */
> -     
> -       if (!bytes_is_const)
> -         {
> -           do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
> -                      NULL_RTX, final_cleanup, profile_probability::even ());
> -         }
> +  do_add3 (src1_addr, src1_addr, iv1);
> +  do_add3 (src2_addr, src2_addr, iv1);
> +}
> 
> -       /* load and compare 8B */
> -       do_load_for_compare_from_addr (load_mode, d1_1,
> -                                      src1_addr, orig_src1);
> -       do_load_for_compare_from_addr (load_mode, d2_1,
> -                                      src2_addr, orig_src2);
> +/* Generate memcmp library call.  */
> +static void
> +gen_library_call (rtx target, rtx src1_addr, rtx src2_addr, rtx bytes_rtx,
> +               rtx library_label)
> +{
> +  emit_label (library_label);
> +
> +  rtx len_rtx = gen_reg_rtx (word_mode);
> +  emit_move_insn (len_rtx, bytes_rtx);
> +  tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
> +  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
> +                        target, LCT_NORMAL, GET_MODE (target),
> +                        src1_addr, Pmode, src2_addr, Pmode,
> +                        len_rtx, GET_MODE (len_rtx));
> +}
> 
> -       /* Compare the word, see if we need to do the last partial.  */
> -       if (TARGET_P9_MISC)
> -         {
> -           /* Generate a compare, and convert with a setb later.  */
> -           rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
> -           emit_insn (gen_rtx_SET (dcond, cmp));
> -         }
> -       else
> -         {
> -           dcond = gen_reg_rtx (CCmode);
> -           if (word_mode == DImode)
> -             emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
> -           else
> -             emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
> -         }
> +static bool
> +expand_compare_with_fixed_length (rtx operands[])
> +{
> +  rtx target = operands[0];
> +  rtx orig_src1 = operands[1];
> +  rtx orig_src2 = operands[2];
> +  rtx bytes_rtx = operands[3];
> +  rtx align_rtx = operands[4];
> 
> -       do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
> -                  dcond, diff_label, profile_probability::even ());
> +  gcc_assert (CONST_INT_P (bytes_rtx));
> +  gcc_assert (GET_MODE (target) == SImode);
> 
> -       do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
> -       do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
> -       emit_move_insn (cmp_rem_before, cmp_rem);
> -       do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
> -       if (bytes_is_const)
> -         bytes_remaining -= load_mode_size;
> -       else
> -         /* See if remaining length is now zero.  We previously set
> -            target to 0 so we can just jump to the end.  */
> -         do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX,
> -                    final_label, profile_probability::unlikely ());
> -     }
> +  if (TARGET_32BIT && TARGET_POWERPC64)
> +    return false;
> 
> -      /* Cases:
> -      bytes_is_const
> -        We can always shift back to do an overlapping compare
> -        of the last chunk because we know length >= 8.
> -
> -      !bytes_is_const
> -        align>=load_mode_size
> -          Read word_mode and mask
> -        align<load_mode_size
> -          avoid stepping past end
> -
> -       Three strategies:
> -       * decrement address and do overlapping compare
> -       * read word_mode and mask
> -       * carefully avoid crossing 4k boundary
> -       */
> -
> -      if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
> -       && align1 >= load_mode_size && align2 >= load_mode_size)
> -     {
> -       /* Alignment is larger than word_mode so we do not need to be
> -          concerned with extra page crossings.  But, we do not know
> -          that the length is larger than load_mode_size so we might
> -          end up compareing against data before the block if we try
> -          an overlapping compare.  Also we use this on P7 for fixed length
> -          remainder because P7 doesn't like overlapping unaligned.
> -          Strategy: load 8B, shift off bytes past length, and compare.  */
> -       emit_label (final_cleanup);
> -       do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
> -                             src1_addr, src2_addr, orig_src1, orig_src2);
> -     }
> -      else if (bytes_remaining && bytes_is_const)
> -     {
> -       /* We do not do loop expand if length < 32 so we know at the
> -          end we can do an overlapping compare.
> -          Strategy: shift address back and do word_mode load that
> -          ends at the end of the block.  */
> -       emit_label (final_cleanup);
> -       do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
> -                                cmp_rem, dcond, src1_addr, src2_addr,
> -                                orig_src1, orig_src2);
> -     }
> -      else if (!bytes_is_const)
> -     {
> -       rtx handle4k_label = gen_label_rtx ();
> -       rtx nonconst_overlap = gen_label_rtx ();
> -       emit_label (nonconst_overlap);
> -
> -       /* Here we have to handle the case where whe have runtime
> -          length which may be too short for overlap compare, and
> -          alignment is not at least load_mode_size so we have to
> -          tread carefully to avoid stepping across 4k boundaries.  */
> -
> -       /* If the length after the loop was larger than word_mode
> -          size, we can just do an overlapping compare and we're
> -          done.  We fall through to this code from the word_mode
> -          compare that preceeds this.  */
> -       do_overlap_load_compare (load_mode, false, 0, diff,
> -                                cmp_rem, dcond, src1_addr, src2_addr,
> -                                orig_src1, orig_src2);
> -
> -       rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
> -       j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
> -       JUMP_LABEL (j) = diff_label;
> -       LABEL_NUSES (diff_label) += 1;
> -       emit_barrier ();
> -
> -       /* If we couldn't do the overlap compare we have to be more
> -          careful of the 4k boundary.  Test to see if either
> -          address is less than word_mode_size away from a 4k
> -          boundary.  If not, then we can do a load/shift/compare
> -          and we are done.  We come to this code if length was less
> -          than word_mode_size.  */
> -
> -       emit_label (final_cleanup);
> -
> -       /* We can still avoid the slow case if the length was larger
> -          than one loop iteration, in which case go do the overlap
> -          load compare path.  */
> -       do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
> -                  NULL_RTX, nonconst_overlap, profile_probability::even ());
> -
> -       rtx rem4k = gen_reg_rtx (word_mode);
> -       rtx dist1 = gen_reg_rtx (word_mode);
> -       rtx dist2 = gen_reg_rtx (word_mode);
> -       do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
> -       if (word_mode == SImode)
> -         emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
> -       else
> -         emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
> -       do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX,
> -                  handle4k_label, profile_probability::very_unlikely ());
> -       if (word_mode == SImode)
> -         emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
> -       else
> -         emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
> -       do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX,
> -                  handle4k_label, profile_probability::very_unlikely ());
> +  /* This must be a fixed size alignment.  */
> +  if (!CONST_INT_P (align_rtx))
> +    return false;
> 
> -       /* We don't have a 4k boundary to deal with, so do
> -          a load/shift/compare and jump to diff.  */
> +  HOST_WIDE_INT align = INTVAL (align_rtx) / BITS_PER_UNIT;
> +  HOST_WIDE_INT bytes = INTVAL (bytes_rtx);
> 
> -       do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
> -                             src1_addr, src2_addr, orig_src1, orig_src2);
> +  if (bytes == 0)
> +    return true;
> 
> -       j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
> -       JUMP_LABEL (j) = diff_label;
> -       LABEL_NUSES (diff_label) += 1;
> -       emit_barrier ();
> +  /* Limit the amount we compare, if known statically.  */
> +  HOST_WIDE_INT max_bytes = get_max_inline_loop_bytes (true, align);
> 
> -       /* Finally in the unlikely case we are inching up to a
> -          4k boundary we use a compact lbzx/compare loop to do
> -          it a byte at a time.  */
> +    /* Allow the option to override the default.  */
> +  if (rs6000_block_compare_inline_loop_limit >= 0)
> +    max_bytes = (unsigned HOST_WIDE_INT) 
> rs6000_block_compare_inline_loop_limit;
> 
> -       emit_label (handle4k_label);
> +  if (max_bytes == 0)
> +    return false;
> 
> -       rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
> -       emit_move_insn (ctr, cmp_rem);
> -       rtx ixreg = gen_reg_rtx (Pmode);
> -       emit_move_insn (ixreg, const0_rtx);
> +  machine_mode load_mode = word_mode;
> +  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> 
> -       rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
> -       rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
> -       rtx d1 = gen_reg_rtx (word_mode);
> -       rtx d2 = gen_reg_rtx (word_mode);
> +  if (max_bytes < load_mode_size
> +      || !IN_RANGE (bytes, load_mode_size, max_bytes))
> +   return false;
> 
> -       rtx fc_loop = gen_label_rtx ();
> -       emit_label (fc_loop);
> +  /* Remainder bytes for compare.  */
> +  rtx cmp_rem = gen_reg_rtx (word_mode);
> +  /* Number of bytes per iteration of the unrolled loop.  */
> +  HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
> 
> -       do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
> -       do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
> +  rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
> +  rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
> 
> -       do_add3 (ixreg, ixreg, const1_rtx);
> +  /* Label for set target when finding a diff.  */
> +  rtx diff_label = gen_label_rtx ();
> +  rtx final_label = gen_label_rtx ();
> 
> -       rtx cond = gen_reg_rtx (CCmode);
> -       rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
> -       rs6000_emit_dot_insn (diff, subexpr, 2, cond);
> +  /* CC used for when we jump to diff_label.  */
> +  rtx dcond = NULL_RTX;
> +  /* For p9 we need to have just one of these as multiple places define
> +     it and it gets used by the setb at the end.  */
> +  if (TARGET_P9_MISC)
> +    dcond = gen_reg_rtx (CCUNSmode);
> 
> -       rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
> -       if (TARGET_64BIT)
> -         j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
> -                                            eqrtx, cond));
> -       else
> -         j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
> -                                            eqrtx, cond));
> -       add_reg_br_prob_note (j, profile_probability::likely ());
> -       JUMP_LABEL (j) = fc_loop;
> -       LABEL_NUSES (fc_loop) += 1;
> -
> -       if (TARGET_64BIT)
> -         emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
> -       else
> -         emit_move_insn (target, diff);
> -
> -       /* Since we are comparing bytes, the difference can be used
> -          as the final result and we are done here.  */
> -       j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> -       JUMP_LABEL (j) = final_label;
> -       LABEL_NUSES (final_label) += 1;
> -       emit_barrier ();
> -     }
> -    }
> +  /* Difference found is stored here before jump to diff_label.  */
> +  rtx diff = gen_reg_rtx (word_mode);
> 
> -  emit_label (diff_label);
> -  /* difference handling, 64->32 conversion */
> +  emit_move_insn (cmp_rem, GEN_INT (bytes));
> +  emit_move_insn (target, const0_rtx);
> 
> -  /* We need to produce DI result from sub, then convert to target SI
> -     while maintaining <0 / ==0 / >0 properties.  This sequence works:
> -     subfc L,A,B
> -     subfe H,H,H
> -     popcntd L,L
> -     rldimi L,H,6,0
> +  gen_load_compare_loop (load_mode, src1_addr, src2_addr, orig_src1,
> +                      orig_src2, diff, diff_label, cmp_rem, &dcond,
> +                      loop_bytes, final_label, true);
> 
> -     This is an alternate one Segher cooked up if somebody
> -     wants to expand this for something that doesn't have popcntd:
> -     subfc L,a,b
> -     subfe H,x,x
> -     addic t,L,-1
> -     subfe v,t,L
> -     or z,v,H
> +  HOST_WIDE_INT rem_bytes = bytes % loop_bytes;
> 
> -     And finally, p9 can just do this:
> -     cmpld A,B
> -     setb r */
> -
> -  if (TARGET_P9_MISC)
> -    emit_insn (gen_setb_unsigned (target, dcond));
> -  else
> +  if (rem_bytes >= load_mode_size)
>      {
> -      if (TARGET_64BIT)
> -     {
> -       rtx tmp_reg_ca = gen_reg_rtx (DImode);
> -       emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
> -       emit_insn (gen_popcntddi2 (diff, diff));
> -       emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
> -       emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
> -     }
> -      else
> -     {
> -       rtx tmp_reg_ca = gen_reg_rtx (SImode);
> -       emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
> -       emit_insn (gen_popcntdsi2 (diff, diff));
> -       emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
> -     }
> +      do_load_and_compare (load_mode, src1_addr, src2_addr, &dcond, diff,
> +                        orig_src1, orig_src2);
> +      do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
> +              dcond, diff_label, profile_probability::unlikely ());
> +      rem_bytes -= load_mode_size;
>      }
> 
> -  if (library_call_label != NULL)
> +  if (rem_bytes > 0)
>      {
> -      /* Branch around memcmp call.  */
> -      j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> -      JUMP_LABEL (j) = final_label;
> -      LABEL_NUSES (final_label) += 1;
> -      emit_barrier ();
> -
> -      /* Make memcmp library call.  cmp_rem is the remaining bytes that
> -      were compared and cmp_rem is the expected amount to be compared
> -      by memcmp.  If we don't find a difference in the loop compare, do
> -      the library call directly instead of doing a small compare just
> -      to get to an arbitrary boundary before calling it anyway.
> -      Also, update addresses to point to the next word to examine.  */
> -      emit_label (library_call_label);
> -
> -      rtx len_rtx = gen_reg_rtx (word_mode);
> -      if (bytes_is_const)
> -     {
> -       emit_move_insn (len_rtx, cmp_rem);
> -       do_add3 (src1_addr, src1_addr, iv1);
> -       do_add3 (src2_addr, src2_addr, iv1);
> -     }
> -      else
> -     emit_move_insn (len_rtx, bytes_rtx);
> -
> -      tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
> -      emit_library_call_value (XEXP (DECL_RTL (fun), 0),
> -                            target, LCT_NORMAL, GET_MODE (target),
> -                            src1_addr, Pmode,
> -                            src2_addr, Pmode,
> -                            len_rtx, GET_MODE (len_rtx));
> +      do_overlap_load_compare (load_mode, rem_bytes, diff, &dcond,
> +                            orig_src1, orig_src2, bytes);
> +      do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
> +              dcond, diff_label, profile_probability::unlikely ());
>      }
> 
> -  /* emit final_label */
> +  rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
> +  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
> +  JUMP_LABEL (j) = final_label;
> +  LABEL_NUSES (final_label) += 1;
> +  emit_barrier ();
> +
> +  gen_diff_handle (target, dcond, diff, diff_label, final_label);
>    emit_label (final_label);
> +
>    return true;
>  }
> 
> -/* Generate code to convert a DImode-plus-carry subtract result into
> -   a SImode result that has the same <0 / ==0 / >0 properties to
> -   produce the final result from memcmp.
> +static bool
> +expand_compare_with_variable_length (rtx operands[])
> +{
> +  rtx target = operands[0];
> +  rtx orig_src1 = operands[1];
> +  rtx orig_src2 = operands[2];
> +  rtx bytes_rtx = operands[3];
> +  rtx align_rtx = operands[4];
> 
> -   TARGET is the rtx for the register to receive the memcmp result.
> -   SUB_RESULT is the rtx for the register contining the subtract result.  */
> +  gcc_assert (!CONST_INT_P (bytes_rtx));
> 
> -void
> -generate_6432_conversion(rtx target, rtx sub_result)
> -{
> -  /* We need to produce DI result from sub, then convert to target SI
> -     while maintaining <0 / ==0 / >0 properties.  This sequence works:
> -     subfc L,A,B
> -     subfe H,H,H
> -     popcntd L,L
> -     rldimi L,H,6,0
> +  if (TARGET_32BIT && TARGET_POWERPC64)
> +    return false;
> 
> -     This is an alternate one Segher cooked up if somebody
> -     wants to expand this for something that doesn't have popcntd:
> -     subfc L,a,b
> -     subfe H,x,x
> -     addic t,L,-1
> -     subfe v,t,L
> -     or z,v,H
> +  /* This must be a fixed size alignment.  */
> +  if (!CONST_INT_P (align_rtx))
> +    return false;
> 
> -     And finally, p9 can just do this:
> -     cmpld A,B
> -     setb r */
> +  HOST_WIDE_INT align = INTVAL (align_rtx) / BITS_PER_UNIT;
> 
> -  if (TARGET_64BIT)
> +  /* Limit the amount we compare, if known statically.  */
> +  HOST_WIDE_INT max_bytes = get_max_inline_loop_bytes (false, align);
> +
> +  /* Allow the option to override the default.  */
> +  if (rs6000_block_compare_inline_loop_limit >= 0)
> +    max_bytes = rs6000_block_compare_inline_loop_limit;
> +  if (max_bytes == 0)
> +    return false;
> +
> +  /* Remainder bytes for compare.  */
> +  rtx cmp_rem = gen_reg_rtx (word_mode);
> +
> +  /* Strip unneeded subreg from length if there is one.  */
> +  if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
> +    bytes_rtx = SUBREG_REG (bytes_rtx);
> +
> +  if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
> +    /* Do not expect length longer than word_mode.  */
> +    return false;
> +  else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
>      {
> -      rtx tmp_reg_ca = gen_reg_rtx (DImode);
> -      emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
> -      rtx popcnt = gen_reg_rtx (DImode);
> -      emit_insn (gen_popcntddi2 (popcnt, sub_result));
> -      rtx tmp2 = gen_reg_rtx (DImode);
> -      emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca));
> -      emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2)));
> +      bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
> +      bytes_rtx = force_reg (word_mode,
> +                          gen_rtx_fmt_e (ZERO_EXTEND, word_mode, bytes_rtx));
>      }
>    else
> +    /* Make sure it's in a register before we get started.  */
> +    bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
> +
> +  machine_mode load_mode = word_mode;
> +  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
> +  rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
> +  rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
> +
> +  rtx library_call_label = gen_label_rtx ();
> +
> +  /* Call library if length > max_bytes.  */
> +  do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
> +          NULL_RTX, library_call_label, profile_probability::unlikely ());
> +
> +  /* Number of bytes per iteration of the unrolled loop.  */
> +  HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
> +
> +  /* Label for set target when finding a diff.  */
> +  rtx diff_label = gen_label_rtx ();
> +  rtx final_label = gen_label_rtx ();
> +
> +  /* CC used for when we jump to diff_label.  */
> +  rtx dcond = NULL_RTX;
> +  /* For p9 we need to have just one of these as multiple places define
> +     it and it gets used by the setb at the end.  */
> +  if (TARGET_P9_MISC)
> +    dcond = gen_reg_rtx (CCUNSmode);
> +
> +  /* Difference found is stored here before jump to diff_label.  */
> +  rtx diff = gen_reg_rtx (word_mode);
> +
> +  emit_move_insn (target, const0_rtx);
> +  emit_move_insn (cmp_rem, bytes_rtx);
> +
> +  /* Number of bytes to be expanded at rest of loop.  */
> +  HOST_WIDE_INT expand_bytes = max_bytes % loop_bytes;
> +  if (max_bytes >= loop_bytes)
>      {
> -      rtx tmp_reg_ca = gen_reg_rtx (SImode);
> -      emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
> -      rtx popcnt = gen_reg_rtx (SImode);
> -      emit_insn (gen_popcntdsi2 (popcnt, sub_result));
> -      emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca));
> +      rtx compare_rest_label = gen_label_rtx ();
> +      do_ifelse (CCmode, LT, cmp_rem, GEN_INT (loop_bytes), NULL_RTX,
> +              compare_rest_label, profile_probability::unlikely ());
> +      /* max bytes >= cmp_rem >= loop_bytes.  */
> +      gen_load_compare_loop (load_mode, src1_addr, src2_addr, orig_src1,
> +                          orig_src2, diff, diff_label, cmp_rem, &dcond,
> +                          loop_bytes, final_label, false);
> +      emit_label (compare_rest_label);
> +      expand_bytes = loop_bytes - 1;
>      }
> +
> +  /* cmp_rem < loop_bytes.  */
> +  do_load_compare_rest_of_loop (load_mode, src1_addr, src2_addr, cmp_rem,
> +                             diff, diff_label, &dcond, final_label,
> +                             orig_src1, orig_src2, loop_bytes,
> +                             expand_bytes);
> +
> +  gen_diff_handle (target, dcond, diff, diff_label, final_label);
> +  gen_library_call (target, src1_addr, src2_addr, bytes_rtx,
> +                 library_call_label);
> +  emit_label (final_label);
> +
> +  return true;
>  }
> 
>  /* Generate memcmp expansion using in-line non-loop GPR instructions.
> @@ -1975,7 +1702,7 @@ expand_block_compare (rtx operands[])
>    /* If this is not a fixed size compare, try generating loop code and
>       if that fails just call memcmp.  */
>    if (!CONST_INT_P (bytes_rtx))
> -    return expand_compare_loop (operands);
> +    return expand_compare_with_variable_length (operands);
> 
>    /* This must be a fixed size alignment.  */
>    if (!CONST_INT_P (align_rtx))
> @@ -2016,7 +1743,7 @@ expand_block_compare (rtx operands[])
>      max_bytes = ((max_bytes + 1) / 2) - 1;
> 
>    if (!IN_RANGE (bytes, 1, max_bytes))
> -    return expand_compare_loop (operands);
> +    return expand_compare_with_fixed_length (operands);
> 
>    rtx final_label = NULL;
> 
> @@ -2069,7 +1796,7 @@ expand_block_compare (rtx operands[])
>         if (TARGET_P9_MISC)
>           emit_insn (gen_setb_unsigned (target, cond));
>         else
> -         generate_6432_conversion(target, sub_result);
> +         generate_6432_conversion (target, sub_result);
>       }
>      }
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/block-cmp-5.c 
> b/gcc/testsuite/gcc.target/powerpc/block-cmp-5.c
> new file mode 100644
> index 00000000000..60a38030784
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/block-cmp-5.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mblock-compare-inline-loop-limit=1" } */
> +/* { dg-final { scan-assembler-not {\mbdnzt\M} } }  */
> +
> +/* Test that no loop will be generated when the inline loop limit is less
> +   than the loop bytes (2 * word_mode_size).  */
> +
> +int foo (const char* s1, const char* s2, int l)
> +{
> +  return __builtin_memcmp (s1, s2, l);
> +}
> diff --git a/gcc/testsuite/gcc.target/powerpc/block-cmp-6.c 
> b/gcc/testsuite/gcc.target/powerpc/block-cmp-6.c
> new file mode 100644
> index 00000000000..0e03f2af943
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/block-cmp-6.c
> @@ -0,0 +1,5 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mblock-compare-inline-loop-limit=4" } */
> +/* { dg-timeout-factor 2 } */
> +
> +#include "../../gcc.dg/memcmp-1.c"
> diff --git a/gcc/testsuite/gcc.target/powerpc/block-cmp-7.c 
> b/gcc/testsuite/gcc.target/powerpc/block-cmp-7.c
> new file mode 100644
> index 00000000000..499f5faee17
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/block-cmp-7.c
> @@ -0,0 +1,5 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mblock-compare-inline-loop-limit=32" } */
> +/* { dg-timeout-factor 2 } */
> +
> +#include "../../gcc.dg/memcmp-1.c"


BR,
Kewen

Reply via email to