Hi,
  As now it's stage 1, gently ping this:
https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646478.html

Thanks
Gui Haochen

在 2024/2/26 10:25, HAO CHEN GUI 写道:
> Hi,
>   This patch enables overlap memory store for block memory clear which
> saves the number of store instructions. The expander calls
> widest_fixed_size_mode_for_block_clear to get the mode for looped block
> clear and calls widest_fixed_size_mode_for_block_clear to get the mode
> for last overlapped clear.
> 
>     Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
> regressions. Is it OK for the trunk or next stage 1?
> 
> Thanks
> Gui Haochen
> 
> 
> ChangeLog
> rs6000: Enable overlap memory store for block memory clear
> 
> gcc/
>       * config/rs6000/rs6000-string.cc
>       (widest_fixed_size_mode_for_block_clear): New.
>       (smallest_fixed_size_mode_for_block_clear): New.
>       (expand_block_clear): Call widest_fixed_size_mode_for_block_clear to
>       get the mode for looped memory stores and call
>       smallest_fixed_size_mode_for_block_clear to get the mode for the last
>       overlapped memory store.
> 
> gcc/testsuite
>       * gcc.target/powerpc/block-clear-1.c: New.
> 
> 
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000-string.cc 
> b/gcc/config/rs6000/rs6000-string.cc
> index 133e5382af2..c2a6095a586 100644
> --- a/gcc/config/rs6000/rs6000-string.cc
> +++ b/gcc/config/rs6000/rs6000-string.cc
> @@ -38,6 +38,49 @@
>  #include "profile-count.h"
>  #include "predict.h"
> 
> +/* Return the widest mode which mode size is less than or equal to the
> +   size.  */
> +static fixed_size_mode
> +widest_fixed_size_mode_for_block_clear (unsigned int size, unsigned int 
> align,
> +                                     bool unaligned_vsx_ok)
> +{
> +  machine_mode mode;
> +
> +  if (TARGET_ALTIVEC
> +      && size >= 16
> +      && (align >= 128
> +       || unaligned_vsx_ok))
> +    mode = V4SImode;
> +  else if (size >= 8
> +        && TARGET_POWERPC64
> +        && (align >= 64
> +            || !STRICT_ALIGNMENT))
> +    mode = DImode;
> +  else if (size >= 4
> +        && (align >= 32
> +            || !STRICT_ALIGNMENT))
> +    mode = SImode;
> +  else if (size >= 2
> +        && (align >= 16
> +            || !STRICT_ALIGNMENT))
> +    mode = HImode;
> +  else
> +    mode = QImode;
> +
> +  return as_a <fixed_size_mode> (mode);
> +}
> +
> +/* Return the smallest mode which mode size is smaller than or eqaul to
> +   the size.  */
> +static fixed_size_mode
> +smallest_fixed_size_mode_for_block_clear (unsigned int size)
> +{
> +  if (size > UNITS_PER_WORD)
> +    return as_a <fixed_size_mode> (V4SImode);
> +
> +  return smallest_int_mode_for_size (size * BITS_PER_UNIT);
> +}
> +
>  /* Expand a block clear operation, and return 1 if successful.  Return 0
>     if we should let the compiler generate normal code.
> 
> @@ -55,7 +98,6 @@ expand_block_clear (rtx operands[])
>    HOST_WIDE_INT align;
>    HOST_WIDE_INT bytes;
>    int offset;
> -  int clear_bytes;
>    int clear_step;
> 
>    /* If this is not a fixed size move, just call memcpy */
> @@ -89,62 +131,36 @@ expand_block_clear (rtx operands[])
> 
>    bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
> 
> -  for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
> +  auto mode = widest_fixed_size_mode_for_block_clear (bytes, align,
> +                                                   unaligned_vsx_ok);
> +  offset = 0;
> +  rtx dest;
> +
> +  do
>      {
> -      machine_mode mode = BLKmode;
> -      rtx dest;
> +      unsigned int size = GET_MODE_SIZE (mode);
> 
> -      if (TARGET_ALTIVEC
> -       && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
> +      while (bytes >= size)
>       {
> -       clear_bytes = 16;
> -       mode = V4SImode;
> -     }
> -      else if (bytes >= 8 && TARGET_POWERPC64
> -            && (align >= 64 || !STRICT_ALIGNMENT))
> -     {
> -       clear_bytes = 8;
> -       mode = DImode;
> -       if (offset == 0 && align < 64)
> -         {
> -           rtx addr;
> +       dest = adjust_address (orig_dest, mode, offset);
> +       emit_move_insn (dest, CONST0_RTX (mode));
> 
> -           /* If the address form is reg+offset with offset not a
> -              multiple of four, reload into reg indirect form here
> -              rather than waiting for reload.  This way we get one
> -              reload, not one per store.  */
> -           addr = XEXP (orig_dest, 0);
> -           if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
> -               && CONST_INT_P (XEXP (addr, 1))
> -               && (INTVAL (XEXP (addr, 1)) & 3) != 0)
> -             {
> -               addr = copy_addr_to_reg (addr);
> -               orig_dest = replace_equiv_address (orig_dest, addr);
> -             }
> -         }
> -     }
> -      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
> -     {                       /* move 4 bytes */
> -       clear_bytes = 4;
> -       mode = SImode;
> -     }
> -      else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
> -     {                       /* move 2 bytes */
> -       clear_bytes = 2;
> -       mode = HImode;
> -     }
> -      else /* move 1 byte at a time */
> -     {
> -       clear_bytes = 1;
> -       mode = QImode;
> +       offset += size;
> +       bytes -= size;
>       }
> 
> -      dest = adjust_address (orig_dest, mode, offset);
> +      if (bytes == 0)
> +     return 1;
> 
> -      emit_move_insn (dest, CONST0_RTX (mode));
> +      mode = smallest_fixed_size_mode_for_block_clear (bytes);
> +      int gap = GET_MODE_SIZE (mode) - bytes;
> +      if (gap > 0)
> +     {
> +       offset -= gap;
> +       bytes += gap;
> +     }
>      }
> -
> -  return 1;
> +  while (1);
>  }
> 
>  /* Figure out the correct instructions to generate to load data for
> diff --git a/gcc/testsuite/gcc.target/powerpc/block-clear-1.c 
> b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
> new file mode 100644
> index 00000000000..5e16c44fea3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-not {\mst[hb]\M} } } */
> +
> +/* Verify that memclear takes overlap store. */
> +void* foo (char* s1)
> +{
> +  __builtin_memset (s1, 0, 31);
> +}

Reply via email to