Re: [PATCH v2] AArch64: Add inline memmove expansion

Richard Sandiford Wed, 29 Nov 2023 10:30:57 -0800

Wilco Dijkstra <wilco.dijks...@arm.com> writes:
> v2: further cleanups, improved comments
>
> Add support for inline memmove expansions.  The generated code is identical
> as for memcpy, except that all loads are emitted before stores rather than
> being interleaved.  The maximum size is 256 bytes which requires at most 16
> registers.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
>         * config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold):
>         Change default.
>         * config/aarch64/aarch64.md (cpymemdi): Add a parameter.
>         (movmemdi): Call aarch64_expand_cpymem.
>         * config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function,
>         simplify, support storing generated loads/stores.
>         (aarch64_expand_cpymem): Support expansion of memmove.
>         * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool 
> arg.
>
> gcc/testsuite/ChangeLog/
>         * gcc.target/aarch64/memmove.c: Add new test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> 60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..0d39622bd2826a3fde54d67b5c5da9ee9286cbbd
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -769,7 +769,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
>  tree aarch64_vector_load_decl (tree);
>  void aarch64_expand_call (rtx, rtx, rtx, bool);
>  bool aarch64_expand_cpymem_mops (rtx *, bool);
> -bool aarch64_expand_cpymem (rtx *);
> +bool aarch64_expand_cpymem (rtx *, bool);
>  bool aarch64_expand_setmem (rtx *);
>  bool aarch64_float_const_zero_rtx_p (rtx);
>  bool aarch64_float_const_rtx_p (rtx);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 2fa5d09de85d385c1165e399bcc97681ef170916..e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25238,52 +25238,37 @@ aarch64_progress_pointer (rtx pointer)
>    return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
>  }
>
> -/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
> -   MODE bytes.  */
> +/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
>
>  static void
> -aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
> -                                             machine_mode mode)
> +aarch64_copy_one_block (rtx *load, rtx *store, rtx src, rtx dst,
> +                       int offset, machine_mode mode)
>  {
> -  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent 
> memory
> -     address copies using V4SImode so that we can use Q registers.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> +  /* Emit explict load/store pair instructions for 32-byte copies.  */
> +  if (known_eq (GET_MODE_SIZE (mode), 32))
>      {
>        mode = V4SImode;
> +      rtx src1 = adjust_address (src, mode, offset);
> +      rtx src2 = adjust_address (src, mode, offset + 16);
> +      rtx dst1 = adjust_address (dst, mode, offset);
> +      rtx dst2 = adjust_address (dst, mode, offset + 16);
>        rtx reg1 = gen_reg_rtx (mode);
>        rtx reg2 = gen_reg_rtx (mode);
> -      /* "Cast" the pointers to the correct mode.  */
> -      *src = adjust_address (*src, mode, 0);
> -      *dst = adjust_address (*dst, mode, 0);
> -      /* Emit the memcpy.  */
> -      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
> -                                       aarch64_progress_pointer (*src)));
> -      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
> -                                        aarch64_progress_pointer (*dst), 
> reg2));
> -      /* Move the pointers forward.  */
> -      *src = aarch64_move_pointer (*src, 32);
> -      *dst = aarch64_move_pointer (*dst, 32);
> +      *load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
> +      *store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
>        return;
>      }
>
>    rtx reg = gen_reg_rtx (mode);
> -
> -  /* "Cast" the pointers to the correct mode.  */
> -  *src = adjust_address (*src, mode, 0);
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memcpy.  */
> -  emit_move_insn (reg, *src);
> -  emit_move_insn (*dst, reg);
> -  /* Move the pointers forward.  */
> -  *src = aarch64_progress_pointer (*src);
> -  *dst = aarch64_progress_pointer (*dst);
> +  *load = gen_move_insn (reg, adjust_address (src, mode, offset));
> +  *store = gen_move_insn (adjust_address (dst, mode, offset), reg);
>  }
>
>  /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
>     from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
>     rather than memcpy.  Return true iff we succeeded.  */
>  bool
> -aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
> +aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
>  {
>    if (!TARGET_MOPS)
>      return false;
> @@ -25302,51 +25287,48 @@ aarch64_expand_cpymem_mops (rtx *operands, bool 
> is_memmove = false)
>    return true;
>  }
>
> -/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
> -   we succeed, otherwise return false, indicating that a libcall to
> -   memcpy should be emitted.  */
> -
> +/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
> +   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
> +   if this is a memmove rather than memcpy.  Return true if we succeed,
> +   otherwise return false, indicating that a libcall should be emitted.  */
>  bool
> -aarch64_expand_cpymem (rtx *operands)
> +aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>  {
> -  int mode_bits;
> +  int mode_bytes;
>    rtx dst = operands[0];
>    rtx src = operands[1];
>    unsigned align = UINTVAL (operands[3]);
>    rtx base;
> -  machine_mode cur_mode = BLKmode;
> -  bool size_p = optimize_function_for_size_p (cfun);
> +  machine_mode cur_mode = BLKmode, next_mode;
>
>    /* Variable-sized or strict-align copies may use the MOPS expansion.  */
>    if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
> -    return aarch64_expand_cpymem_mops (operands);
> +    return aarch64_expand_cpymem_mops (operands, is_memmove);
>
>    unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
>
> -  /* Try to inline up to 256 bytes.  */
> -  unsigned max_copy_size = 256;
> -  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
> +  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  
> */
> +  unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
> +  unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
> +                                      : aarch64_mops_memcpy_size_threshold;
> +
> +  /* Reduce the maximum size with -Os.  */
> +  if (optimize_function_for_size_p (cfun))
> +    max_copy_size /= 4;
>
>    /* Large copies use MOPS when available or a library call.  */
>    if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
> -    return aarch64_expand_cpymem_mops (operands);
> +    return aarch64_expand_cpymem_mops (operands, is_memmove);
>
> -  int copy_bits = 256;
> +  unsigned copy_max = 32;
>
> -  /* Default to 256-bit LDP/STP on large copies, however small copies, no 
> SIMD
> -     support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
> +  /* Default to 32-byte LDP/STP on large copies, however small copies, no 
> SIMD
> +     support or slow LDP/STP fall back to 16-byte chunks.  */
>    if (size <= 24
>        || !TARGET_SIMD
>        || (aarch64_tune_params.extra_tuning_flags
>           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> -    copy_bits = 128;
> -
> -  /* Emit an inline load+store sequence and count the number of operations
> -     involved.  We use a simple count of just the loads and stores emitted
> -     rather than rtx_insn count as all the pointer adjustments and reg 
> copying
> -     in this function will get optimized away later in the pipeline.  */
> -  start_sequence ();
> -  unsigned nops = 0;
> +    copy_max = 16;
>
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> @@ -25354,69 +25336,60 @@ aarch64_expand_cpymem (rtx *operands)
>    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>    src = adjust_automodify_address (src, VOIDmode, base, 0);
>
> -  /* Convert size to bits to make the rest of the code simpler.  */
> -  int n = size * BITS_PER_UNIT;
> +  const int max_ops = 40;
> +  rtx load[max_ops], store[max_ops];


Please either add a comment explaining why 40 is guaranteed to be
enough, or (my preference) use:

  auto_vec<std::pair<rtx, rtx>, ...> ops;

with safe_push.  Using auto_vec also removes the need to track nops
separately.
>
> -  while (n > 0)
> +  int nops, offset;
> +
> +  for (nops = 0, offset = 0; size > 0; nops++)
>      {
>        /* Find the largest mode in which to do the copy in without over 
> reading
>          or writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
> +       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
>           cur_mode = mode_iter.require ();
>
> -      gcc_assert (cur_mode != BLKmode);
> +      gcc_assert (cur_mode != BLKmode && nops < max_ops);
>
> -      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> +      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
>
>        /* Prefer Q-register accesses for the last bytes.  */
> -      if (mode_bits == 128 && copy_bits == 256)
> +      if (mode_bytes == 16 && copy_max == 32)
>         cur_mode = V4SImode;
>
> -      aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
> -      /* A single block copy is 1 load + 1 store.  */
> -      nops += 2;
> -      n -= mode_bits;
> +      aarch64_copy_one_block (&load[nops], &store[nops], src, dst, offset, 
> cur_mode);
> +      size -= mode_bytes;
> +      offset += mode_bytes;
>
>        /* Emit trailing copies using overlapping unaligned accesses
> -       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> -      if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
> +        (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> +      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
>         {
> -         machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
> -         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> -         gcc_assert (n_bits <= mode_bits);
> -         src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
> -         dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
> -         n = n_bits;
> +         next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
> +         int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
> +         gcc_assert (n_bytes <= mode_bytes);
> +         offset -= n_bytes - size;
> +         size = n_bytes;
>         }
>      }
> -  rtx_insn *seq = get_insns ();
> -  end_sequence ();
> -  /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
> -     the constant size into a register.  */
> -  unsigned mops_cost = 3 + 1;
> -
> -  /* If MOPS is available at this point we don't consider the libcall as it's
> -     not a win even on code size.  At this point only consider MOPS if
> -     optimizing for size.  For speed optimizations we will have chosen 
> between
> -     the two based on copy size already.  */
> -  if (TARGET_MOPS)
> -    {
> -      if (size_p && mops_cost < nops)
> -       return aarch64_expand_cpymem_mops (operands);
> -      emit_insn (seq);
> -      return true;
> -    }
>
> -  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
> -     arguments + 1 for the call.  When MOPS is not available and we're
> -     optimizing for size a libcall may be preferable.  */
> -  unsigned libcall_cost = 4;
> -  if (size_p && libcall_cost < nops)
> -    return false;
> +  /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
> +  int i, j, m, inc;
> +  inc = is_memmove ? nops : 3;
> +  if (nops == inc + 1)
> +    inc = nops / 2;

I realise this is personal preference, sorry, but I think this would
be better as:

  inc = is_memmove ? nops : nops == 4 ? 2 : 3;

since it separates the correctness requirement for is_memmove from the
optimisation of interleaving two accesses for nops==4.

> +  for (i = 0; i < nops; i += inc)
> +    {
> +      m = inc;
> +      if (i + m > nops)
> +       m = nops - i;
>
> -  emit_insn (seq);
> +      for (j = 0; j < m; j++)
> +       emit_insn (load[i + j]);
> +      for (j = 0; j < m; j++)
> +       emit_insn (store[i + j]);
> +    }

Also personal preference, but how about:

    m = MIN (i + inc, nops);
    for (j = i; j < m; j++)
      emit_insn (load[j]);
    for (j = i; j < m; j++)
      emit_insn (store[j]);

(adjusted for the comments above).  Not a strong opinion though, so feel
free to keep your version.

>    return true;
>  }
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 1cb3a01d6791a48dc0b08df5783d97805448c7f2..18dd629c2456041b1185eae6d39de074709b2a39
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1629,7 +1629,7 @@ (define_expand "cpymemdi"
>     (match_operand:DI 3 "immediate_operand")]
>     ""
>  {
> -  if (aarch64_expand_cpymem (operands))
> +  if (aarch64_expand_cpymem (operands, false))
>      DONE;
>    FAIL;
>  }
> @@ -1673,17 +1673,9 @@ (define_expand "movmemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "TARGET_MOPS"
> +   ""
>  {
> -   rtx sz_reg = operands[2];
> -   /* For constant-sized memmoves check the threshold.
> -      FIXME: We should add a non-MOPS memmove expansion for smaller,
> -      constant-sized memmove to avoid going to a libcall.  */
> -   if (CONST_INT_P (sz_reg)
> -       && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
> -     FAIL;
> -
> -  if (aarch64_expand_cpymem_mops (operands, true))
> +  if (aarch64_expand_cpymem (operands, true))
>      DONE;
>    FAIL;
>  }
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index 
> f5a518202a157b5b5bc2b2aa14ac1177fded7d66..0ac9d8c578d706e7bf0f0ae399d84544f0c619dc
>  100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -327,7 +327,7 @@ Target Joined UInteger 
> Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
>  Constant memcpy size in bytes above which to start using MOPS sequence.
>
>  -param=aarch64-mops-memmove-size-threshold=
> -Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
> +Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(256) 
> Param
>  Constant memmove size in bytes above which to start using MOPS sequence.
>
>  -param=aarch64-mops-memset-size-threshold=
> diff --git a/gcc/testsuite/gcc.target/aarch64/memmove.c 
> b/gcc/testsuite/gcc.target/aarch64/memmove.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..6926a97761eb2578d3f1db7e6eb19dba17b888be
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/memmove.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */

Probably best to add:

#pragma GCC target "+nomops"

and have another copy of the test with -mstrict-align.  Keeping the
current test (as a third testcase) is fine too of course.

Otherwise it looks good.  You'd know the appropriate thresholds
better than me.

Thanks,
Richard

> +
> +void
> +copy1 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 12);
> +}
> +
> +void
> +copy2 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 128);
> +}
> +
> +void
> +copy3 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 255);
> +}
> +
> +/* { dg-final { scan-assembler-not {\tb\tmemmove} } } */

Re: [PATCH v2] AArch64: Add inline memmove expansion

Reply via email to