Kyrylo Tkachov via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> @@ -23568,6 +23568,28 @@ aarch64_copy_one_block_and_progress_pointers (rtx 
> *src, rtx *dst,
>    *dst = aarch64_progress_pointer (*dst);
>  }
>  
> +/* Expand a cpymem using the MOPS extension.  OPERANDS are taken
> +   from the cpymem pattern.  Return true iff we succeeded.  */
> +static bool
> +aarch64_expand_cpymem_mops (rtx *operands)
> +{
> +  if (!TARGET_MOPS)
> +    return false;
> +  rtx addr_dst = XEXP (operands[0], 0);
> +  rtx addr_src = XEXP (operands[1], 0);
> +  rtx sz_reg = operands[2];
> +
> +  if (!REG_P (sz_reg))
> +    sz_reg = force_reg (DImode, sz_reg);
> +  if (!REG_P (addr_dst))
> +    addr_dst = force_reg (DImode, addr_dst);
> +  if (!REG_P (addr_src))
> +    addr_src = force_reg (DImode, addr_src);
> +  emit_insn (gen_aarch64_cpymemdi (addr_dst, addr_src, sz_reg));
> +
> +  return true;
> +}

On this, I think it would be better to adjust the original src and dst
MEMs if possible, since they contain metadata about the size of the
access and alias set information.  It looks like the code above
generates an instruction with a wild read and a wild write instead.

It should be possible to do that with a define_expand/define_insn
pair, where the define_expand takes two extra operands for the MEMs,
but the define_insn contains the same operands as now.

Since the instruction clobbers the three registers, I think we have to
use copy_to_reg (unconditionally) to force a fresh register.  The ultimate
caller is not expecting the values of the registers in the original
address to change.

Thanks,
Richard



> +
>  /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
>     we succeed, otherwise return false, indicating that a libcall to
>     memcpy should be emitted.  */
> @@ -23581,19 +23603,25 @@ aarch64_expand_cpymem (rtx *operands)
>    rtx base;
>    machine_mode cur_mode = BLKmode;
>  
> -  /* Only expand fixed-size copies.  */
> +  /* Variable-sized memcpy can go through the MOPS expansion if available.  
> */
>    if (!CONST_INT_P (operands[2]))
> -    return false;
> +    return aarch64_expand_cpymem_mops (operands);
>  
>    unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>  
> -  /* Try to inline up to 256 bytes.  */
> -  unsigned HOST_WIDE_INT max_copy_size = 256;
> +  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  
> */
> +  unsigned HOST_WIDE_INT max_copy_size
> +    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
>  
>    bool size_p = optimize_function_for_size_p (cfun);
>  
> +  /* Large constant-sized cpymem should go through MOPS when possible.
> +     It should be a win even for size optimization in the general case.
> +     For speed optimization the choice between MOPS and the SIMD sequence
> +     depends on the size of the copy, rather than number of instructions,
> +     alignment etc.  */
>    if (size > max_copy_size)
> -    return false;
> +    return aarch64_expand_cpymem_mops (operands);
>  
>    int copy_bits = 256;
>  
> @@ -23643,9 +23671,9 @@ aarch64_expand_cpymem (rtx *operands)
>        nops += 2;
>        n -= mode_bits;
>  
> -      /* Emit trailing copies using overlapping unaligned accesses - this is
> -      smaller and faster.  */
> -      if (n > 0 && n < copy_bits / 2)
> +      /* Emit trailing copies using overlapping unaligned accesses
> +     (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> +      if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
>       {
>         machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
>         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> @@ -23657,9 +23685,25 @@ aarch64_expand_cpymem (rtx *operands)
>      }
>    rtx_insn *seq = get_insns ();
>    end_sequence ();
> +  /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
> +     the constant size into a register.  */
> +  unsigned mops_cost = 3 + 1;
> +
> +  /* If MOPS is available at this point we don't consider the libcall as it's
> +     not a win even on code size.  At this point only consider MOPS if
> +     optimizing for size.  For speed optimizations we will have chosen 
> between
> +     the two based on copy size already.  */
> +  if (TARGET_MOPS)
> +    {
> +      if (size_p && mops_cost < nops)
> +     return aarch64_expand_cpymem_mops (operands);
> +      emit_insn (seq);
> +      return true;
> +    }
>  
>    /* A memcpy libcall in the worst case takes 3 instructions to prepare the
> -     arguments + 1 for the call.  */
> +     arguments + 1 for the call.  When MOPS is not available and we're
> +     optimizing for size a libcall may be preferable.  */
>    unsigned libcall_cost = 4;
>    if (size_p && libcall_cost < nops)
>      return false;
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 5297b2d3f95744ac72e36814c6676cc97478d48b..d623c1b00bf62bf24420813fb7a3a6bf09ff1dc0
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -143,6 +143,7 @@ (define_c_enum "unspec" [
>      UNSPEC_AUTIBSP
>      UNSPEC_CALLEE_ABI
>      UNSPEC_CASESI
> +    UNSPEC_CPYMEM
>      UNSPEC_CRC32B
>      UNSPEC_CRC32CB
>      UNSPEC_CRC32CH
> @@ -1572,6 +1573,18 @@ (define_split
>    }
>  )
>  
> +(define_insn "aarch64_cpymemdi"
> +  [(parallel [
> +   (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
> +   (clobber (match_operand:DI 0 "register_operand" "+&r"))
> +   (clobber (match_operand:DI 1 "register_operand" "+&r"))
> +   (set (mem:BLK (match_dup 0))
> +        (unspec:BLK [(mem:BLK (match_dup 1)) (match_dup 2)] 
> UNSPEC_CPYMEM))])]
> + "TARGET_MOPS"
> + "cpyfp\t[%x0]!, [%x1]!, %x2!\;cpyfm\t[%x0]!, [%x1]!, %x2!\;cpyfe\t[%x0]!, 
> [%x1]!, %x2!"
> + [(set_attr "length" "12")]
> +)
> +
>  ;; 0 is dst
>  ;; 1 is src
>  ;; 2 is size of copy in bytes
> @@ -1580,9 +1593,9 @@ (define_split
>  (define_expand "cpymemdi"
>    [(match_operand:BLK 0 "memory_operand")
>     (match_operand:BLK 1 "memory_operand")
> -   (match_operand:DI 2 "immediate_operand")
> +   (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "!STRICT_ALIGNMENT"
> +   "!STRICT_ALIGNMENT || TARGET_MOPS"
>  {
>    if (aarch64_expand_cpymem (operands))
>      DONE;
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index 
> 32191cf1acf43302c4a544b85db60b7b6e14da48..7445ed106cca9cb8a4537414499f6f28476951bf
>  100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -280,3 +280,7 @@ Target Joined UInteger Var(aarch64_autovec_preference) 
> Init(0) IntegerRange(0, 4
>  
>  -param=aarch64-loop-vect-issue-rate-niters=
>  Target Joined UInteger Var(aarch64_loop_vect_issue_rate_niters) Init(6) 
> IntegerRange(0, 65536) Param
> +
> +-param=aarch64-mops-memcpy-size-threshold=
> +Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) 
> Param
> +Constant memcpy size in bytes above which to start using MOPS sequence.
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 
> 510ed079b99374028e38d20f3edbac12d75f7842..9c38277e4f317332088555c9948ef19935ae890c
>  100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -19135,6 +19135,9 @@ prior to Armv8.2-A is not supported.
>  @item ls64
>  Enable the 64-byte atomic load and store instructions for accelerators.
>  This option is enabled by default for @option{-march=armv8.7-a}.
> +@item mops
> +Enable the instructions to accelerate memory operations like @code{memcpy},
> +@code{memmove}, @code{memset}.
>  @item flagm
>  Enable the Flag Manipulation instructions Extension.
>  @item pauth
> diff --git a/gcc/testsuite/gcc.target/aarch64/mops_1.c 
> b/gcc/testsuite/gcc.target/aarch64/mops_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..661c14192e8a84fd4641a4d818b8db46ab4f1b28
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/mops_1.c
> @@ -0,0 +1,57 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=armv8.6-a+mops 
> --param=aarch64-mops-memcpy-size-threshold=0" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <stdlib.h>
> +
> +/* We want to inline variable-sized memcpy.
> +** do_it_cpy:
> +**   cpyfp   \[x1\]\!, \[x0\]\!, x2\!
> +**   cpyfm   \[x1\]\!, \[x0\]\!, x2\!
> +**   cpyfe   \[x1\]\!, \[x0\]\!, x2\!
> +**   ret
> +*/
> +void do_it_cpy (char * in, char * out, size_t size)
> +{
> +  __builtin_memcpy (out, in, size);
> +}
> +
> +/*
> +** do_it_cpy_large:
> +**   mov     x2, 1024
> +**   cpyfp   \[x1\]\!, \[x0\]!, x2\!
> +**   cpyfm   \[x1\]\!, \[x0\]!, x2\!
> +**   cpyfe   \[x1\]\!, \[x0\]\!, x2\!
> +**   ret
> +*/
> +void do_it_cpy_large (char * in, char * out)
> +{
> +  __builtin_memcpy (out, in, 1024);
> +}
> +
> +/*
> +** do_it_cpy_127:
> +**   mov     x2, 127
> +**   cpyfp   \[x1\]\!, \[x0\]!, x2\!
> +**   cpyfm   \[x1\]\!, \[x0\]!, x2\!
> +**   cpyfe   \[x1\]\!, \[x0\]\!, x2\!
> +**   ret
> +*/
> +void do_it_cpy_127 (char * in, char * out)
> +{
> +  __builtin_memcpy (out, in, 127);
> +}
> +
> +/*
> +** do_it_cpy_128:
> +**   mov     x2, 128
> +**   cpyfp   \[x1\]\!, \[x0\]!, x2\!
> +**   cpyfm   \[x1\]\!, \[x0\]!, x2\!
> +**   cpyfe   \[x1\]\!, \[x0\]\!, x2\!
> +**   ret
> +*/
> +void do_it_cpy_128 (char * in, char * out)
> +{
> +  __builtin_memcpy (out, in, 128);
> +}
> +

Reply via email to