STP [PR113618]

Wilco Dijkstra via Gcc-cvs Thu, 07 Mar 2024 13:26:08 -0800

https://gcc.gnu.org/g:19b23bf3c32df3cbb96b3d898a1d7142f7bea4a0


commit r14-9373-g19b23bf3c32df3cbb96b3d898a1d7142f7bea4a0
Author: Wilco Dijkstra <wilco.dijks...@arm.com>
Date:   Wed Feb 21 23:33:58 2024 +0000

    AArch64: memcpy/memset expansions should not emit LDP/STP [PR113618]
    
    The new RTL introduced for LDP/STP results in regressions due to use of 
UNSPEC.
    Given the new LDP fusion pass is good at finding LDP opportunities, change 
the
    memcpy, memmove and memset expansions to emit single vector loads/stores.
    This fixes the regression and enables more RTL optimization on the standard
    memory accesses.  Handling of unaligned tail of memcpy/memmove is improved
    with -mgeneral-regs-only.  SPEC2017 performance improves slightly.  Codesize
    is a bit worse due to missed LDP opportunities as discussed in the PR.
    
    gcc/ChangeLog:
            PR target/113618
            * config/aarch64/aarch64.cc (aarch64_copy_one_block): Remove.
            (aarch64_expand_cpymem): Emit single load/store only.
            (aarch64_set_one_block): Emit single stores only.
    
    gcc/testsuite/ChangeLog:
            PR target/113618
            * gcc.target/aarch64/pr113618.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.cc               | 68 +++++++++--------------------
 gcc/testsuite/gcc.target/aarch64/pr113618.c | 36 +++++++++++++++
 2 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 16318bf9258..0a28e033088 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26465,33 +26465,6 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
-typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
-
-/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
-static void
-aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
-                       int offset, machine_mode mode)
-{
-  /* Emit explict load/store pair instructions for 32-byte copies.  */
-  if (known_eq (GET_MODE_SIZE (mode), 32))
-    {
-      mode = V4SImode;
-      rtx src1 = adjust_address (src, mode, offset);
-      rtx dst1 = adjust_address (dst, mode, offset);
-      rtx reg1 = gen_reg_rtx (mode);
-      rtx reg2 = gen_reg_rtx (mode);
-      rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
-      rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
-      ops.safe_push ({ load, store });
-      return;
-    }
-
-  rtx reg = gen_reg_rtx (mode);
-  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
-  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
-  ops.safe_push ({ load, store });
-}
-
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
    rather than memcpy.  Return true iff we succeeded.  */
@@ -26527,7 +26500,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   rtx src = operands[1];
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
+  machine_mode mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
@@ -26550,16 +26523,12 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
     return aarch64_expand_cpymem_mops (operands, is_memmove);
 
-  unsigned copy_max = 32;
-
-  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
-     support or slow LDP/STP fall back to 16-byte chunks.
-
+  /* Default to 32-byte LDP/STP on large copies, however small copies or
+     no SIMD support fall back to 16-byte chunks.
      ??? Although it would be possible to use LDP/STP Qn in streaming mode
      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
      whether that would improve performance.  */
-  if (size <= 24 || !use_ldpq)
-    copy_max = 16;
+  bool use_qregs = size > 24 && TARGET_SIMD;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -26567,7 +26536,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  copy_ops ops;
+  auto_vec<std::pair<rtx, rtx>, 16> ops;
   int offset = 0;
 
   while (size > 0)
@@ -26576,23 +26545,27 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
         or writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
-         cur_mode = mode_iter.require ();
+       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
+         mode = mode_iter.require ();
 
-      gcc_assert (cur_mode != BLKmode);
+      gcc_assert (mode != BLKmode);
+
+      mode_bytes = GET_MODE_SIZE (mode).to_constant ();
 
-      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+      /* Prefer Q-register accesses.  */
+      if (mode_bytes == 16 && use_qregs)
+       mode = V4SImode;
 
-      /* Prefer Q-register accesses for the last bytes.  */
-      if (mode_bytes == 16 && copy_max == 32)
-       cur_mode = V4SImode;
-      aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
+      rtx reg = gen_reg_rtx (mode);
+      rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
+      rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
+      ops.safe_push ({ load, store });
       size -= mode_bytes;
       offset += mode_bytes;
 
       /* Emit trailing copies using overlapping unaligned accesses
         (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
+      if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
        {
          next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
          int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
@@ -26604,7 +26577,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 
   /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
   int nops = ops.length();
-  int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
+  int inc = is_memmove || nops <= 8 ? nops : 6;
 
   for (int i = 0; i < nops; i += inc)
     {
@@ -26633,7 +26606,8 @@ aarch64_set_one_block_and_progress_pointer (rtx src, 
rtx *dst,
       /* "Cast" the *dst to the correct mode.  */
       *dst = adjust_address (*dst, mode, 0);
       /* Emit the memset.  */
-      emit_insn (aarch64_gen_store_pair (*dst, src, src));
+      emit_move_insn (*dst, src);
+      emit_move_insn (aarch64_move_pointer (*dst, 16), src);
 
       /* Move the pointers forward.  */
       *dst = aarch64_move_pointer (*dst, 32);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113618.c 
b/gcc/testsuite/gcc.target/aarch64/pr113618.c
new file mode 100644
index 00000000000..f582360e7c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113618.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+
+/*
+** move32:
+**     ...
+**     ldp     q([0-9]+), q([0-9]+), \[x0\]
+**     stp     q\1, q\2, \[x1\]
+**     ...
+*/
+
+void move32 (char *a, char *b)
+{
+    char temp[32];
+    __builtin_memcpy (temp, a, 32);
+    __builtin_memcpy (b, temp, 32);
+}
+
+/*
+** move64:
+**     ...
+**     ldp     q([0-9]+), q([0-9]+), \[x0\]
+**     ldp     q([0-9]+), q([0-9]+), \[x0, 32\]
+**     stp     q\1, q\2, \[x1\]
+**     stp     q\3, q\4, \[x1, 32\]
+**     ...
+*/
+
+void move64 (char *a, char *b)
+{
+    char temp[64];
+    __builtin_memcpy (temp, a, 64);
+    __builtin_memcpy (b, temp, 64);
+}

[gcc r14-9373] AArch64: memcpy/memset expansions should not emit LDP/STP [PR113618]

Reply via email to