Re: [PATCH v2] AArch64: Cleanup memset expansion

Richard Earnshaw Tue, 14 Nov 2023 08:36:51 -0800



On 14/11/2023 16:23, Wilco Dijkstra wrote:

Hi,

I checked codesize on SPECINT2017, and 96 had practically identical size.
Using 128 would also be a reasonable Os value with a very slight size
increase,
and 384 looks good for O2 - however I didn't want to tune these values
as this
is a cleanup patch.

Cheers,
Wilco


Shouldn't this be a param then?  Also, manifest constants in the middle
of code are a potential nightmare, please move it to a #define (even if
that's then used as the default value for the param).


I agree on making this a #define but I wouldn't insist on a param.
Code size IMO has a much more consistent right or wrong answer as it's 
statically determinable.
It this was a speed-related param then I'd expect the flexibility for the power 
user to override such heuristics would be more widely useful.
But for code size the compiler should always be able to get it right.

If Richard would still like the param then I'm fine with having the param, but 
I'd be okay with the comment above and making this a #define.

I don't immediately have a feel for how sensitive code would be to the
precise value here.  Is this value something that might affect
individual benchmarks in different ways?  Or something where a future
architecture might want a different value?  For either of those reasons
a param might be useful, but if this is primarily a code size trade off
and the variation in performance is small, then it's probably not
worthwhile having an additional hook.


These are just settings that are good for -Os and -O2. I might tune them once
every 5 years or so, but that's all that is needed. I don't believe there is any
value in giving users too many unnecessary options. Adding the configurable
MOPS settings introduced several bugs that went unnoticed despite multiple
code reviews, so doing this creates extra testing and maintenance overheads.

Cheers,
Wilco

---
v2: Add define MAX_SET_SIZE

Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog:

         * config/aarch64/aarch64.h (MAX_SET_SIZE): New define.
         * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove 
function.
         (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
         (aarch64_expand_setmem): Clean up implementation, use byte offsets,
         simplify size calculation.

---

diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
2f0777a37acccb787200d15ae89ec186b4221748..1d98b48db43e09ecf8c4289a8cd4fc55cc2c8a26
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -998,6 +998,10 @@ typedef struct
     mode that should actually be used.  We allow pairs of registers.  */
  #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode)

+/* Maximum bytes set for an inline memset expansion. With -Os use 3 STP

+   and 1 MOVI/DUP (same size as a call).  */
+#define MAX_SET_SIZE(speed) (speed ? 256 : 96)

So it looks like this assumes we have AdvSIMD. What about-mgeneral-regs-only?

R.

+
  /* Maximum bytes moved by a single instruction (load/store pair).  */
  #define MOVE_MAX (UNITS_PER_WORD * 2)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 
5a22b576710e29795d65ddf3face9e8587b1df88..83a18b35729ddd07a1925f53a77bc21c9ac7ca36
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25415,8 +25415,7 @@ aarch64_progress_pointer (rtx pointer)
    return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
  }

-/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by

-   MODE bytes.  */
+/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */

static void

  aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
@@ -25597,46 +25596,22 @@ aarch64_expand_cpymem (rtx *operands)
    return true;
  }

-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where

-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
  static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-                                           machine_mode mode)
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
  {
-  /* If we are copying 128bits or 256bits, we can do that straight from
-     the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-    {
-      mode = GET_MODE (src);
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memset.  */
-      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
-                                        aarch64_progress_pointer (*dst), src));
-
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 32);
-      return;
-    }
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+  /* Emit explict store pair instructions for 32-byte writes.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
      {
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, GET_MODE (src), 0);
-      /* Emit the memset.  */
-      emit_move_insn (*dst, src);
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 16);
+      mode = V16QImode;
+      rtx dst1 = adjust_address (dst, mode, offset);
+      rtx dst2 = adjust_address (dst, mode, offset + 16);
+      emit_insn (aarch64_gen_store_pair (mode, dst1, src, dst2, src));
        return;
      }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
+  if (known_lt (GET_MODE_SIZE (mode), 16))
+    src = lowpart_subreg (mode, src, GET_MODE (src));
+  emit_move_insn (adjust_address (dst, mode, offset), src);
  }

/* Expand a setmem using the MOPS instructions. OPERANDS are the same

@@ -25665,7 +25640,7 @@ aarch64_expand_setmem_mops (rtx *operands)
  bool
  aarch64_expand_setmem (rtx *operands)
  {
-  int n, mode_bits;
+  int mode_bytes;
    unsigned HOST_WIDE_INT len;
    rtx dst = operands[0];
    rtx val = operands[2], src;
@@ -25678,11 +25653,9 @@ aarch64_expand_setmem (rtx *operands)
        || (STRICT_ALIGNMENT && align < 16))
      return aarch64_expand_setmem_mops (operands);

- bool size_p = optimize_function_for_size_p (cfun);

-
    /* Default the maximum to 256-bytes when considering only libcall vs
       SIMD broadcast sequence.  */
-  unsigned max_set_size = 256;
+  unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
    unsigned mops_threshold = aarch64_mops_memset_size_threshold;

len = UINTVAL (operands[1]);

@@ -25691,91 +25664,55 @@ aarch64_expand_setmem (rtx *operands)
    if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
      return aarch64_expand_setmem_mops (operands);

- int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));

-  /* The MOPS sequence takes:
-     3 instructions for the memory storing
-     + 1 to move the constant size into a reg
-     + 1 if VAL is a non-zero constant to move into a reg
-    (zero constants can use XZR directly).  */
-  unsigned mops_cost = 3 + 1 + cst_val;
-  /* A libcall to memset in the worst case takes 3 instructions to prepare
-     the arguments + 1 for the call.  */
-  unsigned libcall_cost = 4;
-
-  /* Attempt a sequence with a vector broadcast followed by stores.
-     Count the number of operations involved to see if it's worth it
-     against the alternatives.  A simple counter simd_ops on the
-     algorithmically-relevant operations is used rather than an rtx_insn count
-     as all the pointer adjusmtents and mode reinterprets will be optimized
-     away later.  */
-  start_sequence ();
-  unsigned simd_ops = 0;
-
    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
    dst = adjust_automodify_address (dst, VOIDmode, base, 0);

/* Prepare the val using a DUP/MOVI v0.16B, val. */

    src = expand_vector_broadcast (V16QImode, val);
    src = force_reg (V16QImode, src);
-  simd_ops++;
-  /* Convert len to bits to make the rest of the code simpler.  */
-  n = len * BITS_PER_UNIT;

- /* Maximum amount to copy in one go. We allow 256-bit chunks based on the

-     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
-  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
-                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-                         ? GET_MODE_BITSIZE (TImode) : 256;
+  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
+     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
+  unsigned set_max = 32;

- while (n > 0)

+  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
+                   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+    set_max = 16;
+
+  int offset = 0;
+  while (len > 0)
      {
        /* Find the largest mode in which to do the copy without
         over writing.  */
        opt_scalar_int_mode mode_iter;
        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
          cur_mode = mode_iter.require ();

gcc_assert (cur_mode != BLKmode);- mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();

-      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-      simd_ops++;
-      n -= mode_bits;
+      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+
+      /* Prefer Q-register accesses for the last bytes.  */
+      if (mode_bytes == 16)
+       cur_mode = V16QImode;
+
+      aarch64_set_one_block (src, dst, offset, cur_mode);
+      len -= mode_bytes;
+      offset += mode_bytes;

/* Emit trailing writes using overlapping unaligned accesses

-       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
+        (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
        {
-         next_mode = smallest_mode_for_size (n, MODE_INT);
-         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-         gcc_assert (n_bits <= mode_bits);
-         dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
-         n = n_bits;
+         next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
+         int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+         gcc_assert (n_bytes <= mode_bytes);
+         offset -= n_bytes - len;
+         len = n_bytes;
        }
      }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();

- if (size_p)

-    {
-      /* When optimizing for size we have 3 options: the SIMD broadcast 
sequence,
-        call to memset or the MOPS expansion.  */
-      if (TARGET_MOPS
-         && mops_cost <= libcall_cost
-         && mops_cost <= simd_ops)
-       return aarch64_expand_setmem_mops (operands);
-      /* If MOPS is not available or not shorter pick a libcall if the SIMD
-        sequence is too long.  */
-      else if (libcall_cost < simd_ops)
-       return false;
-      emit_insn (seq);
-      return true;
-    }
-
-  /* At this point the SIMD broadcast sequence is the best choice when
-     optimizing for speed.  */
-  emit_insn (seq);
    return true;
  }

Re: [PATCH v2] AArch64: Cleanup memset expansion

Reply via email to