Hi Richard, >> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96) > > Since this isn't (AFAIK) a standard macro, there doesn't seem to be > any need to put it in the header file. It could just go at the head > of aarch64.cc instead.
Sure, I've moved it in v4. >> + if (len <= 24 || (aarch64_tune_params.extra_tuning_flags >> + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)) >> + set_max = 16; > > I think we should take the tuning parameter into account when applying > the MAX_SET_SIZE limit for -Os. Shouldn't it be 48 rather than 96 in > that case? (Alternatively, I suppose it would make sense to ignore > the param for -Os, although we don't seem to do that elsewhere.) That tune is only used by an obsolete core. I ran the memcpy and memset benchmarks from Optimized Routines on xgene-1 with and without LDP/STP. There is no measurable penalty for using LDP/STP. I'm not sure why it was ever added given it does not do anything useful. I'll post a separate patch to remove it to reduce the maintenance overhead. Cheers, Wilco Here is v4 (move MAX_SET_SIZE definition to aarch64.cc): Cleanup memset implementation. Similar to memcpy/memmove, use an offset and bytes throughout. Simplify the complex calculations when optimizing for size by using a fixed limit. Passes regress/bootstrap, OK for commit? gcc/ChangeLog: * config/aarch64/aarch64.cc (MAX_SET_SIZE): New define. (aarch64_progress_pointer): Remove function. (aarch64_set_one_block_and_progress_pointer): Simplify and clean up. (aarch64_expand_setmem): Clean up implementation, use byte offsets, simplify size calculation. --- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index a5a6b52730d6c5013346d128e89915883f1707ae..62f4eee429c1c5195d54604f1d341a8a5a499d89 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -101,6 +101,10 @@ /* Defined for convenience. */ #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT) +/* Maximum bytes set for an inline memset expansion. With -Os use 3 STP + and 1 MOVI/DUP (same size as a call). */ +#define MAX_SET_SIZE(speed) (speed ? 256 : 96) + /* Flags that describe how a function shares certain architectural state with its callers. @@ -26321,15 +26325,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount) next, amount); } -/* Return a new RTX holding the result of moving POINTER forward by the - size of the mode it points to. */ - -static rtx -aarch64_progress_pointer (rtx pointer) -{ - return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer))); -} - typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops; /* Copy one block of size MODE from SRC to DST at offset OFFSET. */ @@ -26484,45 +26479,21 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove) return true; } -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where - SRC is a register we have created with the duplicated value to be set. */ +/* Set one block of size MODE at DST at offset OFFSET to value in SRC. */ static void -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst, - machine_mode mode) +aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode) { - /* If we are copying 128bits or 256bits, we can do that straight from - the SIMD register we prepared. */ - if (known_eq (GET_MODE_BITSIZE (mode), 256)) - { - mode = GET_MODE (src); - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, mode, 0); - /* Emit the memset. */ - emit_insn (aarch64_gen_store_pair (*dst, src, src)); - - /* Move the pointers forward. */ - *dst = aarch64_move_pointer (*dst, 32); - return; - } - if (known_eq (GET_MODE_BITSIZE (mode), 128)) + /* Emit explict store pair instructions for 32-byte writes. */ + if (known_eq (GET_MODE_SIZE (mode), 32)) { - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, GET_MODE (src), 0); - /* Emit the memset. */ - emit_move_insn (*dst, src); - /* Move the pointers forward. */ - *dst = aarch64_move_pointer (*dst, 16); + mode = V16QImode; + rtx dst1 = adjust_address (dst, mode, offset); + emit_insn (aarch64_gen_store_pair (dst1, src, src)); return; } - /* For copying less, we have to extract the right amount from src. */ - rtx reg = lowpart_subreg (mode, src, GET_MODE (src)); - - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, mode, 0); - /* Emit the memset. */ - emit_move_insn (*dst, reg); - /* Move the pointer forward. */ - *dst = aarch64_progress_pointer (*dst); + if (known_lt (GET_MODE_SIZE (mode), 16)) + src = lowpart_subreg (mode, src, GET_MODE (src)); + emit_move_insn (adjust_address (dst, mode, offset), src); } /* Expand a setmem using the MOPS instructions. OPERANDS are the same @@ -26551,7 +26522,7 @@ aarch64_expand_setmem_mops (rtx *operands) bool aarch64_expand_setmem (rtx *operands) { - int n, mode_bits; + int mode_bytes; unsigned HOST_WIDE_INT len; rtx dst = operands[0]; rtx val = operands[2], src; @@ -26564,11 +26535,9 @@ aarch64_expand_setmem (rtx *operands) || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_setmem_mops (operands); - bool size_p = optimize_function_for_size_p (cfun); - /* Default the maximum to 256-bytes when considering only libcall vs SIMD broadcast sequence. */ - unsigned max_set_size = 256; + unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun)); unsigned mops_threshold = aarch64_mops_memset_size_threshold; len = UINTVAL (operands[1]); @@ -26577,91 +26546,55 @@ aarch64_expand_setmem (rtx *operands) if (len > max_set_size || (TARGET_MOPS && len > mops_threshold)) return aarch64_expand_setmem_mops (operands); - int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0)); - /* The MOPS sequence takes: - 3 instructions for the memory storing - + 1 to move the constant size into a reg - + 1 if VAL is a non-zero constant to move into a reg - (zero constants can use XZR directly). */ - unsigned mops_cost = 3 + 1 + cst_val; - /* A libcall to memset in the worst case takes 3 instructions to prepare - the arguments + 1 for the call. */ - unsigned libcall_cost = 4; - - /* Attempt a sequence with a vector broadcast followed by stores. - Count the number of operations involved to see if it's worth it - against the alternatives. A simple counter simd_ops on the - algorithmically-relevant operations is used rather than an rtx_insn count - as all the pointer adjusmtents and mode reinterprets will be optimized - away later. */ - start_sequence (); - unsigned simd_ops = 0; - base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); dst = adjust_automodify_address (dst, VOIDmode, base, 0); /* Prepare the val using a DUP/MOVI v0.16B, val. */ src = expand_vector_broadcast (V16QImode, val); src = force_reg (V16QImode, src); - simd_ops++; - /* Convert len to bits to make the rest of the code simpler. */ - n = len * BITS_PER_UNIT; - /* Maximum amount to copy in one go. We allow 256-bit chunks based on the - AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */ - const int copy_limit = (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) - ? GET_MODE_BITSIZE (TImode) : 256; + /* Set maximum amount to write in one go. We allow 32-byte chunks based + on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */ + unsigned set_max = 32; - while (n > 0) + if (len <= 24 || (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)) + set_max = 16; + + int offset = 0; + while (len > 0) { /* Find the largest mode in which to do the copy without over writing. */ opt_scalar_int_mode mode_iter; FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT) - if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit)) + if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max)) cur_mode = mode_iter.require (); gcc_assert (cur_mode != BLKmode); - mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant (); - aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode); - simd_ops++; - n -= mode_bits; + mode_bytes = GET_MODE_SIZE (cur_mode).to_constant (); + + /* Prefer Q-register accesses for the last bytes. */ + if (mode_bytes == 16) + cur_mode = V16QImode; + + aarch64_set_one_block (src, dst, offset, cur_mode); + len -= mode_bytes; + offset += mode_bytes; /* Emit trailing writes using overlapping unaligned accesses - (when !STRICT_ALIGNMENT) - this is smaller and faster. */ - if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ + if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT) { - next_mode = smallest_mode_for_size (n, MODE_INT); - int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); - gcc_assert (n_bits <= mode_bits); - dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT); - n = n_bits; + next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT); + int n_bytes = GET_MODE_SIZE (next_mode).to_constant (); + gcc_assert (n_bytes <= mode_bytes); + offset -= n_bytes - len; + len = n_bytes; } } - rtx_insn *seq = get_insns (); - end_sequence (); - - if (size_p) - { - /* When optimizing for size we have 3 options: the SIMD broadcast sequence, - call to memset or the MOPS expansion. */ - if (TARGET_MOPS - && mops_cost <= libcall_cost - && mops_cost <= simd_ops) - return aarch64_expand_setmem_mops (operands); - /* If MOPS is not available or not shorter pick a libcall if the SIMD - sequence is too long. */ - else if (libcall_cost < simd_ops) - return false; - emit_insn (seq); - return true; - } - /* At this point the SIMD broadcast sequence is the best choice when - optimizing for speed. */ - emit_insn (seq); return true; }