Wilco Dijkstra <wilco.dijks...@arm.com> writes: > v2: further cleanups, improved comments > > Add support for inline memmove expansions. The generated code is identical > as for memcpy, except that all loads are emitted before stores rather than > being interleaved. The maximum size is 256 bytes which requires at most 16 > registers. > > Passes regress/bootstrap, OK for commit? > > gcc/ChangeLog/ > * config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold): > Change default. > * config/aarch64/aarch64.md (cpymemdi): Add a parameter. > (movmemdi): Call aarch64_expand_cpymem. > * config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function, > simplify, support storing generated loads/stores. > (aarch64_expand_cpymem): Support expansion of memmove. > * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool > arg. > > gcc/testsuite/ChangeLog/ > * gcc.target/aarch64/memmove.c: Add new test. > > --- > > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index > 60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..0d39622bd2826a3fde54d67b5c5da9ee9286cbbd > 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -769,7 +769,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool); > tree aarch64_vector_load_decl (tree); > void aarch64_expand_call (rtx, rtx, rtx, bool); > bool aarch64_expand_cpymem_mops (rtx *, bool); > -bool aarch64_expand_cpymem (rtx *); > +bool aarch64_expand_cpymem (rtx *, bool); > bool aarch64_expand_setmem (rtx *); > bool aarch64_float_const_zero_rtx_p (rtx); > bool aarch64_float_const_rtx_p (rtx); > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > 2fa5d09de85d385c1165e399bcc97681ef170916..e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8 > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -25238,52 +25238,37 @@ aarch64_progress_pointer (rtx pointer) > return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer))); > } > > -/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by > - MODE bytes. */ > +/* Copy one block of size MODE from SRC to DST at offset OFFSET. */ > > static void > -aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, > - machine_mode mode) > +aarch64_copy_one_block (rtx *load, rtx *store, rtx src, rtx dst, > + int offset, machine_mode mode) > { > - /* Handle 256-bit memcpy separately. We do this by making 2 adjacent > memory > - address copies using V4SImode so that we can use Q registers. */ > - if (known_eq (GET_MODE_BITSIZE (mode), 256)) > + /* Emit explict load/store pair instructions for 32-byte copies. */ > + if (known_eq (GET_MODE_SIZE (mode), 32)) > { > mode = V4SImode; > + rtx src1 = adjust_address (src, mode, offset); > + rtx src2 = adjust_address (src, mode, offset + 16); > + rtx dst1 = adjust_address (dst, mode, offset); > + rtx dst2 = adjust_address (dst, mode, offset + 16); > rtx reg1 = gen_reg_rtx (mode); > rtx reg2 = gen_reg_rtx (mode); > - /* "Cast" the pointers to the correct mode. */ > - *src = adjust_address (*src, mode, 0); > - *dst = adjust_address (*dst, mode, 0); > - /* Emit the memcpy. */ > - emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2, > - aarch64_progress_pointer (*src))); > - emit_insn (aarch64_gen_store_pair (mode, *dst, reg1, > - aarch64_progress_pointer (*dst), > reg2)); > - /* Move the pointers forward. */ > - *src = aarch64_move_pointer (*src, 32); > - *dst = aarch64_move_pointer (*dst, 32); > + *load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2); > + *store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2); > return; > } > > rtx reg = gen_reg_rtx (mode); > - > - /* "Cast" the pointers to the correct mode. */ > - *src = adjust_address (*src, mode, 0); > - *dst = adjust_address (*dst, mode, 0); > - /* Emit the memcpy. */ > - emit_move_insn (reg, *src); > - emit_move_insn (*dst, reg); > - /* Move the pointers forward. */ > - *src = aarch64_progress_pointer (*src); > - *dst = aarch64_progress_pointer (*dst); > + *load = gen_move_insn (reg, adjust_address (src, mode, offset)); > + *store = gen_move_insn (adjust_address (dst, mode, offset), reg); > } > > /* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken > from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove > rather than memcpy. Return true iff we succeeded. */ > bool > -aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false) > +aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove) > { > if (!TARGET_MOPS) > return false; > @@ -25302,51 +25287,48 @@ aarch64_expand_cpymem_mops (rtx *operands, bool > is_memmove = false) > return true; > } > > -/* Expand cpymem, as if from a __builtin_memcpy. Return true if > - we succeed, otherwise return false, indicating that a libcall to > - memcpy should be emitted. */ > - > +/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove. > + OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true > + if this is a memmove rather than memcpy. Return true if we succeed, > + otherwise return false, indicating that a libcall should be emitted. */ > bool > -aarch64_expand_cpymem (rtx *operands) > +aarch64_expand_cpymem (rtx *operands, bool is_memmove) > { > - int mode_bits; > + int mode_bytes; > rtx dst = operands[0]; > rtx src = operands[1]; > unsigned align = UINTVAL (operands[3]); > rtx base; > - machine_mode cur_mode = BLKmode; > - bool size_p = optimize_function_for_size_p (cfun); > + machine_mode cur_mode = BLKmode, next_mode; > > /* Variable-sized or strict-align copies may use the MOPS expansion. */ > if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16)) > - return aarch64_expand_cpymem_mops (operands); > + return aarch64_expand_cpymem_mops (operands, is_memmove); > > unsigned HOST_WIDE_INT size = UINTVAL (operands[2]); > > - /* Try to inline up to 256 bytes. */ > - unsigned max_copy_size = 256; > - unsigned mops_threshold = aarch64_mops_memcpy_size_threshold; > + /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. > */ > + unsigned max_copy_size = TARGET_SIMD ? 256 : 128; > + unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold > + : aarch64_mops_memcpy_size_threshold; > + > + /* Reduce the maximum size with -Os. */ > + if (optimize_function_for_size_p (cfun)) > + max_copy_size /= 4; > > /* Large copies use MOPS when available or a library call. */ > if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold)) > - return aarch64_expand_cpymem_mops (operands); > + return aarch64_expand_cpymem_mops (operands, is_memmove); > > - int copy_bits = 256; > + unsigned copy_max = 32; > > - /* Default to 256-bit LDP/STP on large copies, however small copies, no > SIMD > - support or slow 256-bit LDP/STP fall back to 128-bit chunks. */ > + /* Default to 32-byte LDP/STP on large copies, however small copies, no > SIMD > + support or slow LDP/STP fall back to 16-byte chunks. */ > if (size <= 24 > || !TARGET_SIMD > || (aarch64_tune_params.extra_tuning_flags > & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)) > - copy_bits = 128; > - > - /* Emit an inline load+store sequence and count the number of operations > - involved. We use a simple count of just the loads and stores emitted > - rather than rtx_insn count as all the pointer adjustments and reg > copying > - in this function will get optimized away later in the pipeline. */ > - start_sequence (); > - unsigned nops = 0; > + copy_max = 16; > > base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); > dst = adjust_automodify_address (dst, VOIDmode, base, 0); > @@ -25354,69 +25336,60 @@ aarch64_expand_cpymem (rtx *operands) > base = copy_to_mode_reg (Pmode, XEXP (src, 0)); > src = adjust_automodify_address (src, VOIDmode, base, 0); > > - /* Convert size to bits to make the rest of the code simpler. */ > - int n = size * BITS_PER_UNIT; > + const int max_ops = 40; > + rtx load[max_ops], store[max_ops];
Please either add a comment explaining why 40 is guaranteed to be enough, or (my preference) use: auto_vec<std::pair<rtx, rtx>, ...> ops; with safe_push. Using auto_vec also removes the need to track nops separately. > > - while (n > 0) > + int nops, offset; > + > + for (nops = 0, offset = 0; size > 0; nops++) > { > /* Find the largest mode in which to do the copy in without over > reading > or writing. */ > opt_scalar_int_mode mode_iter; > FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT) > - if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits)) > + if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max)) > cur_mode = mode_iter.require (); > > - gcc_assert (cur_mode != BLKmode); > + gcc_assert (cur_mode != BLKmode && nops < max_ops); > > - mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant (); > + mode_bytes = GET_MODE_SIZE (cur_mode).to_constant (); > > /* Prefer Q-register accesses for the last bytes. */ > - if (mode_bits == 128 && copy_bits == 256) > + if (mode_bytes == 16 && copy_max == 32) > cur_mode = V4SImode; > > - aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode); > - /* A single block copy is 1 load + 1 store. */ > - nops += 2; > - n -= mode_bits; > + aarch64_copy_one_block (&load[nops], &store[nops], src, dst, offset, > cur_mode); > + size -= mode_bytes; > + offset += mode_bytes; > > /* Emit trailing copies using overlapping unaligned accesses > - (when !STRICT_ALIGNMENT) - this is smaller and faster. */ > - if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT) > + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ > + if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT) > { > - machine_mode next_mode = smallest_mode_for_size (n, MODE_INT); > - int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); > - gcc_assert (n_bits <= mode_bits); > - src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT); > - dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT); > - n = n_bits; > + next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT); > + int n_bytes = GET_MODE_SIZE (next_mode).to_constant (); > + gcc_assert (n_bytes <= mode_bytes); > + offset -= n_bytes - size; > + size = n_bytes; > } > } > - rtx_insn *seq = get_insns (); > - end_sequence (); > - /* MOPS sequence requires 3 instructions for the memory copying + 1 to move > - the constant size into a register. */ > - unsigned mops_cost = 3 + 1; > - > - /* If MOPS is available at this point we don't consider the libcall as it's > - not a win even on code size. At this point only consider MOPS if > - optimizing for size. For speed optimizations we will have chosen > between > - the two based on copy size already. */ > - if (TARGET_MOPS) > - { > - if (size_p && mops_cost < nops) > - return aarch64_expand_cpymem_mops (operands); > - emit_insn (seq); > - return true; > - } > > - /* A memcpy libcall in the worst case takes 3 instructions to prepare the > - arguments + 1 for the call. When MOPS is not available and we're > - optimizing for size a libcall may be preferable. */ > - unsigned libcall_cost = 4; > - if (size_p && libcall_cost < nops) > - return false; > + /* Memcpy interleaves loads with stores, memmove emits all loads first. */ > + int i, j, m, inc; > + inc = is_memmove ? nops : 3; > + if (nops == inc + 1) > + inc = nops / 2; I realise this is personal preference, sorry, but I think this would be better as: inc = is_memmove ? nops : nops == 4 ? 2 : 3; since it separates the correctness requirement for is_memmove from the optimisation of interleaving two accesses for nops==4. > + for (i = 0; i < nops; i += inc) > + { > + m = inc; > + if (i + m > nops) > + m = nops - i; > > - emit_insn (seq); > + for (j = 0; j < m; j++) > + emit_insn (load[i + j]); > + for (j = 0; j < m; j++) > + emit_insn (store[i + j]); > + } Also personal preference, but how about: m = MIN (i + inc, nops); for (j = i; j < m; j++) emit_insn (load[j]); for (j = i; j < m; j++) emit_insn (store[j]); (adjusted for the comments above). Not a strong opinion though, so feel free to keep your version. > return true; > } > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index > 1cb3a01d6791a48dc0b08df5783d97805448c7f2..18dd629c2456041b1185eae6d39de074709b2a39 > 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -1629,7 +1629,7 @@ (define_expand "cpymemdi" > (match_operand:DI 3 "immediate_operand")] > "" > { > - if (aarch64_expand_cpymem (operands)) > + if (aarch64_expand_cpymem (operands, false)) > DONE; > FAIL; > } > @@ -1673,17 +1673,9 @@ (define_expand "movmemdi" > (match_operand:BLK 1 "memory_operand") > (match_operand:DI 2 "general_operand") > (match_operand:DI 3 "immediate_operand")] > - "TARGET_MOPS" > + "" > { > - rtx sz_reg = operands[2]; > - /* For constant-sized memmoves check the threshold. > - FIXME: We should add a non-MOPS memmove expansion for smaller, > - constant-sized memmove to avoid going to a libcall. */ > - if (CONST_INT_P (sz_reg) > - && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold) > - FAIL; > - > - if (aarch64_expand_cpymem_mops (operands, true)) > + if (aarch64_expand_cpymem (operands, true)) > DONE; > FAIL; > } > diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt > index > f5a518202a157b5b5bc2b2aa14ac1177fded7d66..0ac9d8c578d706e7bf0f0ae399d84544f0c619dc > 100644 > --- a/gcc/config/aarch64/aarch64.opt > +++ b/gcc/config/aarch64/aarch64.opt > @@ -327,7 +327,7 @@ Target Joined UInteger > Var(aarch64_mops_memcpy_size_threshold) Init(256) Param > Constant memcpy size in bytes above which to start using MOPS sequence. > > -param=aarch64-mops-memmove-size-threshold= > -Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param > +Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(256) > Param > Constant memmove size in bytes above which to start using MOPS sequence. > > -param=aarch64-mops-memset-size-threshold= > diff --git a/gcc/testsuite/gcc.target/aarch64/memmove.c > b/gcc/testsuite/gcc.target/aarch64/memmove.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..6926a97761eb2578d3f1db7e6eb19dba17b888be > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/memmove.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ Probably best to add: #pragma GCC target "+nomops" and have another copy of the test with -mstrict-align. Keeping the current test (as a third testcase) is fine too of course. Otherwise it looks good. You'd know the appropriate thresholds better than me. Thanks, Richard > + > +void > +copy1 (int *x, int *y) > +{ > + __builtin_memmove (x, y, 12); > +} > + > +void > +copy2 (int *x, int *y) > +{ > + __builtin_memmove (x, y, 128); > +} > + > +void > +copy3 (int *x, int *y) > +{ > + __builtin_memmove (x, y, 255); > +} > + > +/* { dg-final { scan-assembler-not {\tb\tmemmove} } } */