Hi, This is the second of two patches to add unaligned-access support to the ARM backend. It builds on the first patch to provide support for unaligned accesses when expanding block moves (i.e. for builtin memcpy operations). It makes some effort to use load/store multiple instructions where appropriate (when accessing sufficiently-aligned source or destination addresses), and also makes some effort to generate fast code (for -O1/2/3) or small code (for -Os), though some of the heuristics may need tweaking still.
Examples: #include <string.h> void foo (char *dest, char *src) { memcpy (dest, src, AMOUNT); } char known[64]; void dst_aligned (char *src) { memcpy (known, src, AMOUNT); } void src_aligned (char *dst) { memcpy (dst, known, AMOUNT); } For "-mcpu=cortex-m4 -mthumb -O2 -DAMOUNT=15" we get: foo: ldr r2, [r1, #4] @ unaligned ldr r3, [r1, #8] @ unaligned push {r4} ldr r4, [r1, #0] @ unaligned str r2, [r0, #4] @ unaligned str r4, [r0, #0] @ unaligned str r3, [r0, #8] @ unaligned ldrh r2, [r1, #12] @ unaligned ldrb r3, [r1, #14] @ zero_extendqisi2 strh r2, [r0, #12] @ unaligned strb r3, [r0, #14] pop {r4} bx lr dst_aligned: push {r4} mov r4, r0 movw r3, #:lower16:known ldr r1, [r4, #4] @ unaligned ldr r2, [r4, #8] @ unaligned ldr r0, [r0, #0] @ unaligned movt r3, #:upper16:known stmia r3!, {r0, r1, r2} ldrh r1, [r4, #12] @ unaligned ldrb r2, [r4, #14] @ zero_extendqisi2 strh r1, [r3, #0] @ unaligned strb r2, [r3, #2] pop {r4} bx lr src_aligned: push {r4} movw r3, #:lower16:known movt r3, #:upper16:known mov r4, r0 ldmia r3!, {r0, r1, r2} str r0, [r4, #0] @ unaligned str r1, [r4, #4] @ unaligned str r2, [r4, #8] @ unaligned ldrh r2, [r3, #0] @ unaligned ldrb r3, [r3, #2] @ zero_extendqisi2 strh r2, [r4, #12] @ unaligned strb r3, [r4, #14] pop {r4} bx lr Whereas for "-mcpu=cortex-m4 -mthumb -Os -DAMOUNT=15", e.g.: foo: add r3, r1, #12 .L2: ldr r2, [r1], #4 @ unaligned cmp r1, r3 str r2, [r0], #4 @ unaligned bne .L2 ldrh r3, [r1, #0] @ unaligned strh r3, [r0, #0] @ unaligned ldrb r3, [r1, #2] @ zero_extendqisi2 strb r3, [r0, #2] bx lr Tested (alongside the first patch) with cross to ARM Linux. OK to apply? Thanks, Julian ChangeLog gcc/ * config/arm/arm.c (arm_block_move_unaligned_straight) (arm_adjust_block_mem, arm_block_move_unaligned_loop) (arm_movmemqi_unaligned): New. (arm_gen_movmemqi): Support unaligned block copies.
commit 16973f69fce37a2b347ea7daffd6f593aba843d5 Author: Julian Brown <jul...@henry7.codesourcery.com> Date: Wed May 4 11:26:01 2011 -0700 Optimize block moves when unaligned accesses are permitted. diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index a18aea6..b6df0d3 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -10362,6 +10362,335 @@ gen_const_stm_seq (rtx *operands, int nops) return true; } +/* Copy a block of memory using plain ldr/str/ldrh/strh instructions, to permit + unaligned copies on processors which support unaligned semantics for those + instructions. INTERLEAVE_FACTOR can be used to attempt to hide load latency + (using more registers) by doing e.g. load/load/store/store for a factor of 2. + An interleave factor of 1 (the minimum) will perform no interleaving. + Load/store multiple are used for aligned addresses where possible. */ + +static void +arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase, + HOST_WIDE_INT length, + unsigned int interleave_factor) +{ + rtx *regs = XALLOCAVEC (rtx, interleave_factor); + int *regnos = XALLOCAVEC (int, interleave_factor); + HOST_WIDE_INT block_size_bytes = interleave_factor * UNITS_PER_WORD; + HOST_WIDE_INT i, j; + HOST_WIDE_INT remaining = length, words; + rtx halfword_tmp = NULL, byte_tmp = NULL; + rtx dst, src; + bool src_aligned = MEM_ALIGN (srcbase) >= BITS_PER_WORD; + bool dst_aligned = MEM_ALIGN (dstbase) >= BITS_PER_WORD; + HOST_WIDE_INT srcoffset, dstoffset; + HOST_WIDE_INT src_autoinc, dst_autoinc; + rtx mem, addr; + + gcc_assert (1 <= interleave_factor && interleave_factor <= 4); + + /* Use hard registers if we have aligned source or destination so we can use + load/store multiple with contiguous registers. */ + if (dst_aligned || src_aligned) + for (i = 0; i < interleave_factor; i++) + regs[i] = gen_rtx_REG (SImode, i); + else + for (i = 0; i < interleave_factor; i++) + regs[i] = gen_reg_rtx (SImode); + + dst = copy_addr_to_reg (XEXP (dstbase, 0)); + src = copy_addr_to_reg (XEXP (srcbase, 0)); + + srcoffset = dstoffset = 0; + + /* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST. + For copying the last bytes we want to subtract this offset again. */ + src_autoinc = dst_autoinc = 0; + + for (i = 0; i < interleave_factor; i++) + regnos[i] = i; + + /* Copy BLOCK_SIZE_BYTES chunks. */ + + for (i = 0; i + block_size_bytes <= length; i += block_size_bytes) + { + /* Load words. */ + if (src_aligned && interleave_factor > 1) + { + emit_insn (arm_gen_load_multiple (regnos, interleave_factor, src, + TRUE, srcbase, &srcoffset)); + src_autoinc += UNITS_PER_WORD * interleave_factor; + } + else + { + for (j = 0; j < interleave_factor; j++) + { + addr = plus_constant (src, srcoffset + j * UNITS_PER_WORD + - src_autoinc); + mem = adjust_automodify_address (srcbase, SImode, addr, + srcoffset + j * UNITS_PER_WORD); + emit_insn (gen_unaligned_loadsi (regs[j], mem)); + } + srcoffset += block_size_bytes; + } + + /* Store words. */ + if (dst_aligned && interleave_factor > 1) + { + emit_insn (arm_gen_store_multiple (regnos, interleave_factor, dst, + TRUE, dstbase, &dstoffset)); + dst_autoinc += UNITS_PER_WORD * interleave_factor; + } + else + { + for (j = 0; j < interleave_factor; j++) + { + addr = plus_constant (dst, dstoffset + j * UNITS_PER_WORD + - dst_autoinc); + mem = adjust_automodify_address (dstbase, SImode, addr, + dstoffset + j * UNITS_PER_WORD); + emit_insn (gen_unaligned_storesi (mem, regs[j])); + } + dstoffset += block_size_bytes; + } + + remaining -= block_size_bytes; + } + + /* Copy any whole words left (note these aren't interleaved with any + subsequent halfword/byte load/stores in the interests of simplicity). */ + + words = remaining / UNITS_PER_WORD; + + gcc_assert (words < interleave_factor); + + if (src_aligned && words > 1) + { + emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase, + &srcoffset)); + src_autoinc += UNITS_PER_WORD * words; + } + else + { + for (j = 0; j < words; j++) + { + addr = plus_constant (src, + srcoffset + j * UNITS_PER_WORD - src_autoinc); + mem = adjust_automodify_address (srcbase, SImode, addr, + srcoffset + j * UNITS_PER_WORD); + emit_insn (gen_unaligned_loadsi (regs[j], mem)); + } + srcoffset += words * UNITS_PER_WORD; + } + + if (dst_aligned && words > 1) + { + emit_insn (arm_gen_store_multiple (regnos, words, dst, TRUE, dstbase, + &dstoffset)); + dst_autoinc += words * UNITS_PER_WORD; + } + else + { + for (j = 0; j < words; j++) + { + addr = plus_constant (dst, + dstoffset + j * UNITS_PER_WORD - dst_autoinc); + mem = adjust_automodify_address (dstbase, SImode, addr, + dstoffset + j * UNITS_PER_WORD); + emit_insn (gen_unaligned_storesi (mem, regs[j])); + } + dstoffset += words * UNITS_PER_WORD; + } + + remaining -= words * UNITS_PER_WORD; + + gcc_assert (remaining < 4); + + /* Copy a halfword if necessary. */ + + if (remaining >= 2) + { + halfword_tmp = gen_reg_rtx (SImode); + + addr = plus_constant (src, srcoffset - src_autoinc); + mem = adjust_automodify_address (srcbase, HImode, addr, srcoffset); + emit_insn (gen_unaligned_loadhiu (halfword_tmp, mem)); + + /* Either write out immediately, or delay until we've loaded the last + byte, depending on interleave factor. */ + if (interleave_factor == 1) + { + addr = plus_constant (dst, dstoffset - dst_autoinc); + mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset); + emit_insn (gen_unaligned_storehi (mem, + gen_lowpart (HImode, halfword_tmp))); + halfword_tmp = NULL; + dstoffset += 2; + } + + remaining -= 2; + srcoffset += 2; + } + + gcc_assert (remaining < 2); + + /* Copy last byte. */ + + if ((remaining & 1) != 0) + { + byte_tmp = gen_reg_rtx (SImode); + + addr = plus_constant (src, srcoffset - src_autoinc); + mem = adjust_automodify_address (srcbase, QImode, addr, srcoffset); + emit_move_insn (gen_lowpart (QImode, byte_tmp), mem); + + if (interleave_factor == 1) + { + addr = plus_constant (dst, dstoffset - dst_autoinc); + mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset); + emit_move_insn (mem, gen_lowpart (QImode, byte_tmp)); + byte_tmp = NULL; + dstoffset++; + } + + remaining--; + srcoffset++; + } + + /* Store last halfword if we haven't done so already. */ + + if (halfword_tmp) + { + addr = plus_constant (dst, dstoffset - dst_autoinc); + mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset); + emit_insn (gen_unaligned_storehi (mem, + gen_lowpart (HImode, halfword_tmp))); + dstoffset += 2; + } + + /* Likewise for last byte. */ + + if (byte_tmp) + { + addr = plus_constant (dst, dstoffset - dst_autoinc); + mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset); + emit_move_insn (mem, gen_lowpart (QImode, byte_tmp)); + dstoffset++; + } + + gcc_assert (remaining == 0 && srcoffset == dstoffset); +} + +/* From mips_adjust_block_mem: + + Helper function for doing a loop-based block operation on memory + reference MEM. Each iteration of the loop will operate on LENGTH + bytes of MEM. + + Create a new base register for use within the loop and point it to + the start of MEM. Create a new memory reference that uses this + register. Store them in *LOOP_REG and *LOOP_MEM respectively. */ + +static void +arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg, + rtx *loop_mem) +{ + *loop_reg = copy_addr_to_reg (XEXP (mem, 0)); + + /* Although the new mem does not refer to a known location, + it does keep up to LENGTH bytes of alignment. */ + *loop_mem = change_address (mem, BLKmode, *loop_reg); + set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT)); +} + +/* From mips_block_move_loop: + + Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER + bytes at a time. LENGTH must be at least BYTES_PER_ITER. Assume that + the memory regions do not overlap. */ + +static void +arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length, + unsigned int interleave_factor, + HOST_WIDE_INT bytes_per_iter) +{ + rtx label, src_reg, dest_reg, final_src, test; + HOST_WIDE_INT leftover; + + leftover = length % bytes_per_iter; + length -= leftover; + + /* Create registers and memory references for use within the loop. */ + arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src); + arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest); + + /* Calculate the value that SRC_REG should have after the last iteration of + the loop. */ + final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length), + 0, 0, OPTAB_WIDEN); + + /* Emit the start of the loop. */ + label = gen_label_rtx (); + emit_label (label); + + /* Emit the loop body. */ + arm_block_move_unaligned_straight (dest, src, bytes_per_iter, + interleave_factor); + + /* Move on to the next block. */ + emit_move_insn (src_reg, plus_constant (src_reg, bytes_per_iter)); + emit_move_insn (dest_reg, plus_constant (dest_reg, bytes_per_iter)); + + /* Emit the loop condition. */ + test = gen_rtx_NE (VOIDmode, src_reg, final_src); + emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label)); + + /* Mop up any left-over bytes. */ + if (leftover) + arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor); +} + +/* Emit a block move when either the source or destination is unaligned (not + aligned to a four-byte boundary). This may need further tuning depending on + core type, optimize_size setting, etc. */ + +static int +arm_movmemqi_unaligned (rtx *operands) +{ + HOST_WIDE_INT length = INTVAL (operands[2]); + + if (optimize_size) + { + bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD; + bool dst_aligned = MEM_ALIGN (operands[0]) >= BITS_PER_WORD; + /* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit + size of code if optimizing for size. We'll use ldm/stm if src_aligned + or dst_aligned though: allow more interleaving in those cases since the + resulting code can be smaller. */ + unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1; + HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4; + + if (length > 12) + arm_block_move_unaligned_loop (operands[0], operands[1], length, + interleave_factor, bytes_per_iter); + else + arm_block_move_unaligned_straight (operands[0], operands[1], length, + interleave_factor); + } + else + { + /* Note that the loop created by arm_block_move_unaligned_loop may be + subject to loop unrolling, which makes tuning this condition a little + redundant. */ + if (length > 32) + arm_block_move_unaligned_loop (operands[0], operands[1], length, 4, 16); + else + arm_block_move_unaligned_straight (operands[0], operands[1], length, 4); + } + + return 1; +} + int arm_gen_movmemqi (rtx *operands) { @@ -10374,8 +10703,13 @@ arm_gen_movmemqi (rtx *operands) if (GET_CODE (operands[2]) != CONST_INT || GET_CODE (operands[3]) != CONST_INT - || INTVAL (operands[2]) > 64 - || INTVAL (operands[3]) & 3) + || INTVAL (operands[2]) > 64) + return 0; + + if (unaligned_access && (INTVAL (operands[3]) & 3) != 0) + return arm_movmemqi_unaligned (operands); + + if (INTVAL (operands[3]) & 3) return 0; dstbase = operands[0];