Hi All, 

This patch allows larger bitsizes to be used as copy size
when the target does not have SLOW_UNALIGNED_ACCESS.

It also provides an optimized routine for MEM to REG
copying which avoid reconstructing the value piecewise on the stack
and instead uses a combination of shifts and ORs.

This now generates

        adrp    x0, .LANCHOR0
        add     x0, x0, :lo12:.LANCHOR0
        sub     sp, sp, #16
        ldr     w1, [x0, 120]
        str     w1, [sp, 8]
        ldr     x0, [x0, 112]
        ldr     x1, [sp, 8]
        add     sp, sp, 16

instead of:

        adrp    x3, .LANCHOR0
        add     x3, x3, :lo12:.LANCHOR0
        mov     x0, 0
        mov     x1, 0
        sub     sp, sp, #16
        ldr     x2, [x3, 112]
        ldr     w3, [x3, 120]
        add     sp, sp, 16
        ubfx    x5, x2, 8, 8
        bfi     x0, x2, 0, 8
        ubfx    x4, x2, 16, 8
        lsr     w9, w2, 24
        bfi     x0, x5, 8, 8
        ubfx    x7, x2, 32, 8
        ubfx    x5, x2, 40, 8
        ubfx    x8, x3, 8, 8
        bfi     x0, x4, 16, 8
        bfi     x1, x3, 0, 8
        ubfx    x4, x2, 48, 8
        ubfx    x6, x3, 16, 8
        bfi     x0, x9, 24, 8
        bfi     x1, x8, 8, 8
        lsr     x2, x2, 56
        lsr     w3, w3, 24
        bfi     x0, x7, 32, 8
        bfi     x1, x6, 16, 8
        bfi     x0, x5, 40, 8
        bfi     x1, x3, 24, 8
        bfi     x0, x4, 48, 8
        bfi     x0, x2, 56, 8

To load a 12 1-byte element struct.

and

        adrp    x0, .LANCHOR0
        add     x0, x0, :lo12:.LANCHOR0
        sub     sp, sp, #16
        ldrb    w1, [x0, 18]
        ldrh    w0, [x0, 16]
        orr     w0, w0, w1, lsr 16
        str     w0, [sp, 8]
        add     sp, sp, 16

instead of

        adrp    x2, .LANCHOR0
        add     x2, x2, :lo12:.LANCHOR0
        mov     x0, 0
        sub     sp, sp, #16
        ldrh    w1, [x2, 16]
        ldrb    w2, [x2, 18]
        add     sp, sp, 16
        bfi     x0, x1, 0, 8
        ubfx    x1, x1, 8, 8
        bfi     x0, x1, 8, 8
        bfi     x0, x2, 16, 8

These changes only have an effect on structures smaller than 16 bytes.

The remaining stores come from an existing incomplete data-flow analysis
which thinks the value on the stack is being used and doesn't mark
the value as dead.

Regression tested on aarch64-none-linux-gnu and x86_64-pc-linux-gnu and no 
regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-07  Tamar Christina  <tamar.christ...@arm.com>

        * expr.c (copy_blkmode_to_reg): Fix bitsize for targets
        with fast unaligned access.
        * config/aarch64/aarch64.c (aarch64_expand_movmem):
        Add MEM to REG optimized case.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4f769a40a4e9de83cb5aacfd3ff58301c2feeb78..8906d9a9445ed36f43302708d1f6212bcf017bdc 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13498,6 +13498,41 @@ aarch64_expand_movmem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
+  /* Optimize routines for MEM to REG copies.  */
+  if (n < 8 && !REG_P (XEXP (operands[0], 0)))
+   {
+     unsigned int max_align = UINTVAL (operands[2]);
+     max_align = n < max_align ? max_align : n;
+     machine_mode mov_mode, dest_mode
+       = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT);
+     rtx result = gen_reg_rtx (dest_mode);
+     emit_insn (gen_move_insn (result, GEN_INT (0)));
+
+     unsigned int shift_cnt = 0;
+     for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
+       {
+	 int nearest = 0;
+	 /* Find the mode to use, but limit the max to TI mode.  */
+	 for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *= 2)
+	      nearest = max;
+
+	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);
+	  rtx reg = gen_reg_rtx (mov_mode);
+
+	  src = adjust_address (src, mov_mode, 0);
+	  emit_insn (gen_move_insn (reg, src));
+	  src = aarch64_progress_pointer (src);
+
+	  reg = gen_rtx_ASHIFT (dest_mode, reg,
+				GEN_INT (shift_cnt * BITS_PER_UNIT));
+	  result = gen_rtx_IOR (dest_mode, reg, result);
+      }
+
+    dst = adjust_address (dst, dest_mode, 0);
+    emit_insn (gen_move_insn (dst, result));
+    return true;
+  }
+
   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
      1-byte chunk.  */
   if (n < 4)
diff --git a/gcc/expr.c b/gcc/expr.c
index 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce8ee5a19297ab16 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, tree src)
 
   n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
   dst_words = XALLOCAVEC (rtx, n_regs);
-  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
+  bitsize = BITS_PER_WORD;
+  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE (src))))
+    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
 
   /* Copy the structure BITSIZE bits at a time.  */
   for (bitpos = 0, xbitpos = padding_correction;

Reply via email to