Hi all,

When storing a 64-bit immediate that has equal bottom and top halves we 
currently
synthesize the repeating 32-bit pattern twice and perform a single X-store.
With this patch we synthesize the 32-bit pattern once into a W register and 
store
that twice using an STP. This reduces codesize bloat from synthesising the same
constant multiple times at the expense of converting a store to a store-pair.
It will only trigger if we can save two or more instructions, so it will only 
transform:
        mov     x1, 49370
        movk    x1, 0xc0da, lsl 32
        str     x1, [x0]

into:

        mov     w1, 49370
        stp     w1, w1, [x0]

when optimising for -Os, whereas it will always transform a 4-insn synthesis
sequence into a two-insn sequence + STP (see comments in the patch).

This patch triggers already but will trigger more with the store merging pass
that I'm working on since that will generate more of these repeating 64-bit 
constants.
This helps improve codegen on 456.hmmer where store merging can sometimes 
create very
complex repeating constants and target-specific expand needs to break them down.

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok for trunk?

Thanks,
Kyrill

2016-10-24  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * config/aarch64/aarch64.md (mov<mode>): Call
    aarch64_split_dimode_const_store on DImode constant stores.
    * config/aarch64/aarch64-protos.h (aarch64_split_dimode_const_store):
    New prototype.
    * config/aarch64/aarch64.c (aarch64_split_dimode_const_store): New
    function.

2016-10-24  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * gcc.target/aarch64/store_repeating_constant_1.c: New test.
    * gcc.target/aarch64/store_repeating_constant_2.c: Likewise.
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 07a8cd0455d64a861cb919083a9d369bf23724b7..4c551ef143d3b32e94bd58989c85ebd3352cdd9b 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -337,6 +337,7 @@ bool aarch64_simd_scalar_immediate_valid_for_move (rtx, machine_mode);
 bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
 bool aarch64_simd_valid_immediate (rtx, machine_mode, bool,
 				   struct simd_immediate_info *);
+bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
 bool aarch64_uimm12_shift (HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 3861e7d9ed82ebaffe93471fec08a72c70cfd133..0980befa42be65a760ad06e43decec77bff1b762 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13141,6 +13141,61 @@ aarch64_expand_movmem (rtx *operands)
   return true;
 }
 
+/* Split a DImode store of a CONST_INT SRC to MEM DST as two
+   SImode stores.  Handle the case when the constant has identical
+   bottom and top halves.  This is beneficial when the two stores can be
+   merged into an STP and we avoid synthesising potentially expensive
+   immediates twice.  Return true if such a split is possible.  */
+
+bool
+aarch64_split_dimode_const_store (rtx dst, rtx src)
+{
+  rtx lo = gen_lowpart (SImode, src);
+  rtx hi = gen_highpart_mode (SImode, DImode, src);
+
+  bool size_p = optimize_function_for_size_p (cfun);
+
+  if (!rtx_equal_p (lo, hi))
+    return false;
+
+  unsigned int orig_cost
+    = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
+  unsigned int lo_cost
+    = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
+
+  /* We want to transform:
+     MOV	x1, 49370
+     MOVK	x1, 0x140, lsl 16
+     MOVK	x1, 0xc0da, lsl 32
+     MOVK	x1, 0x140, lsl 48
+     STR	x1, [x0]
+   into:
+     MOV	w1, 49370
+     MOVK	w1, 0x140, lsl 16
+     STP	w1, w1, [x0]
+   So we want to perform this only when we save two instructions
+   or more.  When optimizing for size, however, accept any code size
+   savings we can.  */
+  if (size_p && orig_cost <= lo_cost)
+    return false;
+
+  if (!size_p
+      && (orig_cost <= lo_cost + 1))
+    return false;
+
+  rtx mem_lo = adjust_address (dst, SImode, 0);
+  if (!aarch64_mem_pair_operand (mem_lo, SImode))
+    return false;
+
+  rtx tmp_reg = gen_reg_rtx (SImode);
+  aarch64_expand_mov_immediate (tmp_reg, lo);
+  rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
+
+  emit_insn (gen_store_pairsi (mem_lo, tmp_reg, mem_hi, tmp_reg));
+
+  return true;
+}
+
 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
 static unsigned HOST_WIDE_INT
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 8861ac18f4e33ada3bc6dde0e4667fd040b1c213..ec423eb4b048609daaf2f81b0ae874f2e4350f69 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1010,6 +1010,11 @@ (define_expand "mov<mode>"
 	(match_operand:GPI 1 "general_operand" ""))]
   ""
   "
+    if (MEM_P (operands[0]) && CONST_INT_P (operands[1])
+	&& <MODE>mode == DImode
+	&& aarch64_split_dimode_const_store (operands[0], operands[1]))
+      DONE;
+
     if (GET_CODE (operands[0]) == MEM && operands[1] != const0_rtx)
       operands[1] = force_reg (<MODE>mode, operands[1]);
 
diff --git a/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_1.c b/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc964272eb45dc067fac40a390cf67121b51c180
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+void
+foo (unsigned long long *a)
+{
+  a[0] = 0x0140c0da0140c0daULL;
+}
+
+/* { dg-final { scan-assembler-times "movk\\tw.*" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tw\[0-9\]+, w\[0-9\]+.*" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_2.c b/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..c421277989adcf446ad8a7b3ab9060602c03a7ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/store_repeating_constant_2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+
+/* Check that for -Os we synthesize only the bottom half and then
+   store it twice with an STP rather than synthesizing it twice in each
+   half of an X-reg.  */
+
+void
+foo (unsigned long long *a)
+{
+  a[0] = 0xc0da0000c0daULL;
+}
+
+/* { dg-final { scan-assembler-times "mov\\tw.*" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tw\[0-9\]+, w\[0-9\]+.*" 1 } } */

Reply via email to