Improve the dup pattern to prefer vector registers.  When doing a dup
after a load, the register allocator thinks the costs are identical
and chooses an integer load.  However a dup from an integer register
includes an int->fp transfer which is not modelled.  Adding a '?' to
the integer variant means the cost is increased slightly so we prefer
using a vector register.  This improves the following example:

#include <arm_neon.h>
void f(unsigned *a, uint32x4_t *b)
{
  b[0] = vdupq_n_u32(a[1]);
  b[1] = vdupq_n_u32(a[2]);
}

Before:
        ldr     w2, [x0, 4]
        dup     v0.4s, w2
        str     q0, [x1]
        ldr     w0, [x0, 8]
        dup     v0.4s, w0
        str     q0, [x1, 16]
        ret

After:
        ldr     s0, [x0, 4]
        dup     v0.4s, v0.s[0]
        str     q0, [x1]
        ldr     s0, [x0, 8]
        dup     v0.4s, v0.s[0]
        str     q0, [x1, 16]
        ret

Passes regress & bootstrap, OK for commit?

ChangeLog:
2017-06-20  Wilco Dijkstra  <wdijk...@arm.com>

        * config/aarch64/aarch64-simd.md (aarch64_simd_dup):
        Swap alternatives, make integer dup more expensive.
--
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
24ef178b0de253aa2d49aef022d866266216a0d6..695011eae464d806a5cfeeb7253542c27c211c50
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -44,12 +44,12 @@ (define_expand "movmisalign<mode>"
 (define_insn "aarch64_simd_dup<mode>"
   [(set (match_operand:VDQ_I 0 "register_operand" "=w, w")
        (vec_duplicate:VDQ_I
-         (match_operand:<VEL> 1 "register_operand" "r, w")))]
+         (match_operand:<VEL> 1 "register_operand" "w,?r")))]
   "TARGET_SIMD"
   "@
-   dup\\t%0.<Vtype>, %<vw>1
-   dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_from_gp<q>, neon_dup<q>")]
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_simd_dup<mode>"

Reply via email to