[PATCH][AArch64] Unify vec_set patterns, support floating-point vector modes properly

Kyrill Tkachov Tue, 15 May 2018 01:21:30 -0700

Hi all,

We've a deficiency in our vec_set family of patterns.
We don't support directly loading a vector lane using LD1 for V2DImode and all 
the vector floating-point modes.
We do do it correctly for the other integer vector modes (V4SI, V8HI etc) 
though.


The alternatives on the relative floating-point patterns only allow a 
register-to-register INS instruction.
That means if we want to load a value into a vector lane we must first load it 
into a scalar register and then
perform an INS, which is wasteful.

There is also an explicit V2DI vec_set expander dangling around for no reason 
that I can see. It seems to do the
exact same things as the other vec_set expanders. This patch removes that.
It now unifies all vec_set expansions into a single "vec_set<mode>" 
define_expand using the catch-all VALL_F16 iterator.
I decided to leave two aarch64_simd_vec_set<mode> define_insns. One for the 
integer vector modes (that now include V2DI)
and one for the floating-point vector modes. That is so that we can avoid specifying 
"w,r" alternatives for floating-point
modes in case the register-allocator gets confused and starts gratuitously 
moving registers between the two banks.
So the floating-point pattern only two alternatives, one for SIMD-to-SIMD INS 
and one for LD1.

With this patch we avoid loading values into scalar registers and then doing an 
explicit INS on them to move them into
the desired vector lanes. For example for:

typedef float v4sf __attribute__ ((vector_size (16)));
typedef long long v2di __attribute__ ((vector_size (16)));

v2di
foo_v2di (long long *a, long long *b)
{
  v2di res = { *a, *b };
  return res;
}

v4sf
foo_v4sf (float *a, float *b, float *c, float *d)
{
  v4sf res = { *a, *b, *c, *d };
  return res;
}

we currently generate:

foo_v2di:
        ldr     d0, [x0]
        ldr     x0, [x1]
        ins     v0.d[1], x0
        ret

foo_v4sf:
        ldr     s0, [x0]
        ldr     s3, [x1]
        ldr     s2, [x2]
        ldr     s1, [x3]
        ins     v0.s[1], v3.s[0]
        ins     v0.s[2], v2.s[0]
        ins     v0.s[3], v1.s[0]
        ret

but with this patch we generate the much cleaner:
foo_v2di:
        ldr     d0, [x0]
        ld1     {v0.d}[1], [x1]
        ret

foo_v4sf:
        ldr     s0, [x0]
        ld1     {v0.s}[1], [x1]
        ld1     {v0.s}[2], [x2]
        ld1     {v0.s}[3], [x3]
        ret


Bootstrapped and tested on aarch64-none-linux-gnu and also tested on 
aarch64_be-none-elf.

Ok for trunk?
Thanks,
Kyrill

2018-05-15  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * config/aarch64/aarch64-simd.md (vec_set<mode>): Use VALL_F16 mode
    iterator.  Delete separate integer-mode vec_set<mode> expander.
    (aarch64_simd_vec_setv2di): Delete.
    (vec_setv2di): Delete.
    (aarch64_simd_vec_set<mode>, VQDF_F16): Move earlier in file.
    Add second memory alternative to emit an LD1.
    (aarch64_simd_vec_set<mode>, VDQ_BHSI): Change mode iterator to
    VDQ_I.  Use vwcore mode attribute in first alternative for operand 1
    constraint.

2018-05-15  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * gcc.target/aarch64/vect-init-ld1.c: New test.

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 1154fc3d58deaa33413ea3050ff7feec37f092a6..df3fad2d71ed4096accdfdf725e194bf555d40d2 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -694,11 +694,11 @@ (define_insn "one_cmpl<mode>2"
 )
 
 (define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w,w,w")
-        (vec_merge:VDQ_BHSI
-	    (vec_duplicate:VDQ_BHSI
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(vec_merge:VDQ_I
+	    (vec_duplicate:VDQ_I
 		(match_operand:<VEL> 1 "aarch64_simd_general_operand" "r,w,Utv"))
-	    (match_operand:VDQ_BHSI 3 "register_operand" "0,0,0")
+	    (match_operand:VDQ_I 3 "register_operand" "0,0,0")
 	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
   "TARGET_SIMD"
   {
@@ -707,7 +707,7 @@ (define_insn "aarch64_simd_vec_set<mode>"
    switch (which_alternative)
      {
      case 0:
-	return "ins\\t%0.<Vetype>[%p2], %w1";
+	return "ins\\t%0.<Vetype>[%p2], %<vwcore>1";
      case 1:
 	return "ins\\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
      case 2:
@@ -719,6 +719,30 @@ (define_insn "aarch64_simd_vec_set<mode>"
   [(set_attr "type" "neon_from_gp<q>, neon_ins<q>, neon_load1_one_lane<q>")]
 )
 
+(define_insn "aarch64_simd_vec_set<mode>"
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
+	(vec_merge:VDQF_F16
+	    (vec_duplicate:VDQF_F16
+		(match_operand:<VEL> 1 "aarch64_simd_general_operand" "w,Utv"))
+	    (match_operand:VDQF_F16 3 "register_operand" "0,0")
+	    (match_operand:SI 2 "immediate_operand" "i,i")))]
+  "TARGET_SIMD"
+  {
+   int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2])));
+   operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt);
+   switch (which_alternative)
+     {
+     case 0:
+	return "ins\\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
+     case 1:
+	return "ld1\\t{%0.<Vetype>}[%p2], %1";
+     default:
+	gcc_unreachable ();
+     }
+  }
+  [(set_attr "type" "neon_ins<q>, neon_load1_one_lane<q>")]
+)
+
 (define_insn "*aarch64_simd_vec_copy_lane<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_merge:VALL_F16
@@ -1030,19 +1054,6 @@ (define_expand "aarch64_lshr_simddi"
   }
 )
 
-(define_expand "vec_set<mode>"
-  [(match_operand:VDQ_BHSI 0 "register_operand")
-   (match_operand:<VEL> 1 "register_operand")
-   (match_operand:SI 2 "immediate_operand")]
-  "TARGET_SIMD"
-  {
-    HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
-    emit_insn (gen_aarch64_simd_vec_set<mode> (operands[0], operands[1],
-					    GEN_INT (elem), operands[0]));
-    DONE;
-  }
-)
-
 ;; For 64-bit modes we use ushl/r, as this does not require a SIMD zero.
 (define_insn "vec_shr_<mode>"
   [(set (match_operand:VD 0 "register_operand" "=w")
@@ -1059,62 +1070,8 @@ (define_insn "vec_shr_<mode>"
   [(set_attr "type" "neon_shift_imm")]
 )
 
-(define_insn "aarch64_simd_vec_setv2di"
-  [(set (match_operand:V2DI 0 "register_operand" "=w,w")
-        (vec_merge:V2DI
-	    (vec_duplicate:V2DI
-		(match_operand:DI 1 "register_operand" "r,w"))
-	    (match_operand:V2DI 3 "register_operand" "0,0")
-	    (match_operand:SI 2 "immediate_operand" "i,i")))]
-  "TARGET_SIMD"
-  {
-    int elt = ENDIAN_LANE_N (2, exact_log2 (INTVAL (operands[2])));
-    operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt);
-    switch (which_alternative)
-      {
-      case 0:
-	return "ins\\t%0.d[%p2], %1";
-      case 1:
-        return "ins\\t%0.d[%p2], %1.d[0]";
-      default:
-	gcc_unreachable ();
-      }
-  }
-  [(set_attr "type" "neon_from_gp, neon_ins_q")]
-)
-
-(define_expand "vec_setv2di"
-  [(match_operand:V2DI 0 "register_operand")
-   (match_operand:DI 1 "register_operand")
-   (match_operand:SI 2 "immediate_operand")]
-  "TARGET_SIMD"
-  {
-    HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
-    emit_insn (gen_aarch64_simd_vec_setv2di (operands[0], operands[1],
-					  GEN_INT (elem), operands[0]));
-    DONE;
-  }
-)
-
-(define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
-	(vec_merge:VDQF_F16
-	    (vec_duplicate:VDQF_F16
-		(match_operand:<VEL> 1 "register_operand" "w"))
-	    (match_operand:VDQF_F16 3 "register_operand" "0")
-	    (match_operand:SI 2 "immediate_operand" "i")))]
-  "TARGET_SIMD"
-  {
-    int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2])));
-
-    operands[2] = GEN_INT ((HOST_WIDE_INT)1 << elt);
-    return "ins\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
-  }
-  [(set_attr "type" "neon_ins<q>")]
-)
-
 (define_expand "vec_set<mode>"
-  [(match_operand:VDQF_F16 0 "register_operand" "+w")
+  [(match_operand:VALL_F16 0 "register_operand" "+w")
    (match_operand:<VEL> 1 "register_operand" "w")
    (match_operand:SI 2 "immediate_operand" "")]
   "TARGET_SIMD"
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-ld1.c b/gcc/testsuite/gcc.target/aarch64/vect-init-ld1.c
new file mode 100644
index 0000000000000000000000000000000000000000..d7ff9af7d3285318be2d847ff1a4edbe072ef076
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-ld1.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef int v2si __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef long long v2di __attribute__ ((vector_size (16)));
+
+typedef __fp16 v4hf __attribute__ ((vector_size (8)));
+typedef __fp16 v8hf __attribute__ ((vector_size (16)));
+typedef float v2sf __attribute__ ((vector_size (8)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+
+#define FUNC2(T, IT)    \
+T                       \
+foo_##T (IT *a, IT *b)  \
+{                       \
+  T res = { *a, *b };   \
+  return res;           \
+}
+
+FUNC2(v2di, long long)
+FUNC2(v2si, int)
+FUNC2(v2df, double)
+FUNC2(v2sf, float)
+
+#define FUNC4(T, IT)    \
+T                       \
+foo_##T (IT *a, IT *b, IT *c, IT *d)    \
+{                                       \
+  T res = { *a, *b, *c, *d };           \
+  return res;                           \
+}
+
+FUNC4(v4si, int)
+FUNC4(v4hi, short)
+FUNC4(v4sf, float)
+FUNC4(v4hf, __fp16)
+
+#define FUNC8(T, IT)    \
+T                       \
+foo_##T (IT *a, IT *b, IT *c, IT *d, IT *e, IT *f, IT *g, IT *h)        \
+{                                                                       \
+  T res = { *a, *b, *c, *d, *e, *f, *g, *h };                           \
+  return res;                                                           \
+}
+
+FUNC8(v8hi, short)
+FUNC8(v8qi, char)
+FUNC8(v8hf, __fp16)
+
+
+v16qi
+foo_v16qi (char *a, char *a1, char *a2, char *a3, char *a4, char *a5,
+     char *a6, char *a7, char *a8, char *a9, char *a10, char *a11, char *a12,
+     char *a13, char *a14, char *a15)
+{
+  v16qi res = { *a, *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8, *a9, *a10,
+               *a11, *a12, *a13, *a14, *a15 };
+  return res;
+}
+
+/* { dg-final { scan-assembler-not "ld1r\t" } } */
+/* { dg-final { scan-assembler-not "dup\t" } } */
+/* { dg-final { scan-assembler-not "ins\t" } } */

[PATCH][AArch64] Unify vec_set patterns, support floating-point vector modes properly

Reply via email to