[PATCH v3] aarch64: Recognize vector permute patterns suitable for FMOV [PR100165]

Pengxuan Zheng Tue, 18 Feb 2025 17:07:12 -0800

This patch optimizes certain vector permute expansion with the FMOV instruction
when one of the input vectors is a vector of all zeros and the result of the
vector permute is as if the upper lane of the non-zero input vector is set to
zero and the lower lane remains unchanged.


Note that the patch also propagates zero_op0_p and zero_op1_p during re-encode
now.  They will be used by aarch64_evpc_fmov to check if the input vectors are
valid candidates.

        PR target/100165

gcc/ChangeLog:

        * config/aarch64/aarch64-protos.h (aarch64_lane0_mask_p): New.
        * config/aarch64/aarch64-simd.md 
(@aarch64_simd_vec_set_zero_fmov<mode>):
        New define_insn.
        * config/aarch64/aarch64.cc (aarch64_lane0_mask_p): New.
        (aarch64_evpc_reencode): Copy zero_op0_p and zero_op1_p.
        (aarch64_evpc_fmov): New.
        (aarch64_expand_vec_perm_const_1): Add call to aarch64_evpc_fmov.
        * config/aarch64/iterators.md (VALL_F16_NO_QI): New mode iterator.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/vec-set-zero.c: Update test accordingly.
        * gcc.target/aarch64/fmov-1.c: New test.
        * gcc.target/aarch64/fmov-2.c: New test.
        * gcc.target/aarch64/fmov-3.c: New test.
        * gcc.target/aarch64/fmov-be-1.c: New test.
        * gcc.target/aarch64/fmov-be-2.c: New test.
        * gcc.target/aarch64/fmov-be-3.c: New test.

Signed-off-by: Pengxuan Zheng <[email protected]>
---
 gcc/config/aarch64/aarch64-protos.h           |   2 +-
 gcc/config/aarch64/aarch64-simd.md            |  13 ++
 gcc/config/aarch64/aarch64.cc                 |  96 ++++++++++-
 gcc/config/aarch64/iterators.md               |   9 +
 gcc/testsuite/gcc.target/aarch64/fmov-1.c     | 158 ++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/fmov-2.c     |  52 ++++++
 gcc/testsuite/gcc.target/aarch64/fmov-3.c     | 144 ++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/fmov-be-1.c  | 144 ++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/fmov-be-2.c  |  52 ++++++
 gcc/testsuite/gcc.target/aarch64/fmov-be-3.c  | 144 ++++++++++++++++
 .../gcc.target/aarch64/vec-set-zero.c         |   6 +-
 11 files changed, 816 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be-2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be-3.c

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 4235f4a0ca5..cba94914903 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1051,7 +1051,7 @@ void aarch64_subvti_scratch_regs (rtx, rtx, rtx *,
                                  rtx *, rtx *, rtx *);
 void aarch64_expand_subvti (rtx, rtx, rtx,
                            rtx, rtx, rtx, rtx, bool);
-
+bool aarch64_lane0_mask_p (unsigned int, rtx);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index e2afe87e513..6ddc27c223e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1190,6 +1190,19 @@ (define_insn "@aarch64_simd_vec_set<mode>"
   [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
 )
 
+(define_insn "@aarch64_simd_vec_set_zero_fmov<mode>"
+  [(set (match_operand:VALL_F16_NO_QI 0 "register_operand" "=w")
+       (vec_merge:VALL_F16_NO_QI
+           (match_operand:VALL_F16_NO_QI 1 "register_operand" "w")
+           (match_operand:VALL_F16_NO_QI 2 "aarch64_simd_imm_zero" "Dz")
+           (match_operand:SI 3 "immediate_operand" "i")))]
+  "TARGET_SIMD && aarch64_lane0_mask_p (<nunits>, operands[3])"
+  {
+    return "fmov\\t%<Vetype>0, %<Vetype>1";
+  }
+  [(set_attr "type" "fmov")]
+)
+
 (define_insn "aarch64_simd_vec_set_zero<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
        (vec_merge:VALL_F16
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f5f23f6ff4b..c29a43f2553 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23682,6 +23682,15 @@ aarch64_strided_registers_p (rtx *operands, unsigned 
int num_operands,
   return true;
 }
 
+/* Return TRUE if OP is a valid vec_merge bit mask for lane 0.  */
+
+bool
+aarch64_lane0_mask_p (unsigned int nelts, rtx op)
+{
+  return exact_log2 (INTVAL (op)) >= 0
+        && (ENDIAN_LANE_N (nelts, exact_log2 (INTVAL (op))) == 0);
+}
+
 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
    HIGH (exclusive).  */
 void
@@ -26058,6 +26067,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
+  newd.zero_op0_p = d->zero_op0_p;
+  newd.zero_op1_p = d->zero_op1_p;
   newd.testing_p = d->testing_p;
   newd.one_vector_p = d->one_vector_p;
 
@@ -26643,6 +26654,87 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns suitable for the FMOV instructions.  */
+static bool
+aarch64_evpc_fmov (struct expand_vec_perm_d *d)
+{
+  if (d->vec_flags != VEC_ADVSIMD)
+    return false;
+
+  /* Either d->op0 or d->op1 should be a vector of all zeros.  */
+  if (d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p))
+    return false;
+
+  HOST_WIDE_INT nelt, elt;
+
+  /* to_constant is safe since this routine is specific to Advanced SIMD
+     vectors.  */
+  nelt = d->perm.length ().to_constant ();
+
+  if (!BYTES_BIG_ENDIAN)
+    {
+      if (!d->perm[0].is_constant (&elt))
+       return false;
+
+      /* Lane 0 of the output vector should be chosen from the non-zero
+        vector.  */
+      if (elt != (d->zero_op0_p ? nelt : 0))
+       return false;
+
+      for (HOST_WIDE_INT i = 1; i < nelt; i++)
+       {
+         if (!d->perm[i].is_constant (&elt))
+           return false;
+
+         /* All lanes except lane 0 of the output vector should be chosen from
+            the zero vector.  */
+         if (d->zero_op0_p && elt >= nelt)
+           return false;
+
+         if (!d->zero_op0_p && elt < nelt)
+           return false;
+       }
+    }
+  else
+    {
+      if (!d->perm[nelt-1].is_constant (&elt))
+       return false;
+
+      /* Lane NELT-1 of the output vector should be chosen from the non-zero
+        vector.  */
+      if (elt != (d->zero_op0_p ? 2 * nelt - 1 : nelt - 1))
+       return false;
+
+      for (HOST_WIDE_INT i = 0; i < nelt - 1; i++)
+       {
+         if (!d->perm[i].is_constant (&elt))
+           return false;
+
+         /* All lanes except lane 0 of the output vector should be chosen from
+            the zero vector.  */
+         if (d->zero_op0_p && elt >= nelt)
+           return false;
+
+         if (!d->zero_op0_p && elt < nelt)
+           return false;
+       }
+    }
+
+  if (d->testing_p)
+    return true;
+
+  machine_mode mode = d->vmode;
+  insn_code icode = code_for_aarch64_simd_vec_set_zero_fmov (mode);
+  expand_operand ops[4];
+  create_output_operand (&ops[0], d->target, mode);
+  create_input_operand (&ops[1], d->zero_op0_p ? d->op1 : d->op0, mode);
+  create_input_operand (&ops[2], CONST0_RTX (mode), mode);
+  create_integer_operand (&ops[3], BYTES_BIG_ENDIAN ? 1 << (nelt - 1) : 1);
+  expand_insn (icode, 4, ops);
+
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -26666,7 +26758,9 @@ aarch64_expand_vec_perm_const_1 (struct 
expand_vec_perm_d *d)
     {
       if (d->vmode == d->op_mode)
        {
-         if (aarch64_evpc_rev_local (d))
+         if (aarch64_evpc_fmov (d))
+           return true;
+         else if (aarch64_evpc_rev_local (d))
            return true;
          else if (aarch64_evpc_rev_global (d))
            return true;
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 5bfd6e7d362..17bb6f00abc 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -225,6 +225,15 @@ (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI 
V4SI V2DI
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
                                V4HF V8HF V2SF V4SF])
 
+;; The VALL_F16 modes except the QI ones.
+(define_mode_iterator VALL_F16_NO_QI [(V4HI "TARGET_SIMD_F16INST")
+                                     (V8HI "TARGET_SIMD_F16INST")
+                                     (V4HF "TARGET_SIMD_F16INST")
+                                     (V8HF "TARGET_SIMD_F16INST")
+                                     (V4BF "TARGET_SIMD_F16INST")
+                                     (V8BF "TARGET_SIMD_F16INST")
+                                     V2SI V4SI V2DI V2SF V4SF V2DF])
+
 ;; All Advanced SIMD modes barring HF modes, plus DI.
 (define_mode_iterator VALLDI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF 
V2DF DI])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-1.c 
b/gcc/testsuite/gcc.target/aarch64/fmov-1.c
new file mode 100644
index 00000000000..b87e2b4ca64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-1.c
@@ -0,0 +1,158 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef int v2si __attribute__ ((vector_size (8)));
+typedef float v2sf __attribute__ ((vector_size (8)));
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef long v2di __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+
+/*
+** f_v4hi:
+**     fmov    s0, s0
+**     ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 1, 4, 5 });
+}
+
+/*
+** g_v4hi:
+**     uzp1    v([0-9]+).2d, v0.2d, v0.2d
+**     adrp    x([0-9]+), .LC0
+**     ldr     d([0-9]+), \[x\2, #:lo12:.LC0\]
+**     tbl     v0.8b, {v\1.16b}, v\3.8b
+**     ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 3, 1, 4, 2 });
+}
+
+/*
+** f_v8hi:
+**     fmov    s0, s0
+**     ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 0, 1, 
8, 9, 10, 11, 12, 13 });
+}
+
+/*
+** f_v2si:
+**     fmov    s0, s0
+**     ret
+*/
+v2si
+f_v2si (v2si x)
+{
+  return __builtin_shuffle (x, (v2si){ 0, 0 }, (v2si){ 0, 3 });
+}
+
+/*
+** g_v2si:
+**     fmov    s0, s0
+**     ret
+*/
+v2si
+g_v2si (v2si x)
+{
+  return __builtin_shuffle ((v2si){ 0, 0 }, x, (v2si){ 2, 0 });
+}
+
+/*
+** f_v2sf:
+**     fmov    s0, s0
+**     ret
+*/
+v2sf
+f_v2sf (v2sf x)
+{
+  return __builtin_shuffle (x, (v2sf){ 0, 0 }, (v2si){ 0, 2 });
+}
+
+/*
+** f_v2di:
+**     fmov    d0, d0
+**     ret
+*/
+v2di
+f_v2di (v2di x)
+{
+  return __builtin_shuffle (x, (v2di){ 0, 0 }, (v2di){ 0, 3 });
+}
+
+/*
+** g_v2di:
+**     fmov    d0, d0
+**     ret
+*/
+v2di
+g_v2di (v2di x)
+{
+  return __builtin_shuffle ((v2di){ 0, 0 }, x, (v2di){ 2, 1 });
+}
+
+/*
+** f_v2df:
+**     fmov    d0, d0
+**     ret
+*/
+v2df
+f_v2df (v2df x)
+{
+  return __builtin_shuffle (x, (v2df){ 0, 0 }, (v2di){ 0, 2 });
+}
+
+/*
+** f_v4si:
+**     fmov    d0, d0
+**     ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 1, 4, 5 });
+}
+
+/*
+** g_v4si:
+**     fmov    d0, d0
+**     ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 4, 5, 2, 3 });
+}
+
+/*
+** h_v4si:
+**     fmov    s0, s0
+**     ret
+*/
+v4si
+h_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 5, 6 });
+}
+
+/*
+** f_v4sf:
+**     fmov    d0, d0
+**     ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 1, 6, 7 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-2.c 
b/gcc/testsuite/gcc.target/aarch64/fmov-2.c
new file mode 100644
index 00000000000..e0f1b6d05fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-2.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target ("arch=armv8.2-a+fp16")
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+
+/*
+** f_v4hi:
+**     fmov    h0, h0
+**     ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 4, 5, 6 });
+}
+
+/*
+** g_v4hi:
+**     fmov    h0, h0
+**     ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+  return __builtin_shuffle ((v4hi){ 0, 0, 0, 0 }, x, (v4hi){ 4, 0, 1, 2 });
+}
+
+/*
+** f_v8hi:
+**     fmov    h0, h0
+**     ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 0, 8, 
9, 10, 11, 12, 13, 14 });
+}
+
+/*
+** g_v8hi:
+**     fmov    h0, h0
+**     ret
+*/
+v8hi
+g_v8hi (v8hi x)
+{
+  return __builtin_shuffle ((v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, x, (v8hi){ 8, 0, 
1, 2, 3, 4, 5, 6 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-3.c 
b/gcc/testsuite/gcc.target/aarch64/fmov-3.c
new file mode 100644
index 00000000000..ebef6515722
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-3.c
@@ -0,0 +1,144 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target ("arch=armv8.2-a+fp16")
+
+typedef __fp16 v4hf __attribute__ ((vector_size (8)));
+typedef __fp16 v8hf __attribute__ ((vector_size (16)));
+typedef __bf16 v4bf __attribute__ ((vector_size (8)));
+typedef __bf16 v8bf __attribute__ ((vector_size (16)));
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+
+/*
+** f_v4hf:
+**     fmov    h0, h0
+**     ret
+*/
+v4hf
+f_v4hf (v4hf x)
+{
+  return __builtin_shuffle (x, (v4hf){ 0, 0, 0, 0 }, (v4hi){ 0, 4, 5, 6 });
+}
+
+/*
+** g_v4hf:
+**     fmov    h0, h0
+**     ret
+*/
+v4hf
+g_v4hf (v4hf x)
+{
+  return __builtin_shuffle ((v4hf){ 0, 0, 0, 0 }, x, (v4hi){ 4, 0, 1, 2 });
+}
+
+/*
+** h_v4hf:
+**     fmov    s0, s0
+**     ret
+*/
+v4hf
+h_v4hf (v4hf x)
+{
+  return __builtin_shuffle (x, (v4hf){ 0, 0, 0, 0 }, (v4hi){ 0, 1, 4, 5 });
+}
+
+/*
+** f_v8hf:
+**     fmov    h0, h0
+**     ret
+*/
+v8hf
+f_v8hf (v8hf x)
+{
+  return __builtin_shuffle (x, (v8hf){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 0, 8, 
9, 10, 11, 12, 13, 14 });
+}
+
+/*
+** g_v8hf:
+**     fmov    h0, h0
+**     ret
+*/
+v8hf
+g_v8hf (v8hf x)
+{
+  return __builtin_shuffle ((v8hf){ 0, 0, 0, 0, 0, 0, 0, 0 }, x, (v8hi){ 8, 0, 
1, 2, 3, 4, 5, 6 });
+}
+
+/*
+** h_v8hf:
+**     fmov    s0, s0
+**     ret
+*/
+v8hf
+h_v8hf (v8hf x)
+{
+  return __builtin_shuffle (x, (v8hf){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 0, 1, 
8, 9, 10, 11, 12, 13 });
+}
+
+/*
+** f_v4bf:
+**     fmov    h0, h0
+**     ret
+*/
+v4bf
+f_v4bf (v4bf x)
+{
+  return __builtin_shuffle (x, (v4bf){ 0, 0, 0, 0 }, (v4hi){ 0, 4, 5, 6 });
+}
+
+/*
+** g_v4bf:
+**     fmov    h0, h0
+**     ret
+*/
+v4bf
+g_v4bf (v4bf x)
+{
+  return __builtin_shuffle ((v4bf){ 0, 0, 0, 0 }, x, (v4hi){ 4, 0, 1, 2 });
+}
+
+/*
+** h_v4bf:
+**     fmov    s0, s0
+**     ret
+*/
+v4bf
+h_v4bf (v4bf x)
+{
+  return __builtin_shuffle (x, (v4bf){ 0, 0, 0, 0 }, (v4hi){ 0, 1, 4, 5 });
+}
+
+/*
+** f_v8bf:
+**     fmov    h0, h0
+**     ret
+*/
+v8bf
+f_v8bf (v8bf x)
+{
+  return __builtin_shuffle (x, (v8bf){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 0, 8, 
9, 10, 11, 12, 13, 14 });
+}
+
+/*
+** g_v8bf:
+**     fmov    h0, h0
+**     ret
+*/
+v8bf
+g_v8bf (v8bf x)
+{
+  return __builtin_shuffle ((v8bf){ 0, 0, 0, 0, 0, 0, 0, 0 }, x, (v8hi){ 8, 0, 
1, 2, 3, 4, 5, 6 });
+}
+
+/*
+** h_v8bf:
+**     fmov    s0, s0
+**     ret
+*/
+v8bf
+h_v8bf (v8bf x)
+{
+  return __builtin_shuffle (x, (v8bf){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 0, 1, 
8, 9, 10, 11, 12, 13 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-be-1.c 
b/gcc/testsuite/gcc.target/aarch64/fmov-be-1.c
new file mode 100644
index 00000000000..1f070dc4800
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-be-1.c
@@ -0,0 +1,144 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef int v2si __attribute__ ((vector_size (8)));
+typedef float v2sf __attribute__ ((vector_size (8)));
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef long v2di __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+
+/*
+** f_v4hi:
+**     fmov    s0, s0
+**     ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 5, 2, 3 });
+}
+
+/*
+** f_v8hi:
+**     fmov    s0, s0
+**     ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 8, 9, 
10, 11, 12, 13, 6, 7 });
+}
+
+/*
+** f_v2si:
+**     fmov    s0, s0
+**     ret
+*/
+v2si
+f_v2si (v2si x)
+{
+  return __builtin_shuffle (x, (v2si){ 0, 0 }, (v2si){ 3, 1 });
+}
+
+/*
+** g_v2si:
+**     fmov    s0, s0
+**     ret
+*/
+v2si
+g_v2si (v2si x)
+{
+  return __builtin_shuffle ((v2si){ 0, 0 }, x, (v2si){ 0, 3 });
+}
+
+/*
+** f_v2sf:
+**     fmov    s0, s0
+**     ret
+*/
+v2sf
+f_v2sf (v2sf x)
+{
+  return __builtin_shuffle (x, (v2sf){ 0, 0 }, (v2si){ 2, 1 });
+}
+
+/*
+** f_v2di:
+**     fmov    d0, d0
+**     ret
+*/
+v2di
+f_v2di (v2di x)
+{
+  return __builtin_shuffle (x, (v2di){ 0, 0 }, (v2di){ 2, 1 });
+}
+
+/*
+** g_v2di:
+**     fmov    d0, d0
+**     ret
+*/
+v2di
+g_v2di (v2di x)
+{
+  return __builtin_shuffle ((v2di){ 0, 0 }, x, (v2di){ 0, 3 });
+}
+
+/*
+** f_v2df:
+**     fmov    d0, d0
+**     ret
+*/
+v2df
+f_v2df (v2df x)
+{
+  return __builtin_shuffle (x, (v2df){ 0, 0 }, (v2di){ 2, 1 });
+}
+
+/*
+** f_v4si:
+**     fmov    d0, d0
+**     ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 6, 7, 2, 3 });
+}
+
+/*
+** g_v4si:
+**     fmov    d0, d0
+**     ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 2, 3, 6, 7 });
+}
+
+/*
+** h_v4si:
+**     fmov    s0, s0
+**     ret
+*/
+v4si
+h_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 4, 5, 6, 3 });
+}
+
+/*
+** f_v4sf:
+**     fmov    d0, d0
+**     ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 6, 7, 2, 3 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-be-2.c 
b/gcc/testsuite/gcc.target/aarch64/fmov-be-2.c
new file mode 100644
index 00000000000..a7764019994
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-be-2.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target ("arch=armv8.2-a+fp16")
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+
+/*
+** f_v4hi:
+**     fmov    h0, h0
+**     ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 5, 6, 3 });
+}
+
+/*
+** g_v4hi:
+**     fmov    h0, h0
+**     ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+  return __builtin_shuffle ((v4hi){ 0, 0, 0, 0 }, x, (v4hi){ 0, 1, 2, 7 });
+}
+
+/*
+** f_v8hi:
+**     fmov    h0, h0
+**     ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 8, 9, 
10, 11, 12, 13, 14, 7 });
+}
+
+/*
+** g_v8hi:
+**     fmov    h0, h0
+**     ret
+*/
+v8hi
+g_v8hi (v8hi x)
+{
+  return __builtin_shuffle ((v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, x, (v8hi){ 0, 1, 
2, 3, 4, 5, 6, 15 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-be-3.c 
b/gcc/testsuite/gcc.target/aarch64/fmov-be-3.c
new file mode 100644
index 00000000000..de9f927da0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-be-3.c
@@ -0,0 +1,144 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target ("arch=armv8.2-a+fp16")
+
+typedef __fp16 v4hf __attribute__ ((vector_size (8)));
+typedef __fp16 v8hf __attribute__ ((vector_size (16)));
+typedef __bf16 v4bf __attribute__ ((vector_size (8)));
+typedef __bf16 v8bf __attribute__ ((vector_size (16)));
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+
+/*
+** f_v4hf:
+**     fmov    h0, h0
+**     ret
+*/
+v4hf
+f_v4hf (v4hf x)
+{
+  return __builtin_shuffle (x, (v4hf){ 0, 0, 0, 0 }, (v4hi){ 4, 5, 6, 3 });
+}
+
+/*
+** g_v4hf:
+**     fmov    h0, h0
+**     ret
+*/
+v4hf
+g_v4hf (v4hf x)
+{
+  return __builtin_shuffle ((v4hf){ 0, 0, 0, 0 }, x, (v4hi){ 0, 1, 2, 7 });
+}
+
+/*
+** h_v4hf:
+**     fmov    s0, s0
+**     ret
+*/
+v4hf
+h_v4hf (v4hf x)
+{
+  return __builtin_shuffle (x, (v4hf){ 0, 0, 0, 0 }, (v4hi){ 4, 5, 2, 3 });
+}
+
+/*
+** f_v8hf:
+**     fmov    h0, h0
+**     ret
+*/
+v8hf
+f_v8hf (v8hf x)
+{
+  return __builtin_shuffle (x, (v8hf){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 8, 9, 
10, 11, 12, 13, 14, 7 });
+}
+
+/*
+** g_v8hf:
+**     fmov    h0, h0
+**     ret
+*/
+v8hf
+g_v8hf (v8hf x)
+{
+  return __builtin_shuffle ((v8hf){ 0, 0, 0, 0, 0, 0, 0, 0 }, x, (v8hi){ 0, 1, 
2, 3, 4, 5, 6, 15 });
+}
+
+/*
+** h_v8hf:
+**     fmov    s0, s0
+**     ret
+*/
+v8hf
+h_v8hf (v8hf x)
+{
+  return __builtin_shuffle (x, (v8hf){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 8, 9, 
10, 11, 12, 13, 6, 7 });
+}
+
+/*
+** f_v4bf:
+**     fmov    h0, h0
+**     ret
+*/
+v4bf
+f_v4bf (v4bf x)
+{
+  return __builtin_shuffle (x, (v4bf){ 0, 0, 0, 0 }, (v4hi){ 4, 5, 6, 3 });
+}
+
+/*
+** g_v4bf:
+**     fmov    h0, h0
+**     ret
+*/
+v4bf
+g_v4bf (v4bf x)
+{
+  return __builtin_shuffle ((v4bf){ 0, 0, 0, 0 }, x, (v4hi){ 0, 1, 2, 7 });
+}
+
+/*
+** h_v4bf:
+**     fmov    s0, s0
+**     ret
+*/
+v4bf
+h_v4bf (v4bf x)
+{
+  return __builtin_shuffle (x, (v4bf){ 0, 0, 0, 0 }, (v4hi){ 4, 5, 2, 3 });
+}
+
+/*
+** f_v8bf:
+**     fmov    h0, h0
+**     ret
+*/
+v8bf
+f_v8bf (v8bf x)
+{
+  return __builtin_shuffle (x, (v8bf){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 8, 9, 
10, 11, 12, 13, 14, 7 });
+}
+
+/*
+** g_v8bf:
+**     fmov    h0, h0
+**     ret
+*/
+v8bf
+g_v8bf (v8bf x)
+{
+  return __builtin_shuffle ((v8bf){ 0, 0, 0, 0, 0, 0, 0, 0 }, x, (v8hi){ 0, 1, 
2, 3, 4, 5, 6, 15 });
+}
+
+/*
+** h_v8bf:
+**     fmov    s0, s0
+**     ret
+*/
+v8bf
+h_v8bf (v8bf x)
+{
+  return __builtin_shuffle (x, (v8bf){ 0, 0, 0, 0, 0, 0, 0, 0 }, (v8hi){ 8, 9, 
10, 11, 12, 13, 6, 7 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c 
b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
index b34b902cf27..025350500c6 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
@@ -28,8 +28,10 @@ FOO(float64x2_t)
 
 /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[1\], wzr} 2 { target 
aarch64_little_endian } } } */
 /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.h\[1\], wzr} 4 { target 
aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.s\[1\], wzr} 4 { target 
aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.d\[1\], xzr} 2 { target 
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.s\[1\], wzr} 2 { target 
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-not {ins\tv[0-9]+\.d\[1\], xzr} { target 
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {fmov\ts0, s0} 2 { target 
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {fmov\td0, d0} 2 { target 
aarch64_little_endian } } } */
 
 /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[6\], wzr} 1 { target 
aarch64_big_endian } } } */
 /* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[14\], wzr} 1 { target 
aarch64_big_endian } } } */
-- 
2.17.1

[PATCH v3] aarch64: Recognize vector permute patterns suitable for FMOV [PR100165]

Reply via email to