This patch optimizes certain vector permute expansion with the FMOV instruction
when one of the input vectors is a vector of all zeros and the result of the
vector permute is as if the upper lane of the non-zero input vector is set to
zero and the lower lane remains unchanged.
Note that the patch also propagates zero_op0_p and zero_op1_p during re-encode
now. They will be used by aarch64_evpc_fmov to check if the input vectors are
valid candidates.
PR target/100165
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md (aarch64_simd_vec_set_zero_fmov<mode>):
New define_insn.
* config/aarch64/aarch64.cc (aarch64_evpc_reencode): Copy zero_op0_p and
zero_op1_p.
(aarch64_evpc_fmov): New function.
(aarch64_expand_vec_perm_const_1): Add call to aarch64_evpc_fmov.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/vec-set-zero.c: Update test accordingly.
* gcc.target/aarch64/fmov.c: New test.
* gcc.target/aarch64/fmov-be.c: New test.
Signed-off-by: Pengxuan Zheng <[email protected]>
---
gcc/config/aarch64/aarch64-simd.md | 14 +++
gcc/config/aarch64/aarch64.cc | 74 +++++++++++-
gcc/testsuite/gcc.target/aarch64/fmov-be.c | 74 ++++++++++++
gcc/testsuite/gcc.target/aarch64/fmov.c | 110 ++++++++++++++++++
.../gcc.target/aarch64/vec-set-zero.c | 6 +-
5 files changed, 275 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov.c
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index e456f693d2f..543126948e7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1190,6 +1190,20 @@ (define_insn "aarch64_simd_vec_set<mode>"
[(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
)
+(define_insn "aarch64_simd_vec_set_zero_fmov<mode>"
+ [(set (match_operand:VP_2E 0 "register_operand" "=w")
+ (vec_merge:VP_2E
+ (match_operand:VP_2E 1 "aarch64_simd_imm_zero" "Dz")
+ (match_operand:VP_2E 3 "register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")))]
+ "TARGET_SIMD
+ && (ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))) == 1)"
+ {
+ return "fmov\\t%<Vetype>0, %<Vetype>3";
+ }
+ [(set_attr "type" "fmov")]
+)
+
(define_insn "aarch64_simd_vec_set_zero<mode>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w")
(vec_merge:VALL_F16
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index a6cc00e74ab..64756920eda 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25950,6 +25950,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
+ newd.zero_op0_p = d->zero_op0_p;
+ newd.zero_op1_p = d->zero_op1_p;
newd.testing_p = d->testing_p;
newd.one_vector_p = d->one_vector_p;
@@ -26434,6 +26436,74 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
return true;
}
+/* Recognize patterns suitable for the FMOV instructions. */
+static bool
+aarch64_evpc_fmov (struct expand_vec_perm_d *d)
+{
+ machine_mode mode = d->vmode;
+ unsigned HOST_WIDE_INT nelt;
+
+ if (d->vec_flags != VEC_ADVSIMD)
+ return false;
+
+ /* to_constant is safe since this routine is specific to Advanced SIMD
+ vectors. */
+ nelt = d->perm.length ().to_constant ();
+
+ /* Either d->op0 or d->op1 should be a vector of all zeros. */
+ if (nelt != 2 || d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p))
+ return false;
+
+ HOST_WIDE_INT elt0, elt1;
+ rtx in0 = d->op0;
+ rtx in1 = d->op1;
+
+ if (!d->perm[0].is_constant (&elt0))
+ return false;
+
+ if (!d->perm[1].is_constant (&elt1))
+ return false;
+
+ if (!BYTES_BIG_ENDIAN)
+ {
+ /* Lane 0 of the output vector should come from lane 0 of the non-zero
+ vector. */
+ if (elt0 != (d->zero_op0_p ? 2 : 0))
+ return false;
+
+ /* Lane 1 of the output vector should come from any lane of the zero
+ vector. */
+ if (elt1 != (d->zero_op0_p ? 0 : 2) && elt1 != (d->zero_op0_p ? 1 : 3))
+ return false;
+ }
+ else
+ {
+ /* Lane 0 of the output vector should come from any lane of the zero
+ vector. */
+ if (elt0 != (d->zero_op0_p ? 0 : 2) && elt0 != (d->zero_op0_p ? 1 : 3))
+ return false;
+
+ /* Lane 1 of the output vector should come from lane 1 of the non-zero
+ vector. */
+ if (elt1 != (d->zero_op0_p ? 3 : 1))
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
+ expand_operand ops[5];
+ create_output_operand (&ops[0], d->target, mode);
+ create_input_operand (&ops[1], d->zero_op0_p ? in1 : in0, mode);
+ create_integer_operand (&ops[2], BYTES_BIG_ENDIAN ? 1 : 2);
+ create_input_operand (&ops[3], d->zero_op0_p ? in0 : in1, mode);
+ create_integer_operand (&ops[4], 0);
+ expand_insn (icode, 5, ops);
+
+ return true;
+}
+
static bool
aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
@@ -26457,7 +26527,9 @@ aarch64_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)
{
if (d->vmode == d->op_mode)
{
- if (aarch64_evpc_rev_local (d))
+ if (aarch64_evpc_fmov (d))
+ return true;
+ else if (aarch64_evpc_rev_local (d))
return true;
else if (aarch64_evpc_rev_global (d))
return true;
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-be.c
b/gcc/testsuite/gcc.target/aarch64/fmov-be.c
new file mode 100644
index 00000000000..f864d474a7a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-be.c
@@ -0,0 +1,74 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef int v2si __attribute__ ((vector_size (8)));
+typedef long v2di __attribute__ ((vector_size (16)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+
+/*
+** f_v2si:
+** fmov s0, s0
+** ret
+*/
+v2si
+f_v2si (v2si x)
+{
+ return __builtin_shuffle (x, (v2si){ 0, 0 }, (v2si){ 3, 1 });
+}
+
+/*
+** g_v2si:
+** fmov s0, s0
+** ret
+*/
+v2si
+g_v2si (v2si x)
+{
+ return __builtin_shuffle ((v2si){ 0, 0 }, x, (v2si){ 0, 3 });
+}
+
+/*
+** f_v2di:
+** fmov d0, d0
+** ret
+*/
+v2di
+f_v2di (v2di x)
+{
+ return __builtin_shuffle (x, (v2di){ 0, 0 }, (v2di){ 2, 1 });
+}
+
+/*
+** g_v2di:
+** fmov d0, d0
+** ret
+*/
+v2di
+g_v2di (v2di x)
+{
+ return __builtin_shuffle ((v2di){ 0, 0 }, x, (v2di){ 0, 3 });
+}
+
+/*
+** f_v4si:
+** fmov d0, d0
+** ret
+*/
+v4si
+f_v4si (v4si x)
+{
+ return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 6, 7, 2, 3 });
+}
+
+/*
+** g_v4si:
+** fmov d0, d0
+** ret
+*/
+v4si
+g_v4si (v4si x)
+{
+ return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 2, 3, 6, 7 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov.c
b/gcc/testsuite/gcc.target/aarch64/fmov.c
new file mode 100644
index 00000000000..f40ec56c5f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov.c
@@ -0,0 +1,110 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef int v2si __attribute__ ((vector_size (8)));
+typedef float v2sf __attribute__ ((vector_size (8)));
+typedef long v2di __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+
+
+/*
+** f_v2si:
+** fmov s0, s0
+** ret
+*/
+v2si
+f_v2si (v2si x)
+{
+ return __builtin_shuffle (x, (v2si){ 0, 0 }, (v2si){ 0, 3 });
+}
+
+/*
+** g_v2si:
+** fmov s0, s0
+** ret
+*/
+v2si
+g_v2si (v2si x)
+{
+ return __builtin_shuffle ((v2si){ 0, 0 }, x, (v2si){ 2, 0 });
+}
+
+/*
+** f_v2sf:
+** fmov s0, s0
+** ret
+*/
+v2sf
+f_v2sf (v2sf x)
+{
+ return __builtin_shuffle (x, (v2sf){ 0, 0 }, (v2si){ 0, 2 });
+}
+
+/*
+** f_v2di:
+** fmov d0, d0
+** ret
+*/
+v2di
+f_v2di (v2di x)
+{
+ return __builtin_shuffle (x, (v2di){ 0, 0 }, (v2di){ 0, 3 });
+}
+
+/*
+** g_v2di:
+** fmov d0, d0
+** ret
+*/
+v2di
+g_v2di (v2di x)
+{
+ return __builtin_shuffle ((v2di){ 0, 0 }, x, (v2di){ 2, 1 });
+}
+
+/*
+** f_v2df:
+** fmov d0, d0
+** ret
+*/
+v2df
+f_v2df (v2df x)
+{
+ return __builtin_shuffle (x, (v2df){ 0, 0 }, (v2di){ 0, 2 });
+}
+
+/*
+** f_v4si:
+** fmov d0, d0
+** ret
+*/
+v4si
+f_v4si (v4si x)
+{
+ return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 1, 4, 5 });
+}
+
+/*
+** g_v4si:
+** fmov d0, d0
+** ret
+*/
+v4si
+g_v4si (v4si x)
+{
+ return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 4, 5, 2, 3 });
+}
+
+/*
+** f_v4sf:
+** fmov d0, d0
+** ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+ return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 1, 6, 7 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
index b34b902cf27..9040839931f 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
@@ -28,8 +28,10 @@ FOO(float64x2_t)
/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[1\], wzr} 2 { target
aarch64_little_endian } } } */
/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.h\[1\], wzr} 4 { target
aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.s\[1\], wzr} 4 { target
aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.d\[1\], xzr} 2 { target
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.s\[1\], wzr} 2 { target
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.d\[1\], xzr} 0 { target
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {fmov\ts0, s0} 2 { target
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {fmov\td0, d0} 2 { target
aarch64_little_endian } } } */
/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[6\], wzr} 1 { target
aarch64_big_endian } } } */
/* { dg-final { scan-assembler-times {ins\tv[0-9]+\.b\[14\], wzr} 1 { target
aarch64_big_endian } } } */
--
2.17.1