Hi All,
For IEEE 754 floating point formats we can replace a sequence of alternative
+/- with fneg of a wider type followed by an fadd. This eliminated the need for
using a permutation. This patch adds a math.pd rule to recognize and do this
rewriting.
For
void f (float *restrict a, float *restrict b, float *res, int n)
{
for (int i = 0; i < (n & -4); i+=2)
{
res[i+0] = a[i+0] + b[i+0];
res[i+1] = a[i+1] - b[i+1];
}
}
we generate:
.L3:
ldr q1, [x1, x3]
ldr q0, [x0, x3]
fneg v1.2d, v1.2d
fadd v0.4s, v0.4s, v1.4s
str q0, [x2, x3]
add x3, x3, 16
cmp x3, x4
bne .L3
now instead of:
.L3:
ldr q1, [x0, x3]
ldr q2, [x1, x3]
fadd v0.4s, v1.4s, v2.4s
fsub v1.4s, v1.4s, v2.4s
tbl v0.16b, {v0.16b - v1.16b}, v3.16b
str q0, [x2, x3]
add x3, x3, 16
cmp x3, x4
bne .L3
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Thanks to George Steed for the idea.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* match.pd: Add fneg/fadd rule.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/addsub_1.c: New test.
* gcc.target/aarch64/sve/addsub_1.c: New test.
--- inline copy of patch --
diff --git a/gcc/match.pd b/gcc/match.pd
index
51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bbe811c8ee6c7c6e
100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7612,6 +7612,49 @@ and,
(simplify (reduc (op @0 VECTOR_CST@1))
(op (reduc:type @0) (reduc:type @1))))
+/* Simplify vector floating point operations of alternating sub/add pairs
+ into using an fneg of a wider element type followed by a normal add.
+ under IEEE 754 the fneg of the wider type will negate every even entry
+ and when doing an add we get a sub of the even and add of every odd
+ elements. */
+(simplify
+ (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
+ (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
+ (with
+ {
+ /* Build a vector of integers from the tree mask. */
+ vec_perm_builder builder;
+ if (!tree_to_vec_perm_builder (&builder, @2))
+ return NULL_TREE;
+
+ /* Create a vec_perm_indices for the integer vector. */
+ poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+ vec_perm_indices sel (builder, 2, nelts);
+ }
+ (if (sel.series_p (0, 2, 0, 2))
+ (with
+ {
+ machine_mode vec_mode = TYPE_MODE (type);
+ auto elem_mode = GET_MODE_INNER (vec_mode);
+ auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
+ tree stype;
+ switch (elem_mode)
+ {
+ case E_HFmode:
+ stype = float_type_node;
+ break;
+ case E_SFmode:
+ stype = double_type_node;
+ break;
+ default:
+ return NULL_TREE;
+ }
+ tree ntype = build_vector_type (stype, nunits);
+ if (!ntype)
+ return NULL_TREE;
+ }
+ (plus (view_convert:type (negate (view_convert:ntype @1))) @0))))))
+
(simplify
(vec_perm @0 @1 VECTOR_CST@2)
(with
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
new file mode 100644
index
0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-options "-Ofast" } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#pragma GCC target "+nosve"
+
+/*
+** f1:
+** ...
+** fneg v[0-9]+.2d, v[0-9]+.2d
+** fadd v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+ for (int i = 0; i < (n & -4); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}
+
+/*
+** d1:
+** ...
+** fneg v[0-9]+.4s, v[0-9]+.4s
+** fadd v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+ for (int i = 0; i < (n & -8); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}
+
+/*
+** e1:
+** ...
+** fadd v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** fsub v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** ins v[0-9]+.d\[1\], v[0-9]+.d\[1\]
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+ for (int i = 0; i < (n & -4); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
new file mode 100644
index
0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+** f1:
+** ...
+** fneg z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** fadd z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+ for (int i = 0; i < (n & -4); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}
+
+/*
+** d1:
+** ...
+** fneg z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
+** fadd z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+ for (int i = 0; i < (n & -8); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}
+
+/*
+** e1:
+** ...
+** fsub z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
+** movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** fadd z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+ for (int i = 0; i < (n & -4); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}
--
diff --git a/gcc/match.pd b/gcc/match.pd
index
51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bbe811c8ee6c7c6e
100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7612,6 +7612,49 @@ and,
(simplify (reduc (op @0 VECTOR_CST@1))
(op (reduc:type @0) (reduc:type @1))))
+/* Simplify vector floating point operations of alternating sub/add pairs
+ into using an fneg of a wider element type followed by a normal add.
+ under IEEE 754 the fneg of the wider type will negate every even entry
+ and when doing an add we get a sub of the even and add of every odd
+ elements. */
+(simplify
+ (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
+ (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
+ (with
+ {
+ /* Build a vector of integers from the tree mask. */
+ vec_perm_builder builder;
+ if (!tree_to_vec_perm_builder (&builder, @2))
+ return NULL_TREE;
+
+ /* Create a vec_perm_indices for the integer vector. */
+ poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+ vec_perm_indices sel (builder, 2, nelts);
+ }
+ (if (sel.series_p (0, 2, 0, 2))
+ (with
+ {
+ machine_mode vec_mode = TYPE_MODE (type);
+ auto elem_mode = GET_MODE_INNER (vec_mode);
+ auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
+ tree stype;
+ switch (elem_mode)
+ {
+ case E_HFmode:
+ stype = float_type_node;
+ break;
+ case E_SFmode:
+ stype = double_type_node;
+ break;
+ default:
+ return NULL_TREE;
+ }
+ tree ntype = build_vector_type (stype, nunits);
+ if (!ntype)
+ return NULL_TREE;
+ }
+ (plus (view_convert:type (negate (view_convert:ntype @1))) @0))))))
+
(simplify
(vec_perm @0 @1 VECTOR_CST@2)
(with
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
new file mode 100644
index
0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-options "-Ofast" } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#pragma GCC target "+nosve"
+
+/*
+** f1:
+** ...
+** fneg v[0-9]+.2d, v[0-9]+.2d
+** fadd v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+ for (int i = 0; i < (n & -4); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}
+
+/*
+** d1:
+** ...
+** fneg v[0-9]+.4s, v[0-9]+.4s
+** fadd v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+ for (int i = 0; i < (n & -8); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}
+
+/*
+** e1:
+** ...
+** fadd v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** fsub v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** ins v[0-9]+.d\[1\], v[0-9]+.d\[1\]
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+ for (int i = 0; i < (n & -4); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
new file mode 100644
index
0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+** f1:
+** ...
+** fneg z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** fadd z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+ for (int i = 0; i < (n & -4); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}
+
+/*
+** d1:
+** ...
+** fneg z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
+** fadd z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+ for (int i = 0; i < (n & -8); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}
+
+/*
+** e1:
+** ...
+** fsub z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
+** movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** fadd z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+ for (int i = 0; i < (n & -4); i+=2)
+ {
+ res[i+0] = a[i+0] + b[i+0];
+ res[i+1] = a[i+1] - b[i+1];
+ }
+}