Hi All,
This adds an implementation for masked copysign along with an optimized
pattern for masked copysign (x, -1).
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
PR tree-optimization/109154
* config/aarch64/aarch64-sve.md (cond_copysign<mode>): New.
gcc/testsuite/ChangeLog:
PR tree-optimization/109154
* gcc.target/aarch64/sve/fneg-abs_5.c: New test.
--- inline copy of patch --
diff --git a/gcc/config/aarch64/aarch64-sve.md
b/gcc/config/aarch64/aarch64-sve.md
index
071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254568f45b61a14aa11c305
100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3"
}
)
+(define_expand "cond_copysign<mode>"
+ [(match_operand:SVE_FULL_F 0 "register_operand")
+ (match_operand:<VPRED> 1 "register_operand")
+ (match_operand:SVE_FULL_F 2 "register_operand")
+ (match_operand:SVE_FULL_F 3 "nonmemory_operand")
+ (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ "TARGET_SVE"
+ {
+ rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
+ rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
+ rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
+ int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
+
+ rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
+ rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3], <MODE>mode);
+ rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4], <MODE>mode);
+
+ rtx v_sign_bitmask
+ = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+ HOST_WIDE_INT_M1U << bits);
+
+ /* copysign (x, -1) should instead be expanded as orr with the sign
+ bit. */
+ if (!REG_P (operands[3]))
+ {
+ auto r0
+ = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[3]));
+ if (-1 == real_to_integer (r0))
+ {
+ arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask);
+ emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2,
+ arg3, arg4));
+ emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+ DONE;
+ }
+ }
+
+ operands[2] = force_reg (<MODE>mode, operands[3]);
+ emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask));
+ emit_insn (gen_and<v_int_equiv>3
+ (mant, arg2,
+ aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+ ~(HOST_WIDE_INT_M1U
+ << bits))));
+ emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, mant,
+ arg4));
+ emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+ DONE;
+ }
+)
+
(define_expand "xorsign<mode>3"
[(match_operand:SVE_FULL_F 0 "register_operand")
(match_operand:SVE_FULL_F 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
new file mode 100644
index
0000000000000000000000000000000000000000..f4ecbeecbe1290134e688f46a4389d17155e4a0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** ...
+** orr z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void f1 (float32_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ if (a[i] > n)
+ a[i] = -fabsf (a[i]);
+ else
+ a[i] = n;
+}
+
+/*
+** f2:
+** ...
+** orr z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void f2 (float64_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ if (a[i] > n)
+ a[i] = -fabs (a[i]);
+ else
+ a[i] = n;
+}
--
diff --git a/gcc/config/aarch64/aarch64-sve.md
b/gcc/config/aarch64/aarch64-sve.md
index
071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254568f45b61a14aa11c305
100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3"
}
)
+(define_expand "cond_copysign<mode>"
+ [(match_operand:SVE_FULL_F 0 "register_operand")
+ (match_operand:<VPRED> 1 "register_operand")
+ (match_operand:SVE_FULL_F 2 "register_operand")
+ (match_operand:SVE_FULL_F 3 "nonmemory_operand")
+ (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ "TARGET_SVE"
+ {
+ rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
+ rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
+ rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
+ int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
+
+ rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
+ rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3], <MODE>mode);
+ rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4], <MODE>mode);
+
+ rtx v_sign_bitmask
+ = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+ HOST_WIDE_INT_M1U << bits);
+
+ /* copysign (x, -1) should instead be expanded as orr with the sign
+ bit. */
+ if (!REG_P (operands[3]))
+ {
+ auto r0
+ = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[3]));
+ if (-1 == real_to_integer (r0))
+ {
+ arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask);
+ emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2,
+ arg3, arg4));
+ emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+ DONE;
+ }
+ }
+
+ operands[2] = force_reg (<MODE>mode, operands[3]);
+ emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask));
+ emit_insn (gen_and<v_int_equiv>3
+ (mant, arg2,
+ aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+ ~(HOST_WIDE_INT_M1U
+ << bits))));
+ emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, mant,
+ arg4));
+ emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+ DONE;
+ }
+)
+
(define_expand "xorsign<mode>3"
[(match_operand:SVE_FULL_F 0 "register_operand")
(match_operand:SVE_FULL_F 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
new file mode 100644
index
0000000000000000000000000000000000000000..f4ecbeecbe1290134e688f46a4389d17155e4a0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** ...
+** orr z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void f1 (float32_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ if (a[i] > n)
+ a[i] = -fabsf (a[i]);
+ else
+ a[i] = n;
+}
+
+/*
+** f2:
+** ...
+** orr z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void f2 (float64_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ if (a[i] > n)
+ a[i] = -fabs (a[i]);
+ else
+ a[i] = n;
+}