Hi all,
Given code:
#define MAX(a, b) (a > b ? a : b)
void foo (int ilast, float* w, float* w2)
{
int i;
for (i = 0; i < ilast; ++i)
{
w[i] = MAX (0.0f, w2[i]);
}
}
compiled with
-O1 -funsafe-math-optimizations -ftree-vectorize -mfpu=neon -mfloat-abi=hard
on
arm-none-eabi will cause an ICE when trying to expand the vcond pattern.
Looking at the vcond pattern in neon.md, the predicate for the
comparison operator (arm_comparison_operator) uses
maybe_get_arm_condition_code
which is not needed for vcond since we don't care about the ARM condition
code
(we can handle all the comparison cases ourselves in the expander).
Changing the predicate to comparison_operator allows the expander to proceed
but it ICEs again because the pattern doesn't handle the floating point
unordered cases! (i.e. UNGT, UNORDERED, UNLE etc).
Adding support for the unordered cases is very similar to the aarch64 port
added
here:
http://gcc.gnu.org/ml/gcc-patches/2013-01/msg00957.html
This patch adapts that code to the arm port.
Added the testcase that exposed the ICE initially and also the UNORDERED and
LTGT
variations of it.
No regressions on arm-none-eabi.
Ok for trunk?
Thanks,
Kyrill
gcc/ChangeLog
2013-03-18 Kyrylo Tkachov <kyrylo.tkac...@arm.com>
* config/arm/iterators.md (v_cmp_result): New mode attribute.
* config/arm/neon.md (vcond<mode><mode>): Handle unordered cases.
gcc/testsuite/ChangeLog
2013-03-18 Kyrylo Tkachov <kyrylo.tkac...@arm.com>
* gcc.target/arm/neon-vcond-gt.c: New test.
* gcc.target/arm/neon-vcond-ltgt.c: Likewise.
* gcc.target/arm/neon-vcond-unordered.c: Likewise.
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 252f18b..b3ad42b 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -314,6 +314,12 @@
(V2SF "V2SI") (V4SF "V4SI")
(DI "DI") (V2DI "V2DI")])
+(define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
+ (V4HI "v4hi") (V8HI "v8hi")
+ (V2SI "v2si") (V4SI "v4si")
+ (DI "di") (V2DI "v2di")
+ (V2SF "v2si") (V4SF "v4si")])
+
;; Get element type from double-width mode, for operations where we
;; don't care about signedness.
(define_mode_attr V_if_elem [(V8QI "i8") (V16QI "i8")
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 79b3f66..99fb5e8 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1721,80 +1721,144 @@
(define_expand "vcond<mode><mode>"
[(set (match_operand:VDQW 0 "s_register_operand" "")
(if_then_else:VDQW
- (match_operator 3 "arm_comparison_operator"
+ (match_operator 3 "comparison_operator"
[(match_operand:VDQW 4 "s_register_operand" "")
(match_operand:VDQW 5 "nonmemory_operand" "")])
(match_operand:VDQW 1 "s_register_operand" "")
(match_operand:VDQW 2 "s_register_operand" "")))]
"TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
{
- rtx mask;
- int inverse = 0, immediate_zero = 0;
- /* See the description of "magic" bits in the 'T' case of
- arm_print_operand. */
HOST_WIDE_INT magic_word = (<MODE>mode == V2SFmode || <MODE>mode == V4SFmode)
? 3 : 1;
rtx magic_rtx = GEN_INT (magic_word);
-
- mask = gen_reg_rtx (<V_cmp_result>mode);
-
- if (operands[5] == CONST0_RTX (<MODE>mode))
- immediate_zero = 1;
- else if (!REG_P (operands[5]))
- operands[5] = force_reg (<MODE>mode, operands[5]);
-
+ int inverse = 0;
+ int swap_bsl_operands = 0;
+ rtx mask = gen_reg_rtx (<V_cmp_result>mode);
+ rtx tmp = gen_reg_rtx (<V_cmp_result>mode);
+
+ rtx (*base_comparison) (rtx, rtx, rtx, rtx);
+ rtx (*complimentary_comparison) (rtx, rtx, rtx, rtx);
+
switch (GET_CODE (operands[3]))
{
case GE:
- emit_insn (gen_neon_vcge<mode> (mask, operands[4], operands[5],
- magic_rtx));
+ case LE:
+ case EQ:
+ if (!REG_P (operands[5])
+ && (operands[5] != CONST0_RTX (<MODE>mode)))
+ operands[5] = force_reg (<MODE>mode, operands[5]);
break;
-
+ default:
+ if (!REG_P (operands[5]))
+ operands[5] = force_reg (<MODE>mode, operands[5]);
+ }
+
+ switch (GET_CODE (operands[3]))
+ {
+ case LT:
+ case UNLT:
+ inverse = 1;
+ /* Fall through. */
+ case GE:
+ case UNGE:
+ case ORDERED:
+ case UNORDERED:
+ base_comparison = gen_neon_vcge<mode>;
+ complimentary_comparison = gen_neon_vcgt<mode>;
+ break;
+ case LE:
+ case UNLE:
+ inverse = 1;
+ /* Fall through. */
case GT:
- emit_insn (gen_neon_vcgt<mode> (mask, operands[4], operands[5],
- magic_rtx));
+ case UNGT:
+ base_comparison = gen_neon_vcgt<mode>;
+ complimentary_comparison = gen_neon_vcge<mode>;
break;
-
case EQ:
- emit_insn (gen_neon_vceq<mode> (mask, operands[4], operands[5],
- magic_rtx));
+ case NE:
+ case UNEQ:
+ base_comparison = gen_neon_vceq<mode>;
+ complimentary_comparison = gen_neon_vceq<mode>;
break;
-
+ default:
+ gcc_unreachable ();
+ }
+
+ switch (GET_CODE (operands[3]))
+ {
+ case LT:
case LE:
- if (immediate_zero)
- emit_insn (gen_neon_vcle<mode> (mask, operands[4], operands[5],
- magic_rtx));
+ case GT:
+ case GE:
+ case EQ:
+ /* The easy case. Here we emit one of vcge, vcgt or vceq.
+ As a LT b <=> b GE a && a LE b <=> b GT a. Our transformations are:
+ a GE b -> a GE b
+ a GT b -> a GT b
+ a LE b -> b GE a
+ a LT b -> b GT a
+ a EQ b -> a EQ b */
+
+ if (!inverse)
+ emit_insn (base_comparison (mask, operands[4], operands[5], magic_rtx));
else
- emit_insn (gen_neon_vcge<mode> (mask, operands[5], operands[4],
- magic_rtx));
+ emit_insn (complimentary_comparison (mask, operands[5], operands[4],
magic_rtx));
break;
-
- case LT:
- if (immediate_zero)
- emit_insn (gen_neon_vclt<mode> (mask, operands[4], operands[5],
- magic_rtx));
+ case UNLT:
+ case UNLE:
+ case UNGT:
+ case UNGE:
+ case NE:
+ /* Vector compare returns false for lanes which are unordered, so if we
use
+ the inverse of the comparison we actually want to emit, then
+ swap the operands to BSL, we will end up with the correct result.
+ Note that a NE NaN and NaN NE b are true for all a, b.
+
+ Our transformations are:
+ a GE b -> !(b GT a)
+ a GT b -> !(b GE a)
+ a LE b -> !(a GT b)
+ a LT b -> !(a GE b)
+ a NE b -> !(a EQ b) */
+
+ if (inverse)
+ emit_insn (base_comparison (mask, operands[4], operands[5], magic_rtx));
else
- emit_insn (gen_neon_vcgt<mode> (mask, operands[5], operands[4],
- magic_rtx));
+ emit_insn (complimentary_comparison (mask, operands[5], operands[4],
magic_rtx));
+
+ swap_bsl_operands = 1;
break;
-
- case NE:
- emit_insn (gen_neon_vceq<mode> (mask, operands[4], operands[5],
- magic_rtx));
- inverse = 1;
+ case UNEQ:
+ /* We check (a > b || b > a). combining these comparisons give us
+ true iff !(a != b && a ORDERED b), swapping the operands to BSL
+ will then give us (a == b || a UNORDERED b) as intended. */
+
+ emit_insn (gen_neon_vcgt<mode> (mask, operands[4], operands[5],
magic_rtx));
+ emit_insn (gen_neon_vcgt<mode> (tmp, operands[5], operands[4],
magic_rtx));
+ emit_insn (gen_ior<v_cmp_result>3 (mask, mask, tmp));
+ swap_bsl_operands = 1;
+ break;
+ case UNORDERED:
+ /* Operands are ORDERED iff (a > b || b >= a).
+ Swapping the operands to BSL will give the UNORDERED case. */
+ swap_bsl_operands = 1;
+ /* Fall through. */
+ case ORDERED:
+ emit_insn (gen_neon_vcgt<mode> (tmp, operands[4], operands[5],
magic_rtx));
+ emit_insn (gen_neon_vcge<mode> (mask, operands[5], operands[4],
magic_rtx));
+ emit_insn (gen_ior<v_cmp_result>3 (mask, mask, tmp));
break;
-
default:
gcc_unreachable ();
}
-
- if (inverse)
+
+ if (swap_bsl_operands)
emit_insn (gen_neon_vbsl<mode> (operands[0], mask, operands[2],
operands[1]));
else
emit_insn (gen_neon_vbsl<mode> (operands[0], mask, operands[1],
operands[2]));
-
DONE;
})
diff --git a/gcc/testsuite/gcc.target/arm/neon-vcond-gt.c
b/gcc/testsuite/gcc.target/arm/neon-vcond-gt.c
new file mode 100644
index 0000000..86ccf95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vcond-gt.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O1 -funsafe-math-optimizations -ftree-vectorize" } */
+/* { dg-add-options arm_neon } */
+
+#define MAX(a, b) (a > b ? a : b)
+void foo (int ilast,float* w, float* w2)
+{
+ int i;
+ for (i = 0; i < ilast; ++i)
+ {
+ w[i] = MAX (0.0f, w2[i]);
+ }
+}
+
+/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t
\]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vbit\[\\t \]*q\[0-9\]+,\[\\t
\]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c
b/gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c
new file mode 100644
index 0000000..acb23a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O1 -funsafe-math-optimizations -ftree-vectorize" } */
+/* { dg-add-options arm_neon } */
+
+#define LTGT(a, b) (__builtin_islessgreater (a, b) ? a : b)
+void foo (int ilast,float* w, float* w2)
+{
+ int i;
+ for (i = 0; i < ilast; ++i)
+ {
+ w[i] = LTGT (0.0f, w2[i]);
+ }
+}
+
+/* { dg-final { scan-assembler-times "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t
\]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t
\]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vbsl\[\\t \]*q\[0-9\]+,\[\\t
\]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c
b/gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c
new file mode 100644
index 0000000..c3e448d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O1 -funsafe-math-optimizations -ftree-vectorize" } */
+/* { dg-add-options arm_neon } */
+
+#define UNORD(a, b) (__builtin_isunordered (a, b) ? a : b)
+void foo (int ilast,float* w, float* w2)
+{
+ int i;
+ for (i = 0; i < ilast; ++i)
+ {
+ w[i] = UNORD (0.0f, w2[i]);
+ }
+}
+
+/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t
\]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vcge\\.f32\[\\t \]*q\[0-9\]+,\[\\t
\]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t
\]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vbsl\[\\t \]*q\[0-9\]+,\[\\t
\]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */