On Sat, Dec 28, 2019 at 02:20:09PM +0100, Uros Bizjak wrote:
> > The conditions are:
> > (define_expand "nearbyint<mode>2"
> >   [(use (match_operand:MODEF 0 "register_operand"))
> >    (use (match_operand:MODEF 1 "nonimmediate_operand"))]
> >   "(TARGET_USE_FANCY_MATH_387
> >     && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
> >           || TARGET_MIX_SSE_I387)
> >     && !flag_trapping_math)
> >    || (TARGET_SSE4_1 && TARGET_SSE_MATH)"
> > and:
> > (define_expand "rint<mode>2"
> >   [(use (match_operand:MODEF 0 "register_operand"))
> >    (use (match_operand:MODEF 1 "nonimmediate_operand"))]
> >   "TARGET_USE_FANCY_MATH_387
> >    || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
> > Only nearbyint tests flag_trapping_math, and only for the pre-sse4.1 case,
> 
> This is correct, since x87 frndint always generates precision
> (inexact) exceptions, but nearbyint should not generate any.
> 
> On a related note, trap on denormal is not IEEE exception, and
> documentation explicitly says that -fno-trapping-math affects only
> division by zero, overflow, underflow, inexact result and invalid
> operation. So, do we need to check for flag_trapping_math in
> ix86_builtin_vectorized_function for other builtins involving ROUND
> insn? Also, perhaps floor/ceil/trunc can be reimplemented using
> standard named expander instead.

I'd say we should follow what we do in the scalar code because if users
don't complain about that, it should be fine for vectorized code too.
And yes, reimplementing floor/ceil/trunc is something I'll try to do
incrementally, while it will be less important than rint which didn't have
the 512-bit cases implemented, it will still result in fewer decls that need
to be created.

> Your patch with stuff removed from ix86_builtin_vectorized_function is OK.

Thanks, here is what I've committed after another bootstrap/regtest:

2019-12-29  Jakub Jelinek  <ja...@redhat.com>

        PR target/93078
        * config/i386/i386-builtins.c (ix86_builtin_vectorized_function):
        Remove CASE_CFN_RINT handling.
        * config/i386/i386-builtin.def (IX86_BUILTIN_RINTPD,
        IX86_BUILTIN_RINTPS, IX86_BUILTIN_RINTPD256, IX86_BUILTIN_RINTPS256):
        Remove.
        * config/i386/sse.md (nearbyint<mode>2, rint<mode>2): New expanders
        with VF iterator.

        * gcc.target/i386/sse4_1-pr93078.c: New test.
        * gcc.target/i386/avx-pr93078.c: New test.
        * gcc.target/i386/avx512f-pr93078.c: New test.

--- gcc/config/i386/i386-builtins.c.jj  2019-12-09 15:02:31.077273254 +0100
+++ gcc/config/i386/i386-builtins.c     2019-12-28 12:11:05.509289523 +0100
@@ -1661,27 +1661,6 @@ ix86_builtin_vectorized_function (unsign
        }
       break;
 
-    CASE_CFN_RINT:
-      /* The round insn does not trap on denormals.  */
-      if (flag_trapping_math || !TARGET_SSE4_1)
-       break;
-
-      if (out_mode == DFmode && in_mode == DFmode)
-       {
-         if (out_n == 2 && in_n == 2)
-           return ix86_get_builtin (IX86_BUILTIN_RINTPD);
-         else if (out_n == 4 && in_n == 4)
-           return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
-       }
-      if (out_mode == SFmode && in_mode == SFmode)
-       {
-         if (out_n == 4 && in_n == 4)
-           return ix86_get_builtin (IX86_BUILTIN_RINTPS);
-         else if (out_n == 8 && in_n == 8)
-           return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
-       }
-      break;
-
     CASE_CFN_FMA:
       if (out_mode == DFmode && in_mode == DFmode)
        {
--- gcc/config/i386/i386-builtin.def.jj 2019-12-09 15:02:31.110272755 +0100
+++ gcc/config/i386/i386-builtin.def    2019-12-28 12:07:10.352821780 +0100
@@ -913,7 +913,6 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_F
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd, 
"__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, 
(int) V2DF_FTYPE_V2DF_ROUND)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd, 
"__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) 
V2DF_FTYPE_V2DF_ROUND)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd, 
"__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, 
(int) V2DF_FTYPE_V2DF_ROUND)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd, 
"__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, 
(int) V2DF_FTYPE_V2DF_ROUND)
 
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, 
"__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, 
(enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, 
"__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum 
rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND)
@@ -924,7 +923,6 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_F
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, 
"__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, 
(int) V4SF_FTYPE_V4SF_ROUND)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, 
"__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) 
V4SF_FTYPE_V4SF_ROUND)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, 
"__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, 
(int) V4SF_FTYPE_V4SF_ROUND)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, 
"__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, 
(int) V4SF_FTYPE_V4SF_ROUND)
 
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps_sfix, 
"__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) 
ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps_sfix, 
"__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) 
ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND)
@@ -1047,7 +1045,6 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd256, 
"__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) 
ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd256, 
"__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, 
(int) V4DF_FTYPE_V4DF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd256, 
"__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) 
ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd256, 
"__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) 
ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND)
 
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_roundv4df2, 
"__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) 
V4DF_FTYPE_V4DF)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_roundv4df2_vec_pack_sfix, 
"__builtin_ia32_roundpd_az_vec_pack_sfix256", 
IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF)
@@ -1058,7 +1055,6 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, 
"__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) 
ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, 
"__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, 
(int) V8SF_FTYPE_V8SF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, 
"__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) 
ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, 
"__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) 
ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND)
 
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps_sfix256, 
"__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) 
ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps_sfix256, 
"__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) 
ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND)
--- gcc/config/i386/sse.md.jj   2019-12-21 00:12:54.000000000 +0100
+++ gcc/config/i386/sse.md      2019-12-27 18:16:48.146431083 +0100
@@ -17977,6 +17977,24 @@ (define_insn "ptesttf2"
    (set_attr "prefix" "orig,orig,vex")
    (set_attr "mode" "TI")])
 
+(define_expand "nearbyint<mode>2"
+  [(set (match_operand:VF 0 "register_operand")
+       (unspec:VF
+         [(match_operand:VF 1 "vector_operand")
+          (match_dup 2)]
+         UNSPEC_ROUND))]
+  "TARGET_SSE4_1"
+  "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);")
+
+(define_expand "rint<mode>2"
+  [(set (match_operand:VF 0 "register_operand")
+       (unspec:VF
+         [(match_operand:VF 1 "vector_operand")
+          (match_dup 2)]
+         UNSPEC_ROUND))]
+  "TARGET_SSE4_1"
+  "operands[2] = GEN_INT (ROUND_MXCSR);")
+
 (define_insn "<sse4_1>_round<ssemodesuffix><avxsizesuffix>"
   [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
        (unspec:VF_128_256
--- gcc/testsuite/gcc.target/i386/sse4_1-pr93078.c.jj   2019-12-27 
18:26:05.436970472 +0100
+++ gcc/testsuite/gcc.target/i386/sse4_1-pr93078.c      2019-12-27 
18:32:29.107147604 +0100
@@ -0,0 +1,42 @@
+/* PR target/93078 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse4.1 -mno-sse4.2 -masm=att" } */
+/* { dg-final { scan-assembler "roundps\[ \t]\+\\\$12," } } */
+/* { dg-final { scan-assembler "roundps\[ \t]\+\\\$4," } } */
+/* { dg-final { scan-assembler "roundpd\[ \t]\+\\\$12," } } */
+/* { dg-final { scan-assembler "roundpd\[ \t]\+\\\$4," } } */
+
+float a[16], b[16];
+double c[8], d[8];
+
+void
+foo (void)
+{
+  int i;
+  for (i = 0; i < 16; ++i)
+    b[i] = __builtin_nearbyintf (a[i]);
+}
+
+void
+bar (void)
+{
+  int i;
+  for (i = 0; i < 16; ++i)
+    b[i] = __builtin_rintf (a[i]);
+}
+
+void
+baz (void)
+{
+  int i;
+  for (i = 0; i < 8; ++i)
+    d[i] = __builtin_nearbyint (c[i]);
+}
+
+void
+qux (void)
+{
+  int i;
+  for (i = 0; i < 8; ++i)
+    d[i] = __builtin_rint (c[i]);
+}
--- gcc/testsuite/gcc.target/i386/avx-pr93078.c.jj      2019-12-27 
18:32:47.567867421 +0100
+++ gcc/testsuite/gcc.target/i386/avx-pr93078.c 2019-12-27 18:34:41.527137818 
+0100
@@ -0,0 +1,9 @@
+/* PR target/93078 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -mavx -mno-avx2 
-mprefer-vector-width=256 -masm=att" } */
+/* { dg-final { scan-assembler "vroundps\[ \t]\+\\\$12,\[^\n\r]*%y" } } */
+/* { dg-final { scan-assembler "vroundps\[ \t]\+\\\$4,\[^\n\r]*%y" } } */
+/* { dg-final { scan-assembler "vroundpd\[ \t]\+\\\$12,\[^\n\r]*%y" } } */
+/* { dg-final { scan-assembler "vroundpd\[ \t]\+\\\$4,\[^\n\r]*%y" } } */
+
+#include "sse4_1-pr93078.c"
--- gcc/testsuite/gcc.target/i386/avx512f-pr93078.c.jj  2019-12-27 
18:34:56.632908546 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr93078.c     2019-12-27 
18:35:38.650270831 +0100
@@ -0,0 +1,9 @@
+/* PR target/93078 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -mavx512f -mprefer-vector-width=512 
-masm=att" } */
+/* { dg-final { scan-assembler "vrndscaleps\[ \t]\+\\\$12,\[^\n\r]*%z" } } */
+/* { dg-final { scan-assembler "vrndscaleps\[ \t]\+\\\$4,\[^\n\r]*%z" } } */
+/* { dg-final { scan-assembler "vrndscalepd\[ \t]\+\\\$12,\[^\n\r]*%z" } } */
+/* { dg-final { scan-assembler "vrndscalepd\[ \t]\+\\\$4,\[^\n\r]*%z" } } */
+
+#include "sse4_1-pr93078.c"


        Jakub

Reply via email to