On Sat, Dec 28, 2019 at 02:20:09PM +0100, Uros Bizjak wrote: > > The conditions are: > > (define_expand "nearbyint<mode>2" > > [(use (match_operand:MODEF 0 "register_operand")) > > (use (match_operand:MODEF 1 "nonimmediate_operand"))] > > "(TARGET_USE_FANCY_MATH_387 > > && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH) > > || TARGET_MIX_SSE_I387) > > && !flag_trapping_math) > > || (TARGET_SSE4_1 && TARGET_SSE_MATH)" > > and: > > (define_expand "rint<mode>2" > > [(use (match_operand:MODEF 0 "register_operand")) > > (use (match_operand:MODEF 1 "nonimmediate_operand"))] > > "TARGET_USE_FANCY_MATH_387 > > || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)" > > Only nearbyint tests flag_trapping_math, and only for the pre-sse4.1 case, > > This is correct, since x87 frndint always generates precision > (inexact) exceptions, but nearbyint should not generate any. > > On a related note, trap on denormal is not IEEE exception, and > documentation explicitly says that -fno-trapping-math affects only > division by zero, overflow, underflow, inexact result and invalid > operation. So, do we need to check for flag_trapping_math in > ix86_builtin_vectorized_function for other builtins involving ROUND > insn? Also, perhaps floor/ceil/trunc can be reimplemented using > standard named expander instead.
I'd say we should follow what we do in the scalar code because if users don't complain about that, it should be fine for vectorized code too. And yes, reimplementing floor/ceil/trunc is something I'll try to do incrementally, while it will be less important than rint which didn't have the 512-bit cases implemented, it will still result in fewer decls that need to be created. > Your patch with stuff removed from ix86_builtin_vectorized_function is OK. Thanks, here is what I've committed after another bootstrap/regtest: 2019-12-29 Jakub Jelinek <ja...@redhat.com> PR target/93078 * config/i386/i386-builtins.c (ix86_builtin_vectorized_function): Remove CASE_CFN_RINT handling. * config/i386/i386-builtin.def (IX86_BUILTIN_RINTPD, IX86_BUILTIN_RINTPS, IX86_BUILTIN_RINTPD256, IX86_BUILTIN_RINTPS256): Remove. * config/i386/sse.md (nearbyint<mode>2, rint<mode>2): New expanders with VF iterator. * gcc.target/i386/sse4_1-pr93078.c: New test. * gcc.target/i386/avx-pr93078.c: New test. * gcc.target/i386/avx512f-pr93078.c: New test. --- gcc/config/i386/i386-builtins.c.jj 2019-12-09 15:02:31.077273254 +0100 +++ gcc/config/i386/i386-builtins.c 2019-12-28 12:11:05.509289523 +0100 @@ -1661,27 +1661,6 @@ ix86_builtin_vectorized_function (unsign } break; - CASE_CFN_RINT: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == DFmode && in_mode == DFmode) - { - if (out_n == 2 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_RINTPD); - else if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_RINTPD256); - } - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_RINTPS); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_RINTPS256); - } - break; - CASE_CFN_FMA: if (out_mode == DFmode && in_mode == DFmode) { --- gcc/config/i386/i386-builtin.def.jj 2019-12-09 15:02:31.110272755 +0100 +++ gcc/config/i386/i386-builtin.def 2019-12-28 12:07:10.352821780 +0100 @@ -913,7 +913,6 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_F BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND) -BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND) @@ -924,7 +923,6 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_F BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND) -BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND) @@ -1047,7 +1045,6 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF) @@ -1058,7 +1055,6 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND) --- gcc/config/i386/sse.md.jj 2019-12-21 00:12:54.000000000 +0100 +++ gcc/config/i386/sse.md 2019-12-27 18:16:48.146431083 +0100 @@ -17977,6 +17977,24 @@ (define_insn "ptesttf2" (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) +(define_expand "nearbyint<mode>2" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF + [(match_operand:VF 1 "vector_operand") + (match_dup 2)] + UNSPEC_ROUND))] + "TARGET_SSE4_1" + "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);") + +(define_expand "rint<mode>2" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF + [(match_operand:VF 1 "vector_operand") + (match_dup 2)] + UNSPEC_ROUND))] + "TARGET_SSE4_1" + "operands[2] = GEN_INT (ROUND_MXCSR);") + (define_insn "<sse4_1>_round<ssemodesuffix><avxsizesuffix>" [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") (unspec:VF_128_256 --- gcc/testsuite/gcc.target/i386/sse4_1-pr93078.c.jj 2019-12-27 18:26:05.436970472 +0100 +++ gcc/testsuite/gcc.target/i386/sse4_1-pr93078.c 2019-12-27 18:32:29.107147604 +0100 @@ -0,0 +1,42 @@ +/* PR target/93078 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msse4.1 -mno-sse4.2 -masm=att" } */ +/* { dg-final { scan-assembler "roundps\[ \t]\+\\\$12," } } */ +/* { dg-final { scan-assembler "roundps\[ \t]\+\\\$4," } } */ +/* { dg-final { scan-assembler "roundpd\[ \t]\+\\\$12," } } */ +/* { dg-final { scan-assembler "roundpd\[ \t]\+\\\$4," } } */ + +float a[16], b[16]; +double c[8], d[8]; + +void +foo (void) +{ + int i; + for (i = 0; i < 16; ++i) + b[i] = __builtin_nearbyintf (a[i]); +} + +void +bar (void) +{ + int i; + for (i = 0; i < 16; ++i) + b[i] = __builtin_rintf (a[i]); +} + +void +baz (void) +{ + int i; + for (i = 0; i < 8; ++i) + d[i] = __builtin_nearbyint (c[i]); +} + +void +qux (void) +{ + int i; + for (i = 0; i < 8; ++i) + d[i] = __builtin_rint (c[i]); +} --- gcc/testsuite/gcc.target/i386/avx-pr93078.c.jj 2019-12-27 18:32:47.567867421 +0100 +++ gcc/testsuite/gcc.target/i386/avx-pr93078.c 2019-12-27 18:34:41.527137818 +0100 @@ -0,0 +1,9 @@ +/* PR target/93078 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -mavx -mno-avx2 -mprefer-vector-width=256 -masm=att" } */ +/* { dg-final { scan-assembler "vroundps\[ \t]\+\\\$12,\[^\n\r]*%y" } } */ +/* { dg-final { scan-assembler "vroundps\[ \t]\+\\\$4,\[^\n\r]*%y" } } */ +/* { dg-final { scan-assembler "vroundpd\[ \t]\+\\\$12,\[^\n\r]*%y" } } */ +/* { dg-final { scan-assembler "vroundpd\[ \t]\+\\\$4,\[^\n\r]*%y" } } */ + +#include "sse4_1-pr93078.c" --- gcc/testsuite/gcc.target/i386/avx512f-pr93078.c.jj 2019-12-27 18:34:56.632908546 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-pr93078.c 2019-12-27 18:35:38.650270831 +0100 @@ -0,0 +1,9 @@ +/* PR target/93078 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -mavx512f -mprefer-vector-width=512 -masm=att" } */ +/* { dg-final { scan-assembler "vrndscaleps\[ \t]\+\\\$12,\[^\n\r]*%z" } } */ +/* { dg-final { scan-assembler "vrndscaleps\[ \t]\+\\\$4,\[^\n\r]*%z" } } */ +/* { dg-final { scan-assembler "vrndscalepd\[ \t]\+\\\$12,\[^\n\r]*%z" } } */ +/* { dg-final { scan-assembler "vrndscalepd\[ \t]\+\\\$4,\[^\n\r]*%z" } } */ + +#include "sse4_1-pr93078.c" Jakub