On Tue, Dec 18, 2018 at 3:57 PM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > We weren't vectorizing with unconditional or masked scatters when > -mprefered-vector-width={128,256}. While for DI index and DF/DI > stores or SI index and SF/SI stores we even have the builtins, > for the remaining combinations I had to add a few alt builtins (with spaces > in names as in other cases). I've also renamed the other alt builtin > visible names so that they match the IX86_BUILTIN_* names, they were pretty > confusing before. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2018-12-18 Jakub Jelinek <ja...@redhat.com> > > PR target/88464 > * config/i386/i386-builtin-types.def > (VOID_FTYPE_PDOUBLE_QI_V8SI_V4DF_INT, > VOID_FTYPE_PFLOAT_QI_V4DI_V8SF_INT, > VOID_FTYPE_PLONGLONG_QI_V8SI_V4DI_INT, > VOID_FTYPE_PINT_QI_V4DI_V8SI_INT, > VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT, > VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT, > VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT, > VOID_FTYPE_PINT_QI_V2DI_V4SI_INT): New builtin types. > * config/i386/i386.c (enum ix86_builtins): Add > IX86_BUILTIN_SCATTERALTSIV4DF, IX86_BUILTIN_SCATTERALTDIV8SF, > IX86_BUILTIN_SCATTERALTSIV4DI, IX86_BUILTIN_SCATTERALTDIV8SI, > IX86_BUILTIN_SCATTERALTSIV2DF, IX86_BUILTIN_SCATTERALTDIV4SF, > IX86_BUILTIN_SCATTERALTSIV2DI and IX86_BUILTIN_SCATTERALTDIV4SI. > (ix86_init_mmx_sse_builtins): Fix up names of IX86_BUILTIN_GATHERALT*, > IX86_BUILTIN_GATHER3ALT* and IX86_BUILTIN_SCATTERALT* builtins to > match the IX86_BUILTIN codes. Build IX86_BUILTIN_SCATTERALTSIV4DF, > IX86_BUILTIN_SCATTERALTDIV8SF, IX86_BUILTIN_SCATTERALTSIV4DI, > IX86_BUILTIN_SCATTERALTDIV8SI, IX86_BUILTIN_SCATTERALTSIV2DF, > IX86_BUILTIN_SCATTERALTDIV4SF, IX86_BUILTIN_SCATTERALTSIV2DI and > IX86_BUILTIN_SCATTERALTDIV4SI decls. > (ix86_vectorize_builtin_scatter): Expand those new builtins. > > * gcc.target/i386/avx512f-pr88464-5.c: New test. > * gcc.target/i386/avx512f-pr88464-6.c: New test. > * gcc.target/i386/avx512f-pr88464-7.c: New test. > * gcc.target/i386/avx512f-pr88464-8.c: New test. > * gcc.target/i386/avx512vl-pr88464-5.c: New test. > * gcc.target/i386/avx512vl-pr88464-6.c: New test. > * gcc.target/i386/avx512vl-pr88464-7.c: New test. > * gcc.target/i386/avx512vl-pr88464-8.c: New test. > * gcc.target/i386/avx512vl-pr88464-9.c: New test. > * gcc.target/i386/avx512vl-pr88464-10.c: New test. > * gcc.target/i386/avx512vl-pr88464-11.c: New test. > * gcc.target/i386/avx512vl-pr88464-12.c: New test. > * gcc.target/i386/avx512vl-pr88464-13.c: New test. > * gcc.target/i386/avx512vl-pr88464-14.c: New test. > * gcc.target/i386/avx512vl-pr88464-15.c: New test. > * gcc.target/i386/avx512vl-pr88464-16.c: New test.
LGTM. Thanks, Uros. > --- gcc/config/i386/i386-builtin-types.def.jj 2018-11-08 18:07:10.298826353 > +0100 > +++ gcc/config/i386/i386-builtin-types.def 2018-12-18 11:22:07.965503704 > +0100 > @@ -1068,7 +1068,14 @@ DEF_FUNCTION_TYPE (VOID, PFLOAT, HI, V8D > DEF_FUNCTION_TYPE (VOID, PDOUBLE, QI, V16SI, V8DF, INT) > DEF_FUNCTION_TYPE (VOID, PINT, HI, V8DI, V16SI, INT) > DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V16SI, V8DI, INT) > - > +DEF_FUNCTION_TYPE (VOID, PFLOAT, QI, V4DI, V8SF, INT) > +DEF_FUNCTION_TYPE (VOID, PDOUBLE, QI, V8SI, V4DF, INT) > +DEF_FUNCTION_TYPE (VOID, PINT, QI, V4DI, V8SI, INT) > +DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V8SI, V4DI, INT) > +DEF_FUNCTION_TYPE (VOID, PFLOAT, QI, V2DI, V4SF, INT) > +DEF_FUNCTION_TYPE (VOID, PDOUBLE, QI, V4SI, V2DF, INT) > +DEF_FUNCTION_TYPE (VOID, PINT, QI, V2DI, V4SI, INT) > +DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V4SI, V2DI, INT) > > DEF_FUNCTION_TYPE (V16SF, V16SF, PCVOID, V16SI, HI, INT) > DEF_FUNCTION_TYPE (V8DF, V8DF, PCVOID, V8SI, QI, INT) > --- gcc/config/i386/i386.c.jj 2018-12-18 10:23:58.751164982 +0100 > +++ gcc/config/i386/i386.c 2018-12-18 11:58:18.813311983 +0100 > @@ -30072,6 +30072,14 @@ enum ix86_builtins > IX86_BUILTIN_SCATTERALTDIV16SF, > IX86_BUILTIN_SCATTERALTSIV8DI, > IX86_BUILTIN_SCATTERALTDIV16SI, > + IX86_BUILTIN_SCATTERALTSIV4DF, > + IX86_BUILTIN_SCATTERALTDIV8SF, > + IX86_BUILTIN_SCATTERALTSIV4DI, > + IX86_BUILTIN_SCATTERALTDIV8SI, > + IX86_BUILTIN_SCATTERALTSIV2DF, > + IX86_BUILTIN_SCATTERALTDIV4SF, > + IX86_BUILTIN_SCATTERALTSIV2DI, > + IX86_BUILTIN_SCATTERALTDIV4SI, > IX86_BUILTIN_SCATTERDIV16SF, > IX86_BUILTIN_SCATTERDIV16SI, > IX86_BUILTIN_SCATTERDIV8DF, > @@ -30879,7 +30887,7 @@ ix86_init_mmx_sse_builtins (void) > V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT, > IX86_BUILTIN_GATHERALTSIV4DF); > > - def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 > ", > + def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv8sf ", > V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT, > IX86_BUILTIN_GATHERALTDIV8SF); > > @@ -30887,7 +30895,7 @@ ix86_init_mmx_sse_builtins (void) > V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT, > IX86_BUILTIN_GATHERALTSIV4DI); > > - def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 > ", > + def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv8si ", > V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT, > IX86_BUILTIN_GATHERALTDIV8SI); > > @@ -30924,19 +30932,19 @@ ix86_init_mmx_sse_builtins (void) > V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT, > IX86_BUILTIN_GATHER3DIV8DI); > > - def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df > ", > + def_builtin_pure (OPTION_MASK_ISA_AVX512F, > "__builtin_ia32_gather3altsiv8df ", > V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT, > IX86_BUILTIN_GATHER3ALTSIV8DF); > > - def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf > ", > + def_builtin_pure (OPTION_MASK_ISA_AVX512F, > "__builtin_ia32_gather3altdiv16sf ", > V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT, > IX86_BUILTIN_GATHER3ALTDIV16SF); > > - def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di > ", > + def_builtin_pure (OPTION_MASK_ISA_AVX512F, > "__builtin_ia32_gather3altsiv8di ", > V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT, > IX86_BUILTIN_GATHER3ALTSIV8DI); > > - def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si > ", > + def_builtin_pure (OPTION_MASK_ISA_AVX512F, > "__builtin_ia32_gather3altdiv16si ", > V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT, > IX86_BUILTIN_GATHER3ALTDIV16SI); > > @@ -31116,11 +31124,12 @@ ix86_init_mmx_sse_builtins (void) > def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di", > VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT, > IX86_BUILTIN_SCATTERDIV2DI); > + > def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ", > VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT, > IX86_BUILTIN_SCATTERALTSIV8DF); > > - def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ", > + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv16sf ", > VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT, > IX86_BUILTIN_SCATTERALTDIV16SF); > > @@ -31128,10 +31137,42 @@ ix86_init_mmx_sse_builtins (void) > VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT, > IX86_BUILTIN_SCATTERALTSIV8DI); > > - def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ", > + def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv16si ", > VOID_FTYPE_PINT_HI_V8DI_V16SI_INT, > IX86_BUILTIN_SCATTERALTDIV16SI); > > + def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatteraltsiv4df ", > + VOID_FTYPE_PDOUBLE_QI_V8SI_V4DF_INT, > + IX86_BUILTIN_SCATTERALTSIV4DF); > + > + def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatteraltdiv8sf ", > + VOID_FTYPE_PFLOAT_QI_V4DI_V8SF_INT, > + IX86_BUILTIN_SCATTERALTDIV8SF); > + > + def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatteraltsiv4di ", > + VOID_FTYPE_PLONGLONG_QI_V8SI_V4DI_INT, > + IX86_BUILTIN_SCATTERALTSIV4DI); > + > + def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatteraltdiv8si ", > + VOID_FTYPE_PINT_QI_V4DI_V8SI_INT, > + IX86_BUILTIN_SCATTERALTDIV8SI); > + > + def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatteraltsiv2df ", > + VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT, > + IX86_BUILTIN_SCATTERALTSIV2DF); > + > + def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatteraltdiv4sf ", > + VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT, > + IX86_BUILTIN_SCATTERALTDIV4SF); > + > + def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatteraltsiv2di ", > + VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT, > + IX86_BUILTIN_SCATTERALTSIV2DI); > + > + def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatteraltdiv4si ", > + VOID_FTYPE_PINT_QI_V2DI_V4SI_INT, > + IX86_BUILTIN_SCATTERALTDIV4SI); > + > /* AVX512PF */ > def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd", > VOID_FTYPE_QI_V8SI_PCVOID_INT_INT, > @@ -37529,6 +37570,30 @@ rdseed_step: > case IX86_BUILTIN_SCATTERALTDIV16SI: > icode = CODE_FOR_avx512f_scatterdiv16si; > goto scatter_gen; > + case IX86_BUILTIN_SCATTERALTSIV4DF: > + icode = CODE_FOR_avx512vl_scattersiv4df; > + goto scatter_gen; > + case IX86_BUILTIN_SCATTERALTDIV8SF: > + icode = CODE_FOR_avx512vl_scatterdiv8sf; > + goto scatter_gen; > + case IX86_BUILTIN_SCATTERALTSIV4DI: > + icode = CODE_FOR_avx512vl_scattersiv4di; > + goto scatter_gen; > + case IX86_BUILTIN_SCATTERALTDIV8SI: > + icode = CODE_FOR_avx512vl_scatterdiv8si; > + goto scatter_gen; > + case IX86_BUILTIN_SCATTERALTSIV2DF: > + icode = CODE_FOR_avx512vl_scattersiv2df; > + goto scatter_gen; > + case IX86_BUILTIN_SCATTERALTDIV4SF: > + icode = CODE_FOR_avx512vl_scatterdiv4sf; > + goto scatter_gen; > + case IX86_BUILTIN_SCATTERALTSIV2DI: > + icode = CODE_FOR_avx512vl_scattersiv2di; > + goto scatter_gen; > + case IX86_BUILTIN_SCATTERALTDIV4SI: > + icode = CODE_FOR_avx512vl_scatterdiv4si; > + goto scatter_gen; > case IX86_BUILTIN_GATHERPFDPS: > icode = CODE_FOR_avx512pf_gatherpfv16sisf; > goto vec_prefetch_gen; > @@ -37813,6 +37878,36 @@ rdseed_step: > emit_insn (gen (half, op3)); > op3 = half; > break; > + case IX86_BUILTIN_SCATTERALTSIV4DF: > + case IX86_BUILTIN_SCATTERALTSIV4DI: > + half = gen_reg_rtx (V4SImode); > + if (!nonimmediate_operand (op2, V8SImode)) > + op2 = copy_to_mode_reg (V8SImode, op2); > + emit_insn (gen_vec_extract_lo_v8si (half, op2)); > + op2 = half; > + break; > + case IX86_BUILTIN_SCATTERALTDIV8SF: > + case IX86_BUILTIN_SCATTERALTDIV8SI: > + half = gen_reg_rtx (mode3); > + if (mode3 == V4SFmode) > + gen = gen_vec_extract_lo_v8sf; > + else > + gen = gen_vec_extract_lo_v8si; > + if (!nonimmediate_operand (op3, GET_MODE (op3))) > + op3 = copy_to_mode_reg (GET_MODE (op3), op3); > + emit_insn (gen (half, op3)); > + op3 = half; > + break; > + case IX86_BUILTIN_SCATTERALTSIV2DF: > + case IX86_BUILTIN_SCATTERALTSIV2DI: > + if (!nonimmediate_operand (op2, V4SImode)) > + op2 = copy_to_mode_reg (V4SImode, op2); > + break; > + case IX86_BUILTIN_SCATTERALTDIV4SF: > + case IX86_BUILTIN_SCATTERALTDIV4SI: > + if (!nonimmediate_operand (op3, GET_MODE (op3))) > + op3 = copy_to_mode_reg (GET_MODE (op3), op3); > + break; > default: > break; > } > @@ -38928,6 +39023,54 @@ ix86_vectorize_builtin_scatter (const_tr > case E_V16SImode: > code = si ? IX86_BUILTIN_SCATTERSIV16SI : > IX86_BUILTIN_SCATTERALTDIV16SI; > break; > + case E_V4DFmode: > + if (TARGET_AVX512VL) > + code = si ? IX86_BUILTIN_SCATTERALTSIV4DF : > IX86_BUILTIN_SCATTERDIV4DF; > + else > + return NULL_TREE; > + break; > + case E_V4DImode: > + if (TARGET_AVX512VL) > + code = si ? IX86_BUILTIN_SCATTERALTSIV4DI : > IX86_BUILTIN_SCATTERDIV4DI; > + else > + return NULL_TREE; > + break; > + case E_V8SFmode: > + if (TARGET_AVX512VL) > + code = si ? IX86_BUILTIN_SCATTERSIV8SF : > IX86_BUILTIN_SCATTERALTDIV8SF; > + else > + return NULL_TREE; > + break; > + case E_V8SImode: > + if (TARGET_AVX512VL) > + code = si ? IX86_BUILTIN_SCATTERSIV8SI : > IX86_BUILTIN_SCATTERALTDIV8SI; > + else > + return NULL_TREE; > + break; > + case E_V2DFmode: > + if (TARGET_AVX512VL) > + code = si ? IX86_BUILTIN_SCATTERALTSIV2DF : > IX86_BUILTIN_SCATTERDIV2DF; > + else > + return NULL_TREE; > + break; > + case E_V2DImode: > + if (TARGET_AVX512VL) > + code = si ? IX86_BUILTIN_SCATTERALTSIV2DI : > IX86_BUILTIN_SCATTERDIV2DI; > + else > + return NULL_TREE; > + break; > + case E_V4SFmode: > + if (TARGET_AVX512VL) > + code = si ? IX86_BUILTIN_SCATTERSIV4SF : > IX86_BUILTIN_SCATTERALTDIV4SF; > + else > + return NULL_TREE; > + break; > + case E_V4SImode: > + if (TARGET_AVX512VL) > + code = si ? IX86_BUILTIN_SCATTERSIV4SI : > IX86_BUILTIN_SCATTERALTDIV4SI; > + else > + return NULL_TREE; > + break; > default: > return NULL_TREE; > } > --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c.jj 2018-12-18 > 11:40:14.777856788 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c 2018-12-18 > 11:40:51.162266018 +0100 > @@ -0,0 +1,45 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 > -mtune=skylake-avx512 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" > 4 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 > "vect" } } */ > + > +__attribute__((noipa)) void > +f1 (long long * __restrict__ a, const long long * __restrict__ b, const int > * __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (a[i] > 10) > + a[i] = b[c[i]]; > +} > + > +__attribute__((noipa)) void > +f2 (long long * __restrict__ a, const long long * __restrict__ b, const long > * __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (a[i] > 10) > + a[i] = b[c[i]]; > +} > + > +__attribute__((noipa)) void > +f3 (int * __restrict__ a, const int * __restrict__ b, const int * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (a[i] > 10) > + a[i] = b[c[i]]; > +} > + > +__attribute__((noipa)) void > +f4 (int * __restrict__ a, const int * __restrict__ b, const long * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (a[i] > 10) > + a[i] = b[c[i]]; > +} > --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c.jj 2018-12-18 > 11:41:02.175087205 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c 2018-12-18 > 11:41:58.410174126 +0100 > @@ -0,0 +1,61 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do run { target { avx512f } } } */ > +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 > -mtune=skylake-avx512" } */ > + > +#include "avx512f-check.h" > + > +#include "avx512f-pr88464-5.c" > + > +static void > +avx512f_test (void) > +{ > + long long a[1024], b[1024]; > + int c[1024], f[1024]; > + int d[1024]; > + long e[1024]; > + int i; > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + a[i] = (i % 3) != 0 ? 15 : -5; > + b[i] = 2 * i; > + d[i] = (i % 3) ? 1023 - i : __INT_MAX__; > + } > + f1 (a, b, d, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2 : -5)) > + abort (); > + a[i] = (i % 3) != 1 ? 15 : -5; > + b[i] = 3 * i; > + e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; > + } > + f2 (a, b, e, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3 : -5)) > + abort (); > + c[i] = (i % 3) != 2 ? 15 : -5; > + d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; > + f[i] = 4 * i; > + } > + f3 (c, f, d, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4: -5)) > + abort (); > + c[i] = (i % 3) != 0 ? 15 : -5; > + e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; > + f[i] = 5 * i; > + } > + f4 (c, f, e, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5 : -5)) > + abort (); > + } > +} > --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c.jj 2018-12-18 > 11:42:21.761794974 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c 2018-12-18 > 11:42:52.647293482 +0100 > @@ -0,0 +1,45 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 > -mtune=skylake-avx512 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" > 4 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 > "vect" } } */ > + > +__attribute__((noipa)) void > +f1 (long long * __restrict__ a, const long long * __restrict__ b, const int > * __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (b[i] > -2) > + a[c[i]] = b[i]; > +} > + > +__attribute__((noipa)) void > +f2 (long long * __restrict__ a, const long long * __restrict__ b, const long > * __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (b[i] > -2) > + a[c[i]] = b[i]; > +} > + > +__attribute__((noipa)) void > +f3 (int * __restrict__ a, const int * __restrict__ b, const int * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (b[i] > -2) > + a[c[i]] = b[i]; > +} > + > +__attribute__((noipa)) void > +f4 (int * __restrict__ a, const int * __restrict__ b, const long * > __restrict__ c, int n) > +{ > + int i; > +#pragma GCC ivdep > + for (i = 0; i < n; ++i) > + if (b[i] > -2) > + a[c[i]] = b[i]; > +} > --- gcc/testsuite/gcc.target/i386/avx512f-pr88464-8.c.jj 2018-12-18 > 11:43:06.660065958 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512f-pr88464-8.c 2018-12-18 > 11:47:02.049248182 +0100 > @@ -0,0 +1,61 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do run { target { avx512f } } } */ > +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 > -mtune=skylake-avx512" } */ > + > +#include "avx512f-check.h" > + > +#include "avx512f-pr88464-7.c" > + > +static void > +avx512f_test (void) > +{ > + long long a[1024], b[1024]; > + int c[1024], f[1024]; > + int d[1024]; > + long e[1024]; > + int i; > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + a[i] = -5; > + b[i] = (i % 3) != 0 ? 2 * i : -5; > + d[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; > + } > + f1 (a, b, d, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2 : -5)) > + abort (); > + a[i] = -5; > + b[i] = (i % 3) != 1 ? 3 * i : -5; > + e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; > + } > + f2 (a, b, e, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (a[i] != ((i % 3) != 2 ? (1023 - i) * 3 : -5)) > + abort (); > + c[i] = -5; > + d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; > + f[i] = (i % 3) != 2 ? 4 * i : -5; > + } > + f3 (c, f, d, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (c[i] != ((i % 3) != 1 ? (1023 - i) * 4 : -5)) > + abort (); > + c[i] = -5; > + e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; > + f[i] = (i % 3) != 0 ? 5 * i : -5; > + } > + f4 (c, f, e, 1024); > + for (i = 0; i < 1024; i++) > + { > + asm volatile ("" : "+g" (i)); > + if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5 : -5)) > + abort (); > + } > +} > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-5.c.jj 2018-12-18 > 11:39:36.636476084 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-5.c 2018-12-18 > 11:38:44.944315401 +0100 > @@ -0,0 +1,7 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 > -mtune=skylake-avx512 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" > 4 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 > "vect" } } */ > + > +#include "avx512f-pr88464-3.c" > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-6.c.jj 2018-12-18 > 11:39:39.676426727 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-6.c 2018-12-18 > 11:38:59.605077356 +0100 > @@ -0,0 +1,20 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do run { target { avx512vl } } } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 > -mtune=skylake-avx512" } */ > + > +#define AVX512VL > +#define AVX512F_LEN 512 > +#define AVX512F_LEN_HALF 256 > + > +#include "avx512f-pr88464-4.c" > + > +static void > +test_256 (void) > +{ > + avx512f_test (); > +} > + > +static void > +test_128 (void) > +{ > +} > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-7.c.jj 2018-12-18 > 11:39:42.568379768 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-7.c 2018-12-18 > 11:39:14.828830172 +0100 > @@ -0,0 +1,7 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 > -fno-vect-cost-model -mtune=skylake-avx512 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" > 4 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 > "vect" } } */ > + > +#include "avx512f-pr88464-3.c" > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-8.c.jj 2018-12-18 > 11:39:45.771327762 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-8.c 2018-12-18 > 11:39:30.091582351 +0100 > @@ -0,0 +1,20 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do run { target { avx512vl } } } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 > -fno-vect-cost-model -mtune=skylake-avx512" } */ > + > +#define AVX512VL > +#define AVX512F_LEN 512 > +#define AVX512F_LEN_HALF 256 > + > +#include "avx512f-pr88464-4.c" > + > +static void > +test_256 (void) > +{ > + avx512f_test (); > +} > + > +static void > +test_128 (void) > +{ > +} > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c.jj 2018-12-18 > 12:00:35.984087782 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c 2018-12-18 > 12:01:05.523608218 +0100 > @@ -0,0 +1,7 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 > -mtune=skylake-avx512 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" > 4 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 > "vect" } } */ > + > +#include "avx512f-pr88464-5.c" > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-10.c.jj 2018-12-18 > 12:00:35.991087668 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-10.c 2018-12-18 > 12:01:28.353237592 +0100 > @@ -0,0 +1,20 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do run { target { avx512vl } } } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 > -mtune=skylake-avx512" } */ > + > +#define AVX512VL > +#define AVX512F_LEN 512 > +#define AVX512F_LEN_HALF 256 > + > +#include "avx512f-pr88464-6.c" > + > +static void > +test_256 (void) > +{ > + avx512f_test (); > +} > + > +static void > +test_128 (void) > +{ > +} > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c.jj 2018-12-18 > 12:00:35.998087554 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c 2018-12-18 > 12:01:35.238125814 +0100 > @@ -0,0 +1,7 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 > -mtune=skylake-avx512 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" > 4 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 > "vect" } } */ > + > +#include "avx512f-pr88464-5.c" > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-12.c.jj 2018-12-18 > 12:00:36.005087441 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-12.c 2018-12-18 > 12:01:41.982016328 +0100 > @@ -0,0 +1,20 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do run { target { avx512vl } } } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 > -mtune=skylake-avx512" } */ > + > +#define AVX512VL > +#define AVX512F_LEN 512 > +#define AVX512F_LEN_HALF 256 > + > +#include "avx512f-pr88464-6.c" > + > +static void > +test_256 (void) > +{ > + avx512f_test (); > +} > + > +static void > +test_128 (void) > +{ > +} > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-13.c.jj 2018-12-18 > 12:00:36.012087327 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-13.c 2018-12-18 > 12:01:48.157916062 +0100 > @@ -0,0 +1,7 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 > -mtune=skylake-avx512 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" > 4 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 > "vect" } } */ > + > +#include "avx512f-pr88464-7.c" > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-14.c.jj 2018-12-18 > 12:00:36.019087213 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-14.c 2018-12-18 > 12:01:55.403798428 +0100 > @@ -0,0 +1,20 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do run { target { avx512vl } } } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 > -mtune=skylake-avx512" } */ > + > +#define AVX512VL > +#define AVX512F_LEN 512 > +#define AVX512F_LEN_HALF 256 > + > +#include "avx512f-pr88464-8.c" > + > +static void > +test_256 (void) > +{ > + avx512f_test (); > +} > + > +static void > +test_128 (void) > +{ > +} > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-15.c.jj 2018-12-18 > 12:00:36.026087099 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-15.c 2018-12-18 > 12:02:06.141624102 +0100 > @@ -0,0 +1,7 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 > -fno-vect-cost-model -mtune=skylake-avx512 -fdump-tree-vect-details" } */ > +/* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" > 4 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 > "vect" } } */ > + > +#include "avx512f-pr88464-7.c" > --- gcc/testsuite/gcc.target/i386/avx512vl-pr88464-16.c.jj 2018-12-18 > 12:00:36.033086986 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-16.c 2018-12-18 > 12:02:12.693517738 +0100 > @@ -0,0 +1,20 @@ > +/* PR tree-optimization/88464 */ > +/* { dg-do run { target { avx512vl } } } */ > +/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 > -fno-vect-cost-model -mtune=skylake-avx512" } */ > + > +#define AVX512VL > +#define AVX512F_LEN 512 > +#define AVX512F_LEN_HALF 256 > + > +#include "avx512f-pr88464-8.c" > + > +static void > +test_256 (void) > +{ > + avx512f_test (); > +} > + > +static void > +test_128 (void) > +{ > +} > > Jakub