On Mon, Jan 27, 2020 at 2:17 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > On Mon, Jan 27, 2020 at 12:26 PM Uros Bizjak <ubiz...@gmail.com> wrote: > > > > On Mon, Jan 27, 2020 at 7:23 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > > > movaps/movups is one byte shorter than movdaq/movdqu. But it isn't the > > > case for AVX nor AVX512. We should disable TARGET_SSE_TYPELESS_STORES > > > for TARGET_AVX. > > > > > > gcc/ > > > > > > PR target/91461 > > > * config/i386/i386.h (TARGET_SSE_TYPELESS_STORES): Disable for > > > TARGET_AVX. > > > * config/i386/i386.md (*movoi_internal_avx): Remove > > > TARGET_SSE_TYPELESS_STORES check. > > > > > > gcc/testsuite/ > > > > > > PR target/91461 > > > * gcc.target/i386/pr91461-1.c: New test. > > > * gcc.target/i386/pr91461-2.c: Likewise. > > > * gcc.target/i386/pr91461-3.c: Likewise. > > > * gcc.target/i386/pr91461-4.c: Likewise. > > > * gcc.target/i386/pr91461-5.c: Likewise. > > > --- > > > gcc/config/i386/i386.h | 4 +- > > > gcc/config/i386/i386.md | 4 +- > > > gcc/testsuite/gcc.target/i386/pr91461-1.c | 66 ++++++++++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr91461-2.c | 19 ++++++ > > > gcc/testsuite/gcc.target/i386/pr91461-3.c | 76 +++++++++++++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr91461-4.c | 21 +++++++ > > > gcc/testsuite/gcc.target/i386/pr91461-5.c | 17 +++++ > > > 7 files changed, 203 insertions(+), 4 deletions(-) > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-1.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-2.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-3.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-4.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-5.c > > > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > > index 943e9a5c783..c134b04c5c4 100644 > > > --- a/gcc/config/i386/i386.h > > > +++ b/gcc/config/i386/i386.h > > > @@ -516,8 +516,10 @@ extern unsigned char > > > ix86_tune_features[X86_TUNE_LAST]; > > > #define TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL \ > > > ix86_tune_features[X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL] > > > #define TARGET_SSE_SPLIT_REGS > > > ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS] > > > +/* NB: movaps/movups is one byte shorter than movdaq/movdqu. But it > > > + isn't the case for AVX nor AVX512. */ > > > #define TARGET_SSE_TYPELESS_STORES \ > > > - ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES] > > > + (!TARGET_AVX && ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]) > > > > This is wrong place to disable the feature. >
Here is the updated patch on top of https://gcc.gnu.org/ml/gcc-patches/2020-01/msg01742.html so that set_ix86_tune_features can access per function setting. OK for master branch? Thanks. -- H.J.
From 61482a7d4dff07075f2534840040bafa420e9f36 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.to...@gmail.com> Date: Mon, 27 Jan 2020 09:35:11 -0800 Subject: [PATCH] i386: Disable TARGET_SSE_TYPELESS_STORES for TARGET_AVX movaps/movups is one byte shorter than movdaq/movdqu. But it isn't the case for AVX nor AVX512. We should disable TARGET_SSE_TYPELESS_STORES for TARGET_AVX and adjust vmovups checks in assembly ouputs. gcc/ PR target/91461 * config/i386/i386-options.c (set_ix86_tune_features): Disable TARGET_SSE_TYPELESS_STORES for TARGET_AVX. * config/i386/i386.md (*movoi_internal_avx): Remove TARGET_SSE_TYPELESS_STORES check. gcc/testsuite/ PR target/91461 * gcc.target/i386/avx256-unaligned-store-3.c: Don't check vmovups. * gcc.target/i386/pieces-memcpy-4.c: Likewise. * gcc.target/i386/pieces-memcpy-5.c: Likewise. * gcc.target/i386/pieces-memcpy-6.c: Likewise. * gcc.target/i386/pieces-strcpy-2.c: Likewise. * gcc.target/i386/pr90980-1.c: Likewise. * gcc.target/i386/pr87317-4.c: Check "\tvmovd\t" instead of "vmovd" to avoid matching "vmovdqu". * gcc.target/i386/pr87317-5.c: Likewise. * gcc.target/i386/pr87317-7.c: Likewise. * gcc.target/i386/pr91461-1.c: New test. * gcc.target/i386/pr91461-2.c: Likewise. * gcc.target/i386/pr91461-3.c: Likewise. * gcc.target/i386/pr91461-4.c: Likewise. * gcc.target/i386/pr91461-5.c: Likewise. * gcc.target/i386/pr91461-6.c: Likewise. --- gcc/config/i386/i386-options.c | 5 ++ gcc/config/i386/i386.md | 4 +- .../i386/avx256-unaligned-store-3.c | 4 +- .../gcc.target/i386/pieces-memcpy-4.c | 3 +- .../gcc.target/i386/pieces-memcpy-5.c | 3 +- .../gcc.target/i386/pieces-memcpy-6.c | 3 +- .../gcc.target/i386/pieces-strcpy-2.c | 2 +- gcc/testsuite/gcc.target/i386/pr87317-4.c | 2 +- gcc/testsuite/gcc.target/i386/pr87317-5.c | 2 +- gcc/testsuite/gcc.target/i386/pr87317-7.c | 2 +- gcc/testsuite/gcc.target/i386/pr90980-1.c | 2 +- gcc/testsuite/gcc.target/i386/pr91461-1.c | 66 ++++++++++++++++ gcc/testsuite/gcc.target/i386/pr91461-2.c | 19 +++++ gcc/testsuite/gcc.target/i386/pr91461-3.c | 76 +++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr91461-4.c | 21 +++++ gcc/testsuite/gcc.target/i386/pr91461-5.c | 17 +++++ gcc/testsuite/gcc.target/i386/pr91461-6.c | 62 +++++++++++++++ 17 files changed, 277 insertions(+), 16 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pr91461-6.c diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index e0be4932534..0ca15c91319 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -1599,6 +1599,11 @@ set_ix86_tune_features (struct gcc_options *opts, = !!(initial_ix86_tune_features[i] & ix86_tune_mask); } + /* NB: movaps/movups is one byte shorter than movdaq/movdqu. But it + isn't the case for AVX nor AVX512. */ + if (TARGET_AVX_P (opts->x_ix86_isa_flags)) + ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES] = 0; + if (dump) { fprintf (stderr, "List of x86 specific tuning parameter names:\n"); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 6e9c9bd2fb6..bb096133880 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1980,9 +1980,7 @@ (and (eq_attr "alternative" "1") (match_test "TARGET_AVX512VL")) (const_string "XI") - (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (and (eq_attr "alternative" "3") - (match_test "TARGET_SSE_TYPELESS_STORES"))) + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_string "V8SF") ] (const_string "OI")))]) diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c index a439a66ff34..918028df9ed 100644 --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c @@ -17,6 +17,6 @@ avx_test (void) d[i] = c[i] * 20.0; } -/* { dg-final { scan-assembler-not "vmovups.*movv4df_internal/3" } } */ -/* { dg-final { scan-assembler "vmovups.*movv2df_internal/3" } } */ +/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/3" } } */ +/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/3" } } */ /* { dg-final { scan-assembler "vextractf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c index 64e8921abe2..6f20203a146 100644 --- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c @@ -9,5 +9,4 @@ foo (void) __builtin_memcpy (dst, src, 18); } -/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */ -/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c index 3c464c32f8e..5a1c7b3d512 100644 --- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c @@ -9,5 +9,4 @@ foo (void) __builtin_memcpy (dst, src, 19); } -/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */ -/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c index cdb00e05bc1..5f99cc98c47 100644 --- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c @@ -9,5 +9,4 @@ foo (void) __builtin_memcpy (dst, src, 33); } -/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */ -/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 2 } } */ +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c b/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c index 74212559508..90446edb4f3 100644 --- a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c +++ b/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c @@ -12,4 +12,4 @@ foo (char *s) } /* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\[^\n\]*%xmm" 4 } } */ -/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 4 } } */ +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr87317-4.c b/gcc/testsuite/gcc.target/i386/pr87317-4.c index 2d4f24a89e9..d802575f4c5 100644 --- a/gcc/testsuite/gcc.target/i386/pr87317-4.c +++ b/gcc/testsuite/gcc.target/i386/pr87317-4.c @@ -1,7 +1,7 @@ /* { dg-do compile } */ /* { dg-options "-O2 -march=haswell" } */ /* { dg-final { scan-assembler-times "vpmovzxbd" 1 } } */ -/* { dg-final { scan-assembler-not "vmovd" } } */ +/* { dg-final { scan-assembler-not "\tvmovd\t" } } */ #include <immintrin.h> diff --git a/gcc/testsuite/gcc.target/i386/pr87317-5.c b/gcc/testsuite/gcc.target/i386/pr87317-5.c index 96f82847e5d..42cf7dc0ffe 100644 --- a/gcc/testsuite/gcc.target/i386/pr87317-5.c +++ b/gcc/testsuite/gcc.target/i386/pr87317-5.c @@ -1,7 +1,7 @@ /* { dg-do compile } */ /* { dg-options "-O2 -march=haswell" } */ /* { dg-final { scan-assembler-times "vpmovzxwq" 1 } } */ -/* { dg-final { scan-assembler-not "vmovd" } } */ +/* { dg-final { scan-assembler-not "\tvmovd\t" } } */ #include <immintrin.h> diff --git a/gcc/testsuite/gcc.target/i386/pr87317-7.c b/gcc/testsuite/gcc.target/i386/pr87317-7.c index 2c043d9eb26..c76af7efd5f 100644 --- a/gcc/testsuite/gcc.target/i386/pr87317-7.c +++ b/gcc/testsuite/gcc.target/i386/pr87317-7.c @@ -1,7 +1,7 @@ /* { dg-do compile { target { ! ia32 } } } */ /* { dg-options "-O2 -march=haswell" } */ /* { dg-final { scan-assembler-times "vpmovzxbd" 1 } } */ -/* { dg-final { scan-assembler-not "vmovd" } } */ +/* { dg-final { scan-assembler-not "\tvmovd\t" } } */ #include <immintrin.h> diff --git a/gcc/testsuite/gcc.target/i386/pr90980-1.c b/gcc/testsuite/gcc.target/i386/pr90980-1.c index 72a30dc8da2..885518984c5 100644 --- a/gcc/testsuite/gcc.target/i386/pr90980-1.c +++ b/gcc/testsuite/gcc.target/i386/pr90980-1.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-march=skylake-avx512 -O2" } */ -/* { dg-final { scan-assembler-times "(?:vmovups|vmovdqu)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vmovdqu\[2346\]*\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 2 } } */ #include <immintrin.h> diff --git a/gcc/testsuite/gcc.target/i386/pr91461-1.c b/gcc/testsuite/gcc.target/i386/pr91461-1.c new file mode 100644 index 00000000000..0c94b8e2b76 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr91461-1.c @@ -0,0 +1,66 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler "\tvmovdqa\t" } } */ +/* { dg-final { scan-assembler "\tvmovdqu\t" } } */ +/* { dg-final { scan-assembler "\tvmovapd\t" } } */ +/* { dg-final { scan-assembler "\tvmovupd\t" } } */ +/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */ +/* { dg-final { scan-assembler-not "\tvmovups\t" } } */ + +#include <immintrin.h> + +void +foo1 (__m128i *p, __m128i x) +{ + *p = x; +} + +void +foo2 (__m128d *p, __m128d x) +{ + *p = x; +} + +void +foo3 (__float128 *p, __float128 x) +{ + *p = x; +} + +void +foo4 (__m128i_u *p, __m128i x) +{ + *p = x; +} + +void +foo5 (__m128d_u *p, __m128d x) +{ + *p = x; +} + +typedef __float128 __float128_u __attribute__ ((__aligned__ (1))); + +void +foo6 (__float128_u *p, __float128 x) +{ + *p = x; +} + +#ifdef __x86_64__ +typedef __int128 __int128_u __attribute__ ((__aligned__ (1))); + +extern __int128 int128; + +void +foo7 (__int128 *p) +{ + *p = int128; +} + +void +foo8 (__int128_u *p) +{ + *p = int128; +} +#endif diff --git a/gcc/testsuite/gcc.target/i386/pr91461-2.c b/gcc/testsuite/gcc.target/i386/pr91461-2.c new file mode 100644 index 00000000000..921cfaf9780 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr91461-2.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler "\tvmovdqa\t" } } */ +/* { dg-final { scan-assembler "\tvmovapd\t" } } */ +/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */ + +#include <immintrin.h> + +void +foo1 (__m256i *p, __m256i x) +{ + *p = x; +} + +void +foo2 (__m256d *p, __m256d x) +{ + *p = x; +} diff --git a/gcc/testsuite/gcc.target/i386/pr91461-3.c b/gcc/testsuite/gcc.target/i386/pr91461-3.c new file mode 100644 index 00000000000..c67a48063bf --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr91461-3.c @@ -0,0 +1,76 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mavx512f -mavx512vl" } */ +/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */ +/* { dg-final { scan-assembler-not "\tvmovups\t" } } */ + +#include <immintrin.h> + +void +foo1 (__m128i *p, __m128i a) +{ + register __m128i x __asm ("xmm16") = a; + asm volatile ("" : "+v" (x)); + *p = x; +} + +void +foo2 (__m128d *p, __m128d a) +{ + register __m128d x __asm ("xmm16") = a; + asm volatile ("" : "+v" (x)); + *p = x; +} + +void +foo3 (__float128 *p, __float128 a) +{ + register __float128 x __asm ("xmm16") = a; + asm volatile ("" : "+v" (x)); + *p = x; +} + +void +foo4 (__m128i_u *p, __m128i a) +{ + register __m128i x __asm ("xmm16") = a; + asm volatile ("" : "+v" (x)); + *p = x; +} + +void +foo5 (__m128d_u *p, __m128d a) +{ + register __m128d x __asm ("xmm16") = a; + asm volatile ("" : "+v" (x)); + *p = x; +} + +typedef __float128 __float128_u __attribute__ ((__aligned__ (1))); + +void +foo6 (__float128_u *p, __float128 a) +{ + register __float128 x __asm ("xmm16") = a; + asm volatile ("" : "+v" (x)); + *p = x; +} + +typedef __int128 __int128_u __attribute__ ((__aligned__ (1))); + +extern __int128 int128; + +void +foo7 (__int128 *p) +{ + register __int128 x __asm ("xmm16") = int128; + asm volatile ("" : "+v" (x)); + *p = x; +} + +void +foo8 (__int128_u *p) +{ + register __int128 x __asm ("xmm16") = int128; + asm volatile ("" : "+v" (x)); + *p = x; +} diff --git a/gcc/testsuite/gcc.target/i386/pr91461-4.c b/gcc/testsuite/gcc.target/i386/pr91461-4.c new file mode 100644 index 00000000000..69df590de3a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr91461-4.c @@ -0,0 +1,21 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mavx512f -mavx512vl" } */ +/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */ + +#include <immintrin.h> + +void +foo1 (__m256i *p, __m256i a) +{ + register __m256i x __asm ("xmm16") = a; + asm volatile ("" : "+v" (x)); + *p = x; +} + +void +foo2 (__m256d *p, __m256d a) +{ + register __m256d x __asm ("xmm16") = a; + asm volatile ("" : "+v" (x)); + *p = x; +} diff --git a/gcc/testsuite/gcc.target/i386/pr91461-5.c b/gcc/testsuite/gcc.target/i386/pr91461-5.c new file mode 100644 index 00000000000..974263042f3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr91461-5.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f" } */ +/* { dg-final { scan-assembler-not "\tvmovaps\t" } } */ + +#include <immintrin.h> + +void +foo1 (__m512i *p, __m512i x) +{ + *p = x; +} + +void +foo2 (__m512d *p, __m512d x) +{ + *p = x; +} diff --git a/gcc/testsuite/gcc.target/i386/pr91461-6.c b/gcc/testsuite/gcc.target/i386/pr91461-6.c new file mode 100644 index 00000000000..c1524e339d5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr91461-6.c @@ -0,0 +1,62 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx -mtune-ctrl=sse_typeless_stores" } */ +/* { dg-final { scan-assembler "\tvmovaps\t" } } */ +/* { dg-final { scan-assembler "\tvmovups\t" } } */ + +#include <immintrin.h> + +void +foo1 (__m128i *p, __m128i x) +{ + *p = x; +} + +void +foo2 (__m128d *p, __m128d x) +{ + *p = x; +} + +void +foo3 (__float128 *p, __float128 x) +{ + *p = x; +} + +void +foo4 (__m128i_u *p, __m128i x) +{ + *p = x; +} + +void +foo5 (__m128d_u *p, __m128d x) +{ + *p = x; +} + +typedef __float128 __float128_u __attribute__ ((__aligned__ (1))); + +void +foo6 (__float128_u *p, __float128 x) +{ + *p = x; +} + +#ifdef __x86_64__ +typedef __int128 __int128_u __attribute__ ((__aligned__ (1))); + +extern __int128 int128; + +void +foo7 (__int128 *p) +{ + *p = int128; +} + +void +foo8 (__int128_u *p) +{ + *p = int128; +} +#endif -- 2.24.1