Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]

2021-05-12 Thread Hongtao Liu via Gcc-patches
There's a typo in the testcase, I've committed the patch as an obvious fix.

Fix typo in testcase.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-pr94680.c: Fix typo in testcase.

diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c
b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
index a89e4967f64..cb5041b6af3 100644
--- a/gcc/testsuite/gcc.target/i386/avx-pr94680.c
+++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
@@ -76,7 +76,7 @@ foo_v16hi_l (v16hi x)
 {
   return __builtin_shuffle ((v16hi)  { 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0 }, x,
-   (v16hi) { 16, 17, 18, 20, 21, 22, 23,
+   (v16hi) { 16, 17, 18, 19, 20, 21, 22, 23,
  15, 0, 13, 2, 11, 4, 9, 6 });
 }

-- 
BR,
Hongtao


Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]

2021-05-12 Thread Hongtao Liu via Gcc-patches
On Wed, May 12, 2021 at 10:19 PM Jakub Jelinek  wrote:
>
> On Sun, Apr 25, 2021 at 02:57:08PM +0800, Hongtao Liu via Gcc-patches wrote:
> > gcc/ChangeLog:
> >
> >   PR target/94680
> >   * config/i386/sse.md (ssedoublevecmode): Add attribute for
> >   V64QI/V32HI/V16SI/V4DI.
> >   (ssehalfvecmode): Add attribute for V2DI/V2DF.
> >   (*vec_concatv4si_0): Extend to VI124_128.
> >   (*vec_concat_0): New pre-reload splitter.
> >   * config/i386/predicates.md (movq_parallel): New predicate.
> >
> > gcc/testsuite/ChangeLog:
> >
> >   PR target/94680
> >   * gcc.target/i386/avx-pr94680.c: New test.
> >   * gcc.target/i386/avx512f-pr94680.c: New test.
> >   * gcc.target/i386/sse2-pr94680.c: New test.
>
> Ok, thanks.  Sorry for the delay.

Thanks for the review.

>
> Jakub
>


-- 
BR,
Hongtao


Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]

2021-05-12 Thread Jakub Jelinek via Gcc-patches
On Sun, Apr 25, 2021 at 02:57:08PM +0800, Hongtao Liu via Gcc-patches wrote:
> gcc/ChangeLog:
> 
>   PR target/94680
>   * config/i386/sse.md (ssedoublevecmode): Add attribute for
>   V64QI/V32HI/V16SI/V4DI.
>   (ssehalfvecmode): Add attribute for V2DI/V2DF.
>   (*vec_concatv4si_0): Extend to VI124_128.
>   (*vec_concat_0): New pre-reload splitter.
>   * config/i386/predicates.md (movq_parallel): New predicate.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR target/94680
>   * gcc.target/i386/avx-pr94680.c: New test.
>   * gcc.target/i386/avx512f-pr94680.c: New test.
>   * gcc.target/i386/sse2-pr94680.c: New test.

Ok, thanks.  Sorry for the delay.

Jakub



Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]

2021-05-12 Thread Hongtao Liu via Gcc-patches
ping.

On Sun, Apr 25, 2021 at 2:57 PM Hongtao Liu  wrote:
>
> On Fri, Apr 23, 2021 at 5:13 PM Jakub Jelinek  wrote:
> >
> > On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote:
> > > +  if (!CONST_INT_P (er))
> > > + return 0;
> > > +  ei = INTVAL (er);
> > > +  if (i < nelt2 && ei != i)
> > > + return 0;
> > > +  if (i >= nelt2
> > > +  && (ei < nelt || ei >= nelt<<1))
> >
> > Formatting:
> > 1) you have spaces followed by tab, remove the spaces; but,
> >   if (i >= nelt2 && (ei < nelt || ei >= nelt<<1))
> >fits on one line, so keep it on one line.
> > 2) nelt<<1 should be nelt << 1 with spaces around the <<
> >
>
> Done.
>
> > > -(define_insn "*vec_concatv4si_0"
> > > -  [(set (match_operand:V4SI 0 "register_operand"   "=v,x")
> > > - (vec_concat:V4SI
> > > -   (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
> > > -   (match_operand:V2SI 2 "const0_operand"   " C,C")))]
> > > +(define_insn "*vec_concat_0"
> > > +  [(set (match_operand:VI124_128 0 "register_operand"   "=v,x")
> > > + (vec_concat:VI124_128
> > > +   (match_operand: 1 "nonimmediate_operand" 
> > > "vm,?!*y")
> > > +   (match_operand: 2 "const0_operand"   " 
> > > C,C")))]
> > >"TARGET_SSE2"
> > >"@
> > > %vmovq\t{%1, %0|%0, %1}
> > > @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat"
> > > (set_attr "prefix" "maybe_evex")
> > > (set_attr "mode" "")])
> > >
> > > +(define_insn_and_split "*vec_concat_0"
> >
> > Would be better to use a different pattern name, *vec_concat_0
> > is already used in the above define_insn.
> > Use some additional suffix after _0?
> >
>
> Changed to "*vec_concat_0_1"
>
> > > +  return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
> > > +  0, 0, 0, 0, 0, 0, 0, 0,
> > > +  0, 0, 0, 0, 0, 0, 0, 0,
> > > +  0, 0, 0, 0, 0, 0, 0, 0 },
> > > +(v32qi) { 0, 1, 2, 3, 4, 5, 6, 7,
> > > +  8, 9, 10, 11, 12, 13, 14, 15,
> > > +  32, 49, 34, 58, 36, 53, 38, 39,
> > > +  40, 60, 42, 43, 63, 45, 46, 47 });
> >
> > In this testcase the shuffles in the part taking indexes from the zero
> > vector are nicely randomized.
> >
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
> > > @@ -0,0 +1,78 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 
> > > 6} } */
> > > +/* { dg-final { scan-assembler-not "pxor" } } */
> > > +
> > > +
> > > +typedef float v16sf __attribute__((vector_size(64)));
> > > +typedef double v8df __attribute__ ((vector_size (64)));
> > > +typedef long long v8di __attribute__((vector_size(64)));
> > > +typedef int v16si __attribute__((vector_size(64)));
> > > +typedef short v32hi __attribute__ ((vector_size (64)));
> > > +typedef char v64qi __attribute__ ((vector_size (64)));
> > > +
> > > +v8df
> > > +foo_v8df (v8df x)
> > > +{
> > > +  return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 },
> > > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> > > +}
> > > +
> > > +v8di
> > > +foo_v8di (v8di x)
> > > +{
> > > +  return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 },
> > > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> > > +}
> > > +
> > > +v16sf
> > > +foo_v16sf (v16sf x)
> > > +{
> > > +  return __builtin_shuffle (x, (v16sf)  { 0, 0, 0, 0, 0, 0, 0, 0,
> > > +0, 0, 0, 0, 0, 0, 0, 0 },
> > > +(v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> > > +  16, 17, 18, 19, 20, 21, 22, 23 });
> > > +}
> > > +
> > > +v16si
> > > +foo_v16si (v16si x)
> > > +{
> > > +return __builtin_shuffle (x, (v16si)  { 0, 0, 0, 0, 0, 0, 0, 0,
> > > +0, 0, 0, 0, 0, 0, 0, 0 },
> > > +(v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> > > +  16, 17, 18, 19, 20, 21, 22, 23 });
> > > +}
> > > +
> > > +v32hi
> > > +foo_v32hi (v32hi x)
> > > +{
> > > +  return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
> > > +  0, 0, 0, 0, 0, 0, 0, 0,
> > > +  0, 0, 0, 0, 0, 0, 0, 0,
> > > +  0, 0, 0, 0, 0, 0, 0, 0 },
> > > +(v32hi) { 0, 1, 2, 3, 4, 5, 6, 7,
> > > +  8, 9, 10, 11, 12, 13, 14, 15,
> > > +  32, 33, 34, 35, 36, 37, 38, 39,
> > > +  40,41, 42, 43, 44, 45, 46, 47 });
> > > +}
> > > +
> > > +v64qi
> > > +foo_v64qi (v64qi x)
> > > +{
> 

Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]

2021-04-25 Thread Hongtao Liu via Gcc-patches
On Fri, Apr 23, 2021 at 5:13 PM Jakub Jelinek  wrote:
>
> On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote:
> > +  if (!CONST_INT_P (er))
> > + return 0;
> > +  ei = INTVAL (er);
> > +  if (i < nelt2 && ei != i)
> > + return 0;
> > +  if (i >= nelt2
> > +  && (ei < nelt || ei >= nelt<<1))
>
> Formatting:
> 1) you have spaces followed by tab, remove the spaces; but,
>   if (i >= nelt2 && (ei < nelt || ei >= nelt<<1))
>fits on one line, so keep it on one line.
> 2) nelt<<1 should be nelt << 1 with spaces around the <<
>

Done.

> > -(define_insn "*vec_concatv4si_0"
> > -  [(set (match_operand:V4SI 0 "register_operand"   "=v,x")
> > - (vec_concat:V4SI
> > -   (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
> > -   (match_operand:V2SI 2 "const0_operand"   " C,C")))]
> > +(define_insn "*vec_concat_0"
> > +  [(set (match_operand:VI124_128 0 "register_operand"   "=v,x")
> > + (vec_concat:VI124_128
> > +   (match_operand: 1 "nonimmediate_operand" "vm,?!*y")
> > +   (match_operand: 2 "const0_operand"   " C,C")))]
> >"TARGET_SSE2"
> >"@
> > %vmovq\t{%1, %0|%0, %1}
> > @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat"
> > (set_attr "prefix" "maybe_evex")
> > (set_attr "mode" "")])
> >
> > +(define_insn_and_split "*vec_concat_0"
>
> Would be better to use a different pattern name, *vec_concat_0
> is already used in the above define_insn.
> Use some additional suffix after _0?
>

Changed to "*vec_concat_0_1"

> > +  return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
> > +  0, 0, 0, 0, 0, 0, 0, 0,
> > +  0, 0, 0, 0, 0, 0, 0, 0,
> > +  0, 0, 0, 0, 0, 0, 0, 0 },
> > +(v32qi) { 0, 1, 2, 3, 4, 5, 6, 7,
> > +  8, 9, 10, 11, 12, 13, 14, 15,
> > +  32, 49, 34, 58, 36, 53, 38, 39,
> > +  40, 60, 42, 43, 63, 45, 46, 47 });
>
> In this testcase the shuffles in the part taking indexes from the zero
> vector are nicely randomized.
>
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
> > @@ -0,0 +1,78 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
> > +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 6} 
> > } */
> > +/* { dg-final { scan-assembler-not "pxor" } } */
> > +
> > +
> > +typedef float v16sf __attribute__((vector_size(64)));
> > +typedef double v8df __attribute__ ((vector_size (64)));
> > +typedef long long v8di __attribute__((vector_size(64)));
> > +typedef int v16si __attribute__((vector_size(64)));
> > +typedef short v32hi __attribute__ ((vector_size (64)));
> > +typedef char v64qi __attribute__ ((vector_size (64)));
> > +
> > +v8df
> > +foo_v8df (v8df x)
> > +{
> > +  return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 },
> > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> > +}
> > +
> > +v8di
> > +foo_v8di (v8di x)
> > +{
> > +  return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 },
> > + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> > +}
> > +
> > +v16sf
> > +foo_v16sf (v16sf x)
> > +{
> > +  return __builtin_shuffle (x, (v16sf)  { 0, 0, 0, 0, 0, 0, 0, 0,
> > +0, 0, 0, 0, 0, 0, 0, 0 },
> > +(v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> > +  16, 17, 18, 19, 20, 21, 22, 23 });
> > +}
> > +
> > +v16si
> > +foo_v16si (v16si x)
> > +{
> > +return __builtin_shuffle (x, (v16si)  { 0, 0, 0, 0, 0, 0, 0, 0,
> > +0, 0, 0, 0, 0, 0, 0, 0 },
> > +(v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> > +  16, 17, 18, 19, 20, 21, 22, 23 });
> > +}
> > +
> > +v32hi
> > +foo_v32hi (v32hi x)
> > +{
> > +  return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
> > +  0, 0, 0, 0, 0, 0, 0, 0,
> > +  0, 0, 0, 0, 0, 0, 0, 0,
> > +  0, 0, 0, 0, 0, 0, 0, 0 },
> > +(v32hi) { 0, 1, 2, 3, 4, 5, 6, 7,
> > +  8, 9, 10, 11, 12, 13, 14, 15,
> > +  32, 33, 34, 35, 36, 37, 38, 39,
> > +  40,41, 42, 43, 44, 45, 46, 47 });
> > +}
> > +
> > +v64qi
> > +foo_v64qi (v64qi x)
> > +{
> > +  return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
> > +  0, 0, 0, 0, 0, 0, 0, 0,
> > +  0, 0, 0, 0, 0, 0, 0, 0,
> > +  0, 0, 0, 0, 0, 0, 0, 0,
> > +  0, 0, 

Re: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]

2021-04-23 Thread Jakub Jelinek via Gcc-patches
On Fri, Apr 23, 2021 at 12:53:58PM +0800, Hongtao Liu via Gcc-patches wrote:
> +  if (!CONST_INT_P (er))
> + return 0;
> +  ei = INTVAL (er);
> +  if (i < nelt2 && ei != i)
> + return 0;
> +  if (i >= nelt2
> +  && (ei < nelt || ei >= nelt<<1))

Formatting:
1) you have spaces followed by tab, remove the spaces; but,
  if (i >= nelt2 && (ei < nelt || ei >= nelt<<1))
   fits on one line, so keep it on one line.
2) nelt<<1 should be nelt << 1 with spaces around the <<

> -(define_insn "*vec_concatv4si_0"
> -  [(set (match_operand:V4SI 0 "register_operand"   "=v,x")
> - (vec_concat:V4SI
> -   (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
> -   (match_operand:V2SI 2 "const0_operand"   " C,C")))]
> +(define_insn "*vec_concat_0"
> +  [(set (match_operand:VI124_128 0 "register_operand"   "=v,x")
> + (vec_concat:VI124_128
> +   (match_operand: 1 "nonimmediate_operand" "vm,?!*y")
> +   (match_operand: 2 "const0_operand"   " C,C")))]
>"TARGET_SSE2"
>"@
> %vmovq\t{%1, %0|%0, %1}
> @@ -22154,6 +22157,24 @@ (define_insn "avx_vec_concat"
> (set_attr "prefix" "maybe_evex")
> (set_attr "mode" "")])
>  
> +(define_insn_and_split "*vec_concat_0"

Would be better to use a different pattern name, *vec_concat_0
is already used in the above define_insn.
Use some additional suffix after _0?

> +  return __builtin_shuffle (x, (v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0 },
> +(v32qi) { 0, 1, 2, 3, 4, 5, 6, 7,
> +  8, 9, 10, 11, 12, 13, 14, 15,
> +  32, 49, 34, 58, 36, 53, 38, 39,
> +  40, 60, 42, 43, 63, 45, 46, 47 });

In this testcase the shuffles in the part taking indexes from the zero
vector are nicely randomized.

> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
> @@ -0,0 +1,78 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
> +/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 6} } 
> */
> +/* { dg-final { scan-assembler-not "pxor" } } */
> +
> +
> +typedef float v16sf __attribute__((vector_size(64)));
> +typedef double v8df __attribute__ ((vector_size (64)));
> +typedef long long v8di __attribute__((vector_size(64)));
> +typedef int v16si __attribute__((vector_size(64)));
> +typedef short v32hi __attribute__ ((vector_size (64)));
> +typedef char v64qi __attribute__ ((vector_size (64)));
> +
> +v8df
> +foo_v8df (v8df x)
> +{
> +  return __builtin_shuffle (x, (v8df) { 0, 0, 0, 0, 0, 0, 0, 0 },
> + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> +}
> +
> +v8di
> +foo_v8di (v8di x)
> +{
> +  return __builtin_shuffle (x, (v8di) { 0, 0, 0, 0, 0, 0, 0, 0 },
> + (v8di) { 0, 1, 2, 3, 8, 9, 10, 11 });
> +}
> +
> +v16sf
> +foo_v16sf (v16sf x)
> +{
> +  return __builtin_shuffle (x, (v16sf)  { 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0 },
> +(v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> +  16, 17, 18, 19, 20, 21, 22, 23 });
> +}
> +
> +v16si
> +foo_v16si (v16si x)
> +{
> +return __builtin_shuffle (x, (v16si)  { 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0 },
> +(v16si) { 0, 1, 2, 3, 4, 5, 6, 7,
> +  16, 17, 18, 19, 20, 21, 22, 23 });
> +}
> +
> +v32hi
> +foo_v32hi (v32hi x)
> +{
> +  return __builtin_shuffle (x, (v32hi) { 0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0 },
> +(v32hi) { 0, 1, 2, 3, 4, 5, 6, 7,
> +  8, 9, 10, 11, 12, 13, 14, 15,
> +  32, 33, 34, 35, 36, 37, 38, 39,
> +  40,41, 42, 43, 44, 45, 46, 47 });
> +}
> +
> +v64qi
> +foo_v64qi (v64qi x)
> +{
> +  return __builtin_shuffle (x, (v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0,
> +  0, 0, 0, 0, 0, 0, 0, 0 },
> +(v64qi) {0, 1, 2, 3, 4, 5, 6, 7,
> +   8, 9, 

[PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the upper bits of the dest. [PR target/94680]

2021-04-22 Thread Hongtao Liu via Gcc-patches
Hi:
  If the second operand of __builtin_shuffle is const vector 0, and with
specific mask, it can be optimized to movq/vmovps.

.i.e.
foo128:
-   vxorps  %xmm1, %xmm1, %xmm1
-   vmovlhps%xmm1, %xmm0, %xmm0
+   vmovq   %xmm0, %xmm0

 foo256:
-   vxorps  %xmm1, %xmm1, %xmm1
-   vshuff32x4  $0, %ymm1, %ymm0, %ymm0
+   vmovaps %xmm0, %xmm0

 foo512:
-   vxorps  %xmm1, %xmm1, %xmm1
-   vshuff32x4  $68, %zmm1, %zmm0, %zmm0
+   vmovaps %ymm0, %ymm0

  Bootstrapped and regtested on x86-64_iinux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

PR target/94680
* config/i386/sse.md (ssedoublevecmode): Add attribute for
V64QI/V32HI/V16SI/V4DI.
(ssehalfvecmode): Add attribute for V2DI/V2DF.
(*vec_concatv4si_0): Extend to VI124_128.
(*vec_concat_0): New pre-reload splitter.
* config/i386/predicates.md (movq_parallel): New predicate.

gcc/testsuite/ChangeLog:

PR target/94680
* gcc.target/i386/avx-pr94680.c: New test.
* gcc.target/i386/avx512f-pr94680.c: New test.
* gcc.target/i386/sse2-pr94680.c: New test.


-- 
BR,
Hongtao
From eec5469cdeecf0e6650e9d2963dea4117919c5d2 Mon Sep 17 00:00:00 2001
From: liuhongt 
Date: Thu, 22 Apr 2021 15:33:16 +0800
Subject: [PATCH] [i386] Optimize __builtin_shuffle when it's used to zero the
 upper bits of the dest. [PR target/94680]

If the second operand of __builtin_shuffle is const vector 0, and with
specific mask, it can be optimized to movq/vmovps.

.i.e.
foo128:
-   vxorps  %xmm1, %xmm1, %xmm1
-   vmovlhps%xmm1, %xmm0, %xmm0
+   vmovq   %xmm0, %xmm0

 foo256:
-   vxorps  %xmm1, %xmm1, %xmm1
-   vshuff32x4  $0, %ymm1, %ymm0, %ymm0
+   vmovaps %xmm0, %xmm0

 foo512:
-   vxorps  %xmm1, %xmm1, %xmm1
-   vshuff32x4  $68, %zmm1, %zmm0, %zmm0
+   vmovaps %ymm0, %ymm0

gcc/ChangeLog:

	PR target/94680
	* config/i386/sse.md (ssedoublevecmode): Add attribute for
	V64QI/V32HI/V16SI/V4DI.
	(ssehalfvecmode): Add attribute for V2DI/V2DF.
	(*vec_concatv4si_0): Extend to VI124_128.
	(*vec_concat_0): New pre-reload splitter.
	* config/i386/predicates.md (movq_parallel): New predicate.

gcc/testsuite/ChangeLog:

	PR target/94680
	* gcc.target/i386/avx-pr94680.c: New test.
	* gcc.target/i386/avx512f-pr94680.c: New test.
	* gcc.target/i386/sse2-pr94680.c: New test.
---
 gcc/config/i386/predicates.md | 33 
 gcc/config/i386/sse.md| 37 +++--
 gcc/testsuite/gcc.target/i386/avx-pr94680.c   | 59 ++
 .../gcc.target/i386/avx512f-pr94680.c | 78 +++
 gcc/testsuite/gcc.target/i386/sse2-pr94680.c  | 51 
 5 files changed, 250 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-pr94680.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr94680.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index b1df8548af6..4b706003ed8 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1524,6 +1524,39 @@ (define_predicate "misaligned_operand"
   (and (match_code "mem")
(match_test "MEM_ALIGN (op) < GET_MODE_BITSIZE (mode)")))
 
+;; Return true if OP is a parallel for an mov{d,q,dqa,ps,pd} vec_select,
+;; where one of the two operands of the vec_concat is const0_operand.
+(define_predicate "movq_parallel"
+  (match_code "parallel")
+{
+  unsigned nelt = XVECLEN (op, 0);
+  unsigned nelt2 = nelt >> 1;
+  unsigned i;
+
+  if (nelt < 2)
+return false;
+
+  /* Validate that all of the elements are constants,
+ lower halves of permute are lower halves of the first operand,
+ upper halves of permute come from any of the second operand.  */
+  for (i = 0; i < nelt; ++i)
+{
+  rtx er = XVECEXP (op, 0, i);
+  unsigned HOST_WIDE_INT ei;
+
+  if (!CONST_INT_P (er))
+	return 0;
+  ei = INTVAL (er);
+  if (i < nelt2 && ei != i)
+	return 0;
+  if (i >= nelt2
+  	 && (ei < nelt || ei >= nelt<<1))
+	return 0;
+}
+
+  return 1;
+})
+
 ;; Return true if OP is a vzeroall operation, known to be a PARALLEL.
 (define_predicate "vzeroall_operation"
   (match_code "parallel")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 9d3728d1cb0..b55636a3e12 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -812,19 +812,22 @@ (define_mode_attr sseintvecmodelower
 
 ;; Mapping of vector modes to a vector mode of double size
 (define_mode_attr ssedoublevecmode
-  [(V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI")
+  [(V64QI "V128QI") (V32HI "V64HI") (V16SI "V32SI") (V8DI "V16DI")
+   (V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI")
(V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI")
+   (V16SF "V32SF") (V8DF "V16DF")
(V8SF "V16SF") (V4DF "V8DF")
(V4SF "V8SF") (V2DF "V4DF")])
 
 ;;