PING: V2 [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns

2018-11-20 Thread H.J. Lu
On Mon, Nov 5, 2018 at 2:02 PM H.J. Lu  wrote:
>
> Hi Richard, Jakub,
>
> Can you take a look at this patch?  The last review from Kirill was in
> June.
>
> Thanks.
>
>
> H.J.
> --
> There are many duplicated AVX2/AVX512 vec_dup patterns like:
>
> (define_insn "avx2_vec_dup"
>   [(set (match_operand:VF1_128_256 0 "register_operand" "=v")
> (vec_duplicate:VF1_128_256
>   (vec_select:SF
> (match_operand:V4SF 1 "register_operand" "v")
> (parallel [(const_int 0)]]
>   "TARGET_AVX2"
>   "vbroadcastss\t{%1, %0|%0, %1}"
>   [(set_attr "type" "sselog1")
> (set_attr "prefix" "maybe_evex")
> (set_attr "mode" "")])
>
> and
>
> (define_insn "vec_dup"
>   [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "=x,x,x,v,x")
> (vec_duplicate:AVX_VEC_DUP_MODE
>   (match_operand: 1 "nonimmediate_operand" 
> "m,m,x,v,?x")))]
>   "TARGET_AVX"
>   "@
>vbroadcast\t{%1, %0|%0, %1}
>vbroadcast\t{%1, %0|%0, %1}
>vbroadcast\t{%x1, %0|%0, %x1}
>vbroadcast\t{%x1, %g0|%g0, %x1}
>#"
>   [(set_attr "type" "ssemov")
>(set_attr "prefix_extra" "1")
>(set_attr "prefix" "maybe_evex")
>(set_attr "isa" "avx2,noavx2,avx2,avx512f,noavx2")
>(set_attr "mode" ",V8SF,,,V8SF")])
>
> We can remove the duplicated AVX2/AVX512 vec_dup patterns and use the
> normal AVX2/AVX512 vec_dup patterns instead by changing source operand
> to subreg of the same register class of the base by generating
>
> (set (reg:V8SF 84)
>  (vec_duplicate:V8SF (subreg:SF (reg:V4SF 85) 0)))
>
> instead of
>
> (set (reg:V8SF 84)
>   (vec_duplicate:V8SF
> (vec_select:SF (reg:V4SF 85)
>   (parallel [(const_int 0 [0])]
>
> For integer vector broadcast, we generate
>
> (set (reg:V32QI 86)
>  (vec_duplicate:V32QI
> (vec_select:QI (subreg:V16QI (reg:V32QI 87) 0))
>   (parallel [(const_int 0 [0])]
>
> instead of
>
> (set (reg:V32QI 86)
>  (vec_duplicate:V32QI
> (vec_select:QI (reg:V32QI 87)
>   (parallel [(const_int 0 [0])]
>
> so that we can remove
>
> (define_insn "avx2_pbroadcast_1"
>   [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v")
> (vec_duplicate:VI_256
>   (vec_select:
> (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v")
> (parallel [(const_int 0)]]
>   "TARGET_AVX2"
>   "@
>vpbroadcast\t{%1, %0|%0, %1}
>vpbroadcast\t{%x1, %0|%0, %x1}
>vpbroadcast\t{%1, %0|%0, %1}
>vpbroadcast\t{%x1, %0|%0, %x1}"
>   [(set_attr "isa" "*,*,,")
>(set_attr "type" "ssemov")
>(set_attr "prefix_extra" "1")
>(set_attr "prefix" "vex")
>(set_attr "mode" "")])
>
> and keep only
>
> (define_insn "avx2_pbroadcast"
>   [(set (match_operand:VI 0 "register_operand" "=x,v")
> (vec_duplicate:VI
>   (vec_select:
> (match_operand: 1 "nonimmediate_operand" "xm,vm")
> (parallel [(const_int 0)]]
>   "TARGET_AVX2"
>   "vpbroadcast\t{%1, %0|%0, %1}"
>   [(set_attr "isa" "*,")
>(set_attr "type" "ssemov")
>(set_attr "prefix_extra" "1")
>(set_attr "prefix" "vex,evex")
>(set_attr "mode" "")])
>
> gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by
>
>  avx2_test:
> .cfi_startproc
> -   vmovaps x(%rip), %xmm1
> -   vbroadcastss%xmm1, %ymm0
> +   vbroadcastssx(%rip), %ymm0
> vmovaps %ymm0, y(%rip)
> vzeroupper
> ret
> .cfi_endproc
>
> gcc.target/i386/avx512vl-vbroadcast-3.c is changed by
>
> @@ -113,7 +113,7 @@ f10:
> .cfi_startproc
> vmovaps %ymm0, %ymm16
> vpermilps   $85, %ymm16, %ymm16
> -   vbroadcastss%xmm16, %ymm16
> +   vshuff32x4  $0x0, %ymm16, %ymm16, %ymm16
> vzeroupper
> ret
> .cfi_endproc
> @@ -153,8 +153,7 @@ f12:
>  f13:
>  .LFB12:
> .cfi_startproc
> -   vmovaps (%rdi), %ymm16
> -   vbroadcastss%xmm16, %ymm16
> +   vbroadcastss(%rdi), %ymm16
> vzeroupper
> ret
> .cfi_endproc
>
> gcc/
>
> * config/i386/i386-builtin.def: Replace CODE_FOR_avx2_vec_dupv4sf,
> CODE_FOR_avx2_vec_dupv8sf and CODE_FOR_avx2_vec_dupv4df with
> CODE_FOR_vec_dupv4sf, CODE_FOR_vec_dupv8sf and
> CODE_FOR_vec_dupv4df, respectively.
> * config/i386/i386.c (expand_vec_perm_1): Use subreg with vec_dup.
> * config/i386/i386.md (SF to DF splitter): Replace
> gen_avx512f_vec_dupv16sf_1 with gen_avx512f_vec_dupv16sf.
> * config/i386/sse.md (VF48_AVX512VL): New.
> (avx2_vec_dup): Removed.
> (avx2_vec_dupv8sf_1): Likewise.
> (avx512f_vec_dup_1): Likewise.
> (avx2_pbroadcast_1): Likewise.
> (avx2_vec_dupv4df): Likewise.
> (_vec_dup_1): Likewise.
> (*avx_vperm_broadcast_): Replace gen_avx2_vec_dupv8sf with
> gen_vec_dupv8sf.
>
> gcc/testsuite/
>
> * gcc.target/i386/avx2-vbroadcastss_ps256-1.c: Updated.

V2 [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns

2018-11-05 Thread H.J. Lu
Hi Richard, Jakub,

Can you take a look at this patch?  The last review from Kirill was in
June.

Thanks.


H.J.
--
There are many duplicated AVX2/AVX512 vec_dup patterns like:

(define_insn "avx2_vec_dup"
  [(set (match_operand:VF1_128_256 0 "register_operand" "=v")
(vec_duplicate:VF1_128_256
  (vec_select:SF
(match_operand:V4SF 1 "register_operand" "v")
(parallel [(const_int 0)]]
  "TARGET_AVX2"
  "vbroadcastss\t{%1, %0|%0, %1}"
  [(set_attr "type" "sselog1")
(set_attr "prefix" "maybe_evex")
(set_attr "mode" "")])

and

(define_insn "vec_dup"
  [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "=x,x,x,v,x")
(vec_duplicate:AVX_VEC_DUP_MODE
  (match_operand: 1 "nonimmediate_operand" 
"m,m,x,v,?x")))]
  "TARGET_AVX"
  "@
   vbroadcast\t{%1, %0|%0, %1}
   vbroadcast\t{%1, %0|%0, %1}
   vbroadcast\t{%x1, %0|%0, %x1}
   vbroadcast\t{%x1, %g0|%g0, %x1}
   #"
  [(set_attr "type" "ssemov")
   (set_attr "prefix_extra" "1")
   (set_attr "prefix" "maybe_evex")
   (set_attr "isa" "avx2,noavx2,avx2,avx512f,noavx2")
   (set_attr "mode" ",V8SF,,,V8SF")])

We can remove the duplicated AVX2/AVX512 vec_dup patterns and use the
normal AVX2/AVX512 vec_dup patterns instead by changing source operand
to subreg of the same register class of the base by generating

(set (reg:V8SF 84)
 (vec_duplicate:V8SF (subreg:SF (reg:V4SF 85) 0)))

instead of

(set (reg:V8SF 84)
  (vec_duplicate:V8SF
(vec_select:SF (reg:V4SF 85)
  (parallel [(const_int 0 [0])]

For integer vector broadcast, we generate

(set (reg:V32QI 86)
 (vec_duplicate:V32QI
(vec_select:QI (subreg:V16QI (reg:V32QI 87) 0))
  (parallel [(const_int 0 [0])]

instead of

(set (reg:V32QI 86)
 (vec_duplicate:V32QI
(vec_select:QI (reg:V32QI 87)
  (parallel [(const_int 0 [0])]

so that we can remove

(define_insn "avx2_pbroadcast_1"
  [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v")
(vec_duplicate:VI_256
  (vec_select:
(match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v")
(parallel [(const_int 0)]]
  "TARGET_AVX2"
  "@
   vpbroadcast\t{%1, %0|%0, %1}
   vpbroadcast\t{%x1, %0|%0, %x1}
   vpbroadcast\t{%1, %0|%0, %1}
   vpbroadcast\t{%x1, %0|%0, %x1}"
  [(set_attr "isa" "*,*,,")
   (set_attr "type" "ssemov")
   (set_attr "prefix_extra" "1")
   (set_attr "prefix" "vex")
   (set_attr "mode" "")])

and keep only

(define_insn "avx2_pbroadcast"
  [(set (match_operand:VI 0 "register_operand" "=x,v")
(vec_duplicate:VI
  (vec_select:
(match_operand: 1 "nonimmediate_operand" "xm,vm")
(parallel [(const_int 0)]]
  "TARGET_AVX2"
  "vpbroadcast\t{%1, %0|%0, %1}"
  [(set_attr "isa" "*,")
   (set_attr "type" "ssemov")
   (set_attr "prefix_extra" "1")
   (set_attr "prefix" "vex,evex")
   (set_attr "mode" "")])

gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by

 avx2_test:
.cfi_startproc
-   vmovaps x(%rip), %xmm1
-   vbroadcastss%xmm1, %ymm0
+   vbroadcastssx(%rip), %ymm0
vmovaps %ymm0, y(%rip)
vzeroupper
ret
.cfi_endproc

gcc.target/i386/avx512vl-vbroadcast-3.c is changed by

@@ -113,7 +113,7 @@ f10:
.cfi_startproc
vmovaps %ymm0, %ymm16
vpermilps   $85, %ymm16, %ymm16
-   vbroadcastss%xmm16, %ymm16
+   vshuff32x4  $0x0, %ymm16, %ymm16, %ymm16
vzeroupper
ret
.cfi_endproc
@@ -153,8 +153,7 @@ f12:
 f13:
 .LFB12:
.cfi_startproc
-   vmovaps (%rdi), %ymm16
-   vbroadcastss%xmm16, %ymm16
+   vbroadcastss(%rdi), %ymm16
vzeroupper
ret
.cfi_endproc

gcc/

* config/i386/i386-builtin.def: Replace CODE_FOR_avx2_vec_dupv4sf,
CODE_FOR_avx2_vec_dupv8sf and CODE_FOR_avx2_vec_dupv4df with
CODE_FOR_vec_dupv4sf, CODE_FOR_vec_dupv8sf and
CODE_FOR_vec_dupv4df, respectively.
* config/i386/i386.c (expand_vec_perm_1): Use subreg with vec_dup.
* config/i386/i386.md (SF to DF splitter): Replace
gen_avx512f_vec_dupv16sf_1 with gen_avx512f_vec_dupv16sf.
* config/i386/sse.md (VF48_AVX512VL): New.
(avx2_vec_dup): Removed.
(avx2_vec_dupv8sf_1): Likewise.
(avx512f_vec_dup_1): Likewise.
(avx2_pbroadcast_1): Likewise.
(avx2_vec_dupv4df): Likewise.
(_vec_dup_1): Likewise.
(*avx_vperm_broadcast_): Replace gen_avx2_vec_dupv8sf with
gen_vec_dupv8sf.

gcc/testsuite/

* gcc.target/i386/avx2-vbroadcastss_ps256-1.c: Updated.
* gcc.target/i386/avx512vl-vbroadcast-3.c: Likewise.
---
 gcc/config/i386/i386-builtin.def  |  6 +-
 gcc/config/i386/i386.c| 57 ++---
 gcc/config/i386/i386.md   |  2 +-
 gcc/config/i386/sse.md| 83 +--
 

Re: [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns

2018-11-05 Thread Uros Bizjak
On Sun, Nov 4, 2018 at 9:49 PM H.J. Lu  wrote:

> > > > Actually, we can achieve the same with pre-reload splitters. Please
> > > > see the attached patch for a couple of examples and a fix for
> > > > vbroadcastss that accesses the memory in wrong mode.
> > > >
> > >
> > > My patch removes a bunch of duplicated patterns from sse.md.  But
> > > yours adds a couple more patterns.   Isn't fewer patterns preferred?
> >
> > Playing SUBREG games before reload does not look safe to me. We would
>
> There are plenty of SUBREG usage in i386 backend before preload.  It is
> perfectly safe to do so as long as we don't create SUBREG with a different
> register class from the base.  Do you have a testcase to show my SUBREG
> usage is unsafe?

No. However, the patch then substatially changes functionality in the
vector part of the i386 (expand_vec_perm_1), so it needs approval from
the relevant maintainer (Kirill).

Uros.


Re: [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns

2018-11-04 Thread H.J. Lu
On Sun, Nov 4, 2018 at 11:45 AM Uros Bizjak  wrote:
>
> On Sun, Nov 4, 2018 at 8:17 PM H.J. Lu  wrote:
> >
> > On Sun, Nov 4, 2018 at 8:41 AM Uros Bizjak  wrote:
> > >
> > > On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu  wrote:
> > > >
> > > > Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with
> > > > subreg.  gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by
> > > >
> > > >  avx2_test:
> > > > .cfi_startproc
> > > > -   vmovaps x(%rip), %xmm1
> > > > -   vbroadcastss%xmm1, %ymm0
> > > > +   vbroadcastssx(%rip), %ymm0
> > > > vmovaps %ymm0, y(%rip)
> > > > vzeroupper
> > > > ret
> > > > .cfi_endproc
> > > >
> > > > gcc.target/i386/avx512vl-vbroadcast-3.c is changed by
> > > >
> > > > @@ -113,7 +113,7 @@ f10:
> > > > .cfi_startproc
> > > > vmovaps %ymm0, %ymm16
> > > > vpermilps   $85, %ymm16, %ymm16
> > > > -   vbroadcastss%xmm16, %ymm16
> > > > +   vshuff32x4  $0x0, %ymm16, %ymm16, %ymm16
> > > > vzeroupper
> > > > ret
> > > > .cfi_endproc
> > > > @@ -153,8 +153,7 @@ f12:
> > > >  f13:
> > > >  .LFB12:
> > > > .cfi_startproc
> > > > -   vmovaps (%rdi), %ymm16
> > > > -   vbroadcastss%xmm16, %ymm16
> > > > +   vbroadcastss(%rdi), %ymm16
> > > > vzeroupper
> > > > ret
> > > > .cfi_endproc
> > >
> > > Actually, we can achieve the same with pre-reload splitters. Please
> > > see the attached patch for a couple of examples and a fix for
> > > vbroadcastss that accesses the memory in wrong mode.
> > >
> >
> > My patch removes a bunch of duplicated patterns from sse.md.  But
> > yours adds a couple more patterns.   Isn't fewer patterns preferred?
>
> Playing SUBREG games before reload does not look safe to me. We would

There are plenty of SUBREG usage in i386 backend before preload.  It is
perfectly safe to do so as long as we don't create SUBREG with a different
register class from the base.  Do you have a testcase to show my SUBREG
usage is unsafe?

> like to create a simpler instruction out of the combination of vector
> load and broadcast, so I think that combine+split is the right tool
> for this simplification.

Adding new patterns doesn't simplify the issue.

> BTW: Half of my proposed patch is a fix to a avx2_pbroadcast{_1}
> pattern, which models wrong access to memory.
>

I will take look at avx2_pbroadcast{_1}.


-- 
H.J.


Re: [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns

2018-11-04 Thread Uros Bizjak
On Sun, Nov 4, 2018 at 8:17 PM H.J. Lu  wrote:
>
> On Sun, Nov 4, 2018 at 8:41 AM Uros Bizjak  wrote:
> >
> > On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu  wrote:
> > >
> > > Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with
> > > subreg.  gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by
> > >
> > >  avx2_test:
> > > .cfi_startproc
> > > -   vmovaps x(%rip), %xmm1
> > > -   vbroadcastss%xmm1, %ymm0
> > > +   vbroadcastssx(%rip), %ymm0
> > > vmovaps %ymm0, y(%rip)
> > > vzeroupper
> > > ret
> > > .cfi_endproc
> > >
> > > gcc.target/i386/avx512vl-vbroadcast-3.c is changed by
> > >
> > > @@ -113,7 +113,7 @@ f10:
> > > .cfi_startproc
> > > vmovaps %ymm0, %ymm16
> > > vpermilps   $85, %ymm16, %ymm16
> > > -   vbroadcastss%xmm16, %ymm16
> > > +   vshuff32x4  $0x0, %ymm16, %ymm16, %ymm16
> > > vzeroupper
> > > ret
> > > .cfi_endproc
> > > @@ -153,8 +153,7 @@ f12:
> > >  f13:
> > >  .LFB12:
> > > .cfi_startproc
> > > -   vmovaps (%rdi), %ymm16
> > > -   vbroadcastss%xmm16, %ymm16
> > > +   vbroadcastss(%rdi), %ymm16
> > > vzeroupper
> > > ret
> > > .cfi_endproc
> >
> > Actually, we can achieve the same with pre-reload splitters. Please
> > see the attached patch for a couple of examples and a fix for
> > vbroadcastss that accesses the memory in wrong mode.
> >
>
> My patch removes a bunch of duplicated patterns from sse.md.  But
> yours adds a couple more patterns.   Isn't fewer patterns preferred?

Playing SUBREG games before reload does not look safe to me. We would
like to create a simpler instruction out of the combination of vector
load and broadcast, so I think that combine+split is the right tool
for this simplification.

BTW: Half of my proposed patch is a fix to a avx2_pbroadcast{_1}
pattern, which models wrong access to memory.

Uros.


Re: [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns

2018-11-04 Thread H.J. Lu
On Sun, Nov 4, 2018 at 8:41 AM Uros Bizjak  wrote:
>
> On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu  wrote:
> >
> > Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with
> > subreg.  gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by
> >
> >  avx2_test:
> > .cfi_startproc
> > -   vmovaps x(%rip), %xmm1
> > -   vbroadcastss%xmm1, %ymm0
> > +   vbroadcastssx(%rip), %ymm0
> > vmovaps %ymm0, y(%rip)
> > vzeroupper
> > ret
> > .cfi_endproc
> >
> > gcc.target/i386/avx512vl-vbroadcast-3.c is changed by
> >
> > @@ -113,7 +113,7 @@ f10:
> > .cfi_startproc
> > vmovaps %ymm0, %ymm16
> > vpermilps   $85, %ymm16, %ymm16
> > -   vbroadcastss%xmm16, %ymm16
> > +   vshuff32x4  $0x0, %ymm16, %ymm16, %ymm16
> > vzeroupper
> > ret
> > .cfi_endproc
> > @@ -153,8 +153,7 @@ f12:
> >  f13:
> >  .LFB12:
> > .cfi_startproc
> > -   vmovaps (%rdi), %ymm16
> > -   vbroadcastss%xmm16, %ymm16
> > +   vbroadcastss(%rdi), %ymm16
> > vzeroupper
> > ret
> > .cfi_endproc
>
> Actually, we can achieve the same with pre-reload splitters. Please
> see the attached patch for a couple of examples and a fix for
> vbroadcastss that accesses the memory in wrong mode.
>

My patch removes a bunch of duplicated patterns from sse.md.  But
yours adds a couple more patterns.   Isn't fewer patterns preferred?

-- 
H.J.


Re: [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns

2018-11-04 Thread Uros Bizjak
On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu  wrote:
>
> Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with
> subreg.  gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by
>
>  avx2_test:
> .cfi_startproc
> -   vmovaps x(%rip), %xmm1
> -   vbroadcastss%xmm1, %ymm0
> +   vbroadcastssx(%rip), %ymm0
> vmovaps %ymm0, y(%rip)
> vzeroupper
> ret
> .cfi_endproc
>
> gcc.target/i386/avx512vl-vbroadcast-3.c is changed by
>
> @@ -113,7 +113,7 @@ f10:
> .cfi_startproc
> vmovaps %ymm0, %ymm16
> vpermilps   $85, %ymm16, %ymm16
> -   vbroadcastss%xmm16, %ymm16
> +   vshuff32x4  $0x0, %ymm16, %ymm16, %ymm16
> vzeroupper
> ret
> .cfi_endproc
> @@ -153,8 +153,7 @@ f12:
>  f13:
>  .LFB12:
> .cfi_startproc
> -   vmovaps (%rdi), %ymm16
> -   vbroadcastss%xmm16, %ymm16
> +   vbroadcastss(%rdi), %ymm16
> vzeroupper
> ret
> .cfi_endproc

Actually, we can achieve the same with pre-reload splitters. Please
see the attached patch for a couple of examples and a fix for
vbroadcastss that accesses the memory in wrong mode.

Uros.
Index: sse.md
===
--- sse.md  (revision 265740)
+++ sse.md  (working copy)
@@ -7129,6 +7129,20 @@
 (set_attr "prefix" "maybe_evex")
 (set_attr "mode" "")])
 
+(define_insn_and_split "*avx2_vec_dup_1"
+  [(set (match_operand:VF1_128_256 0 "register_operand")
+   (vec_duplicate:VF1_128_256
+ (vec_select:SF
+   (match_operand:V4SF 1 "memory_operand")
+   (parallel [(const_int 0)]]
+  "TARGET_AVX2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (vec_duplicate:VF1_128_256 (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], SFmode, 0);")
+
 (define_insn "avx2_vec_dupv8sf_1"
   [(set (match_operand:V8SF 0 "register_operand" "=v")
(vec_duplicate:V8SF
@@ -7141,6 +7155,20 @@
 (set_attr "prefix" "maybe_evex")
 (set_attr "mode" "V8SF")])
 
+(define_insn_and_split "*avx2_vec_dupv8sf_1"
+  [(set (match_operand:V8SF 0 "register_operand")
+   (vec_duplicate:V8SF
+ (vec_select:SF
+   (match_operand:V4SF 1 "memory_operand")
+   (parallel [(const_int 0)]]
+  "TARGET_AVX2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (vec_duplicate:VF1_128_256 (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], SFmode, 0);")
+
 (define_insn "avx512f_vec_dup_1"
   [(set (match_operand:VF_512 0 "register_operand" "=v")
(vec_duplicate:VF_512
@@ -17908,7 +17936,7 @@
   [(set (match_operand:VI 0 "register_operand" "=x,v")
(vec_duplicate:VI
  (vec_select:
-   (match_operand: 1 "nonimmediate_operand" "xm,vm")
+   (match_operand: 1 "register_operand" "x,v")
(parallel [(const_int 0)]]
   "TARGET_AVX2"
   "vpbroadcast\t{%1, %0|%0, %1}"
@@ -17918,24 +17946,64 @@
(set_attr "prefix" "vex,evex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*avx2_pbroadcast_mem_1"
+  [(set (match_operand:VI 0 "register_operand")
+   (vec_duplicate:VI
+ (vec_select:
+   (match_operand: 1 "memory_operand")
+   (parallel [(const_int 0)]]
+  "TARGET_AVX2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (vec_duplicate:VI (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], mode, 0);")
+
 (define_insn "avx2_pbroadcast_1"
-  [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v")
+  [(set (match_operand:VI_256 0 "register_operand" "=x,v")
(vec_duplicate:VI_256
  (vec_select:
-   (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v")
+   (match_operand:VI_256 1 "register_operand" "x,v")
(parallel [(const_int 0)]]
   "TARGET_AVX2"
-  "@
-   vpbroadcast\t{%1, %0|%0, %1}
-   vpbroadcast\t{%x1, %0|%0, %x1}
-   vpbroadcast\t{%1, %0|%0, %1}
-   vpbroadcast\t{%x1, %0|%0, %x1}"
-  [(set_attr "isa" "*,*,,")
+  "vpbroadcast\t{%x1, %0|%0, %x1}"
+  [(set_attr "isa" "*,")
(set_attr "type" "ssemov")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "vex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*avx2_pbroadcast_1_mem_1"
+  [(set (match_operand:VI_256 0 "register_operand" "=x,v")
+   (vec_duplicate:VI_256
+ (vec_select:
+   (match_operand:VI_256 1 "memory_operand" "m,m")
+   (parallel [(const_int 0)]]
+  "TARGET_AVX2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (vec_duplicate:VI_256 (match_dup 1)))]
+  "operands[1] = adjust_address_nv (operands[1], mode, 0);")
+
+(define_insn "*avx2_pbroadcast_mem"
+  [(set (match_operand:VI 0 "register_operand" "=x,v")
+   (vec_duplicate:VI
+ (match_operand: 1 "memory_operand" "m,m")))]
+  "TARGET_AVX2"
+  "vpbroadcast\t{%1, 

[PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns

2018-11-02 Thread H.J. Lu
Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with
subreg.  gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by

 avx2_test:
.cfi_startproc
-   vmovaps x(%rip), %xmm1
-   vbroadcastss%xmm1, %ymm0
+   vbroadcastssx(%rip), %ymm0
vmovaps %ymm0, y(%rip)
vzeroupper
ret
.cfi_endproc

gcc.target/i386/avx512vl-vbroadcast-3.c is changed by

@@ -113,7 +113,7 @@ f10:
.cfi_startproc
vmovaps %ymm0, %ymm16
vpermilps   $85, %ymm16, %ymm16
-   vbroadcastss%xmm16, %ymm16
+   vshuff32x4  $0x0, %ymm16, %ymm16, %ymm16
vzeroupper
ret
.cfi_endproc
@@ -153,8 +153,7 @@ f12:
 f13:
 .LFB12:
.cfi_startproc
-   vmovaps (%rdi), %ymm16
-   vbroadcastss%xmm16, %ymm16
+   vbroadcastss(%rdi), %ymm16
vzeroupper
ret
.cfi_endproc

OK for trunk?

Thanks.

H.J.
--
gcc/

* config/i386/i386-builtin.def: Replace CODE_FOR_avx2_vec_dupv4sf,
CODE_FOR_avx2_vec_dupv8sf and CODE_FOR_avx2_vec_dupv4df with
CODE_FOR_vec_dupv4sf, CODE_FOR_vec_dupv8sf and
CODE_FOR_vec_dupv4df, respectively.
* config/i386/i386.c (expand_vec_perm_1): Use subreg with vec_dup.
* config/i386/i386.md (SF to DF splitter): Replace
gen_avx512f_vec_dupv16sf_1 with gen_avx512f_vec_dupv16sf.
* config/i386/sse.md (VF48_AVX512VL): New.
(avx2_vec_dup): Removed.
(avx2_vec_dupv8sf_1): Likewise.
(avx512f_vec_dup_1): Likewise.
(avx2_pbroadcast_1): Likewise.
(avx2_vec_dupv4df): Likewise.
(_vec_dup_1): Likewise.
(*avx_vperm_broadcast_): Replace gen_avx2_vec_dupv8sf with
gen_vec_dupv8sf.

gcc/testsuite/

* gcc.target/i386/avx2-vbroadcastss_ps256-1.c: Updated.
* gcc.target/i386/avx512vl-vbroadcast-3.c: Likewise.
---
 gcc/config/i386/i386-builtin.def  |  6 +-
 gcc/config/i386/i386.c| 57 ++---
 gcc/config/i386/i386.md   |  2 +-
 gcc/config/i386/sse.md| 83 +--
 .../i386/avx2-vbroadcastss_ps256-1.c  |  3 +-
 .../gcc.target/i386/avx512vl-vbroadcast-3.c   |  5 +-
 6 files changed, 56 insertions(+), 100 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index df0f7e975ac..d217add8ee2 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -1194,9 +1194,9 @@ BDESC (OPTION_MASK_ISA_AVX2, 
CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, 
"__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) 
V8SI_FTYPE_V8SI_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, 
"__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) 
V4DI_FTYPE_V4DI_V4DI)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", 
IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI)
-BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, 
"__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) 
V4SF_FTYPE_V4SF)
-BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, 
"__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, 
(int) V8SF_FTYPE_V4SF)
-BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, 
"__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, 
(int) V4DF_FTYPE_V2DF)
+BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv4sf, 
"__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) 
V4SF_FTYPE_V4SF)
+BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv8sf, 
"__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, 
(int) V8SF_FTYPE_V4SF)
+BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv4df, 
"__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, 
(int) V4DF_FTYPE_V2DF)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, 
"__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) 
V4DI_FTYPE_V2DI)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, 
"__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) 
V4SI_FTYPE_V4SI_V4SI_INT)
 BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, 
"__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) 
V8SI_FTYPE_V8SI_V8SI_INT)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 963c7fcbb34..6b95d774ad1 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -45963,28 +45963,41 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
{
  /* Use vpbroadcast{b,w,d}.  */
  rtx (*gen) (rtx, rtx) = NULL;
+ machine_mode smode = VOIDmode;
  switch (d->vmode)
{
case E_V64QImode:
  if (TARGET_AVX512BW)
-   gen =