PING: V2 [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns
On Mon, Nov 5, 2018 at 2:02 PM H.J. Lu wrote: > > Hi Richard, Jakub, > > Can you take a look at this patch? The last review from Kirill was in > June. > > Thanks. > > > H.J. > -- > There are many duplicated AVX2/AVX512 vec_dup patterns like: > > (define_insn "avx2_vec_dup" > [(set (match_operand:VF1_128_256 0 "register_operand" "=v") > (vec_duplicate:VF1_128_256 > (vec_select:SF > (match_operand:V4SF 1 "register_operand" "v") > (parallel [(const_int 0)]] > "TARGET_AVX2" > "vbroadcastss\t{%1, %0|%0, %1}" > [(set_attr "type" "sselog1") > (set_attr "prefix" "maybe_evex") > (set_attr "mode" "")]) > > and > > (define_insn "vec_dup" > [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "=x,x,x,v,x") > (vec_duplicate:AVX_VEC_DUP_MODE > (match_operand: 1 "nonimmediate_operand" > "m,m,x,v,?x")))] > "TARGET_AVX" > "@ >vbroadcast\t{%1, %0|%0, %1} >vbroadcast\t{%1, %0|%0, %1} >vbroadcast\t{%x1, %0|%0, %x1} >vbroadcast\t{%x1, %g0|%g0, %x1} >#" > [(set_attr "type" "ssemov") >(set_attr "prefix_extra" "1") >(set_attr "prefix" "maybe_evex") >(set_attr "isa" "avx2,noavx2,avx2,avx512f,noavx2") >(set_attr "mode" ",V8SF,,,V8SF")]) > > We can remove the duplicated AVX2/AVX512 vec_dup patterns and use the > normal AVX2/AVX512 vec_dup patterns instead by changing source operand > to subreg of the same register class of the base by generating > > (set (reg:V8SF 84) > (vec_duplicate:V8SF (subreg:SF (reg:V4SF 85) 0))) > > instead of > > (set (reg:V8SF 84) > (vec_duplicate:V8SF > (vec_select:SF (reg:V4SF 85) > (parallel [(const_int 0 [0])] > > For integer vector broadcast, we generate > > (set (reg:V32QI 86) > (vec_duplicate:V32QI > (vec_select:QI (subreg:V16QI (reg:V32QI 87) 0)) > (parallel [(const_int 0 [0])] > > instead of > > (set (reg:V32QI 86) > (vec_duplicate:V32QI > (vec_select:QI (reg:V32QI 87) > (parallel [(const_int 0 [0])] > > so that we can remove > > (define_insn "avx2_pbroadcast_1" > [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v") > (vec_duplicate:VI_256 > (vec_select: > (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v") > (parallel [(const_int 0)]] > "TARGET_AVX2" > "@ >vpbroadcast\t{%1, %0|%0, %1} >vpbroadcast\t{%x1, %0|%0, %x1} >vpbroadcast\t{%1, %0|%0, %1} >vpbroadcast\t{%x1, %0|%0, %x1}" > [(set_attr "isa" "*,*,,") >(set_attr "type" "ssemov") >(set_attr "prefix_extra" "1") >(set_attr "prefix" "vex") >(set_attr "mode" "")]) > > and keep only > > (define_insn "avx2_pbroadcast" > [(set (match_operand:VI 0 "register_operand" "=x,v") > (vec_duplicate:VI > (vec_select: > (match_operand: 1 "nonimmediate_operand" "xm,vm") > (parallel [(const_int 0)]] > "TARGET_AVX2" > "vpbroadcast\t{%1, %0|%0, %1}" > [(set_attr "isa" "*,") >(set_attr "type" "ssemov") >(set_attr "prefix_extra" "1") >(set_attr "prefix" "vex,evex") >(set_attr "mode" "")]) > > gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by > > avx2_test: > .cfi_startproc > - vmovaps x(%rip), %xmm1 > - vbroadcastss%xmm1, %ymm0 > + vbroadcastssx(%rip), %ymm0 > vmovaps %ymm0, y(%rip) > vzeroupper > ret > .cfi_endproc > > gcc.target/i386/avx512vl-vbroadcast-3.c is changed by > > @@ -113,7 +113,7 @@ f10: > .cfi_startproc > vmovaps %ymm0, %ymm16 > vpermilps $85, %ymm16, %ymm16 > - vbroadcastss%xmm16, %ymm16 > + vshuff32x4 $0x0, %ymm16, %ymm16, %ymm16 > vzeroupper > ret > .cfi_endproc > @@ -153,8 +153,7 @@ f12: > f13: > .LFB12: > .cfi_startproc > - vmovaps (%rdi), %ymm16 > - vbroadcastss%xmm16, %ymm16 > + vbroadcastss(%rdi), %ymm16 > vzeroupper > ret > .cfi_endproc > > gcc/ > > * config/i386/i386-builtin.def: Replace CODE_FOR_avx2_vec_dupv4sf, > CODE_FOR_avx2_vec_dupv8sf and CODE_FOR_avx2_vec_dupv4df with > CODE_FOR_vec_dupv4sf, CODE_FOR_vec_dupv8sf and > CODE_FOR_vec_dupv4df, respectively. > * config/i386/i386.c (expand_vec_perm_1): Use subreg with vec_dup. > * config/i386/i386.md (SF to DF splitter): Replace > gen_avx512f_vec_dupv16sf_1 with gen_avx512f_vec_dupv16sf. > * config/i386/sse.md (VF48_AVX512VL): New. > (avx2_vec_dup): Removed. > (avx2_vec_dupv8sf_1): Likewise. > (avx512f_vec_dup_1): Likewise. > (avx2_pbroadcast_1): Likewise. > (avx2_vec_dupv4df): Likewise. > (_vec_dup_1): Likewise. > (*avx_vperm_broadcast_): Replace gen_avx2_vec_dupv8sf with > gen_vec_dupv8sf. > > gcc/testsuite/ > > * gcc.target/i386/avx2-vbroadcastss_ps256-1.c: Updated.
V2 [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns
Hi Richard, Jakub, Can you take a look at this patch? The last review from Kirill was in June. Thanks. H.J. -- There are many duplicated AVX2/AVX512 vec_dup patterns like: (define_insn "avx2_vec_dup" [(set (match_operand:VF1_128_256 0 "register_operand" "=v") (vec_duplicate:VF1_128_256 (vec_select:SF (match_operand:V4SF 1 "register_operand" "v") (parallel [(const_int 0)]] "TARGET_AVX2" "vbroadcastss\t{%1, %0|%0, %1}" [(set_attr "type" "sselog1") (set_attr "prefix" "maybe_evex") (set_attr "mode" "")]) and (define_insn "vec_dup" [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "=x,x,x,v,x") (vec_duplicate:AVX_VEC_DUP_MODE (match_operand: 1 "nonimmediate_operand" "m,m,x,v,?x")))] "TARGET_AVX" "@ vbroadcast\t{%1, %0|%0, %1} vbroadcast\t{%1, %0|%0, %1} vbroadcast\t{%x1, %0|%0, %x1} vbroadcast\t{%x1, %g0|%g0, %x1} #" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_evex") (set_attr "isa" "avx2,noavx2,avx2,avx512f,noavx2") (set_attr "mode" ",V8SF,,,V8SF")]) We can remove the duplicated AVX2/AVX512 vec_dup patterns and use the normal AVX2/AVX512 vec_dup patterns instead by changing source operand to subreg of the same register class of the base by generating (set (reg:V8SF 84) (vec_duplicate:V8SF (subreg:SF (reg:V4SF 85) 0))) instead of (set (reg:V8SF 84) (vec_duplicate:V8SF (vec_select:SF (reg:V4SF 85) (parallel [(const_int 0 [0])] For integer vector broadcast, we generate (set (reg:V32QI 86) (vec_duplicate:V32QI (vec_select:QI (subreg:V16QI (reg:V32QI 87) 0)) (parallel [(const_int 0 [0])] instead of (set (reg:V32QI 86) (vec_duplicate:V32QI (vec_select:QI (reg:V32QI 87) (parallel [(const_int 0 [0])] so that we can remove (define_insn "avx2_pbroadcast_1" [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v") (vec_duplicate:VI_256 (vec_select: (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v") (parallel [(const_int 0)]] "TARGET_AVX2" "@ vpbroadcast\t{%1, %0|%0, %1} vpbroadcast\t{%x1, %0|%0, %x1} vpbroadcast\t{%1, %0|%0, %1} vpbroadcast\t{%x1, %0|%0, %x1}" [(set_attr "isa" "*,*,,") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "vex") (set_attr "mode" "")]) and keep only (define_insn "avx2_pbroadcast" [(set (match_operand:VI 0 "register_operand" "=x,v") (vec_duplicate:VI (vec_select: (match_operand: 1 "nonimmediate_operand" "xm,vm") (parallel [(const_int 0)]] "TARGET_AVX2" "vpbroadcast\t{%1, %0|%0, %1}" [(set_attr "isa" "*,") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "vex,evex") (set_attr "mode" "")]) gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by avx2_test: .cfi_startproc - vmovaps x(%rip), %xmm1 - vbroadcastss%xmm1, %ymm0 + vbroadcastssx(%rip), %ymm0 vmovaps %ymm0, y(%rip) vzeroupper ret .cfi_endproc gcc.target/i386/avx512vl-vbroadcast-3.c is changed by @@ -113,7 +113,7 @@ f10: .cfi_startproc vmovaps %ymm0, %ymm16 vpermilps $85, %ymm16, %ymm16 - vbroadcastss%xmm16, %ymm16 + vshuff32x4 $0x0, %ymm16, %ymm16, %ymm16 vzeroupper ret .cfi_endproc @@ -153,8 +153,7 @@ f12: f13: .LFB12: .cfi_startproc - vmovaps (%rdi), %ymm16 - vbroadcastss%xmm16, %ymm16 + vbroadcastss(%rdi), %ymm16 vzeroupper ret .cfi_endproc gcc/ * config/i386/i386-builtin.def: Replace CODE_FOR_avx2_vec_dupv4sf, CODE_FOR_avx2_vec_dupv8sf and CODE_FOR_avx2_vec_dupv4df with CODE_FOR_vec_dupv4sf, CODE_FOR_vec_dupv8sf and CODE_FOR_vec_dupv4df, respectively. * config/i386/i386.c (expand_vec_perm_1): Use subreg with vec_dup. * config/i386/i386.md (SF to DF splitter): Replace gen_avx512f_vec_dupv16sf_1 with gen_avx512f_vec_dupv16sf. * config/i386/sse.md (VF48_AVX512VL): New. (avx2_vec_dup): Removed. (avx2_vec_dupv8sf_1): Likewise. (avx512f_vec_dup_1): Likewise. (avx2_pbroadcast_1): Likewise. (avx2_vec_dupv4df): Likewise. (_vec_dup_1): Likewise. (*avx_vperm_broadcast_): Replace gen_avx2_vec_dupv8sf with gen_vec_dupv8sf. gcc/testsuite/ * gcc.target/i386/avx2-vbroadcastss_ps256-1.c: Updated. * gcc.target/i386/avx512vl-vbroadcast-3.c: Likewise. --- gcc/config/i386/i386-builtin.def | 6 +- gcc/config/i386/i386.c| 57 ++--- gcc/config/i386/i386.md | 2 +- gcc/config/i386/sse.md| 83 +--
Re: [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns
On Sun, Nov 4, 2018 at 9:49 PM H.J. Lu wrote: > > > > Actually, we can achieve the same with pre-reload splitters. Please > > > > see the attached patch for a couple of examples and a fix for > > > > vbroadcastss that accesses the memory in wrong mode. > > > > > > > > > > My patch removes a bunch of duplicated patterns from sse.md. But > > > yours adds a couple more patterns. Isn't fewer patterns preferred? > > > > Playing SUBREG games before reload does not look safe to me. We would > > There are plenty of SUBREG usage in i386 backend before preload. It is > perfectly safe to do so as long as we don't create SUBREG with a different > register class from the base. Do you have a testcase to show my SUBREG > usage is unsafe? No. However, the patch then substatially changes functionality in the vector part of the i386 (expand_vec_perm_1), so it needs approval from the relevant maintainer (Kirill). Uros.
Re: [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns
On Sun, Nov 4, 2018 at 11:45 AM Uros Bizjak wrote: > > On Sun, Nov 4, 2018 at 8:17 PM H.J. Lu wrote: > > > > On Sun, Nov 4, 2018 at 8:41 AM Uros Bizjak wrote: > > > > > > On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu wrote: > > > > > > > > Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with > > > > subreg. gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by > > > > > > > > avx2_test: > > > > .cfi_startproc > > > > - vmovaps x(%rip), %xmm1 > > > > - vbroadcastss%xmm1, %ymm0 > > > > + vbroadcastssx(%rip), %ymm0 > > > > vmovaps %ymm0, y(%rip) > > > > vzeroupper > > > > ret > > > > .cfi_endproc > > > > > > > > gcc.target/i386/avx512vl-vbroadcast-3.c is changed by > > > > > > > > @@ -113,7 +113,7 @@ f10: > > > > .cfi_startproc > > > > vmovaps %ymm0, %ymm16 > > > > vpermilps $85, %ymm16, %ymm16 > > > > - vbroadcastss%xmm16, %ymm16 > > > > + vshuff32x4 $0x0, %ymm16, %ymm16, %ymm16 > > > > vzeroupper > > > > ret > > > > .cfi_endproc > > > > @@ -153,8 +153,7 @@ f12: > > > > f13: > > > > .LFB12: > > > > .cfi_startproc > > > > - vmovaps (%rdi), %ymm16 > > > > - vbroadcastss%xmm16, %ymm16 > > > > + vbroadcastss(%rdi), %ymm16 > > > > vzeroupper > > > > ret > > > > .cfi_endproc > > > > > > Actually, we can achieve the same with pre-reload splitters. Please > > > see the attached patch for a couple of examples and a fix for > > > vbroadcastss that accesses the memory in wrong mode. > > > > > > > My patch removes a bunch of duplicated patterns from sse.md. But > > yours adds a couple more patterns. Isn't fewer patterns preferred? > > Playing SUBREG games before reload does not look safe to me. We would There are plenty of SUBREG usage in i386 backend before preload. It is perfectly safe to do so as long as we don't create SUBREG with a different register class from the base. Do you have a testcase to show my SUBREG usage is unsafe? > like to create a simpler instruction out of the combination of vector > load and broadcast, so I think that combine+split is the right tool > for this simplification. Adding new patterns doesn't simplify the issue. > BTW: Half of my proposed patch is a fix to a avx2_pbroadcast{_1} > pattern, which models wrong access to memory. > I will take look at avx2_pbroadcast{_1}. -- H.J.
Re: [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns
On Sun, Nov 4, 2018 at 8:17 PM H.J. Lu wrote: > > On Sun, Nov 4, 2018 at 8:41 AM Uros Bizjak wrote: > > > > On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu wrote: > > > > > > Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with > > > subreg. gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by > > > > > > avx2_test: > > > .cfi_startproc > > > - vmovaps x(%rip), %xmm1 > > > - vbroadcastss%xmm1, %ymm0 > > > + vbroadcastssx(%rip), %ymm0 > > > vmovaps %ymm0, y(%rip) > > > vzeroupper > > > ret > > > .cfi_endproc > > > > > > gcc.target/i386/avx512vl-vbroadcast-3.c is changed by > > > > > > @@ -113,7 +113,7 @@ f10: > > > .cfi_startproc > > > vmovaps %ymm0, %ymm16 > > > vpermilps $85, %ymm16, %ymm16 > > > - vbroadcastss%xmm16, %ymm16 > > > + vshuff32x4 $0x0, %ymm16, %ymm16, %ymm16 > > > vzeroupper > > > ret > > > .cfi_endproc > > > @@ -153,8 +153,7 @@ f12: > > > f13: > > > .LFB12: > > > .cfi_startproc > > > - vmovaps (%rdi), %ymm16 > > > - vbroadcastss%xmm16, %ymm16 > > > + vbroadcastss(%rdi), %ymm16 > > > vzeroupper > > > ret > > > .cfi_endproc > > > > Actually, we can achieve the same with pre-reload splitters. Please > > see the attached patch for a couple of examples and a fix for > > vbroadcastss that accesses the memory in wrong mode. > > > > My patch removes a bunch of duplicated patterns from sse.md. But > yours adds a couple more patterns. Isn't fewer patterns preferred? Playing SUBREG games before reload does not look safe to me. We would like to create a simpler instruction out of the combination of vector load and broadcast, so I think that combine+split is the right tool for this simplification. BTW: Half of my proposed patch is a fix to a avx2_pbroadcast{_1} pattern, which models wrong access to memory. Uros.
Re: [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns
On Sun, Nov 4, 2018 at 8:41 AM Uros Bizjak wrote: > > On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu wrote: > > > > Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with > > subreg. gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by > > > > avx2_test: > > .cfi_startproc > > - vmovaps x(%rip), %xmm1 > > - vbroadcastss%xmm1, %ymm0 > > + vbroadcastssx(%rip), %ymm0 > > vmovaps %ymm0, y(%rip) > > vzeroupper > > ret > > .cfi_endproc > > > > gcc.target/i386/avx512vl-vbroadcast-3.c is changed by > > > > @@ -113,7 +113,7 @@ f10: > > .cfi_startproc > > vmovaps %ymm0, %ymm16 > > vpermilps $85, %ymm16, %ymm16 > > - vbroadcastss%xmm16, %ymm16 > > + vshuff32x4 $0x0, %ymm16, %ymm16, %ymm16 > > vzeroupper > > ret > > .cfi_endproc > > @@ -153,8 +153,7 @@ f12: > > f13: > > .LFB12: > > .cfi_startproc > > - vmovaps (%rdi), %ymm16 > > - vbroadcastss%xmm16, %ymm16 > > + vbroadcastss(%rdi), %ymm16 > > vzeroupper > > ret > > .cfi_endproc > > Actually, we can achieve the same with pre-reload splitters. Please > see the attached patch for a couple of examples and a fix for > vbroadcastss that accesses the memory in wrong mode. > My patch removes a bunch of duplicated patterns from sse.md. But yours adds a couple more patterns. Isn't fewer patterns preferred? -- H.J.
Re: [PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns
On Fri, Nov 2, 2018 at 6:25 PM H.J. Lu wrote: > > Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with > subreg. gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by > > avx2_test: > .cfi_startproc > - vmovaps x(%rip), %xmm1 > - vbroadcastss%xmm1, %ymm0 > + vbroadcastssx(%rip), %ymm0 > vmovaps %ymm0, y(%rip) > vzeroupper > ret > .cfi_endproc > > gcc.target/i386/avx512vl-vbroadcast-3.c is changed by > > @@ -113,7 +113,7 @@ f10: > .cfi_startproc > vmovaps %ymm0, %ymm16 > vpermilps $85, %ymm16, %ymm16 > - vbroadcastss%xmm16, %ymm16 > + vshuff32x4 $0x0, %ymm16, %ymm16, %ymm16 > vzeroupper > ret > .cfi_endproc > @@ -153,8 +153,7 @@ f12: > f13: > .LFB12: > .cfi_startproc > - vmovaps (%rdi), %ymm16 > - vbroadcastss%xmm16, %ymm16 > + vbroadcastss(%rdi), %ymm16 > vzeroupper > ret > .cfi_endproc Actually, we can achieve the same with pre-reload splitters. Please see the attached patch for a couple of examples and a fix for vbroadcastss that accesses the memory in wrong mode. Uros. Index: sse.md === --- sse.md (revision 265740) +++ sse.md (working copy) @@ -7129,6 +7129,20 @@ (set_attr "prefix" "maybe_evex") (set_attr "mode" "")]) +(define_insn_and_split "*avx2_vec_dup_1" + [(set (match_operand:VF1_128_256 0 "register_operand") + (vec_duplicate:VF1_128_256 + (vec_select:SF + (match_operand:V4SF 1 "memory_operand") + (parallel [(const_int 0)]] + "TARGET_AVX2 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(set (match_dup 0) + (vec_duplicate:VF1_128_256 (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], SFmode, 0);") + (define_insn "avx2_vec_dupv8sf_1" [(set (match_operand:V8SF 0 "register_operand" "=v") (vec_duplicate:V8SF @@ -7141,6 +7155,20 @@ (set_attr "prefix" "maybe_evex") (set_attr "mode" "V8SF")]) +(define_insn_and_split "*avx2_vec_dupv8sf_1" + [(set (match_operand:V8SF 0 "register_operand") + (vec_duplicate:V8SF + (vec_select:SF + (match_operand:V4SF 1 "memory_operand") + (parallel [(const_int 0)]] + "TARGET_AVX2 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(set (match_dup 0) + (vec_duplicate:VF1_128_256 (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], SFmode, 0);") + (define_insn "avx512f_vec_dup_1" [(set (match_operand:VF_512 0 "register_operand" "=v") (vec_duplicate:VF_512 @@ -17908,7 +17936,7 @@ [(set (match_operand:VI 0 "register_operand" "=x,v") (vec_duplicate:VI (vec_select: - (match_operand: 1 "nonimmediate_operand" "xm,vm") + (match_operand: 1 "register_operand" "x,v") (parallel [(const_int 0)]] "TARGET_AVX2" "vpbroadcast\t{%1, %0|%0, %1}" @@ -17918,24 +17946,64 @@ (set_attr "prefix" "vex,evex") (set_attr "mode" "")]) +(define_insn_and_split "*avx2_pbroadcast_mem_1" + [(set (match_operand:VI 0 "register_operand") + (vec_duplicate:VI + (vec_select: + (match_operand: 1 "memory_operand") + (parallel [(const_int 0)]] + "TARGET_AVX2 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(set (match_dup 0) + (vec_duplicate:VI (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], mode, 0);") + (define_insn "avx2_pbroadcast_1" - [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v") + [(set (match_operand:VI_256 0 "register_operand" "=x,v") (vec_duplicate:VI_256 (vec_select: - (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v") + (match_operand:VI_256 1 "register_operand" "x,v") (parallel [(const_int 0)]] "TARGET_AVX2" - "@ - vpbroadcast\t{%1, %0|%0, %1} - vpbroadcast\t{%x1, %0|%0, %x1} - vpbroadcast\t{%1, %0|%0, %1} - vpbroadcast\t{%x1, %0|%0, %x1}" - [(set_attr "isa" "*,*,,") + "vpbroadcast\t{%x1, %0|%0, %x1}" + [(set_attr "isa" "*,") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "vex") (set_attr "mode" "")]) +(define_insn_and_split "*avx2_pbroadcast_1_mem_1" + [(set (match_operand:VI_256 0 "register_operand" "=x,v") + (vec_duplicate:VI_256 + (vec_select: + (match_operand:VI_256 1 "memory_operand" "m,m") + (parallel [(const_int 0)]] + "TARGET_AVX2 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(set (match_dup 0) + (vec_duplicate:VI_256 (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], mode, 0);") + +(define_insn "*avx2_pbroadcast_mem" + [(set (match_operand:VI 0 "register_operand" "=x,v") + (vec_duplicate:VI + (match_operand: 1 "memory_operand" "m,m")))] + "TARGET_AVX2" + "vpbroadcast\t{%1,
[PATCH] i386: Remove duplicated AVX2/AVX512 vec_dup patterns
Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with subreg. gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by avx2_test: .cfi_startproc - vmovaps x(%rip), %xmm1 - vbroadcastss%xmm1, %ymm0 + vbroadcastssx(%rip), %ymm0 vmovaps %ymm0, y(%rip) vzeroupper ret .cfi_endproc gcc.target/i386/avx512vl-vbroadcast-3.c is changed by @@ -113,7 +113,7 @@ f10: .cfi_startproc vmovaps %ymm0, %ymm16 vpermilps $85, %ymm16, %ymm16 - vbroadcastss%xmm16, %ymm16 + vshuff32x4 $0x0, %ymm16, %ymm16, %ymm16 vzeroupper ret .cfi_endproc @@ -153,8 +153,7 @@ f12: f13: .LFB12: .cfi_startproc - vmovaps (%rdi), %ymm16 - vbroadcastss%xmm16, %ymm16 + vbroadcastss(%rdi), %ymm16 vzeroupper ret .cfi_endproc OK for trunk? Thanks. H.J. -- gcc/ * config/i386/i386-builtin.def: Replace CODE_FOR_avx2_vec_dupv4sf, CODE_FOR_avx2_vec_dupv8sf and CODE_FOR_avx2_vec_dupv4df with CODE_FOR_vec_dupv4sf, CODE_FOR_vec_dupv8sf and CODE_FOR_vec_dupv4df, respectively. * config/i386/i386.c (expand_vec_perm_1): Use subreg with vec_dup. * config/i386/i386.md (SF to DF splitter): Replace gen_avx512f_vec_dupv16sf_1 with gen_avx512f_vec_dupv16sf. * config/i386/sse.md (VF48_AVX512VL): New. (avx2_vec_dup): Removed. (avx2_vec_dupv8sf_1): Likewise. (avx512f_vec_dup_1): Likewise. (avx2_pbroadcast_1): Likewise. (avx2_vec_dupv4df): Likewise. (_vec_dup_1): Likewise. (*avx_vperm_broadcast_): Replace gen_avx2_vec_dupv8sf with gen_vec_dupv8sf. gcc/testsuite/ * gcc.target/i386/avx2-vbroadcastss_ps256-1.c: Updated. * gcc.target/i386/avx512vl-vbroadcast-3.c: Likewise. --- gcc/config/i386/i386-builtin.def | 6 +- gcc/config/i386/i386.c| 57 ++--- gcc/config/i386/i386.md | 2 +- gcc/config/i386/sse.md| 83 +-- .../i386/avx2-vbroadcastss_ps256-1.c | 3 +- .../gcc.target/i386/avx512vl-vbroadcast-3.c | 5 +- 6 files changed, 56 insertions(+), 100 deletions(-) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index df0f7e975ac..d217add8ee2 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -1194,9 +1194,9 @@ BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_ BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI) BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI) BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI) -BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF) -BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF) -BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF) +BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF) +BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF) +BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF) BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI) BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT) BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 963c7fcbb34..6b95d774ad1 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -45963,28 +45963,41 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) { /* Use vpbroadcast{b,w,d}. */ rtx (*gen) (rtx, rtx) = NULL; + machine_mode smode = VOIDmode; switch (d->vmode) { case E_V64QImode: if (TARGET_AVX512BW) - gen =