> Am 04.03.2022 um 03:30 schrieb Hongtao Liu via Gcc-patches > <gcc-patches@gcc.gnu.org>: > > On Fri, Mar 4, 2022 at 10:29 AM liuhongt via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: >> >> This is incremental patch based on [1], it enables optimization as below >> >> - vbroadcastss .LC1(%rip), %xmm0 >> + movl $-45, %edx >> + vmovd %edx, %xmm0 >> + vpshufd $0, %xmm0, %xmm0 >> >> According to microbenchmark, it's faster than broadcast from memory Is that true even on AMD uarchs? >> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591162.html. >> >> Bootstrapped and regtest on x86_64-linux-gnu{-m32,}. >> Ok for trunk? >> >> gcc/ChangeLog: >> >> PR target/104704 >> * config/i386/sse.md (*vec_dupv4si): Add alternative $r and >> corresponding post_reload splitter. >> >> gcc/testsuite/ChangeLog: >> >> * gcc.target/i386/pr100865-8a.c: Adjust testcase. >> * gcc.target/i386/pr100865-8c.c: Ditto. >> * gcc.target/i386/pr100865-9c.c: Ditto. >> --- >> gcc/config/i386/sse.md | 41 ++++++++++++++++----- >> gcc/testsuite/gcc.target/i386/pr100865-8a.c | 2 +- >> gcc/testsuite/gcc.target/i386/pr100865-8c.c | 2 +- >> gcc/testsuite/gcc.target/i386/pr100865-9c.c | 2 +- >> 4 files changed, 35 insertions(+), 12 deletions(-) >> >> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md >> index 3066ea3734a..d124545aa5d 100644 >> --- a/gcc/config/i386/sse.md >> +++ b/gcc/config/i386/sse.md >> @@ -25121,20 +25121,43 @@ (define_insn "vec_dupv4sf" >> (set_attr "mode" "V4SF")]) >> >> (define_insn "*vec_dupv4si" >> - [(set (match_operand:V4SI 0 "register_operand" "=v,v,x") >> + [(set (match_operand:V4SI 0 "register_operand" "=v,v,x,v") >> (vec_duplicate:V4SI >> - (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))] >> + (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0,$r")))] >> "TARGET_SSE" >> "@ >> %vpshufd\t{$0, %1, %0|%0, %1, 0} >> vbroadcastss\t{%1, %0|%0, %1} >> - shufps\t{$0, %0, %0|%0, %0, 0}" >> - [(set_attr "isa" "sse2,avx,noavx") >> - (set_attr "type" "sselog1,ssemov,sselog1") >> - (set_attr "length_immediate" "1,0,1") >> - (set_attr "prefix_extra" "0,1,*") >> - (set_attr "prefix" "maybe_vex,maybe_evex,orig") >> - (set_attr "mode" "TI,V4SF,V4SF")]) >> + shufps\t{$0, %0, %0|%0, %0, 0} >> + #" >> + [(set_attr "isa" "sse2,avx,noavx,noavx512vl") >> + (set_attr "type" "sselog1,ssemov,sselog1,sselog1") >> + (set_attr "length_immediate" "1,0,1,1") >> + (set_attr "prefix_extra" "0,1,*,0") >> + (set_attr "prefix" "maybe_vex,maybe_evex,orig,maybe_vex") >> + (set_attr "mode" "TI,V4SF,V4SF,TI") >> + (set (attr "preferred_for_speed") >> + (cond [(eq_attr "alternative" "3") >> + (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC") >> + ] >> + (symbol_ref "true")))]) >> + >> +(define_split >> + [(set (match_operand:V4SI 0 "sse_reg_operand") >> + (vec_duplicate:V4SI >> + (match_operand:SI 1 "general_reg_operand")))] >> + "TARGET_SSE && reload_completed >> + /* Disable this splitter if avx512vl_vec_dup_gprv4si insn is >> + available, because then we can broadcast from GPRs directly. */ >> + && !TARGET_AVX512VL" >> + [(const_int 0)] >> +{ >> + emit_insn (gen_vec_setv4si_0 (gen_lowpart (V4SImode, operands[0]), >> + CONST0_RTX (V4SImode), >> + gen_lowpart (SImode, operands[1]))); >> + emit_insn (gen_vec_duplicatev4si (operands[0], operands[0])); >> + DONE; >> +}) >> >> (define_insn "*vec_dupv2di" >> [(set (match_operand:V2DI 0 "register_operand" "=x,v,v,x") >> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c >> b/gcc/testsuite/gcc.target/i386/pr100865-8a.c >> index 911b14d4a25..544a14db6f7 100644 >> --- a/gcc/testsuite/gcc.target/i386/pr100865-8a.c >> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c >> @@ -20,5 +20,5 @@ foo (void) >> array[i] = MK_CONST128_BROADCAST_SIGNED (-45); >> } >> >> -/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t >> \]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */ >> +/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t >> \]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */ >> /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } >> */ >> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8c.c >> b/gcc/testsuite/gcc.target/i386/pr100865-8c.c >> index 00682edb8c9..efee0488614 100644 >> --- a/gcc/testsuite/gcc.target/i386/pr100865-8c.c >> +++ b/gcc/testsuite/gcc.target/i386/pr100865-8c.c >> @@ -3,5 +3,5 @@ >> >> #include "pr100865-8a.c" >> >> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, >> %xmm\[0-9\]+" 1 { xfail *-*-* } } } */ >> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, >> %xmm\[0-9\]+" 1 } } */ >> /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } >> */ >> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9c.c >> b/gcc/testsuite/gcc.target/i386/pr100865-9c.c >> index 8ffcdc1629d..e6f25902c1d 100644 >> --- a/gcc/testsuite/gcc.target/i386/pr100865-9c.c >> +++ b/gcc/testsuite/gcc.target/i386/pr100865-9c.c >> @@ -3,5 +3,5 @@ >> >> #include "pr100865-9a.c" >> >> -/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, >> %xmm\[0-9\]+" 1 { xfail *-*-* } } } */ >> +/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, >> %xmm\[0-9\]+" 1 } } */ >> /* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } >> */ >> -- >> 2.18.1 >> > > > -- > BR, > Hongtao
Re: [PATCH] [i386] Optimize v4si broadcast for noavx512vl.
Richard Biener via Gcc-patches Fri, 04 Mar 2022 08:40:11 -0800
- [PATCH] [i386] Optimize v4si broadcast for ... liuhongt via Gcc-patches
- Re: [PATCH] [i386] Optimize v4si broad... Hongtao Liu via Gcc-patches
- Re: [PATCH] [i386] Optimize v4si b... Richard Biener via Gcc-patches
- Re: [PATCH] [i386] Optimize v4... H.J. Lu via Gcc-patches
- Re: [PATCH] [i386] Optimize v4si broad... Uros Bizjak via Gcc-patches
- [PATCH V2] [i386] Optimize v4si br... liuhongt via Gcc-patches
- Re: [PATCH V2] [i386] Optimize... Hongtao Liu via Gcc-patches
- Re: [PATCH V2] [i386] Optimize... Uros Bizjak via Gcc-patches