> Considering ICE in PR104976, it's better to force_reg before lowpart_subreg. > i.e. > op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]), V8HFmode); > if (!MEM_P (operands[1])) > operands[1] = force_reg (V8HFmode, operands[1]); > op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > rtx dest = gen_reg_rtx (V4SFmode); > emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4])); > emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode);
I think this is different from PR104976, since operands[0] and operands[1] here are strictly V8HF operands from builtin input. I suppose there should be no chance to input a different size subreg for the expander, otherwise (__v8hf) convert in builtin would fail first. Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2022年3月21日周一 20:53写道: > > On Mon, Mar 21, 2022 at 7:52 PM Hongyu Wang via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > > Hi, > > > > For complex scalar intrinsic like _mm_mask_fcmadd_sch, the > > mask should be and by 1 to ensure the mask is bind to lowest byte. > > Use masked vmovss to perform same operation which omits higher bits > > of mask. > > > > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde. > > > > Ok for master? > > > > gcc/ChangeLog: > > > > PR target/104978 > > * config/i386/sse.md > > (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name): > > Use avx512f_movsf_mask instead of vmovaps or vblend. > > (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise. > > > > gcc/testsuite/ChangeLog: > > > > PR target/104978 > > * gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c: Adjust scan. > > * gcc.target/i386/avx512fp16-vfmaddcsh-1a.c: Ditto. > > * gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c: Removed. > > * gcc.target/i386/avx512fp16-vfmaddcsh-1c.c: Ditto. > > * gcc.target/i386/pr104978.c: New test. > > --- > > gcc/config/i386/sse.md | 48 ++++--------------- > > .../i386/avx512fp16-vfcmaddcsh-1a.c | 4 +- > > .../i386/avx512fp16-vfcmaddcsh-1c.c | 13 ----- > > .../gcc.target/i386/avx512fp16-vfmaddcsh-1a.c | 4 +- > > .../gcc.target/i386/avx512fp16-vfmaddcsh-1c.c | 13 ----- > > gcc/testsuite/gcc.target/i386/pr104978.c | 18 +++++++ > > 6 files changed, 30 insertions(+), 70 deletions(-) > > delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c > > delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c > > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > > index 21bf3c55c95..1087a37812f 100644 > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -6586,26 +6586,10 @@ (define_expand > > "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>" > > emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0], > > operands[1], operands[2], operands[3], operands[4])); > > > > - if (TARGET_AVX512VL) > > - { > > - op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > > - op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > > - emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4])); > > - } > > - else > > - { > > - rtx mask, tmp, vec_mask; > > - mask = lowpart_subreg (SImode, operands[4], QImode), > > - tmp = gen_reg_rtx (SImode); > > - emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); > > - vec_mask = gen_reg_rtx (V4SImode); > > - emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode))); > > - emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp)); > > - vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode); > > - op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > > - op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > > - emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask)); > > - } > > + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > > + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > > + emit_insn (gen_avx512f_movsf_mask (op1, op1, op0, op1, operands[4])); > > + emit_move_insn (op0, op1); > Considering ICE in PR104976, it's better to force_reg before lowpart_subreg. > i.e. > op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]), V8HFmode); > if (!MEM_P (operands[1])) > operands[1] = force_reg (V8HFmode, operands[1]); > op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > rtx dest = gen_reg_rtx (V4SFmode); > emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4])); > emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode); > > > DONE; > > }) > > > > @@ -6641,26 +6625,10 @@ (define_expand > > "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>" > > emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0], > > operands[1], operands[2], operands[3], operands[4])); > > > > - if (TARGET_AVX512VL) > > - { > > - op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > > - op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > > - emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4])); > > - } > > - else > > - { > > - rtx mask, tmp, vec_mask; > > - mask = lowpart_subreg (SImode, operands[4], QImode), > > - tmp = gen_reg_rtx (SImode); > > - emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); > > - vec_mask = gen_reg_rtx (V4SImode); > > - emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode))); > > - emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp)); > > - vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode); > > - op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > > - op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > > - emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask)); > > - } > > + op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > > + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > > + emit_insn (gen_avx512f_movsf_mask (op1, op1, op0, op1, operands[4])); > > + emit_move_insn (op0, op1); > > DONE; > > }) > > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c > > b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c > > index eb96588df39..0f87861f09b 100644 > > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c > > @@ -1,13 +1,13 @@ > > /* { dg-do compile } */ > > -/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */ > > +/* { dg-options "-mavx512fp16 -O2" } */ > > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 1 } } */ > > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ > > \\t\]+#)" 2 } } */ > > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 1 } } */ > > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 1 } } */ > > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 2 } } */ > > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 1 } } */ > > -/* { dg-final { scan-assembler-times "vblendvps\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 2 } } */ > > /* { dg-final { scan-assembler-times "vmovss\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 2 } } */ > > +/* { dg-final { scan-assembler-times "vmovss\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ > > \\t\]+#)" 2 } } */ > > > > #include <immintrin.h> > > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c > > b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c > > deleted file mode 100644 > > index 79a295f722c..00000000000 > > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c > > +++ /dev/null > > @@ -1,13 +0,0 @@ > > -/* { dg-do compile } */ > > -/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */ > > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 1 } } */ > > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ > > \\t\]+#)" 2 } } */ > > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 1 } } */ > > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 1 } } */ > > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 2 } } */ > > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ > > \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 1 } } */ > > -/* { dg-final { scan-assembler-times "vblendmps\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ > > \\t\]+#)" 2 } } */ > > -/* { dg-final { scan-assembler-times "vmovss\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 2 } } */ > > - > > -#include "avx512fp16-vfcmaddcsh-1a.c" > > - > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c > > b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c > > index 288d1c12a10..6b07957a8bb 100644 > > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c > > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c > > @@ -1,13 +1,13 @@ > > /* { dg-do compile } */ > > -/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */ > > +/* { dg-options "-mavx512fp16 -O2" } */ > > /* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 1 } } */ > > /* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ > > \\t\]+#)" 2 } } */ > > /* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 1 } } */ > > /* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 1 } } */ > > /* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 2 } } */ > > /* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 1 } } */ > > -/* { dg-final { scan-assembler-times "vblendvps\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 2 } } */ > > /* { dg-final { scan-assembler-times "vmovss\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 2 } } */ > > +/* { dg-final { scan-assembler-times "vmovss\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ > > \\t\]+#)" 2 } } */ > > > > #include <immintrin.h> > > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c > > b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c > > deleted file mode 100644 > > index 7863f8f9af9..00000000000 > > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c > > +++ /dev/null > > @@ -1,13 +0,0 @@ > > -/* { dg-do compile } */ > > -/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */ > > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 1 } } */ > > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ > > \\t\]+#)" 2 } } */ > > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 1 } } */ > > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 1 } } */ > > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 2 } } */ > > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ > > \\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ > > \\t\]+#)" 1 } } */ > > -/* { dg-final { scan-assembler-times "vblendmps\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[ > > \\t\]+#)" 2 } } */ > > -/* { dg-final { scan-assembler-times "vmovss\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ > > \\t\]+#)" 2 } } */ > > - > > -#include "avx512fp16-vfmaddcsh-1a.c" > > - > > diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c > > b/gcc/testsuite/gcc.target/i386/pr104978.c > > new file mode 100644 > > index 00000000000..54788171aff > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr104978.c > > @@ -0,0 +1,18 @@ > > +/* PR target/104978 */ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mavx512fp16" } */ > > +/* { dg-final { scan-assembler-times "vmovss\[ > > \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[ > > \\t\]+#)" 2 } } */ > > + > > +#include<immintrin.h> > > + > > +__m128h > > +foo (__m128h a, __m128h b, __m128h c, __mmask8 m) > > +{ > > + return _mm_mask_fmadd_round_sch (a, m, b, c, 8); > > +} > > + > > +__m128h > > +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m) > > +{ > > + return _mm_mask_fcmadd_round_sch (a, m, b, c, 8); > > +} > > -- > > 2.18.1 > > > > > -- > BR, > Hongtao