> -----Original Message----- > From: Wang, Hongyu <hongyu.w...@intel.com> > Sent: Tuesday, March 22, 2022 11:28 AM > To: Liu, Hongtao <hongtao....@intel.com> > Cc: gcc-patches@gcc.gnu.org > Subject: [PATCH v3] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch > [PR 104978] > > Hi, here is the patch with force_reg before lowpart_subreg. > > Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde. > > Ok for master? > > For complex scalar intrinsic like _mm_mask_fcmadd_sch, the mask should be > and by 1 to ensure the mask is bind to lowest byte. > Use masked vmovss to perform same operation which omits higher bits of mask. > > gcc/ChangeLog: > > PR target/104978 > * config/i386/sse.md > (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name): > Use avx512f_movsf_mask instead of vmovaps or vblend, and > force_reg before lowpart_subreg. > (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise. LGTM. > > gcc/testsuite/ChangeLog: > > PR target/104978 > * gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c: Adjust asm scan. > * gcc.target/i386/avx512fp16-vfmaddcsh-1a.c: Ditto. > * gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c: Removed. > * gcc.target/i386/avx512fp16-vfmaddcsh-1c.c: Ditto. > * gcc.target/i386/pr104978.c: New test. > > V3 > --- > gcc/config/i386/sse.md | 62 ++++++------------- > .../i386/avx512fp16-vfcmaddcsh-1a.c | 4 +- > .../i386/avx512fp16-vfcmaddcsh-1c.c | 13 ---- > .../gcc.target/i386/avx512fp16-vfmaddcsh-1a.c | 4 +- > .../gcc.target/i386/avx512fp16-vfmaddcsh-1c.c | 13 ---- > gcc/testsuite/gcc.target/i386/pr104978.c | 18 ++++++ > 6 files changed, 42 insertions(+), 72 deletions(-) delete mode 100644 > gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c > delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index > 21bf3c55c95..6f7af2f21d6 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -6576,7 +6576,7 @@ (define_expand > "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>" > (match_operand:QI 4 "register_operand")] > "TARGET_AVX512FP16 && <round_mode512bit_condition>" > { > - rtx op0, op1; > + rtx op0, op1, dest; > > if (<round_embedded_complex>) > emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> > ( @@ -6586,26 +6586,15 @@ (define_expand > "avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>" > emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0], > operands[1], operands[2], operands[3], operands[4])); > > - if (TARGET_AVX512VL) > - { > - op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > - op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > - emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4])); > - } > - else > - { > - rtx mask, tmp, vec_mask; > - mask = lowpart_subreg (SImode, operands[4], QImode), > - tmp = gen_reg_rtx (SImode); > - emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); > - vec_mask = gen_reg_rtx (V4SImode); > - emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode))); > - emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp)); > - vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode); > - op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > - op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > - emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask)); > - } > + op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]), > + V8HFmode); > + if (!MEM_P (operands[1])) > + operands[1] = force_reg (V8HFmode, operands[1]); > + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > + dest = gen_reg_rtx (V4SFmode); > + emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, > +operands[4])); > + emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, > + V4SFmode)); > DONE; > }) > > @@ -6631,7 +6620,7 @@ (define_expand > "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>" > (match_operand:QI 4 "register_operand")] > "TARGET_AVX512FP16 && <round_mode512bit_condition>" > { > - rtx op0, op1; > + rtx op0, op1, dest; > > if (<round_embedded_complex>) > emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> > ( @@ -6641,26 +6630,15 @@ (define_expand > "avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>" > emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0], > operands[1], operands[2], operands[3], operands[4])); > > - if (TARGET_AVX512VL) > - { > - op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > - op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > - emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4])); > - } > - else > - { > - rtx mask, tmp, vec_mask; > - mask = lowpart_subreg (SImode, operands[4], QImode), > - tmp = gen_reg_rtx (SImode); > - emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31))); > - vec_mask = gen_reg_rtx (V4SImode); > - emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode))); > - emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp)); > - vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode); > - op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode); > - op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > - emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask)); > - } > + op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]), > + V8HFmode); > + if (!MEM_P (operands[1])) > + operands[1] = force_reg (V8HFmode, operands[1]); > + op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode); > + dest = gen_reg_rtx (V4SFmode); > + emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, > +operands[4])); > + emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, > + V4SFmode)); > DONE; > }) > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c > b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c > index eb96588df39..0f87861f09b 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c > @@ -1,13 +1,13 @@ > /* { dg-do compile } */ > -/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */ > +/* { dg-options "-mavx512fp16 -O2" } */ > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0- > 9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0- > 9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > /* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > -/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > /* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vmovss\[ > +\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\ > +{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */ > > #include <immintrin.h> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c > b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c > deleted file mode 100644 > index 79a295f722c..00000000000 > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c > +++ /dev/null > @@ -1,13 +0,0 @@ > -/* { dg-do compile } */ > -/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */ > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0- > 9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0- > 9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rn- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > -/* { dg-final { scan-assembler-times "vfcmaddcsh\[ \\t\]+\{rz- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > -/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0- > 9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > -/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > - > -#include "avx512fp16-vfcmaddcsh-1a.c" > - > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c > b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c > index 288d1c12a10..6b07957a8bb 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c > +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c > @@ -1,13 +1,13 @@ > /* { dg-do compile } */ > -/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */ > +/* { dg-options "-mavx512fp16 -O2" } */ > /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0- > 9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0- > 9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > /* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > -/* { dg-final { scan-assembler-times "vblendvps\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > /* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > +/* { dg-final { scan-assembler-times "vmovss\[ > +\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\ > +{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */ > > #include <immintrin.h> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c > b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c > deleted file mode 100644 > index 7863f8f9af9..00000000000 > --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c > +++ /dev/null > @@ -1,13 +0,0 @@ > -/* { dg-do compile } */ > -/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */ > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0- > 9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0- > 9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rn- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > -/* { dg-final { scan-assembler-times "vfmaddcsh\[ \\t\]+\{rz- > sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0- > 9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ > -/* { dg-final { scan-assembler-times "vblendmps\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0- > 9\]\}\[^\{\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ > -/* { dg-final { scan-assembler-times "vmovss\[ \\t\]+\[^\{\n\]*%xmm\[0- > 9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ > - > -#include "avx512fp16-vfmaddcsh-1a.c" > - > diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c > b/gcc/testsuite/gcc.target/i386/pr104978.c > new file mode 100644 > index 00000000000..54788171aff > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr104978.c > @@ -0,0 +1,18 @@ > +/* PR target/104978 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx512fp16" } */ > +/* { dg-final { scan-assembler-times "vmovss\[ > +\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\ > +{%k\[0-9\]\}+(?:\n|\[ \\t\]+#)" 2 } } */ > + > +#include<immintrin.h> > + > +__m128h > +foo (__m128h a, __m128h b, __m128h c, __mmask8 m) { > + return _mm_mask_fmadd_round_sch (a, m, b, c, 8); } > + > +__m128h > +foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m) { > + return _mm_mask_fcmadd_round_sch (a, m, b, c, 8); } > -- > 2.18.1
RE: [PATCH v3] AVX512FP16: Fix wrong code for _mm_mask_f[c]madd.*sch [PR 104978]
Liu, Hongtao via Gcc-patches Mon, 21 Mar 2022 20:44:05 -0700
- [PATCH v3] AVX512FP16: Fix wrong code for _mm... Hongyu Wang via Gcc-patches
- RE: [PATCH v3] AVX512FP16: Fix wrong cod... Liu, Hongtao via Gcc-patches