Hi, here is the patch with force_reg before lowpart_subreg.

Bootstraped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.

Ok for master?

For complex scalar intrinsic like _mm_mask_fcmadd_sch, the
mask should be and by 1 to ensure the mask is bind to lowest byte.
Use masked vmovss to perform same operation which omits higher bits
of mask.

gcc/ChangeLog:

        PR target/104978
        * config/i386/sse.md
        (avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name):
        Use avx512f_movsf_mask instead of vmovaps or vblend, and
        force_reg before lowpart_subreg.
        (avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name): Likewise.

gcc/testsuite/ChangeLog:

        PR target/104978
        * gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c: Adjust asm scan.
        * gcc.target/i386/avx512fp16-vfmaddcsh-1a.c: Ditto.
        * gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c: Removed.
        * gcc.target/i386/avx512fp16-vfmaddcsh-1c.c: Ditto.
        * gcc.target/i386/pr104978.c: New test.

V3
---
 gcc/config/i386/sse.md                        | 62 ++++++-------------
 .../i386/avx512fp16-vfcmaddcsh-1a.c           |  4 +-
 .../i386/avx512fp16-vfcmaddcsh-1c.c           | 13 ----
 .../gcc.target/i386/avx512fp16-vfmaddcsh-1a.c |  4 +-
 .../gcc.target/i386/avx512fp16-vfmaddcsh-1c.c | 13 ----
 gcc/testsuite/gcc.target/i386/pr104978.c      | 18 ++++++
 6 files changed, 42 insertions(+), 72 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
 delete mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104978.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 21bf3c55c95..6f7af2f21d6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6576,7 +6576,7 @@ (define_expand 
"avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
    (match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx op0, op1, dest;
 
   if (<round_embedded_complex>)
     emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> (
@@ -6586,26 +6586,15 @@ (define_expand 
"avx512fp16_fmaddcsh_v8hf_mask1<round_expand_name>"
     emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0],
       operands[1], operands[2], operands[3], operands[4]));
 
-  if (TARGET_AVX512VL)
-  {
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
-  }
-  else
-  {
-    rtx mask, tmp, vec_mask;
-    mask = lowpart_subreg (SImode, operands[4], QImode),
-    tmp = gen_reg_rtx (SImode);
-    emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
-    vec_mask = gen_reg_rtx (V4SImode);
-    emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
-    emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
-    vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
-  }
+  op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]),
+                       V8HFmode);
+  if (!MEM_P (operands[1]))
+    operands[1] = force_reg (V8HFmode, operands[1]);
+  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
+  dest = gen_reg_rtx (V4SFmode);
+  emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4]));
+  emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest,
+                                              V4SFmode));
   DONE;
 })
 
@@ -6631,7 +6620,7 @@ (define_expand 
"avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
    (match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx op0, op1, dest;
 
   if (<round_embedded_complex>)
     emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> (
@@ -6641,26 +6630,15 @@ (define_expand 
"avx512fp16_fcmaddcsh_v8hf_mask1<round_expand_name>"
     emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0],
       operands[1], operands[2], operands[3], operands[4]));
 
-  if (TARGET_AVX512VL)
-  {
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
-  }
-  else
-  {
-    rtx mask, tmp, vec_mask;
-    mask = lowpart_subreg (SImode, operands[4], QImode),
-    tmp = gen_reg_rtx (SImode);
-    emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
-    vec_mask = gen_reg_rtx (V4SImode);
-    emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
-    emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
-    vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
-  }
+  op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]),
+                       V8HFmode);
+  if (!MEM_P (operands[1]))
+    operands[1] = force_reg (V8HFmode, operands[1]);
+  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
+  dest = gen_reg_rtx (V4SFmode);
+  emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4]));
+  emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest,
+                                              V4SFmode));
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
index eb96588df39..0f87861f09b 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1a.c
@@ -1,13 +1,13 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
+/* { dg-options "-mavx512fp16 -O2" } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
 \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vblendvps\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vmovss\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[
 \\t\]+#)" 2 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
deleted file mode 100644
index 79a295f722c..00000000000
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfcmaddcsh-1c.c
+++ /dev/null
@@ -1,13 +0,0 @@
-/* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
 \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfcmaddcsh\[ 
\\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vblendmps\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vmovss\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
-
-#include "avx512fp16-vfcmaddcsh-1a.c"
-
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
index 288d1c12a10..6b07957a8bb 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1a.c
@@ -1,13 +1,13 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mno-avx512vl -O2" } */
+/* { dg-options "-mavx512fp16 -O2" } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
 \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vblendvps\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
 \\t\]+#)" 2 } } */
 /* { dg-final { scan-assembler-times "vmovss\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vmovss\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[
 \\t\]+#)" 2 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
deleted file mode 100644
index 7863f8f9af9..00000000000
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-vfmaddcsh-1c.c
+++ /dev/null
@@ -1,13 +0,0 @@
-/* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[
 \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vfmaddcsh\[ 
\\t\]+\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[
 \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vblendmps\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\{\n\r]*(?:\n|\[
 \\t\]+#)" 2 } } */
-/* { dg-final { scan-assembler-times "vmovss\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
-
-#include "avx512fp16-vfmaddcsh-1a.c"
-
diff --git a/gcc/testsuite/gcc.target/i386/pr104978.c 
b/gcc/testsuite/gcc.target/i386/pr104978.c
new file mode 100644
index 00000000000..54788171aff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104978.c
@@ -0,0 +1,18 @@
+/* PR target/104978 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16" } */
+/* { dg-final { scan-assembler-times "vmovss\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}+(?:\n|\[
 \\t\]+#)" 2 } } */
+
+#include<immintrin.h>
+
+__m128h
+foo (__m128h a, __m128h b, __m128h c, __mmask8 m)
+{ 
+  return _mm_mask_fmadd_round_sch (a, m, b, c, 8);
+}
+
+__m128h
+foo2 (__m128h a, __m128h b, __m128h c, __mmask8 m)
+{ 
+  return _mm_mask_fcmadd_round_sch (a, m, b, c, 8);
+}
-- 
2.18.1

Reply via email to