On Mon, May 25, 2020 at 1:55 AM Uros Bizjak <ubiz...@gmail.com> wrote:
>
> On Sun, May 24, 2020 at 9:26 AM Hongtao Liu <crazy...@gmail.com> wrote:
> >
> > On Sat, May 23, 2020 at 6:11 PM Uros Bizjak <ubiz...@gmail.com> wrote:
> > >
> > > On Sat, May 23, 2020 at 9:25 AM Hongtao Liu <crazy...@gmail.com> wrote:
> > > >
> > > > Hi:
> > > >   This patch fix non-conforming expander for
> > > > floatv2div2sf2,floatunsv2div2sf2,fix_truncv2sfv2di,fixuns_truncv2sfv2di,
> > > > refer to PR95211, PR95256.
> > > >   bootstrap ok, regression test on i386/x86-64 backend is ok.
> > > >
> > > > gcc/ChangeLog:
> > > >         PR target/95211 PR target/95256
> > >
> > Changed.
> > > Please put every PR reference in a separate line.
> > >
> > > >         * config/i386/sse.md <floatunssuffix>v2div2sf2): New expander.
> > > >         (fix<fixunssuffix>_truncv2sfv2di2): Ditto.
> > > >         (float<floatunssuffix>v2div2sf2_internal): Renaming from
> > > >         float<floatunssuffix>v2div2sf2.
> > > >         (fix<fixunssuffix>_truncv2sfv2di2<mask_name>_internal):
> > >
> > > The convention throughout sse,md is to prefix a standard pattern that
> > > is used through builtins with avx512<something>_ instead of suffixing
> > > the pattern name with _internal.
> > >
> > Changed.
> > > >         Renaming from fix<fixunssuffix>_truncv2sfv2di2<mask_name>.
> > > >         (vec_pack<floatprefix>_float_<mode>): Adjust icode name.
> > > >         (vec_unpack_<fixprefix>fix_trunc_lo_<mode>): Ditto.
> > > >         * config/i386/i386-builtin.def: Ditto.
> > >
> > > Uros.
> >
> > Update patch.
>
> The patch is wrong, and the correct way to fix these patterns is more complex:
>
> a) the pattern should not access register in mode, narrower than 128
> bits, as this implies MMX register in non-TARGET-MMX-WITH-SSE targets.
> So, the correct way to define insn with narrow mode is to use
> vec_select, something like:
>
> (define_insn "sse4_1_<code>v8qiv8hi2<mask_name>"
>   [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
>     (any_extend:V8HI
>       (vec_select:V8QI
>         (match_operand:V16QI 1 "register_operand" "Yr,*x,v")
>         (parallel [(const_int 0) (const_int 1)
>                (const_int 2) (const_int 3)
>                (const_int 4) (const_int 5)
>                (const_int 6) (const_int 7)]))))]
>
> The instruction accesses the memory in the correct mode, so the memory
> operand is:
>
> (define_insn "*sse4_1_<code>v8qiv8hi2<mask_name>_1"
>   [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v")
>     (any_extend:V8HI
>       (match_operand:V8QI 1 "memory_operand" "m,m,m")))]
>
> and a pre-reload split has to be introduced to convert insn from
> register form to memory form, when memory gets propagated to the insn:
>
> (define_insn_and_split "*sse4_1_<code>v8qiv8hi2<mask_name>_2"
>   [(set (match_operand:V8HI 0 "register_operand")
>     (any_extend:V8HI
>       (vec_select:V8QI
>         (subreg:V16QI
>           (vec_concat:V2DI
>             (match_operand:DI 1 "memory_operand")
>         (const_int 0)) 0)
>         (parallel [(const_int 0) (const_int 1)
>                (const_int 2) (const_int 3)
>                (const_int 4) (const_int 5)
>                (const_int 6) (const_int 7)]))))]
>
> For a middle end to use this insn, an expander is used:
>
> (define_expand "<code>v8qiv8hi2"
>   [(set (match_operand:V8HI 0 "register_operand")
>     (any_extend:V8HI
>       (match_operand:V8QI 1 "nonimmediate_operand")))]
>
> b) Similar approach is used when an output is narrower than 128 bits:
>
> (define_insn "*float<floatunssuffix>v2div2sf2"
>   [(set (match_operand:V4SF 0 "register_operand" "=v")
>     (vec_concat:V4SF
>         (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm"))
>         (match_operand:V2SF 2 "const0_operand" "C")))]
>
> In your concrete case,
>
> (define_insn "fix<fixunssuffix>_truncv2sfv2di2<mask_name>"
>   [(set (match_operand:V2DI 0 "register_operand" "=v")
>     (any_fix:V2DI
>       (vec_select:V2SF
>         (match_operand:V4SF 1 "nonimmediate_operand" "vm")
>         (parallel [(const_int 0) (const_int 1)]))))]
>
> is already _NOT_ defined in a correct way as far as memory operand is
> concerned, see a) above. But, <shrug>, we will apparently have to live
> with that. The problem is, that it is named as a standard named
> pattern, so middle-end discovers it and tries to use it. It should be
> renamed with avx512dq_... prefix. Let's give middle-end something
> correct, similar to:
>
> (define_expand "<code>v8qiv8hi2"
>   [(set (match_operand:V8HI 0 "register_operand")
>     (any_extend:V8HI
>       (match_operand:V8QI 1 "nonimmediate_operand")))]
>   "TARGET_SSE4_1"
> {
>   if (!MEM_P (operands[1]))
>     {
>       operands[1] = simplify_gen_subreg (V16QImode, operands[1], V8QImode, 0);
>       emit_insn (gen_sse4_1_<code>v8qiv8hi2 (operands[0], operands[1]));
>       DONE;
>     }
> })
>
> The second case is with v2sf output, less than 128 bits wide:
>
> (define_insn "*float<floatunssuffix>v2div2sf2"
>   [(set (match_operand:V4SF 0 "register_operand" "=v")
>     (vec_concat:V4SF
>         (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm"))
>         (match_operand:V2SF 2 "const0_operand" "C")))]
>
> The above insn pattern is OK, we access the output register with
> 128bit access, so we are sure no MMX reg will be generated. The
> problem is with the existing expander
>

Oh, i missed that, thx for explanation.

> (define_expand "float<floatunssuffix>v2div2sf2"
>   [(set (match_operand:V4SF 0 "register_operand" "=v")
>     (vec_concat:V4SF
>         (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm"))
>         (match_dup 2)))]
>   "TARGET_AVX512DQ && TARGET_AVX512VL"
>   "operands[2] = CONST0_RTX (V2SFmode);")
>
> again, this expander has standard name, and is discovered by
> middle-end. Unfortunatelly, it doesn't have conforming operand type.
> So, rename it with avx512dq_ prefix, since it is needed by builtins.
>
> We have to introduce a new expander, that will have conforming mode of
> output operand (V2SF) and will produce RTX that will match
> *float<floatunssuffix>v2div2sf2. A paradoxical output subreg from
> V2SFmode V4SFmode is needed, generated by simplify_gen_subreg as is
> the case with paradoxical input subreg.

Problem`is simplify_gen_subreg (V4SFmode, operands[0], V2SFmode, 0)
will return NULL since
----
948  /* Subregs involving floating point modes are not allowed to
949     change size.  Therefore (subreg:DI (reg:DF) 0) is fine, but
950     (subreg:SI (reg:DF) 0) isn't.  */
----
Maybe we could define expander under TARGET_MMX_WITH_SSE.

+(define_insn "fix<fixunssuffix>_truncv2sfv2di2"
+  [(set (match_operand:V2DI 0 "register_operand")
+ (any_fix:V2DI
+   (match_operand:V2SF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512DQ && TARGET_AVX512VL && TARGET_MMX_WITH_SSE"
+  "vcvttps2<fixsuffix>qq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])

and

+(define_insn "float<floatunssuffix>v2div2sf2"
+  [(set (match_operand:V2SF 0 "register_operand" "=v")
+   (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512DQ && TARGET_AVX512VL && TARGET_MMX_WITH_SSE"
+  "vcvt<floatsuffix>qq2ps{x}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V4SF")])
>
> Uros.

Update patch.

-- 
BR,
Hongtao
From bb78598e33c3deebfe5d16aaca84c72a96253b9c Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao....@intel.com>
Date: Fri, 22 May 2020 21:56:03 +0800
Subject: [PATCH] Fix non-comforming expander for
 floatv2div2sf2,floatunsv2div2sf2,fix_truncv2sfv2di,fixuns_truncv2sfv2di.

gcc/ChangeLog:
	PR target/95211
	PR target/95256
	* config/i386/sse.md <floatunssuffix>v2div2sf2): New expander.
	(fix<fixunssuffix>_truncv2sfv2di2): Ditto.
	(avx512dq_float<floatunssuffix>v2div2sf2): Renaming from
	float<floatunssuffix>v2div2sf2.
	(avx512dq_fix<fixunssuffix>_truncv2sfv2di2<mask_name>):
	Renaming from fix<fixunssuffix>_truncv2sfv2di2<mask_name>.
	(vec_pack<floatprefix>_float_<mode>): Adjust icode name.
	(vec_unpack_<fixprefix>fix_trunc_lo_<mode>): Ditto.
	(vec_unpack_<fixprefix>fix_trunc_hi_<mode>): Ditto.
	* config/i386/i386-builtin.def: Ditto.

gcc/testsuite/ChangeLog
	* gcc.target/i386/pr95211.c: New test.
---
 gcc/config/i386/i386-builtin.def        |  4 +--
 gcc/config/i386/sse.md                  | 32 ++++++++++++++++++---
 gcc/testsuite/gcc.target/i386/pr95211.c | 38 +++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95211.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index fa123788a8e..b873498f3ab 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -1649,9 +1649,9 @@ BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_not
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_notruncv4dfv4si2_mask, "__builtin_ia32_cvtpd2udq256_mask", IX86_BUILTIN_CVTPD2UDQ256_MASK, UNKNOWN, (int) V4SI_FTYPE_V4DF_V4SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_notruncv2dfv2si2_mask, "__builtin_ia32_cvtpd2udq128_mask", IX86_BUILTIN_CVTPD2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V2DF_V4SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv4sfv4di2_mask, "__builtin_ia32_cvttps2qq256_mask", IX86_BUILTIN_CVTTPS2QQ256, UNKNOWN, (int) V4DI_FTYPE_V4SF_V4DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv2sfv2di2_mask, "__builtin_ia32_cvttps2qq128_mask", IX86_BUILTIN_CVTTPS2QQ128, UNKNOWN, (int) V2DI_FTYPE_V4SF_V2DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512dq_fix_truncv2sfv2di2_mask, "__builtin_ia32_cvttps2qq128_mask", IX86_BUILTIN_CVTTPS2QQ128, UNKNOWN, (int) V2DI_FTYPE_V4SF_V2DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_truncv4sfv4di2_mask, "__builtin_ia32_cvttps2uqq256_mask", IX86_BUILTIN_CVTTPS2UQQ256, UNKNOWN, (int) V4DI_FTYPE_V4SF_V4DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_truncv2sfv2di2_mask, "__builtin_ia32_cvttps2uqq128_mask", IX86_BUILTIN_CVTTPS2UQQ128, UNKNOWN, (int) V2DI_FTYPE_V4SF_V2DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512dq_fixuns_truncv2sfv2di2_mask, "__builtin_ia32_cvttps2uqq128_mask", IX86_BUILTIN_CVTTPS2UQQ128, UNKNOWN, (int) V2DI_FTYPE_V4SF_V2DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv8sfv8si2_mask, "__builtin_ia32_cvttps2dq256_mask", IX86_BUILTIN_CVTTPS2DQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SF_V8SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv4sfv4si2_mask, "__builtin_ia32_cvttps2dq128_mask", IX86_BUILTIN_CVTTPS2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SF_V4SI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_truncv8sfv8si2_mask, "__builtin_ia32_cvttps2udq256_mask", IX86_BUILTIN_CVTTPS2UDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF_V8SI_UQI)
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5071fb2895a..92b5e378aef 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5795,7 +5795,16 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<MODE>")])
 
-(define_expand "float<floatunssuffix>v2div2sf2"
+(define_insn "float<floatunssuffix>v2div2sf2"
+  [(set (match_operand:V2SF 0 "register_operand" "=v")
+	  (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512DQ && TARGET_AVX512VL && TARGET_MMX_WITH_SSE"
+  "vcvt<floatsuffix>qq2ps{x}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V4SF")])
+
+(define_expand "avx512dq_float<floatunssuffix>v2div2sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=v")
 	(vec_concat:V4SF
 	    (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm"))
@@ -5831,6 +5840,8 @@
   rtx r1 = gen_reg_rtx (<vpckfloat_temp_mode>mode);
   rtx r2 = gen_reg_rtx (<vpckfloat_temp_mode>mode);
   rtx (*gen) (rtx, rtx) = gen_float<floatunssuffix><mode><vpckfloat_op_mode>2;
+  if (<MODE>mode == V2DImode)
+    gen = gen_avx512dq_float<floatunssuffix>v2div2sf2;
   emit_insn (gen (r1, operands[1]));
   emit_insn (gen (r2, operands[2]));
   if (<MODE>mode == V2DImode)
@@ -6217,7 +6228,17 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseintvecmode3>")])
 
-(define_insn "fix<fixunssuffix>_truncv2sfv2di2<mask_name>"
+(define_insn "fix<fixunssuffix>_truncv2sfv2di2"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(any_fix:V2DI
+	  (match_operand:V2SF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512DQ && TARGET_AVX512VL && TARGET_MMX_WITH_SSE"
+  "vcvttps2<fixsuffix>qq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512dq_fix<fixunssuffix>_truncv2sfv2di2<mask_name>"
   [(set (match_operand:V2DI 0 "register_operand" "=v")
 	(any_fix:V2DI
 	  (vec_select:V2SF
@@ -6251,6 +6272,8 @@
     }
   rtx (*gen) (rtx, rtx)
     = gen_fix<fixunssuffix>_trunc<ssehalfvecmodelower><vunpckfixt_model>2;
+  if (<MODE>mode == V4SFmode)
+    gen = gen_avx512dq_fix<fixunssuffix>_truncv2sfv2di2;
   emit_insn (gen (operands[0], tem));
   DONE;
 })
@@ -6262,19 +6285,20 @@
   "TARGET_AVX512DQ"
 {
   rtx tem;
+  rtx (*gen) (rtx, rtx);
   if (<MODE>mode != V4SFmode)
     {
       tem = gen_reg_rtx (<ssehalfvecmode>mode);
       emit_insn (gen_vec_extract_hi_<vunpckfixt_extract_mode> (tem,
 							       operands[1]));
+      gen = gen_fix<fixunssuffix>_trunc<ssehalfvecmodelower><vunpckfixt_model>2;
     }
   else
     {
       tem = gen_reg_rtx (V4SFmode);
       emit_insn (gen_avx_vpermilv4sf (tem, operands[1], GEN_INT (0x4e)));
+      gen  = gen_avx512dq_fix<fixunssuffix>_truncv2sfv2di2;
     }
-  rtx (*gen) (rtx, rtx)
-    = gen_fix<fixunssuffix>_trunc<ssehalfvecmodelower><vunpckfixt_model>2;
   emit_insn (gen (operands[0], tem));
   DONE;
 })
diff --git a/gcc/testsuite/gcc.target/i386/pr95211.c b/gcc/testsuite/gcc.target/i386/pr95211.c
new file mode 100644
index 00000000000..dc10f8f4659
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95211.c
@@ -0,0 +1,38 @@
+/* PR target/95211 target/95256 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -ftree-slp-vectorize -march=skylake-avx512" } */
+
+extern float f[4];
+extern long long l[2];
+extern long long ul[2];
+
+void
+fix_128 (void)
+{
+  l[0] = f[0];
+  l[1] = f[1];
+}
+
+void
+fixuns_128 (void)
+{
+  ul[0] = f[0];
+  ul[1] = f[1];
+}
+
+void
+float_128 (void)
+{
+  f[0] = l[0];
+  f[1] = l[1];
+}
+
+void
+floatuns_128 (void)
+{
+  f[0] = ul[0];
+  f[1] = ul[1];
+}
+
+/* { dg-final { scan-assembler-times "vcvttps2qq" 2 } } */
+/* { dg-final { scan-assembler-times "vcvtqq2ps" 2 } } */
-- 
2.18.1

Reply via email to