On Wed, Feb 26, 2020 at 4:24 PM Jeff Law <l...@redhat.com> wrote:
>
> On Wed, 2020-02-26 at 16:02 -0800, H.J. Lu wrote:
> > On Wed, Feb 26, 2020 at 2:42 PM Jeff Law <l...@redhat.com> wrote:
> > > On Sat, 2020-02-15 at 07:26 -0800, H.J. Lu wrote:
> > > > On x86, when AVX and AVX512 are enabled, vector move instructions can
> > > > be encoded with either 2-byte/3-byte VEX (AVX) or 4-byte EVEX (AVX512):
> > > >
> > > >    0: c5 f9 6f d1             vmovdqa %xmm1,%xmm2
> > > >    4: 62 f1 fd 08 6f d1       vmovdqa64 %xmm1,%xmm2
> > > >
> > > > We prefer VEX encoding over EVEX since VEX is shorter.  Also AVX512F
> > > > only supports 512-bit vector moves.  AVX512F + AVX512VL supports 128-bit
> > > > and 256-bit vector moves.  Mode attributes on x86 vector move patterns
> > > > indicate target preferences of vector move encoding.  For vector 
> > > > register
> > > > to vector register move, we can use 512-bit vector move instructions to
> > > > move 128-bit/256-bit vector if AVX512VL isn't available.  With AVX512F
> > > > and AVX512VL, we should use VEX encoding for 128-bit/256-bit vector 
> > > > moves
> > > > if upper 16 vector registers aren't used.  This patch adds a function,
> > > > ix86_output_ssemov, to generate vector moves:
> > > >
> > > > 1. If zmm registers are used, use EVEX encoding.
> > > > 2. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE or VEX encoding
> > > > will be generated.
> > > > 3. If xmm16-xmm31/ymm16-ymm31 registers are used:
> > > >    a. With AVX512VL, AVX512VL vector moves will be generated.
> > > >    b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
> > > >       move will be done with zmm register move.
> > > >
> > > >
> > > [ ... ]
> > >
> > > > +/* Return the opcode of the TYPE_SSEMOV instruction.  To move from
> > > > +   or to xmm16-xmm31/ymm16-ymm31 registers, we either require
> > > > +   TARGET_AVX512VL or it is a register to register move which can
> > > > +   be done with zmm register move. */
> > > > +
> > > > +static const char *
> > > > +ix86_get_ssemov (rtx *operands, unsigned size,
> > > > +              enum attr_mode insn_mode, machine_mode mode)
> > > > +{
> > > > +  char buf[128];
> > > > +  bool misaligned_p = (misaligned_operand (operands[0], mode)
> > > > +                    || misaligned_operand (operands[1], mode));
> > > > +  bool evex_reg_p = (EXT_REX_SSE_REG_P (operands[0])
> > > > +                  || EXT_REX_SSE_REG_P (operands[1]));
> > > > +  machine_mode scalar_mode;
> > > > +
> > > > +  else if (SCALAR_INT_MODE_P (scalar_mode))
> > > > +    {
> > > > +      switch (scalar_mode)
> > > > +     {
> > > > +     case E_QImode:
> > > > +       if (size == 64)
> > > > +         opcode = (misaligned_p
> > > > +                   ? (TARGET_AVX512BW
> > > > +                      ? "vmovdqu8"
> > > > +                      : "vmovdqu64")
> > > > +                   : "vmovdqa64");
> > > > +       else if (evex_reg_p)
> > > > +         {
> > > > +           if (TARGET_AVX512VL)
> > > > +             opcode = (misaligned_p
> > > > +                       ? (TARGET_AVX512BW
> > > > +                          ? "vmovdqu8"
> > > > +                          : "vmovdqu64")
> > > > +                       : "vmovdqa64");
> > > > +         }
> > > > +       else
> > > > +         opcode = (misaligned_p
> > > > +                   ? (TARGET_AVX512BW
> > > > +                      ? "vmovdqu8"
> > > > +                      : "%vmovdqu")
> > > > +                   : "%vmovdqa");
> > > > +       break;
> > > > +     case E_HImode:
> > > > +       if (size == 64)
> > > > +         opcode = (misaligned_p
> > > > +                   ? (TARGET_AVX512BW
> > > > +                      ? "vmovdqu16"
> > > > +                      : "vmovdqu64")
> > > > +                   : "vmovdqa64");
> > > > +       else if (evex_reg_p)
> > > > +         {
> > > > +           if (TARGET_AVX512VL)
> > > > +             opcode = (misaligned_p
> > > > +                       ? (TARGET_AVX512BW
> > > > +                          ? "vmovdqu16"
> > > > +                          : "vmovdqu64")
> > > > +                       : "vmovdqa64");
> > > > +         }
> > > > +       else
> > > > +         opcode = (misaligned_p
> > > > +                   ? (TARGET_AVX512BW
> > > > +                      ? "vmovdqu16"
> > > > +                      : "%vmovdqu")
> > > > +                   : "%vmovdqa");
> > > > +       break;
> > > > +     case E_SImode:
> > > > +       if (size == 64)
> > > > +         opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > > > +       else if (evex_reg_p)
> > > > +         {
> > > > +           if (TARGET_AVX512VL)
> > > > +             opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > > > +         }
> > > > +       else
> > > > +         opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > > > +       break;
> > > > +     case E_DImode:
> > > > +     case E_TImode:
> > > > +     case E_OImode:
> > > > +       if (size == 64)
> > > > +         opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > > +       else if (evex_reg_p)
> > > > +         {
> > > > +           if (TARGET_AVX512VL)
> > > > +             opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > > +         }
> > > > +       else
> > > > +         opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > > > +       break;
> > > > +     case E_XImode:
> > > > +       opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > > +       break;
> > > > +     default:
> > > > +       gcc_unreachable ();
> > > > +     }
> > > > +    }
> > > > +  else
> > > > +    gcc_unreachable ();
> > > > +
> > > > +  if (!opcode)
> > > > +    {
> > > > +      /* NB: We get here only because we move xmm16-xmm31/ymm16-ymm31
> > > > +      registers without AVX512VL by using zmm register move.  */
> > > So the overall flow control in here is rather convoluted.  I hate the way
> > > you
> > > don't set OPCODE above and then do it down here.  I would suggest breaking
> > > the !opcode bits into its own little function.  Then above in those places
> > > where you do
> > >
> > > if (TARGET_AVX512VL)
> > >    opcode = <whatever>;
> > >
> > >
> > > Instead change those to something like
> > >
> > > if (TARGET_AVX512VL)
> > >    opcode = <whatever>;
> > > else
> > >    opcode = new_function (...)
> > >
> > > That way opcode is set on every path through the major if-else in this
> > > function.
> > >
> > > Second when I suggested you break the patch up on a per-pattern basis, I
> > > probably should have also said that I would start with the minimal support
> > > in
> > > ix86_get_ssemov and ix86_output_ssemov to support the pattern you just
> > > converted.  That way the mapping from current code to new code is more
> > > obvious.
> >
> > I will do these.   On x86,  different instructions can move vector
> > registers.  They all
> > do the same thing.  But some are preferred over others, depending on
> > tuning options.
> I know.
>
> >
> > > As it stands the breaking into separate patches didn't really help much
> > > because
> > > we still have all the complexity in ix86_get_ssemov and ix86_output_ssemov
> > > in
> > > patch #1 and that's the code I'm most worried about verifying we get 
> > > right,
> > > particularly at this stage.  I literally can't take any patch and map from
> > > the
> > > old code to the new code without having to understand all of patch #1.
> >
> > The old code is very convoluted and wrong in some cases.  I am trying to
> > clean it up.  I will update my patches based on your feedback.
> Thanks.  I was going to try and break those two functions down on my own, but
> you're more likely to get it right than I am :-)
>

How about this?  If it looks OK, I will post the whole patch set.

Thanks.

-- 
H.J.
From 3964b63d5ef086fa7466992f703bc1ec6de085dc Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.to...@gmail.com>
Date: Tue, 12 Feb 2019 13:25:41 -0800
Subject: [PATCH 01/10] i386: Properly encode vector registers in vector move

On x86, when AVX and AVX512 are enabled, vector move instructions can
be encoded with either 2-byte/3-byte VEX (AVX) or 4-byte EVEX (AVX512):

   0:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
   4:	62 f1 fd 08 6f d1    	vmovdqa64 %xmm1,%xmm2

We prefer VEX encoding over EVEX since VEX is shorter.  Also AVX512F
only supports 512-bit vector moves.  AVX512F + AVX512VL supports 128-bit
and 256-bit vector moves.  Mode attributes on x86 vector move patterns
indicate target preferences of vector move encoding.  For vector register
to vector register move, we can use 512-bit vector move instructions to
move 128-bit/256-bit vector if AVX512VL isn't available.  With AVX512F
and AVX512VL, we should use VEX encoding for 128-bit/256-bit vector moves
if upper 16 vector registers aren't used.  This patch adds a function,
ix86_output_ssemov, to generate vector moves:

1. If zmm registers are used, use EVEX encoding.
2. If xmm16-xmm31/ymm16-ymm31 registers aren't used, SSE or VEX encoding
will be generated.
3. If xmm16-xmm31/ymm16-ymm31 registers are used:
   a. With AVX512VL, AVX512VL vector moves will be generated.
   b. Without AVX512VL, xmm16-xmm31/ymm16-ymm31 register to register
      move will be done with zmm register move.

Tested on AVX2 and AVX512 with and without --with-arch=native.

gcc/

	PR target/89229
	PR target/89346
	* config/i386/i386-protos.h (ix86_output_ssemov): New prototype.
	* config/i386/i386.c (ix86_get_ssemov): New function.
	(ix86_output_ssemov): Likewise.
	* config/i386/sse.md (VMOVE:mov<mode>_internal): Call
	ix86_output_ssemov for TYPE_SSEMOV.  Remove TARGET_AVX512VL
	check.

gcc/testsuite/

	PR target/89229
	PR target/89346
	* gcc.target/i386/avx512vl-vmovdqa64-1.c: Updated.
	* gcc.target/i386/pr89346.c: New test.
---
 gcc/config/i386/i386-protos.h                 |   2 +
 gcc/config/i386/i386.c                        | 203 ++++++++++++++++++
 gcc/config/i386/sse.md                        |  98 +--------
 .../gcc.target/i386/avx512vl-vmovdqa64-1.c    |   7 +-
 gcc/testsuite/gcc.target/i386/pr89346.c       |  15 ++
 5 files changed, 225 insertions(+), 100 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89346.c

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 266381ca5a6..39fcaa0ad5f 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -38,6 +38,8 @@ extern void ix86_expand_split_stack_prologue (void);
 extern void ix86_output_addr_vec_elt (FILE *, int);
 extern void ix86_output_addr_diff_elt (FILE *, int, int);
 
+extern const char *ix86_output_ssemov (rtx_insn *, rtx *);
+
 extern enum calling_abi ix86_cfun_abi (void);
 extern enum calling_abi ix86_function_type_abi (const_tree);
 
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index dac7a3fc5fd..4602149e10c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -4915,6 +4915,209 @@ ix86_pre_reload_split (void)
 	  && !(cfun->curr_properties & PROP_rtl_split_insns));
 }
 
+/* Return the opcode of the TYPE_SSEMOV instruction.  To move from
+   or to xmm16-xmm31/ymm16-ymm31 registers, we either require
+   TARGET_AVX512VL or it is a register to register move which can
+   be done with zmm register move. */
+
+static const char *
+ix86_get_ssemov (rtx *operands, unsigned size,
+		 enum attr_mode insn_mode, machine_mode mode)
+{
+  char buf[128];
+  bool misaligned_p = (misaligned_operand (operands[0], mode)
+		       || misaligned_operand (operands[1], mode));
+  bool evex_reg_p = (size == 64
+		     || EXT_REX_SSE_REG_P (operands[0])
+		     || EXT_REX_SSE_REG_P (operands[1]));
+  machine_mode scalar_mode;
+
+  const char *opcode = NULL;
+  enum
+    {
+      opcode_int,
+      opcode_float,
+      opcode_double
+    } type = opcode_int;
+
+  switch (insn_mode)
+    {
+    case MODE_V16SF:
+    case MODE_V8SF:
+    case MODE_V4SF:
+      scalar_mode = E_SFmode;
+      type = opcode_float;
+      break;
+    case MODE_V8DF:
+    case MODE_V4DF:
+    case MODE_V2DF:
+      scalar_mode = E_DFmode;
+      type = opcode_double;
+      break;
+    case MODE_XI:
+    case MODE_OI:
+    case MODE_TI:
+      scalar_mode = GET_MODE_INNER (mode);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* NB: To move xmm16-xmm31/ymm16-ymm31 registers without AVX512VL,
+     we can only use zmm register move without memory operand.  */
+   if (evex_reg_p
+       && !TARGET_AVX512VL
+       && GET_MODE_SIZE (mode) < 64)
+     {
+       if (memory_operand (operands[0], mode)
+	   || memory_operand (operands[1], mode))
+	gcc_unreachable ();
+      size = 64;
+      switch (type)
+	{
+	case opcode_int:
+	  opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
+	  break;
+	case opcode_float:
+	  opcode = misaligned_p ? "vmovups" : "vmovaps";
+	  break;
+	case opcode_double:
+	  opcode = misaligned_p ? "vmovupd" : "vmovapd";
+	  break;
+	}
+     }
+   else if (SCALAR_FLOAT_MODE_P (scalar_mode))
+    {
+      switch (scalar_mode)
+	{
+	case E_SFmode:
+	  opcode = misaligned_p ? "%vmovups" : "%vmovaps";
+	  break;
+	case E_DFmode:
+	  opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
+	  break;
+	case E_TFmode:
+	  if (evex_reg_p)
+	    opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	  else
+	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  else if (SCALAR_INT_MODE_P (scalar_mode))
+    {
+      switch (scalar_mode)
+	{
+	case E_QImode:
+	  if (evex_reg_p)
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu8"
+			 : "vmovdqu64")
+		      : "vmovdqa64");
+	  else
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu8"
+			 : "%vmovdqu")
+		      : "%vmovdqa");
+	  break;
+	case E_HImode:
+	  if (evex_reg_p)
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu16"
+			 : "vmovdqu64")
+		      : "vmovdqa64");
+	  else
+	    opcode = (misaligned_p
+		      ? (TARGET_AVX512BW
+			 ? "vmovdqu16"
+			 : "%vmovdqu")
+		      : "%vmovdqa");
+	  break;
+	case E_SImode:
+	  if (evex_reg_p)
+	    opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
+	  else
+	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	  break;
+	case E_DImode:
+	case E_TImode:
+	case E_OImode:
+	  if (evex_reg_p)
+	    opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	  else
+	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	  break;
+	case E_XImode:
+	  opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  else
+    gcc_unreachable ();
+
+  switch (size)
+    {
+    case 64:
+      snprintf (buf, sizeof (buf), "%s\t{%%g1, %%g0|%%g0, %%g1}",
+		opcode);
+      break;
+    case 32:
+      snprintf (buf, sizeof (buf), "%s\t{%%t1, %%t0|%%t0, %%t1}",
+		opcode);
+      break;
+    case 16:
+      snprintf (buf, sizeof (buf), "%s\t{%%x1, %%x0|%%x0, %%x1}",
+		opcode);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  output_asm_insn (buf, operands);
+  return "";
+}
+
+/* Return the template of the TYPE_SSEMOV instruction to move
+   operands[1] into operands[0].  */
+
+const char *
+ix86_output_ssemov (rtx_insn *insn, rtx *operands)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (get_attr_type (insn) != TYPE_SSEMOV
+      || mode != GET_MODE (operands[1]))
+    gcc_unreachable ();
+
+  enum attr_mode insn_mode = get_attr_mode (insn);
+
+  switch (insn_mode)
+    {
+    case MODE_XI:
+    case MODE_V8DF:
+    case MODE_V16SF:
+      return ix86_get_ssemov (operands, 64, insn_mode, mode);
+
+    case MODE_OI:
+    case MODE_V4DF:
+    case MODE_V8SF:
+      return ix86_get_ssemov (operands, 32, insn_mode, mode);
+
+    case MODE_TI:
+    case MODE_V2DF:
+    case MODE_V4SF:
+      return ix86_get_ssemov (operands, 16, insn_mode, mode);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
 /* Returns true if OP contains a symbol reference */
 
 bool
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ee1f138d1af..8f5902292c6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1013,98 +1013,7 @@ (define_insn "mov<mode>_internal"
       return standard_sse_constant_opcode (insn, operands);
 
     case TYPE_SSEMOV:
-      /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
-	 in avx512f, so we need to use workarounds, to access sse registers
-	 16-31, which are evex-only. In avx512vl we don't need workarounds.  */
-      if (TARGET_AVX512F && <MODE_SIZE> < 64 && !TARGET_AVX512VL
-	  && (EXT_REX_SSE_REG_P (operands[0])
-	      || EXT_REX_SSE_REG_P (operands[1])))
-	{
-	  if (memory_operand (operands[0], <MODE>mode))
-	    {
-	      if (<MODE_SIZE> == 32)
-		return "vextract<shuffletype>64x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
-	      else if (<MODE_SIZE> == 16)
-		return "vextract<shuffletype>32x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
-	      else
-		gcc_unreachable ();
-	    }
-	  else if (memory_operand (operands[1], <MODE>mode))
-	    {
-	      if (<MODE_SIZE> == 32)
-		return "vbroadcast<shuffletype>64x4\t{%1, %g0|%g0, %1}";
-	      else if (<MODE_SIZE> == 16)
-		return "vbroadcast<shuffletype>32x4\t{%1, %g0|%g0, %1}";
-	      else
-		gcc_unreachable ();
-	    }
-	  else
-	    /* Reg -> reg move is always aligned.  Just use wider move.  */
-	    switch (get_attr_mode (insn))
-	      {
-	      case MODE_V8SF:
-	      case MODE_V4SF:
-		return "vmovaps\t{%g1, %g0|%g0, %g1}";
-	      case MODE_V4DF:
-	      case MODE_V2DF:
-		return "vmovapd\t{%g1, %g0|%g0, %g1}";
-	      case MODE_OI:
-	      case MODE_TI:
-		return "vmovdqa64\t{%g1, %g0|%g0, %g1}";
-	      default:
-		gcc_unreachable ();
-	      }
-	}
-
-      switch (get_attr_mode (insn))
-	{
-	case MODE_V16SF:
-	case MODE_V8SF:
-	case MODE_V4SF:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return "%vmovups\t{%1, %0|%0, %1}";
-	  else
-	    return "%vmovaps\t{%1, %0|%0, %1}";
-
-	case MODE_V8DF:
-	case MODE_V4DF:
-	case MODE_V2DF:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return "%vmovupd\t{%1, %0|%0, %1}";
-	  else
-	    return "%vmovapd\t{%1, %0|%0, %1}";
-
-	case MODE_OI:
-	case MODE_TI:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return TARGET_AVX512VL
-		   && (<MODE>mode == V4SImode
-		       || <MODE>mode == V2DImode
-		       || <MODE>mode == V8SImode
-		       || <MODE>mode == V4DImode
-		       || TARGET_AVX512BW)
-		   ? "vmovdqu<ssescalarsize>\t{%1, %0|%0, %1}"
-		   : "%vmovdqu\t{%1, %0|%0, %1}";
-	  else
-	    return TARGET_AVX512VL ? "vmovdqa64\t{%1, %0|%0, %1}"
-				   : "%vmovdqa\t{%1, %0|%0, %1}";
-	case MODE_XI:
-	  if (misaligned_operand (operands[0], <MODE>mode)
-	      || misaligned_operand (operands[1], <MODE>mode))
-	    return (<MODE>mode == V16SImode
-		    || <MODE>mode == V8DImode
-		    || TARGET_AVX512BW)
-		   ? "vmovdqu<ssescalarsize>\t{%1, %0|%0, %1}"
-		   : "vmovdqu64\t{%1, %0|%0, %1}";
-	  else
-	    return "vmovdqa64\t{%1, %0|%0, %1}";
-
-	default:
-	  gcc_unreachable ();
-	}
+      return ix86_output_ssemov (insn, operands);
 
     default:
       gcc_unreachable ();
@@ -1113,10 +1022,7 @@ (define_insn "mov<mode>_internal"
   [(set_attr "type" "sselog1,sselog1,ssemov,ssemov")
    (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
-	(cond [(and (eq_attr "alternative" "1")
-		    (match_test "TARGET_AVX512VL"))
-		 (const_string "<sseinsnmode>")
-	       (match_test "TARGET_AVX")
+	(cond [(match_test "TARGET_AVX")
 		 (const_string "<sseinsnmode>")
 	       (ior (not (match_test "TARGET_SSE2"))
 		    (match_test "optimize_function_for_size_p (cfun)"))
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
index 14fe4b84544..db4d9d14875 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
@@ -4,14 +4,13 @@
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\\(\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
-/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\\(\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\nxy\]*\\(.{5,6}(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
-/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\nxy\]*\\((?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\nxy\]*\\(.{5,6}(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/pr89346.c b/gcc/testsuite/gcc.target/i386/pr89346.c
new file mode 100644
index 00000000000..cdc9accf521
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89346.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+#include <immintrin.h>
+
+long long *p;
+volatile __m256i y;
+
+void
+foo (void)
+{
+   _mm256_store_epi64 (p, y);
+}
+
+/* { dg-final { scan-assembler-not "vmovdqa64" } } */
-- 
2.24.1

Reply via email to