On Tue, Nov 24, 2020 at 9:00 PM Jakub Jelinek <ja...@redhat.com> wrote:
>
> On Tue, Nov 24, 2020 at 10:36:49AM +0800, Hongtao Liu via Gcc-patches wrote:
> > > >       * gcc.target/i386/pr97642-2.c: New test.
> > > So in the BZ Jakub asked for the all-ones mask case to be specially
> > > handled to emit a normal load.  I don't see where we're handling that.
> > > ISTM that we'd want a test for that too.  Right?
> > >
> >
> > all-ones mask would be simplified to a simple load but with unspec in
> > set_src and would be handled by the following
> >
> > +(define_insn_and_split "*<avx512>_load<mode>"
> > +  [(set (match_operand:V48_AVX512VL 0 "register_operand")
> > + (unspec:V48_AVX512VL
> > +   [(match_operand:V48_AVX512VL 1 "memory_operand")]
> > +   UNSPEC_MASKLOAD))]
> > +  "TARGET_AVX512F"
> > +  "#"
> > +  "&& 1"
> > +  [(set (match_dup 0) (match_dup 1))])
>
> It is certainly good to have these insns and splitters, but
> the reason I think it is very much desirable to handle this during expansion
> too is to avoid creating the UNSPECs in the very common case where
> either already from the user, or from all the GIMPLE optimizations the
> mask is optimized into all ones.  The UNSPECs then can prevent various
> optimizations before splitting is performed, because generic RTL optimizers
> don't know anything about what it does.  While for all ones mask it is
> easily expressible in RTL by having the most simple RTL for that, plain
> load.
>
>         Jakub
>

Update patch:
  1. ix86_expand_special_args_builtin is used for expanding mask load
intrinsics, this function will always convert the constant mask
operands into reg. So for the situation of all-ones mask, keep this
constant, and also change the mask operand predicate(of corresponding
expander) to register_or_constm1_operand.
  2. Delete last_arg_constant which is not used in
ix86_expand_special_args_builtin(maybe should be in a separate patch?)
  3. Still keep (define_insn_and_split "*<avx512>_load<mode>" to catch
some optimization opportunities exposed by rtl passes.


gcc/ChangeLog:

        PR target/97642
        * config/i386/i386-expand.c
        (ix86_expand_special_args_builtin): Delete last_arg_constant
        since it's never used, also don't move all-ones mask operands
        into register.
        * config/i386/sse.md (UNSPEC_MASKLOAD): New unspec.
        (*<avx512>_load<mode>_mask): New define_insns for masked load
        instructions.
        (<avx512>_load<mode>_mask): Changed to define_expands which
        specifically handle memory or all-ones mask operands.
        (<avx512>_blendm<mode>): Changed to define_insns which are same
        as original <avx512>_load<mode>_mask with adjustment of
        operands order.
        (*<avx512>_load<mode>): New define_insn_and_split which is
        used to optimize for masked load with all one mask.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/avx512bw-vmovdqu16-1.c: Adjust testcase to
        make sure only masked load instruction is generated.
        * gcc.target/i386/avx512bw-vmovdqu8-1.c: Ditto.
        * gcc.target/i386/avx512f-vmovapd-1.c: Ditto.
        * gcc.target/i386/avx512f-vmovaps-1.c: Ditto.
        * gcc.target/i386/avx512f-vmovdqa32-1.c: Ditto.
        * gcc.target/i386/avx512f-vmovdqa64-1.c: Ditto.
        * gcc.target/i386/avx512vl-vmovapd-1.c: Ditto.
        * gcc.target/i386/avx512vl-vmovaps-1.c: Ditto.
        * gcc.target/i386/avx512vl-vmovdqa32-1.c: Ditto.
        * gcc.target/i386/avx512vl-vmovdqa64-1.c: Ditto.
        * gcc.target/i386/pr97642-1.c: New test.
        * gcc.target/i386/pr97642-2.c: New test.

-- 
BR,
Hongtao
From 1ff0944eee7ab23bb00f889a444833f56d7cefe7 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao....@intel.com>
Date: Tue, 3 Nov 2020 17:26:43 +0800
Subject: [PATCH] Fix incorrect replacement of vmovdqu32 with vpblendd which
 can cause fault.

gcc/ChangeLog:

	PR target/97642
	* config/i386/i386-expand.c
	(ix86_expand_special_args_builtin): Delete last_arg_constant
	since it's never used, also don't move all-ones mask operands
	into register.
	* config/i386/sse.md (UNSPEC_MASKLOAD): New unspec.
	(*<avx512>_load<mode>_mask): New define_insns for masked load
	instructions.
	(<avx512>_load<mode>_mask): Changed to define_expands which
	specifically handle memory or all-ones mask operands.
	(<avx512>_blendm<mode>): Changed to define_insns which are same
	as original <avx512>_load<mode>_mask with adjustment of
	operands order.
	(*<avx512>_load<mode>): New define_insn_and_split which is
	used to optimize for masked load with all one mask.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512bw-vmovdqu16-1.c: Adjust testcase to
	make sure only masked load instruction is generated.
	* gcc.target/i386/avx512bw-vmovdqu8-1.c: Ditto.
	* gcc.target/i386/avx512f-vmovapd-1.c: Ditto.
	* gcc.target/i386/avx512f-vmovaps-1.c: Ditto.
	* gcc.target/i386/avx512f-vmovdqa32-1.c: Ditto.
	* gcc.target/i386/avx512f-vmovdqa64-1.c: Ditto.
	* gcc.target/i386/avx512vl-vmovapd-1.c: Ditto.
	* gcc.target/i386/avx512vl-vmovaps-1.c: Ditto.
	* gcc.target/i386/avx512vl-vmovdqa32-1.c: Ditto.
	* gcc.target/i386/avx512vl-vmovdqa64-1.c: Ditto.
	* gcc.target/i386/pr97642-1.c: New test.
	* gcc.target/i386/pr97642-2.c: New test.
---
 gcc/config/i386/i386-expand.c                 |  68 ++++----
 gcc/config/i386/sse.md                        | 148 ++++++++++++++----
 .../gcc.target/i386/avx512bw-vmovdqu16-1.c    |   6 +-
 .../gcc.target/i386/avx512bw-vmovdqu8-1.c     |   6 +-
 .../gcc.target/i386/avx512f-vmovapd-1.c       |   2 +-
 .../gcc.target/i386/avx512f-vmovaps-1.c       |   2 +-
 .../gcc.target/i386/avx512f-vmovdqa32-1.c     |   2 +-
 .../gcc.target/i386/avx512f-vmovdqa64-1.c     |   2 +-
 .../gcc.target/i386/avx512vl-vmovapd-1.c      |   4 +-
 .../gcc.target/i386/avx512vl-vmovaps-1.c      |   4 +-
 .../gcc.target/i386/avx512vl-vmovdqa32-1.c    |   4 +-
 .../gcc.target/i386/avx512vl-vmovdqa64-1.c    |   4 +-
 gcc/testsuite/gcc.target/i386/pr97642-1.c     |  41 +++++
 gcc/testsuite/gcc.target/i386/pr97642-2.c     |  77 +++++++++
 14 files changed, 287 insertions(+), 83 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr97642-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr97642-2.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 73e3358b290..c6ea8ae6f82 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -10494,7 +10494,6 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
       machine_mode mode;
     } args[3];
   enum insn_code icode = d->icode;
-  bool last_arg_constant = false;
   const struct insn_data_d *insn_p = &insn_data[icode];
   machine_mode tmode = insn_p->operand[0].mode;
   enum { load, store } klass;
@@ -10824,48 +10823,43 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
       op = expand_normal (arg);
       match = insn_p->operand[i + 1].predicate (op, mode);
 
-      if (last_arg_constant && (i + 1) == nargs)
+      if (i == memory)
 	{
-	  if (!match)
-	    {
-	      error ("the last argument must be an 8-bit immediate");
-	      return const0_rtx;
-	    }
+	  /* This must be the memory operand.  */
+	  op = ix86_zero_extend_to_Pmode (op);
+	  op = gen_rtx_MEM (mode, op);
+	  /* op at this point has just BITS_PER_UNIT MEM_ALIGN
+	     on it.  Try to improve it using get_pointer_alignment,
+	     and if the special builtin is one that requires strict
+	     mode alignment, also from it's GET_MODE_ALIGNMENT.
+	     Failure to do so could lead to ix86_legitimate_combined_insn
+	     rejecting all changes to such insns.  */
+	  unsigned int align = get_pointer_alignment (arg);
+	  if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
+	    align = GET_MODE_ALIGNMENT (mode);
+	  if (MEM_ALIGN (op) < align)
+	    set_mem_align (op, align);
 	}
       else
 	{
-	  if (i == memory)
-	    {
-	      /* This must be the memory operand.  */
-	      op = ix86_zero_extend_to_Pmode (op);
-	      op = gen_rtx_MEM (mode, op);
-	      /* op at this point has just BITS_PER_UNIT MEM_ALIGN
-		 on it.  Try to improve it using get_pointer_alignment,
-		 and if the special builtin is one that requires strict
-		 mode alignment, also from it's GET_MODE_ALIGNMENT.
-		 Failure to do so could lead to ix86_legitimate_combined_insn
-		 rejecting all changes to such insns.  */
-	      unsigned int align = get_pointer_alignment (arg);
-	      if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
-		align = GET_MODE_ALIGNMENT (mode);
-	      if (MEM_ALIGN (op) < align)
-		set_mem_align (op, align);
-	    }
-	  else
-	    {
-	      /* This must be register.  */
-	      if (VECTOR_MODE_P (mode))
-		op = safe_vector_operand (op, mode);
+	  /* This must be register.  */
+	  if (VECTOR_MODE_P (mode))
+	    op = safe_vector_operand (op, mode);
 
-	      op = fixup_modeless_constant (op, mode);
+	  op = fixup_modeless_constant (op, mode);
 
-	      if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
-		op = copy_to_mode_reg (mode, op);
-	      else
-	        {
-	          op = copy_to_reg (op);
-	          op = lowpart_subreg (mode, op, GET_MODE (op));
-	        }
+	  /* NB: 3-operands load implied it's an mask load,
+	     and mask operand is assumed to be last.
+	     Keep all-ones mask which could be simplied by the expander.  */
+	  if (nargs == 3 && i == 2 && klass == load
+	      && constm1_operand (op, mode))
+	    ;
+	  else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+	    op = copy_to_mode_reg (mode, op);
+	  else
+	    {
+	      op = copy_to_reg (op);
+	      op = lowpart_subreg (mode, op, GET_MODE (op));
 	    }
 	}
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 11936809561..c7f7aeec51d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -111,6 +111,8 @@ (define_c_enum "unspec" [
   UNSPEC_MASKOP
   UNSPEC_KORTEST
   UNSPEC_KTEST
+  ;; Mask load
+  UNSPEC_MASKLOAD
 
   ;; For embed. rounding feature
   UNSPEC_EMBEDDED_ROUNDING
@@ -1065,18 +1067,39 @@ (define_insn "mov<mode>_internal"
 	      ]
 	      (symbol_ref "true")))])
 
-(define_insn "<avx512>_load<mode>_mask"
-  [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v,v")
+;; If mem_addr points to a memory region with less than whole vector size bytes
+;; of accessible memory and k is a mask that would prevent reading the inaccessible
+;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
+;; See pr97642.
+(define_expand "<avx512>_load<mode>_mask"
+  [(set (match_operand:V48_AVX512VL 0 "register_operand")
 	(vec_merge:V48_AVX512VL
-	  (match_operand:V48_AVX512VL 1 "nonimmediate_operand" "vm,vm")
-	  (match_operand:V48_AVX512VL 2 "nonimm_or_0_operand" "0C,v")
-	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
+	  (match_operand:V48_AVX512VL 1 "nonimmediate_operand")
+	  (match_operand:V48_AVX512VL 2 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 3 "register_or_constm1_operand")))]
   "TARGET_AVX512F"
 {
-  if (REG_P (operands[2])
-     && REGNO (operands[2]) != REGNO (operands[0]))
-    return "v<sseintprefix>blendm<ssemodesuffix>\t{%1, %2, %0%{%3%}|%0%{%3%}, %2, %1}";
+  if (CONST_INT_P (operands[3]))
+    {
+      emit_insn (gen_rtx_SET (operands[0], operands[1]));
+      DONE;
+    }
+  else if (MEM_P (operands[1]))
+    operands[1] = gen_rtx_UNSPEC (<MODE>mode,
+				 gen_rtvec(1, operands[1]),
+				 UNSPEC_MASKLOAD);
+})
 
+(define_insn "*<avx512>_load<mode>_mask"
+  [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
+	(vec_merge:V48_AVX512VL
+	  (unspec:V48_AVX512VL
+	    [(match_operand:V48_AVX512VL 1 "memory_operand" "m")]
+	    UNSPEC_MASKLOAD)
+	  (match_operand:V48_AVX512VL 2 "nonimm_or_0_operand" "0C")
+	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+{
   if (FLOAT_MODE_P (GET_MODE_INNER (<MODE>mode)))
     {
       if (misaligned_operand (operands[1], <MODE>mode))
@@ -1096,20 +1119,60 @@ (define_insn "<avx512>_load<mode>_mask"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "<avx512>_load<mode>_mask"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v,v")
+(define_insn_and_split "*<avx512>_load<mode>"
+  [(set (match_operand:V48_AVX512VL 0 "register_operand")
+	(unspec:V48_AVX512VL
+	  [(match_operand:V48_AVX512VL 1 "memory_operand")]
+	  UNSPEC_MASKLOAD))]
+  "TARGET_AVX512F"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (match_dup 1))])
+
+(define_expand "<avx512>_load<mode>_mask"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
 	(vec_merge:VI12_AVX512VL
-	  (match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "vm,vm")
-	  (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C,v")
-	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
+	  (match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
+	  (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 3 "register_or_constm1_operand")))]
   "TARGET_AVX512BW"
-  "@
-    vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}
-    vpblendm<ssemodesuffix>\t{%1, %2, %0%{%3%}|%0%{%3%}, %2, %1}"
+{
+  if (CONST_INT_P (operands[3]))
+    {
+      emit_insn (gen_rtx_SET (operands[0], operands[1]));
+      DONE;
+    }
+  else if (MEM_P (operands[1]))
+    operands[1] = gen_rtx_UNSPEC (<MODE>mode,
+				 gen_rtvec(1, operands[1]),
+				 UNSPEC_MASKLOAD);
+
+})
+
+(define_insn "*<avx512>_load<mode>_mask"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
+	(vec_merge:VI12_AVX512VL
+	  (unspec:VI12_AVX512VL
+	    [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+	    UNSPEC_MASKLOAD)
+	  (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C")
+	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
+  "TARGET_AVX512BW"
+  "vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<avx512>_load<mode>"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
+	(unspec:VI12_AVX512VL
+	  [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+	  UNSPEC_MASKLOAD))]
+  "TARGET_AVX512BW"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (match_dup 1))])
+
 (define_insn "avx512f_mov<ssescalarmodelower>_mask"
   [(set (match_operand:VF_128 0 "register_operand" "=v")
 	(vec_merge:VF_128
@@ -1171,21 +1234,50 @@ (define_insn "avx512f_store<mode>_mask"
    (set_attr "memory" "store")
    (set_attr "mode" "<MODE>")])
 
-(define_expand "<avx512>_blendm<mode>"
-  [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
+(define_insn "<avx512>_blendm<mode>"
+  [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:V48_AVX512VL
-	  (match_operand:V48_AVX512VL 2 "nonimmediate_operand" "vm")
-	  (match_operand:V48_AVX512VL 1 "register_operand" "v")
-	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
-  "TARGET_AVX512F")
+	  (match_operand:V48_AVX512VL 2 "nonimmediate_operand" "vm,vm")
+	  (match_operand:V48_AVX512VL 1 "nonimm_or_0_operand" "0C,v")
+	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512F"
+{
+  if (REG_P (operands[1])
+     && REGNO (operands[1]) != REGNO (operands[0]))
+    return "v<sseintprefix>blendm<ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}";
 
-(define_expand "<avx512>_blendm<mode>"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
+  if (FLOAT_MODE_P (GET_MODE_INNER (<MODE>mode)))
+    {
+      if (misaligned_operand (operands[2], <MODE>mode))
+	return "vmovu<ssemodesuffix>\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}";
+      else
+	return "vmova<ssemodesuffix>\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}";
+    }
+  else
+    {
+      if (misaligned_operand (operands[2], <MODE>mode))
+	return "vmovdqu<ssescalarsize>\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}";
+      else
+	return "vmovdqa<ssescalarsize>\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}";
+    }
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<avx512>_blendm<mode>"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VI12_AVX512VL
-	  (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
-	  (match_operand:VI12_AVX512VL 1 "register_operand" "v")
-	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
-  "TARGET_AVX512BW")
+	  (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm,vm")
+	  (match_operand:VI12_AVX512VL 1 "nonimm_or_0_operand" "0C,v")
+	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512BW"
+  "@
+    vmovdqu<ssescalarsize>\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}
+    vpblendm<ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_store<mode>_mask"
   [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu16-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu16-1.c
index dcb8caaa73e..8603a1909c7 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu16-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu16-1.c
@@ -1,8 +1,8 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu16|vpblendmw)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu16|vpblendmw)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu16|vpblendmw)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu8-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu8-1.c
index a335bcab3b2..d1e33926c81 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu8-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu8-1.c
@@ -1,8 +1,8 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu8|vpblendmb)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu8|vpblendmb)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu8|vpblendmb)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vmovapd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vmovapd-1.c
index 7fc84b16e2b..e869f70665a 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vmovapd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vmovapd-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovapd|vblendmpd)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vmovaps-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vmovaps-1.c
index c2e2655fda6..a7635a3ebf2 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vmovaps-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vmovaps-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovaps|vblendmps)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vmovdqa32-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vmovdqa32-1.c
index 8fb816c1317..b93727d9ef2 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vmovdqa32-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vmovdqa32-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa32|vpblendmd)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vmovdqa64-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vmovdqa64-1.c
index 4352b12b6e7..1c372c4f92a 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vmovdqa64-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vmovdqa64-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa64|vpblendmq)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vmovapd-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vmovapd-1.c
index fd59660f932..89c3ebefe35 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vmovapd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vmovapd-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovapd|vblendmpd)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovapd|vblendmpd)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vmovaps-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vmovaps-1.c
index 455b1a9dc37..2196ebb55d9 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vmovaps-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vmovaps-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovaps|vblendmps)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovaps|vblendmps)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa32-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa32-1.c
index 5c6a3d0bfb4..9f991dbaca2 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa32-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa32-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa32|vpblendmd)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa32|vpblendmd)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
index 592541aeb8e..d20b4a7b997 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa64|vpblendmq)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa64|vpblendmq)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 { target nonpic } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr97642-1.c b/gcc/testsuite/gcc.target/i386/pr97642-1.c
new file mode 100644
index 00000000000..f511440e577
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr97642-1.c
@@ -0,0 +1,41 @@
+/* PR target/97642 */
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -O2" } */
+/* { dg-final { scan-assembler-not { k[0-8] } } } */
+
+#include <immintrin.h>
+__m128i
+foo1 (__m128i src, void const* P)
+{
+  return _mm_mask_loadu_epi32 (src, 15, P);
+}
+
+__m256i
+foo2 (__m256i src, void const* P)
+{
+  return _mm256_mask_loadu_epi32 (src, 255, P);
+}
+
+__m512i
+foo3 (__m512i src, void const* P)
+{
+  return _mm512_mask_loadu_epi32 (src, 65535 , P);
+}
+
+__m128i
+foo4 (__m128i src, void const* P)
+{
+  return _mm_mask_loadu_epi32 (src, -1, P);
+}
+
+__m256i
+foo5 (__m256i src, void const* P)
+{
+  return _mm256_mask_loadu_epi32 (src, -1, P);
+}
+
+__m512i
+foo6 (__m512i src, void const* P)
+{
+  return _mm512_mask_loadu_epi32 (src, -1 , P);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr97642-2.c b/gcc/testsuite/gcc.target/i386/pr97642-2.c
new file mode 100644
index 00000000000..eb06a2739b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr97642-2.c
@@ -0,0 +1,77 @@
+/* PR target/97642 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512dq -mavx512vl -mavx512bw" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512dq } */
+/* { dg-require-effective-target avx512bw } */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <stdint.h>
+#include <sys/mman.h>
+
+#define N 5
+
+// Faults with GCC because usage of vpblendd
+__m256i __attribute__((noinline)) mask_load(uint32_t * arr) {
+  __m256i tmp;
+  return _mm256_mask_loadu_epi32(tmp, (1 << N) - 1, arr);
+}
+
+// Faults
+__m256i __attribute__((noinline)) blend_load_asm(uint32_t * arr) {
+  __m256i tmp = _mm256_set1_epi64x(0);
+  asm volatile("vpblendd %[m], (%[arr]), %[tmp], %[tmp]\n\t"
+	       : [ tmp ] "+x"(tmp)
+	       : [ arr ] "r"(arr), [ m ] "i"(((1 << N) - 1))
+	       :);
+  return tmp;
+}
+
+// Does not fault
+__m256i __attribute__((noinline)) mask_load_asm(uint32_t * arr) {
+  __m256i           tmp;
+  asm volatile(
+	       "movb %[m], %%al\n\t"
+	       "kmovb %%eax, %%k1\n\t"
+	       "vmovdqu32 (%[arr]), %[tmp] %{%%k1} %{z%}\n\t"
+	       : [ tmp ] "+x"(tmp)
+	       : [ arr ] "r"(arr), [ m ] "i"(((1 << N) - 1))
+	       : "eax", "k1");
+  return tmp;
+}
+
+
+void __attribute__((noinline)) mask_store(uint32_t * arr, __m256i v) {
+  return _mm256_mask_storeu_epi32(arr, (1 << N) - 1, v);
+}
+
+
+#define NPAGES      (2)
+#define END_OF_PAGE (1024 - N)
+
+#ifndef LOAD_METHOD
+#define LOAD_METHOD mask_load // mask_load_asm does not fault
+#endif
+
+
+int
+main() {
+  if (!(__builtin_cpu_supports ("avx512dq")
+	&& __builtin_cpu_supports ("avx512vl")
+	&& __builtin_cpu_supports ("avx512bw")))
+    return 0;
+
+  uint32_t * addr =
+    (uint32_t *)mmap(NULL, NPAGES * 4096, PROT_READ | PROT_WRITE,
+		     MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+  for (uint32_t i = 0; i < NPAGES; i += 2) {
+
+    uint32_t page_offset      = 1024 * i + END_OF_PAGE;
+    uint32_t next_page_offset = 1024 * (i + 1);
+
+    assert(!mprotect(addr + next_page_offset, 4096, PROT_NONE));
+    mask_store(addr + page_offset, LOAD_METHOD(addr + page_offset));
+  }
+}
-- 
2.18.1

Reply via email to