Re: [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'

2023-07-10 Thread Hongtao Liu via Gcc-patches
On Tue, Jul 11, 2023 at 12:24 AM Alexander Monakov via Gcc-patches
 wrote:
>
>
> On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote:
>
> > False dependency happens when destination is only updated by
> > pternlog. There is no false dependency when destination is also used
> > in source. So either a pxor should be inserted, or input operand
> > should be set with constraint '0'.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ready to push to trunk.
>
> Shouldn't this patch also remove uses of vpternlog in
> standard_sse_constant_opcode?
It's still needed when !optimize_function_for_speed_p (cfun).
>
> A couple more questions below:
>
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -1382,6 +1382,29 @@ (define_insn "mov_internal"
> > ]
> > (symbol_ref "true")))])
> >
> > +; False dependency happens on destination register which is not really
> > +; used when moving all ones to vector register
> > +(define_split
> > +  [(set (match_operand:VMOVE 0 "register_operand")
> > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> > +  "TARGET_AVX512F && reload_completed
> > +  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))
> > +  && optimize_function_for_speed_p (cfun)"
>
> Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate.
> Doesn't it work here as well?
I'm just aligned with lzcnt/popcnt case, the difference between
option_insn_for_speed_p and optimized_function_for_speed_p is the
former will consider
!crtl->maybe_hot_insn_p but the latter just returns
!optimize_function_for_size_p (cfun). It looks
optimize_insn_for_speed_p() is more reasonable for single insn.

 350optimize_insn_for_size_p (void)
 351{
 352  enum optimize_size_level ret = optimize_function_for_size_p (cfun);
 353  if (ret < OPTIMIZE_SIZE_BALANCED && !crtl->maybe_hot_insn_p)
 354ret = OPTIMIZE_SIZE_BALANCED;
 355  return ret;

>
> > +  [(set (match_dup 0) (match_dup 2))
> > +   (parallel
> > + [(set (match_dup 0) (match_dup 1))
> > +  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > +  "operands[2] = CONST0_RTX (mode);")
> > +
> > +(define_insn "*vmov_constm1_pternlog_false_dep"
> > +  [(set (match_operand:VMOVE 0 "register_operand" "=v")
> > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" 
> > ""))
> > +   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] 
> > UNSPEC_INSN_FALSE_DEP)]
> > +   "TARGET_AVX512VL ||  == 64"
> > +   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
> > +  [(set_attr "type" "sselog1")
> > +   (set_attr "prefix" "evex")])
> > +
> >  ;; If mem_addr points to a memory region with less than whole vector size 
> > bytes
> >  ;; of accessible memory and k is a mask that would prevent reading the 
> > inaccessible
> >  ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be 
> > transformed to vpblendd
> > @@ -9336,7 +9359,7 @@ (define_expand 
> > "_cvtmask2"
> >  operands[3] = CONST0_RTX (mode);
> >}")
> >
> > -(define_insn "*_cvtmask2"
> > +(define_insn_and_split "*_cvtmask2"
> >[(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
> >   (vec_merge:VI48_AVX512VL
> > (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> > @@ -9346,11 +9369,35 @@ (define_insn 
> > "*_cvtmask2"
> >"@
> > vpmovm2\t{%1, %0|%0, %1}
> > vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, 
> > %0, %0, 0x81}"
> > +  "&& !TARGET_AVX512DQ && reload_completed
> > +   && optimize_function_for_speed_p (cfun)"
> > +  [(set (match_dup 0) (match_dup 4))
> > +   (parallel
> > +[(set (match_dup 0)
> > +   (vec_merge:VI48_AVX512VL
> > + (match_dup 2)
> > + (match_dup 3)
> > + (match_dup 1)))
> > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > +  "operands[4] = CONST0_RTX (mode);"
> >[(set_attr "isa" "avx512dq,*")
> > (set_attr "length_immediate" "0,1")
> > (set_attr "prefix" "evex")
> > (set_attr "mode" "")])
> >
> > +(define_insn "*_cvtmask2_pternlog_false_dep"
> > +  [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
> > + (vec_merge:VI48_AVX512VL
> > +   (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> > +   (match_operand:VI48_AVX512VL 3 "const0_operand")
> > +   (match_operand: 1 "register_operand" "Yk")))
> > +   (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] 
> > UNSPEC_INSN_FALSE_DEP)]
> > +  "TARGET_AVX512F && !TARGET_AVX512DQ"
> > +  "vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, 
> > %0, %0, 0x81}"
> > +  [(set_attr "length_immediate" "1")
> > +   (set_attr "prefix" "evex")
> > +   (set_attr "mode" "")])
> > +
> >  (define_expand "extendv2sfv2df2"
> >[(set (match_operand:V2DF 0 "register_operand")
> >   (float_extend:V2DF
> > @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl2"
> >  operands[2] = force_reg (mode, operands[2]);
> >  })
> >
> > -(define_insn "one_cmpl2"
> > -  [(set 

Re: [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'

2023-07-10 Thread Alexander Monakov via Gcc-patches


On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote:

> False dependency happens when destination is only updated by
> pternlog. There is no false dependency when destination is also used
> in source. So either a pxor should be inserted, or input operand
> should be set with constraint '0'.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ready to push to trunk.

Shouldn't this patch also remove uses of vpternlog in
standard_sse_constant_opcode?

A couple more questions below:

> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1382,6 +1382,29 @@ (define_insn "mov_internal"
> ]
> (symbol_ref "true")))])
>  
> +; False dependency happens on destination register which is not really
> +; used when moving all ones to vector register
> +(define_split
> +  [(set (match_operand:VMOVE 0 "register_operand")
> + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> +  "TARGET_AVX512F && reload_completed
> +  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))
> +  && optimize_function_for_speed_p (cfun)"

Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate.
Doesn't it work here as well?

> +  [(set (match_dup 0) (match_dup 2))
> +   (parallel
> + [(set (match_dup 0) (match_dup 1))
> +  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> +  "operands[2] = CONST0_RTX (mode);")
> +
> +(define_insn "*vmov_constm1_pternlog_false_dep"
> +  [(set (match_operand:VMOVE 0 "register_operand" "=v")
> + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" 
> ""))
> +   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] 
> UNSPEC_INSN_FALSE_DEP)]
> +   "TARGET_AVX512VL ||  == 64"
> +   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
> +  [(set_attr "type" "sselog1")
> +   (set_attr "prefix" "evex")])
> +
>  ;; If mem_addr points to a memory region with less than whole vector size 
> bytes
>  ;; of accessible memory and k is a mask that would prevent reading the 
> inaccessible
>  ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed 
> to vpblendd
> @@ -9336,7 +9359,7 @@ (define_expand "_cvtmask2"
>  operands[3] = CONST0_RTX (mode);
>}")
>  
> -(define_insn "*_cvtmask2"
> +(define_insn_and_split "*_cvtmask2"
>[(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
>   (vec_merge:VI48_AVX512VL
> (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> @@ -9346,11 +9369,35 @@ (define_insn "*_cvtmask2"
>"@
> vpmovm2\t{%1, %0|%0, %1}
> vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, 
> %0, %0, 0x81}"
> +  "&& !TARGET_AVX512DQ && reload_completed
> +   && optimize_function_for_speed_p (cfun)"
> +  [(set (match_dup 0) (match_dup 4))
> +   (parallel
> +[(set (match_dup 0)
> +   (vec_merge:VI48_AVX512VL
> + (match_dup 2)
> + (match_dup 3)
> + (match_dup 1)))
> + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> +  "operands[4] = CONST0_RTX (mode);"
>[(set_attr "isa" "avx512dq,*")
> (set_attr "length_immediate" "0,1")
> (set_attr "prefix" "evex")
> (set_attr "mode" "")])
>  
> +(define_insn "*_cvtmask2_pternlog_false_dep"
> +  [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
> + (vec_merge:VI48_AVX512VL
> +   (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> +   (match_operand:VI48_AVX512VL 3 "const0_operand")
> +   (match_operand: 1 "register_operand" "Yk")))
> +   (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] 
> UNSPEC_INSN_FALSE_DEP)]
> +  "TARGET_AVX512F && !TARGET_AVX512DQ"
> +  "vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, 
> %0, %0, 0x81}"
> +  [(set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set_attr "mode" "")])
> +
>  (define_expand "extendv2sfv2df2"
>[(set (match_operand:V2DF 0 "register_operand")
>   (float_extend:V2DF
> @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl2"
>  operands[2] = force_reg (mode, operands[2]);
>  })
>  
> -(define_insn "one_cmpl2"
> -  [(set (match_operand:VI 0 "register_operand" "=v,v")
> - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
> - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
> +(define_insn_and_split "one_cmpl2"
> +  [(set (match_operand:VI 0 "register_operand" "=v,v,v")
> + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br")
> + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
>"TARGET_AVX512F
> && (!
> || mode == SImode
> || mode == DImode)"
>  {
> +  if (! && which_alternative
> +  && optimize_function_for_speed_p (cfun))
> +return "#";
> +
>if (TARGET_AVX512VL)
>  return "vpternlog\t{$0x55, %1, %0, 
> %0|%0, %0, %1, 0x55}";
>else
>  return "vpternlog\t{$0x55, %g1, %g0, 
> %g0|%g0, %g0, %g1, 0x55}";
>  }
> +  "&& reload_completed && !REG_P (operands[1]) && !
> +   && optimize_function_for_speed_p (cfun)"
> 

[PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'

2023-07-09 Thread liuhongt via Gcc-patches
False dependency happens when destination is only updated by
pternlog. There is no false dependency when destination is also used
in source. So either a pxor should be inserted, or input operand
should be set with constraint '0'.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR target/110438
PR target/110202
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov_constm1_pternlog_false_dep): New
define_insn.
(*_cvtmask2_pternlog_false_dep):
Ditto.
(*_cvtmask2_pternlog_false_dep):
Ditto.
(*_cvtmask2): Adjust to
define_insn_and_split to avoid false dependence.
(*_cvtmask2): Ditto.
(one_cmpl2): Adjust constraint
of operands 1 to '0' to avoid false dependence.
(*andnot3): Ditto.
(iornot3): Ditto.
(*3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110438.c: New test.
---
 gcc/config/i386/predicates.md|   8 +-
 gcc/config/i386/sse.md   | 113 ---
 gcc/testsuite/gcc.target/i386/pr110438.c |  30 ++
 3 files changed, 135 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 7ddbe01a6f9..37d20c6303a 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
 return false;
 })
 
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
 (define_predicate "vector_all_ones_operand"
   (and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+  (ior (match_operand 0 "vector_all_ones_operand")
+   (match_operand 0 "float_vector_all_ones_operand")
+   (match_test "op == constm1_rtx")))
+
 /* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit.  */
 (define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 418c337a775..56920a3e1d3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,29 @@ (define_insn "mov_internal"
  ]
  (symbol_ref "true")))])
 
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))
+  && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+ [(set (match_dup 0) (match_dup 1))
+  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (mode);")
+
+(define_insn "*vmov_constm1_pternlog_false_dep"
+  [(set (match_operand:VMOVE 0 "register_operand" "=v")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" 
""))
+   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_AVX512VL ||  == 64"
+   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")])
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the 
inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to 
vpblendd
@@ -9336,7 +9359,7 @@ (define_expand "_cvtmask2"
 operands[3] = CONST0_RTX (mode);
   }")
 
-(define_insn "*_cvtmask2"
+(define_insn_and_split "*_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9346,11 +9369,35 @@ (define_insn "*_cvtmask2"
   "@
vpmovm2\t{%1, %0|%0, %1}
vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, 
%0, 0x81}"
+  "&& !TARGET_AVX512DQ && reload_completed
+   && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel
+[(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+   (match_dup 2)
+   (match_dup 3)
+   (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[4] = CONST0_RTX (mode);"
   [(set_attr "isa" "avx512dq,*")
(set_attr "length_immediate" "0,1")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "*_cvtmask2_pternlog_false_dep"
+  [(set 

Re: [PATCH] Break false dependence for vpternlog by inserting vpxor.

2023-07-07 Thread Hongtao Liu via Gcc-patches
On Thu, Jul 6, 2023 at 11:46 PM  wrote:
>
> > +; False dependency happens on destination register which is not really
> > +; used when moving all ones to vector register
> > +(define_split
> > +  [(set (match_operand:VMOVE 0 "register_operand")
> > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> > +  "TARGET_AVX512F && reload_completed
> > +  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))"
> > +  [(set (match_dup 0) (match_dup 2))
> > +   (parallel
> > + [(set (match_dup 0) (match_dup 1))
> > +  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > +  "operands[2] = CONST0_RTX (mode);")
>
> I think we shouldnt emit PXOR when optimizing for size. So should change
> define_split:
> define_split
>[(set (match_operand:VMOVE 0 "register_operand")
> (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
>"TARGET_AVX512F && reload_completed
>&& ( == 64 || EXT_REX_SSE_REG_P (operands[0]))
>&& optimize_insn_for_speed_p ()"
>[(set (match_dup 0) (match_dup 2))
> (parallel
>   [(set (match_dup 0) (match_dup 1))
>(unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
>"operands[2] = CONST0_RTX (mode);")
Yes, will do. I'm still working on breaking the false depence for
pternlog in newly added pattern *iornot3,*xnor3 and
*3.
Will repost the patch when it's done.



-- 
BR,
Hongtao


Re: [PATCH] Break false dependence for vpternlog by inserting vpxor.

2023-07-06 Thread simonaytes.yan--- via Gcc-patches

+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+ [(set (match_dup 0) (match_dup 1))
+  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (mode);")


I think we shouldnt emit PXOR when optimizing for size. So should change 
define_split:

define_split
  [(set (match_operand:VMOVE 0 "register_operand")
(match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
  "TARGET_AVX512F && reload_completed
  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))
  && optimize_insn_for_speed_p ()"
  [(set (match_dup 0) (match_dup 2))
   (parallel
 [(set (match_dup 0) (match_dup 1))
  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
  "operands[2] = CONST0_RTX (mode);")


[PATCH] Break false dependence for vpternlog by inserting vpxor.

2023-07-03 Thread liuhongt via Gcc-patches
vpternlog is also used for optimization which doesn't need any valid
input operand, in that case, the destination is used as input in the
instruction and that creates a false dependence.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR target/110438
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov_constm1_pternlog): New
define_insn.
(*_cvtmask2): Adjust to
define_insn_and_split to avoid false dependence.
(*_cvtmask2_pternlog): New
define_insn.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110438.c: New test.
---
 gcc/config/i386/predicates.md|  8 ++-
 gcc/config/i386/sse.md   | 69 +++-
 gcc/testsuite/gcc.target/i386/pr110438.c | 30 +++
 3 files changed, 94 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index fb07707dcba..df0d9e20def 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
 return false;
 })
 
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
 (define_predicate "vector_all_ones_operand"
   (and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+  (ior (match_operand 0 "vector_all_ones_operand")
+   (match_operand 0 "float_vector_all_ones_operand")
+   (match_test "op == constm1_rtx")))
+
 /* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit.  */
 (define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 812cfca4b92..93cdd844026 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,28 @@ (define_insn "mov_internal"
  ]
  (symbol_ref "true")))])
 
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+ [(set (match_dup 0) (match_dup 1))
+  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (mode);")
+
+(define_insn "*vmov_constm1_pternlog"
+  [(set (match_operand:VMOVE 0 "register_operand" "=v")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" 
""))
+   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_AVX512VL ||  == 64"
+   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")])
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the 
inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to 
vpblendd
@@ -9336,7 +9358,7 @@ (define_expand "_cvtmask2"
 operands[3] = CONST0_RTX (mode);
   }")
 
-(define_insn "*_cvtmask2"
+(define_insn_and_split "*_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9345,12 +9367,35 @@ (define_insn "*_cvtmask2"
   "TARGET_AVX512F"
   "@
vpmovm2\t{%1, %0|%0, %1}
-   vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, 
%0, 0x81}"
+   #"
+  "&& !TARGET_AVX512DQ && reload_completed"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel
+[(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+   (match_dup 2)
+   (match_dup 3)
+   (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[4] = CONST0_RTX (mode);"
   [(set_attr "isa" "avx512dq,*")
(set_attr "length_immediate" "0,1")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "*_cvtmask2_pternlog"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
+   (vec_merge:VI48_AVX512VL
+ (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
+ (match_operand:VI48_AVX512VL 3 "const0_operand")
+ (match_operand: 1 "register_operand" "Yk")))
+   (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_AVX512F && !TARGET_AVX512DQ"
+