Re: [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
On Tue, Jul 11, 2023 at 12:24 AM Alexander Monakov via Gcc-patches wrote: > > > On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote: > > > False dependency happens when destination is only updated by > > pternlog. There is no false dependency when destination is also used > > in source. So either a pxor should be inserted, or input operand > > should be set with constraint '0'. > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > Ready to push to trunk. > > Shouldn't this patch also remove uses of vpternlog in > standard_sse_constant_opcode? It's still needed when !optimize_function_for_speed_p (cfun). > > A couple more questions below: > > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -1382,6 +1382,29 @@ (define_insn "mov_internal" > > ] > > (symbol_ref "true")))]) > > > > +; False dependency happens on destination register which is not really > > +; used when moving all ones to vector register > > +(define_split > > + [(set (match_operand:VMOVE 0 "register_operand") > > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] > > + "TARGET_AVX512F && reload_completed > > + && ( == 64 || EXT_REX_SSE_REG_P (operands[0])) > > + && optimize_function_for_speed_p (cfun)" > > Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate. > Doesn't it work here as well? I'm just aligned with lzcnt/popcnt case, the difference between option_insn_for_speed_p and optimized_function_for_speed_p is the former will consider !crtl->maybe_hot_insn_p but the latter just returns !optimize_function_for_size_p (cfun). It looks optimize_insn_for_speed_p() is more reasonable for single insn. 350optimize_insn_for_size_p (void) 351{ 352 enum optimize_size_level ret = optimize_function_for_size_p (cfun); 353 if (ret < OPTIMIZE_SIZE_BALANCED && !crtl->maybe_hot_insn_p) 354ret = OPTIMIZE_SIZE_BALANCED; 355 return ret; > > > + [(set (match_dup 0) (match_dup 2)) > > + (parallel > > + [(set (match_dup 0) (match_dup 1)) > > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > > + "operands[2] = CONST0_RTX (mode);") > > + > > +(define_insn "*vmov_constm1_pternlog_false_dep" > > + [(set (match_operand:VMOVE 0 "register_operand" "=v") > > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" > > "")) > > + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] > > UNSPEC_INSN_FALSE_DEP)] > > + "TARGET_AVX512VL || == 64" > > + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" > > + [(set_attr "type" "sselog1") > > + (set_attr "prefix" "evex")]) > > + > > ;; If mem_addr points to a memory region with less than whole vector size > > bytes > > ;; of accessible memory and k is a mask that would prevent reading the > > inaccessible > > ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be > > transformed to vpblendd > > @@ -9336,7 +9359,7 @@ (define_expand > > "_cvtmask2" > > operands[3] = CONST0_RTX (mode); > >}") > > > > -(define_insn "*_cvtmask2" > > +(define_insn_and_split "*_cvtmask2" > >[(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") > > (vec_merge:VI48_AVX512VL > > (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > > @@ -9346,11 +9369,35 @@ (define_insn > > "*_cvtmask2" > >"@ > > vpmovm2\t{%1, %0|%0, %1} > > vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, > > %0, %0, 0x81}" > > + "&& !TARGET_AVX512DQ && reload_completed > > + && optimize_function_for_speed_p (cfun)" > > + [(set (match_dup 0) (match_dup 4)) > > + (parallel > > +[(set (match_dup 0) > > + (vec_merge:VI48_AVX512VL > > + (match_dup 2) > > + (match_dup 3) > > + (match_dup 1))) > > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > > + "operands[4] = CONST0_RTX (mode);" > >[(set_attr "isa" "avx512dq,*") > > (set_attr "length_immediate" "0,1") > > (set_attr "prefix" "evex") > > (set_attr "mode" "")]) > > > > +(define_insn "*_cvtmask2_pternlog_false_dep" > > + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") > > + (vec_merge:VI48_AVX512VL > > + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > > + (match_operand:VI48_AVX512VL 3 "const0_operand") > > + (match_operand: 1 "register_operand" "Yk"))) > > + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] > > UNSPEC_INSN_FALSE_DEP)] > > + "TARGET_AVX512F && !TARGET_AVX512DQ" > > + "vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, > > %0, %0, 0x81}" > > + [(set_attr "length_immediate" "1") > > + (set_attr "prefix" "evex") > > + (set_attr "mode" "")]) > > + > > (define_expand "extendv2sfv2df2" > >[(set (match_operand:V2DF 0 "register_operand") > > (float_extend:V2DF > > @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl2" > > operands[2] = force_reg (mode, operands[2]); > > }) > > > > -(define_insn "one_cmpl2" > > - [(set
Re: [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote: > False dependency happens when destination is only updated by > pternlog. There is no false dependency when destination is also used > in source. So either a pxor should be inserted, or input operand > should be set with constraint '0'. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ready to push to trunk. Shouldn't this patch also remove uses of vpternlog in standard_sse_constant_opcode? A couple more questions below: > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -1382,6 +1382,29 @@ (define_insn "mov_internal" > ] > (symbol_ref "true")))]) > > +; False dependency happens on destination register which is not really > +; used when moving all ones to vector register > +(define_split > + [(set (match_operand:VMOVE 0 "register_operand") > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] > + "TARGET_AVX512F && reload_completed > + && ( == 64 || EXT_REX_SSE_REG_P (operands[0])) > + && optimize_function_for_speed_p (cfun)" Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate. Doesn't it work here as well? > + [(set (match_dup 0) (match_dup 2)) > + (parallel > + [(set (match_dup 0) (match_dup 1)) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > + "operands[2] = CONST0_RTX (mode);") > + > +(define_insn "*vmov_constm1_pternlog_false_dep" > + [(set (match_operand:VMOVE 0 "register_operand" "=v") > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" > "")) > + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] > UNSPEC_INSN_FALSE_DEP)] > + "TARGET_AVX512VL || == 64" > + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" > + [(set_attr "type" "sselog1") > + (set_attr "prefix" "evex")]) > + > ;; If mem_addr points to a memory region with less than whole vector size > bytes > ;; of accessible memory and k is a mask that would prevent reading the > inaccessible > ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed > to vpblendd > @@ -9336,7 +9359,7 @@ (define_expand "_cvtmask2" > operands[3] = CONST0_RTX (mode); >}") > > -(define_insn "*_cvtmask2" > +(define_insn_and_split "*_cvtmask2" >[(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") > (vec_merge:VI48_AVX512VL > (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > @@ -9346,11 +9369,35 @@ (define_insn "*_cvtmask2" >"@ > vpmovm2\t{%1, %0|%0, %1} > vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, > %0, %0, 0x81}" > + "&& !TARGET_AVX512DQ && reload_completed > + && optimize_function_for_speed_p (cfun)" > + [(set (match_dup 0) (match_dup 4)) > + (parallel > +[(set (match_dup 0) > + (vec_merge:VI48_AVX512VL > + (match_dup 2) > + (match_dup 3) > + (match_dup 1))) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > + "operands[4] = CONST0_RTX (mode);" >[(set_attr "isa" "avx512dq,*") > (set_attr "length_immediate" "0,1") > (set_attr "prefix" "evex") > (set_attr "mode" "")]) > > +(define_insn "*_cvtmask2_pternlog_false_dep" > + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") > + (vec_merge:VI48_AVX512VL > + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > + (match_operand:VI48_AVX512VL 3 "const0_operand") > + (match_operand: 1 "register_operand" "Yk"))) > + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] > UNSPEC_INSN_FALSE_DEP)] > + "TARGET_AVX512F && !TARGET_AVX512DQ" > + "vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, > %0, %0, 0x81}" > + [(set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > (define_expand "extendv2sfv2df2" >[(set (match_operand:V2DF 0 "register_operand") > (float_extend:V2DF > @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl2" > operands[2] = force_reg (mode, operands[2]); > }) > > -(define_insn "one_cmpl2" > - [(set (match_operand:VI 0 "register_operand" "=v,v") > - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m") > - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] > +(define_insn_and_split "one_cmpl2" > + [(set (match_operand:VI 0 "register_operand" "=v,v,v") > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))] >"TARGET_AVX512F > && (! > || mode == SImode > || mode == DImode)" > { > + if (! && which_alternative > + && optimize_function_for_speed_p (cfun)) > +return "#"; > + >if (TARGET_AVX512VL) > return "vpternlog\t{$0x55, %1, %0, > %0|%0, %0, %1, 0x55}"; >else > return "vpternlog\t{$0x55, %g1, %g0, > %g0|%g0, %g0, %g1, 0x55}"; > } > + "&& reload_completed && !REG_P (operands[1]) && ! > + && optimize_function_for_speed_p (cfun)" >
[PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
False dependency happens when destination is only updated by pternlog. There is no false dependency when destination is also used in source. So either a pxor should be inserted, or input operand should be set with constraint '0'. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ready to push to trunk. gcc/ChangeLog: PR target/110438 PR target/110202 * config/i386/predicates.md (int_float_vector_all_ones_operand): New predicate. * config/i386/sse.md (*vmov_constm1_pternlog_false_dep): New define_insn. (*_cvtmask2_pternlog_false_dep): Ditto. (*_cvtmask2_pternlog_false_dep): Ditto. (*_cvtmask2): Adjust to define_insn_and_split to avoid false dependence. (*_cvtmask2): Ditto. (one_cmpl2): Adjust constraint of operands 1 to '0' to avoid false dependence. (*andnot3): Ditto. (iornot3): Ditto. (*3): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr110438.c: New test. --- gcc/config/i386/predicates.md| 8 +- gcc/config/i386/sse.md | 113 --- gcc/testsuite/gcc.target/i386/pr110438.c | 30 ++ 3 files changed, 135 insertions(+), 16 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 7ddbe01a6f9..37d20c6303a 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand" return false; }) -/* Return true if operand is a vector constant that is all ones. */ +/* Return true if operand is an integral vector constant that is all ones. */ (define_predicate "vector_all_ones_operand" (and (match_code "const_vector") (match_test "INTEGRAL_MODE_P (GET_MODE (op))") (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) +/* Return true if operand is a vector constant that is all ones. */ +(define_predicate "int_float_vector_all_ones_operand" + (ior (match_operand 0 "vector_all_ones_operand") + (match_operand 0 "float_vector_all_ones_operand") + (match_test "op == constm1_rtx"))) + /* Return true if operand is an 128/256bit all ones vector that zero-extends to 256/512bit. */ (define_predicate "vector_all_ones_zero_extend_half_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 418c337a775..56920a3e1d3 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1382,6 +1382,29 @@ (define_insn "mov_internal" ] (symbol_ref "true")))]) +; False dependency happens on destination register which is not really +; used when moving all ones to vector register +(define_split + [(set (match_operand:VMOVE 0 "register_operand") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] + "TARGET_AVX512F && reload_completed + && ( == 64 || EXT_REX_SSE_REG_P (operands[0])) + && optimize_function_for_speed_p (cfun)" + [(set (match_dup 0) (match_dup 2)) + (parallel + [(set (match_dup 0) (match_dup 1)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[2] = CONST0_RTX (mode);") + +(define_insn "*vmov_constm1_pternlog_false_dep" + [(set (match_operand:VMOVE 0 "register_operand" "=v") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "")) + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512VL || == 64" + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "evex")]) + ;; If mem_addr points to a memory region with less than whole vector size bytes ;; of accessible memory and k is a mask that would prevent reading the inaccessible ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd @@ -9336,7 +9359,7 @@ (define_expand "_cvtmask2" operands[3] = CONST0_RTX (mode); }") -(define_insn "*_cvtmask2" +(define_insn_and_split "*_cvtmask2" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VI48_AVX512VL (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") @@ -9346,11 +9369,35 @@ (define_insn "*_cvtmask2" "@ vpmovm2\t{%1, %0|%0, %1} vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + "&& !TARGET_AVX512DQ && reload_completed + && optimize_function_for_speed_p (cfun)" + [(set (match_dup 0) (match_dup 4)) + (parallel +[(set (match_dup 0) + (vec_merge:VI48_AVX512VL + (match_dup 2) + (match_dup 3) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[4] = CONST0_RTX (mode);" [(set_attr "isa" "avx512dq,*") (set_attr "length_immediate" "0,1") (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn "*_cvtmask2_pternlog_false_dep" + [(set
Re: [PATCH] Break false dependence for vpternlog by inserting vpxor.
On Thu, Jul 6, 2023 at 11:46 PM wrote: > > > +; False dependency happens on destination register which is not really > > +; used when moving all ones to vector register > > +(define_split > > + [(set (match_operand:VMOVE 0 "register_operand") > > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] > > + "TARGET_AVX512F && reload_completed > > + && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))" > > + [(set (match_dup 0) (match_dup 2)) > > + (parallel > > + [(set (match_dup 0) (match_dup 1)) > > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > > + "operands[2] = CONST0_RTX (mode);") > > I think we shouldnt emit PXOR when optimizing for size. So should change > define_split: > define_split >[(set (match_operand:VMOVE 0 "register_operand") > (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] >"TARGET_AVX512F && reload_completed >&& ( == 64 || EXT_REX_SSE_REG_P (operands[0])) >&& optimize_insn_for_speed_p ()" >[(set (match_dup 0) (match_dup 2)) > (parallel > [(set (match_dup 0) (match_dup 1)) >(unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] >"operands[2] = CONST0_RTX (mode);") Yes, will do. I'm still working on breaking the false depence for pternlog in newly added pattern *iornot3,*xnor3 and *3. Will repost the patch when it's done. -- BR, Hongtao
Re: [PATCH] Break false dependence for vpternlog by inserting vpxor.
+; False dependency happens on destination register which is not really +; used when moving all ones to vector register +(define_split + [(set (match_operand:VMOVE 0 "register_operand") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] + "TARGET_AVX512F && reload_completed + && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))" + [(set (match_dup 0) (match_dup 2)) + (parallel + [(set (match_dup 0) (match_dup 1)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[2] = CONST0_RTX (mode);") I think we shouldnt emit PXOR when optimizing for size. So should change define_split: define_split [(set (match_operand:VMOVE 0 "register_operand") (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] "TARGET_AVX512F && reload_completed && ( == 64 || EXT_REX_SSE_REG_P (operands[0])) && optimize_insn_for_speed_p ()" [(set (match_dup 0) (match_dup 2)) (parallel [(set (match_dup 0) (match_dup 1)) (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] "operands[2] = CONST0_RTX (mode);")
[PATCH] Break false dependence for vpternlog by inserting vpxor.
vpternlog is also used for optimization which doesn't need any valid input operand, in that case, the destination is used as input in the instruction and that creates a false dependence. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ready to push to trunk. gcc/ChangeLog: PR target/110438 * config/i386/predicates.md (int_float_vector_all_ones_operand): New predicate. * config/i386/sse.md (*vmov_constm1_pternlog): New define_insn. (*_cvtmask2): Adjust to define_insn_and_split to avoid false dependence. (*_cvtmask2_pternlog): New define_insn. gcc/testsuite/ChangeLog: * gcc.target/i386/pr110438.c: New test. --- gcc/config/i386/predicates.md| 8 ++- gcc/config/i386/sse.md | 69 +++- gcc/testsuite/gcc.target/i386/pr110438.c | 30 +++ 3 files changed, 94 insertions(+), 13 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index fb07707dcba..df0d9e20def 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand" return false; }) -/* Return true if operand is a vector constant that is all ones. */ +/* Return true if operand is an integral vector constant that is all ones. */ (define_predicate "vector_all_ones_operand" (and (match_code "const_vector") (match_test "INTEGRAL_MODE_P (GET_MODE (op))") (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) +/* Return true if operand is a vector constant that is all ones. */ +(define_predicate "int_float_vector_all_ones_operand" + (ior (match_operand 0 "vector_all_ones_operand") + (match_operand 0 "float_vector_all_ones_operand") + (match_test "op == constm1_rtx"))) + /* Return true if operand is an 128/256bit all ones vector that zero-extends to 256/512bit. */ (define_predicate "vector_all_ones_zero_extend_half_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 812cfca4b92..93cdd844026 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1382,6 +1382,28 @@ (define_insn "mov_internal" ] (symbol_ref "true")))]) +; False dependency happens on destination register which is not really +; used when moving all ones to vector register +(define_split + [(set (match_operand:VMOVE 0 "register_operand") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] + "TARGET_AVX512F && reload_completed + && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))" + [(set (match_dup 0) (match_dup 2)) + (parallel + [(set (match_dup 0) (match_dup 1)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[2] = CONST0_RTX (mode);") + +(define_insn "*vmov_constm1_pternlog" + [(set (match_operand:VMOVE 0 "register_operand" "=v") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "")) + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512VL || == 64" + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "evex")]) + ;; If mem_addr points to a memory region with less than whole vector size bytes ;; of accessible memory and k is a mask that would prevent reading the inaccessible ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd @@ -9336,7 +9358,7 @@ (define_expand "_cvtmask2" operands[3] = CONST0_RTX (mode); }") -(define_insn "*_cvtmask2" +(define_insn_and_split "*_cvtmask2" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VI48_AVX512VL (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") @@ -9345,12 +9367,35 @@ (define_insn "*_cvtmask2" "TARGET_AVX512F" "@ vpmovm2\t{%1, %0|%0, %1} - vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + #" + "&& !TARGET_AVX512DQ && reload_completed" + [(set (match_dup 0) (match_dup 4)) + (parallel +[(set (match_dup 0) + (vec_merge:VI48_AVX512VL + (match_dup 2) + (match_dup 3) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[4] = CONST0_RTX (mode);" [(set_attr "isa" "avx512dq,*") (set_attr "length_immediate" "0,1") (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn "*_cvtmask2_pternlog" + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") + (vec_merge:VI48_AVX512VL + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") + (match_operand:VI48_AVX512VL 3 "const0_operand") + (match_operand: 1 "register_operand" "Yk"))) + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512F && !TARGET_AVX512DQ" +