Re: [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
On Tue, Jul 11, 2023 at 12:24 AM Alexander Monakov via Gcc-patches wrote: > > > On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote: > > > False dependency happens when destination is only updated by > > pternlog. There is no false dependency when destination is also used > > in source. So either a pxor should be inserted, or input operand > > should be set with constraint '0'. > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > Ready to push to trunk. > > Shouldn't this patch also remove uses of vpternlog in > standard_sse_constant_opcode? It's still needed when !optimize_function_for_speed_p (cfun). > > A couple more questions below: > > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -1382,6 +1382,29 @@ (define_insn "mov_internal" > > ] > > (symbol_ref "true")))]) > > > > +; False dependency happens on destination register which is not really > > +; used when moving all ones to vector register > > +(define_split > > + [(set (match_operand:VMOVE 0 "register_operand") > > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] > > + "TARGET_AVX512F && reload_completed > > + && ( == 64 || EXT_REX_SSE_REG_P (operands[0])) > > + && optimize_function_for_speed_p (cfun)" > > Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate. > Doesn't it work here as well? I'm just aligned with lzcnt/popcnt case, the difference between option_insn_for_speed_p and optimized_function_for_speed_p is the former will consider !crtl->maybe_hot_insn_p but the latter just returns !optimize_function_for_size_p (cfun). It looks optimize_insn_for_speed_p() is more reasonable for single insn. 350optimize_insn_for_size_p (void) 351{ 352 enum optimize_size_level ret = optimize_function_for_size_p (cfun); 353 if (ret < OPTIMIZE_SIZE_BALANCED && !crtl->maybe_hot_insn_p) 354ret = OPTIMIZE_SIZE_BALANCED; 355 return ret; > > > + [(set (match_dup 0) (match_dup 2)) > > + (parallel > > + [(set (match_dup 0) (match_dup 1)) > > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > > + "operands[2] = CONST0_RTX (mode);") > > + > > +(define_insn "*vmov_constm1_pternlog_false_dep" > > + [(set (match_operand:VMOVE 0 "register_operand" "=v") > > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" > > "")) > > + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] > > UNSPEC_INSN_FALSE_DEP)] > > + "TARGET_AVX512VL || == 64" > > + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" > > + [(set_attr "type" "sselog1") > > + (set_attr "prefix" "evex")]) > > + > > ;; If mem_addr points to a memory region with less than whole vector size > > bytes > > ;; of accessible memory and k is a mask that would prevent reading the > > inaccessible > > ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be > > transformed to vpblendd > > @@ -9336,7 +9359,7 @@ (define_expand > > "_cvtmask2" > > operands[3] = CONST0_RTX (mode); > >}") > > > > -(define_insn "*_cvtmask2" > > +(define_insn_and_split "*_cvtmask2" > >[(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") > > (vec_merge:VI48_AVX512VL > > (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > > @@ -9346,11 +9369,35 @@ (define_insn > > "*_cvtmask2" > >"@ > > vpmovm2\t{%1, %0|%0, %1} > > vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, > > %0, %0, 0x81}" > > + "&& !TARGET_AVX512DQ && reload_completed > > + && optimize_function_for_speed_p (cfun)" > > + [(set (match_dup 0) (match_dup 4)) > > + (parallel > > +[(set (match_dup 0) > > + (vec_merge:VI48_AVX512VL > > + (match_dup 2) > > + (match_dup 3) > > + (match_dup 1))) > > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > > + "operands[4] = CONST0_RTX (mode);" > >[(set_attr "isa" "avx512dq,*") > > (set_attr "length_immediate" "0,1") > > (set_attr "prefix" "evex") > > (set_attr "mode" "")]) > > > > +(define_insn "*_cvtmask2_pternlog_false_dep" > > + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") > > + (vec_merge:VI48_AVX512VL > > + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > > + (match_operand:VI48_AVX512VL 3 "const0_operand") > > + (match_operand: 1 "register_operand" "Yk"))) > > + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] > > UNSPEC_INSN_FALSE_DEP)] > > + "TARGET_AVX512F && !TARGET_AVX512DQ" > > + "vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, > > %0, %0, 0x81}" > > + [(set_attr "length_immediate" "1") > > + (set_attr "prefix" "evex") > > + (set_attr "mode" "")]) > > + > > (define_expand "extendv2sfv2df2" > >[(set (match_operand:V2DF 0 "register_operand") > > (float_extend:V2DF > > @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl2" > > operands[2] = force_reg (mode, operands[2]); > > }) > > > > -(define_insn "one_cmpl2" > > - [(set (match_operand:VI
Re: [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote: > False dependency happens when destination is only updated by > pternlog. There is no false dependency when destination is also used > in source. So either a pxor should be inserted, or input operand > should be set with constraint '0'. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ready to push to trunk. Shouldn't this patch also remove uses of vpternlog in standard_sse_constant_opcode? A couple more questions below: > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -1382,6 +1382,29 @@ (define_insn "mov_internal" > ] > (symbol_ref "true")))]) > > +; False dependency happens on destination register which is not really > +; used when moving all ones to vector register > +(define_split > + [(set (match_operand:VMOVE 0 "register_operand") > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] > + "TARGET_AVX512F && reload_completed > + && ( == 64 || EXT_REX_SSE_REG_P (operands[0])) > + && optimize_function_for_speed_p (cfun)" Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate. Doesn't it work here as well? > + [(set (match_dup 0) (match_dup 2)) > + (parallel > + [(set (match_dup 0) (match_dup 1)) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > + "operands[2] = CONST0_RTX (mode);") > + > +(define_insn "*vmov_constm1_pternlog_false_dep" > + [(set (match_operand:VMOVE 0 "register_operand" "=v") > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" > "")) > + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] > UNSPEC_INSN_FALSE_DEP)] > + "TARGET_AVX512VL || == 64" > + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" > + [(set_attr "type" "sselog1") > + (set_attr "prefix" "evex")]) > + > ;; If mem_addr points to a memory region with less than whole vector size > bytes > ;; of accessible memory and k is a mask that would prevent reading the > inaccessible > ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed > to vpblendd > @@ -9336,7 +9359,7 @@ (define_expand "_cvtmask2" > operands[3] = CONST0_RTX (mode); >}") > > -(define_insn "*_cvtmask2" > +(define_insn_and_split "*_cvtmask2" >[(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") > (vec_merge:VI48_AVX512VL > (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > @@ -9346,11 +9369,35 @@ (define_insn "*_cvtmask2" >"@ > vpmovm2\t{%1, %0|%0, %1} > vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, > %0, %0, 0x81}" > + "&& !TARGET_AVX512DQ && reload_completed > + && optimize_function_for_speed_p (cfun)" > + [(set (match_dup 0) (match_dup 4)) > + (parallel > +[(set (match_dup 0) > + (vec_merge:VI48_AVX512VL > + (match_dup 2) > + (match_dup 3) > + (match_dup 1))) > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] > + "operands[4] = CONST0_RTX (mode);" >[(set_attr "isa" "avx512dq,*") > (set_attr "length_immediate" "0,1") > (set_attr "prefix" "evex") > (set_attr "mode" "")]) > > +(define_insn "*_cvtmask2_pternlog_false_dep" > + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") > + (vec_merge:VI48_AVX512VL > + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") > + (match_operand:VI48_AVX512VL 3 "const0_operand") > + (match_operand: 1 "register_operand" "Yk"))) > + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] > UNSPEC_INSN_FALSE_DEP)] > + "TARGET_AVX512F && !TARGET_AVX512DQ" > + "vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, > %0, %0, 0x81}" > + [(set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set_attr "mode" "")]) > + > (define_expand "extendv2sfv2df2" >[(set (match_operand:V2DF 0 "register_operand") > (float_extend:V2DF > @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl2" > operands[2] = force_reg (mode, operands[2]); > }) > > -(define_insn "one_cmpl2" > - [(set (match_operand:VI 0 "register_operand" "=v,v") > - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m") > - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] > +(define_insn_and_split "one_cmpl2" > + [(set (match_operand:VI 0 "register_operand" "=v,v,v") > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))] >"TARGET_AVX512F > && (! > || mode == SImode > || mode == DImode)" > { > + if (! && which_alternative > + && optimize_function_for_speed_p (cfun)) > +return "#"; > + >if (TARGET_AVX512VL) > return "vpternlog\t{$0x55, %1, %0, > %0|%0, %0, %1, 0x55}"; >else > return "vpternlog\t{$0x55, %g1, %g0, > %g0|%g0, %g0, %g1, 0x55}"; > } > + "&& reload_completed && !REG_P (operands[1]) && ! > + && optimize_function_for_speed_p (cfun)" > +
[PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
False dependency happens when destination is only updated by pternlog. There is no false dependency when destination is also used in source. So either a pxor should be inserted, or input operand should be set with constraint '0'. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ready to push to trunk. gcc/ChangeLog: PR target/110438 PR target/110202 * config/i386/predicates.md (int_float_vector_all_ones_operand): New predicate. * config/i386/sse.md (*vmov_constm1_pternlog_false_dep): New define_insn. (*_cvtmask2_pternlog_false_dep): Ditto. (*_cvtmask2_pternlog_false_dep): Ditto. (*_cvtmask2): Adjust to define_insn_and_split to avoid false dependence. (*_cvtmask2): Ditto. (one_cmpl2): Adjust constraint of operands 1 to '0' to avoid false dependence. (*andnot3): Ditto. (iornot3): Ditto. (*3): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr110438.c: New test. --- gcc/config/i386/predicates.md| 8 +- gcc/config/i386/sse.md | 113 --- gcc/testsuite/gcc.target/i386/pr110438.c | 30 ++ 3 files changed, 135 insertions(+), 16 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 7ddbe01a6f9..37d20c6303a 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand" return false; }) -/* Return true if operand is a vector constant that is all ones. */ +/* Return true if operand is an integral vector constant that is all ones. */ (define_predicate "vector_all_ones_operand" (and (match_code "const_vector") (match_test "INTEGRAL_MODE_P (GET_MODE (op))") (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) +/* Return true if operand is a vector constant that is all ones. */ +(define_predicate "int_float_vector_all_ones_operand" + (ior (match_operand 0 "vector_all_ones_operand") + (match_operand 0 "float_vector_all_ones_operand") + (match_test "op == constm1_rtx"))) + /* Return true if operand is an 128/256bit all ones vector that zero-extends to 256/512bit. */ (define_predicate "vector_all_ones_zero_extend_half_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 418c337a775..56920a3e1d3 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1382,6 +1382,29 @@ (define_insn "mov_internal" ] (symbol_ref "true")))]) +; False dependency happens on destination register which is not really +; used when moving all ones to vector register +(define_split + [(set (match_operand:VMOVE 0 "register_operand") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] + "TARGET_AVX512F && reload_completed + && ( == 64 || EXT_REX_SSE_REG_P (operands[0])) + && optimize_function_for_speed_p (cfun)" + [(set (match_dup 0) (match_dup 2)) + (parallel + [(set (match_dup 0) (match_dup 1)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[2] = CONST0_RTX (mode);") + +(define_insn "*vmov_constm1_pternlog_false_dep" + [(set (match_operand:VMOVE 0 "register_operand" "=v") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "")) + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512VL || == 64" + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "evex")]) + ;; If mem_addr points to a memory region with less than whole vector size bytes ;; of accessible memory and k is a mask that would prevent reading the inaccessible ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd @@ -9336,7 +9359,7 @@ (define_expand "_cvtmask2" operands[3] = CONST0_RTX (mode); }") -(define_insn "*_cvtmask2" +(define_insn_and_split "*_cvtmask2" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VI48_AVX512VL (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") @@ -9346,11 +9369,35 @@ (define_insn "*_cvtmask2" "@ vpmovm2\t{%1, %0|%0, %1} vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + "&& !TARGET_AVX512DQ && reload_completed + && optimize_function_for_speed_p (cfun)" + [(set (match_dup 0) (match_dup 4)) + (parallel +[(set (match_dup 0) + (vec_merge:VI48_AVX512VL + (match_dup 2) + (match_dup 3) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[4] = CONST0_RTX (mode);" [(set_attr "isa" "avx512dq,*") (set_attr "length_immediate" "0,1") (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn "*_cvtmask2_pternlog_false_dep" + [(set (match_operand:VI