On Tue, Aug 24, 2021 at 9:36 AM liuhongt <hongtao....@intel.com> wrote: > > Also optimize below 3 forms to vpternlog, op1, op2, op3 are > register_operand or unary_p as (not reg) > > A: (any_logic (any_logic op1 op2) op3) > B: (any_logic (any_logic op1 op2) (any_logic op3 op4)) op3/op4 should > be equal to op1/op2 > C: (any_logic (any_logic (any_logic:op1 op2) op3) op4) op3/op4 should > be equal to op1/op2 > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > > gcc/ChangeLog: > > PR target/101989 > * config/i386/i386-protos.h > (ix86_strip_reg_or_notreg_operand): New declare. > * config/i386/i386.c (ix86_rtx_costs): Define cost for > UNSPEC_VTERNLOG. > (ix86_strip_reg_or_notreg_operand): New function. Push to trunk by changing ix86_strip_reg_or_notreg_operand to macro, function call seems too inefficient for the simple strip unary. > * config/i386/predicates.md (reg_or_notreg_operand): New > predicate. > * config/i386/sse.md (*<avx512>_vternlog<mode>_all): New define_insn. > (*<avx512>_vternlog<mode>_1): New pre_reload > define_insn_and_split. > (*<avx512>_vternlog<mode>_2): Ditto. > (*<avx512>_vternlog<mode>_3): Ditto. > (any_logic1,any_logic2): New code iterator. > (logic_op): New code attribute. > (ternlogsuffix): Extend to VNxDF and VNxSF. > > gcc/testsuite/ChangeLog: > > PR target/101989 > * gcc.target/i386/pr101989-1.c: New test. > * gcc.target/i386/pr101989-2.c: New test. > * gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase. > --- > gcc/config/i386/i386-protos.h | 1 + > gcc/config/i386/i386.c | 13 + > gcc/config/i386/predicates.md | 7 + > gcc/config/i386/sse.md | 234 ++++++++++++++++++ > .../i386/avx512bw-shiftqihi-constant-1.c | 4 +- > gcc/testsuite/gcc.target/i386/pr101989-1.c | 51 ++++ > gcc/testsuite/gcc.target/i386/pr101989-2.c | 102 ++++++++ > 7 files changed, 410 insertions(+), 2 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-2.c > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 2fd13074c81..2bdaadcf4f3 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -60,6 +60,7 @@ extern rtx standard_80387_constant_rtx (int); > extern int standard_sse_constant_p (rtx, machine_mode); > extern const char *standard_sse_constant_opcode (rtx_insn *, rtx *); > extern bool ix86_standard_x87sse_constant_load_p (const rtx_insn *, rtx); > +extern rtx ix86_strip_reg_or_notreg_operand (rtx); > extern bool ix86_pre_reload_split (void); > extern bool symbolic_reference_mentioned_p (rtx); > extern bool extended_reg_mentioned_p (rtx); > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 46844fab08f..a69225ccc81 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -5236,6 +5236,14 @@ ix86_standard_x87sse_constant_load_p (const rtx_insn > *insn, rtx dst) > return true; > } > > +/* Returns true if INSN can be transformed from a memory load > + to a supported FP constant load. */ > +rtx > +ix86_strip_reg_or_notreg_operand (rtx op) > +{ > + return UNARY_P (op) ? XEXP (op, 0) : op; > +} > + > /* Predicate for pre-reload splitters with associated instructions, > which can match any time before the split1 pass (usually combine), > then are unconditionally split in that pass and should not be > @@ -20544,6 +20552,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int > outer_code_i, int opno, > case UNSPEC: > if (XINT (x, 1) == UNSPEC_TP) > *total = 0; > + else if (XINT(x, 1) == UNSPEC_VTERNLOG) > + { > + *total = cost->sse_op; > + return true; > + } > return false; > > case VEC_SELECT: > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index 9321f332ef9..df5acb425d4 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -1044,6 +1044,13 @@ (define_predicate "reg_or_pm1_operand" > (ior (match_test "op == const1_rtx") > (match_test "op == constm1_rtx"))))) > > +;; True for registers, or (not: registers). Used to optimize 3-operand > +;; bitwise operation. > +(define_predicate "reg_or_notreg_operand" > + (ior (match_operand 0 "register_operand") > + (and (match_code "not") > + (match_test "register_operand (XEXP (op, 0), mode)")))) > + > ;; True if OP is acceptable as operand of DImode shift expander. > (define_predicate "shiftdi_operand" > (if_then_else (match_test "TARGET_64BIT") > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 13889687793..0acd749d21c 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -933,7 +933,9 @@ (define_mode_attr iptr > ;; Mapping of vector modes to VPTERNLOG suffix > (define_mode_attr ternlogsuffix > [(V8DI "q") (V4DI "q") (V2DI "q") > + (V8DF "q") (V4DF "q") (V2DF "q") > (V16SI "d") (V8SI "d") (V4SI "d") > + (V16SF "d") (V8SF "d") (V4SF "d") > (V32HI "d") (V16HI "d") (V8HI "d") > (V64QI "d") (V32QI "d") (V16QI "d")]) > > @@ -10041,6 +10043,238 @@ (define_insn > "<avx512>_vternlog<mode><sd_maskz_name>" > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn "*<avx512>_vternlog<mode>_all" > + [(set (match_operand:V 0 "register_operand" "=v") > + (unspec:V > + [(match_operand:V 1 "register_operand" "0") > + (match_operand:V 2 "register_operand" "v") > + (match_operand:V 3 "nonimmediate_operand" "vm") > + (match_operand:SI 4 "const_0_to_255_operand")] > + UNSPEC_VTERNLOG))] > + "TARGET_AVX512F" > + "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}" > + [(set_attr "type" "sselog") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > +;; There must be lots of other combinations like > +;; > +;; (any_logic:V > +;; (any_logic:V op1 op2) > +;; (any_logic:V op1 op3)) > +;; > +;; (any_logic:V > +;; (any_logic:V > +;; (any_logic:V op1, op2) > +;; op3) > +;; op1) > +;; > +;; and so on. > + > +(define_code_iterator any_logic1 [and ior xor]) > +(define_code_iterator any_logic2 [and ior xor]) > +(define_code_attr logic_op [(and "&") (ior "|") (xor "^")]) > + > +(define_insn_and_split "*<avx512>_vpternlog<mode>_1" > + [(set (match_operand:V 0 "register_operand") > + (any_logic:V > + (any_logic1:V > + (match_operand:V 1 "reg_or_notreg_operand") > + (match_operand:V 2 "reg_or_notreg_operand")) > + (any_logic2:V > + (match_operand:V 3 "reg_or_notreg_operand") > + (match_operand:V 4 "reg_or_notreg_operand"))))] > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL) > + && ix86_pre_reload_split () > + && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), > + ix86_strip_reg_or_notreg_operand (operands[4])) > + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), > + ix86_strip_reg_or_notreg_operand (operands[4])) > + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), > + ix86_strip_reg_or_notreg_operand (operands[3])) > + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), > + ix86_strip_reg_or_notreg_operand (operands[3])))" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (unspec:V > + [(match_dup 6) > + (match_dup 2) > + (match_dup 1) > + (match_dup 5)] > + UNSPEC_VTERNLOG))] > +{ > + /* VPTERNLOGD reg6, reg2, reg1, imm8. */ > + int reg6 = 0xF0; > + int reg2 = 0xCC; > + int reg1 = 0xAA; > + int reg3 = 0; > + int reg4 = 0; > + int reg_mask, tmp1, tmp2; > + if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), > + ix86_strip_reg_or_notreg_operand (operands[4]))) > + { > + reg4 = reg1; > + reg3 = reg6; > + operands[6] = operands[3]; > + } > + else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), > + ix86_strip_reg_or_notreg_operand (operands[4]))) > + { > + reg4 = reg2; > + reg3 = reg6; > + operands[6] = operands[3]; > + } > + else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), > + ix86_strip_reg_or_notreg_operand (operands[3]))) > + { > + reg4 = reg6; > + reg3 = reg1; > + operands[6] = operands[4]; > + } > + else > + { > + reg4 = reg6; > + reg3 = reg2; > + operands[6] = operands[4]; > + } > + > + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; > + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; > + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; > + reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4; > + > + tmp1 = reg1 <any_logic1:logic_op> reg2; > + tmp2 = reg3 <any_logic2:logic_op> reg4; > + reg_mask = tmp1 <any_logic:logic_op> tmp2; > + reg_mask &= 0xFF; > + > + operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]); > + operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]); > + operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]); > + operands[5] = GEN_INT (reg_mask); > +}) > + > +(define_insn_and_split "*<avx512>_vpternlog<mode>_2" > + [(set (match_operand:V 0 "register_operand") > + (any_logic:V > + (any_logic1:V > + (any_logic2:V > + (match_operand:V 1 "reg_or_notreg_operand") > + (match_operand:V 2 "reg_or_notreg_operand")) > + (match_operand:V 3 "reg_or_notreg_operand")) > + (match_operand:V 4 "reg_or_notreg_operand")))] > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL) > + && ix86_pre_reload_split () > + && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), > + ix86_strip_reg_or_notreg_operand (operands[4])) > + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), > + ix86_strip_reg_or_notreg_operand (operands[4])) > + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), > + ix86_strip_reg_or_notreg_operand (operands[3])) > + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), > + ix86_strip_reg_or_notreg_operand (operands[3])))" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (unspec:V > + [(match_dup 6) > + (match_dup 2) > + (match_dup 1) > + (match_dup 5)] > + UNSPEC_VTERNLOG))] > +{ > + /* VPTERNLOGD reg6, reg2, reg1, imm8. */ > + int reg6 = 0xF0; > + int reg2 = 0xCC; > + int reg1 = 0xAA; > + int reg3 = 0; > + int reg4 = 0; > + int reg_mask, tmp1, tmp2; > + if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), > + ix86_strip_reg_or_notreg_operand (operands[4]))) > + { > + reg4 = reg1; > + reg3 = reg6; > + operands[6] = operands[3]; > + } > + else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), > + ix86_strip_reg_or_notreg_operand (operands[4]))) > + { > + reg4 = reg2; > + reg3 = reg6; > + operands[6] = operands[3]; > + } > + else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), > + ix86_strip_reg_or_notreg_operand (operands[3]))) > + { > + reg4 = reg6; > + reg3 = reg1; > + operands[6] = operands[4]; > + } > + else > + { > + reg4 = reg6; > + reg3 = reg2; > + operands[6] = operands[4]; > + } > + > + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; > + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; > + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; > + reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4; > + > + tmp1 = reg1 <any_logic2:logic_op> reg2; > + tmp2 = tmp1 <any_logic1:logic_op> reg3; > + reg_mask = tmp2 <any_logic:logic_op> reg4; > + reg_mask &= 0xFF; > + > + operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]); > + operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]); > + operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]); > + operands[5] = GEN_INT (reg_mask); > +}) > + > +(define_insn_and_split "*<avx512>_vpternlog<mode>_3" > + [(set (match_operand:V 0 "register_operand") > + (any_logic:V > + (any_logic1:V > + (match_operand:V 1 "reg_or_notreg_operand") > + (match_operand:V 2 "reg_or_notreg_operand")) > + (match_operand:V 3 "reg_or_notreg_operand")))] > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL) > + && ix86_pre_reload_split ()" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (unspec:V > + [(match_dup 3) > + (match_dup 2) > + (match_dup 1) > + (match_dup 4)] > + UNSPEC_VTERNLOG))] > +{ > + /* VPTERNLOGD reg3, reg2, reg1, imm8. */ > + int reg3 = 0xF0; > + int reg2 = 0xCC; > + int reg1 = 0xAA; > + int reg_mask, tmp1; > + > + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; > + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; > + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; > + > + tmp1 = reg1 <any_logic1:logic_op> reg2; > + reg_mask = tmp1 <any_logic:logic_op> reg3; > + reg_mask &= 0xFF; > + > + operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]); > + operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]); > + operands[3] = ix86_strip_reg_or_notreg_operand (operands[3]); > + operands[4] = GEN_INT (reg_mask); > +}) > + > + > (define_insn "<avx512>_vternlog<mode>_mask" > [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") > (vec_merge:VI48_AVX512VL > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c > b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c > index 78bf5d33689..fbc3de08119 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c > @@ -1,7 +1,8 @@ > /* PR target/95524 */ > /* { dg-do compile } */ > /* { dg-options "-O2 -mavx512bw" } */ > -/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } } */ > +/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 2 } } */ > +/* { dg-final { scan-assembler-times "vpternlogd\[^\n\]*%zmm" 1 } } */ > typedef char v64qi __attribute__ ((vector_size (64))); > typedef unsigned char v64uqi __attribute__ ((vector_size (64))); > > @@ -11,7 +12,6 @@ foo_ashiftrt_512 (v64qi a) > return a >> 2; > } > /* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */ > -/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */ > /* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */ > > __attribute__((noipa)) v64qi > diff --git a/gcc/testsuite/gcc.target/i386/pr101989-1.c > b/gcc/testsuite/gcc.target/i386/pr101989-1.c > new file mode 100644 > index 00000000000..594093ecdde > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr101989-1.c > @@ -0,0 +1,51 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { scan-assembler-times "vpternlog" 6 } } */ > +/* { dg-final { scan-assembler-not "vpxor" } } */ > +/* { dg-final { scan-assembler-not "vpor" } } */ > +/* { dg-final { scan-assembler-not "vpand" } } */ > + > +#include<immintrin.h> > +__m256d > +__attribute__((noipa, target("avx512vl"))) > +copysign2_pd(__m256d from, __m256d to) { > + __m256i a = _mm256_castpd_si256(from); > + __m256d avx_signbit = > _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63)); > + /* (avx_signbit & from) | (~avx_signbit & to) */ > + return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), > _mm256_andnot_pd(avx_signbit, to)); > +} > + > +__m256i > +__attribute__((noipa, target("avx512vl"))) > +foo (__m256i src1, __m256i src2, __m256i src3) > +{ > + return (src2 & ~src1) | (src3 & src1); > +} > + > +__m256i > +__attribute__ ((noipa, target("avx512vl"))) > +foo1 (__m256i src1, __m256i src2, __m256i src3) > +{ > + return (src2 & src1) | (src3 & ~src1); > +} > + > +__m256i > +__attribute__ ((noipa, target("avx512vl"))) > +foo2 (__m256i src1, __m256i src2, __m256i src3) > +{ > + return (src2 & src1) | (~src3 & src1); > +} > + > +__m256i > +__attribute__ ((noipa, target("avx512vl"))) > +foo3 (__m256i src1, __m256i src2, __m256i src3) > +{ > + return (~src2 & src1) | (src3 & src1); > +} > + > +__m256i > +__attribute__ ((noipa, target("avx512vl"))) > +foo4 (__m256i src1, __m256i src2, __m256i src3) > +{ > + return src3 & src2 ^ src1; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr101989-2.c > b/gcc/testsuite/gcc.target/i386/pr101989-2.c > new file mode 100644 > index 00000000000..9d9759a8e1d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr101989-2.c > @@ -0,0 +1,102 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ > +/* { dg-require-effective-target avx512vl } */ > + > +#define AVX512VL > + > +#include "avx512f-helper.h" > + > +#include "pr101989-1.c" > +__m256d > +avx2_copysign2_pd (__m256d from, __m256d to) { > + __m256i a = _mm256_castpd_si256(from); > + __m256d avx_signbit = > _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63)); > + /* (avx_signbit & from) | (~avx_signbit & to) */ > + return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), > _mm256_andnot_pd(avx_signbit, to)); > +} > + > +__m256i > +avx2_foo (__m256i src1, __m256i src2, __m256i src3) > +{ > + return (src2 & ~src1) | (src3 & src1); > +} > + > +__m256i > +avx2_foo1 (__m256i src1, __m256i src2, __m256i src3) > +{ > + return (src2 & src1) | (src3 & ~src1); > +} > + > +__m256i > +avx2_foo2 (__m256i src1, __m256i src2, __m256i src3) > +{ > + return (src2 & src1) | (~src3 & src1); > +} > + > +__m256i > +avx2_foo3 (__m256i src1, __m256i src2, __m256i src3) > +{ > + return (~src2 & src1) | (src3 & src1); > +} > + > +__m256i > +avx2_foo4 (__m256i src1, __m256i src2, __m256i src3) > +{ > + return src3 & src2 ^ src1; > +} > + > + > +void > +test_256 (void) > +{ > + union256i_q q1, q2, q3, res2, exp2; > + union256d d1, d2, res1, exp1; > + int i, sign = 1; > + > + for (i = 0; i < 4; i++) > + { > + d1.a[i] = 12.34 * (i + 2000) * sign; > + d2.a[i] = 56.78 * (i - 30) * sign; > + q1.a[i] = 12 * (i + 2000) * sign; > + q2.a[i] = 56 * (i - 30) * sign; > + q3.a[i] = 90 * (i + 40) * sign; > + res1.a[i] = DEFAULT_VALUE; > + exp1.a[i] = DEFAULT_VALUE; > + res2.a[i] = exp2.a[i] = -1; > + sign = -sign; > + } > + > + exp1.x = avx2_copysign2_pd (d1.x, d2.x); > + res1.x = copysign2_pd (d1.x, d2.x); > + if (UNION_CHECK (256, d) (res1, exp1.a)) > + abort (); > + > + exp2.x = avx2_foo1 (q1.x, q2.x, q3.x); > + res2.x = foo1 (q1.x, q2.x, q3.x); > + if (UNION_CHECK (256, i_q) (res2, exp2.a)) > + abort (); > + > + exp2.x = avx2_foo2 (q1.x, q2.x, q3.x); > + res2.x = foo2 (q1.x, q2.x, q3.x); > + if (UNION_CHECK (256, i_q) (res2, exp2.a)) > + abort (); > + > + exp2.x = avx2_foo3 (q1.x, q2.x, q3.x); > + res2.x = foo3 (q1.x, q2.x, q3.x); > + if (UNION_CHECK (256, i_q) (res2, exp2.a)) > + abort (); > + > + exp2.x = avx2_foo4 (q1.x, q2.x, q3.x); > + res2.x = foo4 (q1.x, q2.x, q3.x); > + if (UNION_CHECK (256, i_q) (res2, exp2.a)) > + abort (); > + > + exp2.x = avx2_foo (q1.x, q2.x, q3.x); > + res2.x = foo (q1.x, q2.x, q3.x); > + if (UNION_CHECK (256, i_q) (res2, exp2.a)) > + abort (); > +} > + > +static void > +test_128 () > +{} > -- > 2.18.1 >
-- BR, Hongtao