Also optimize below 3 forms to vpternlog, op1, op2, op3 are register_operand or unary_p as (not reg)
A: (any_logic (any_logic op1 op2) op3) B: (any_logic (any_logic op1 op2) (any_logic op3 op4)) op3/op4 should be equal to op1/op2 C: (any_logic (any_logic (any_logic:op1 op2) op3) op4) op3/op4 should be equal to op1/op2 Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. gcc/ChangeLog: PR target/101989 * config/i386/i386-protos.h (ix86_strip_reg_or_notreg_operand): New declare. * config/i386/i386.c (ix86_rtx_costs): Define cost for UNSPEC_VTERNLOG. (ix86_strip_reg_or_notreg_operand): New function. * config/i386/predicates.md (reg_or_notreg_operand): New predicate. * config/i386/sse.md (*<avx512>_vternlog<mode>_all): New define_insn. (*<avx512>_vternlog<mode>_1): New pre_reload define_insn_and_split. (*<avx512>_vternlog<mode>_2): Ditto. (*<avx512>_vternlog<mode>_3): Ditto. (any_logic1,any_logic2): New code iterator. (logic_op): New code attribute. (ternlogsuffix): Extend to VNxDF and VNxSF. gcc/testsuite/ChangeLog: PR target/101989 * gcc.target/i386/pr101989-1.c: New test. * gcc.target/i386/pr101989-2.c: New test. * gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase. --- gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.c | 13 + gcc/config/i386/predicates.md | 7 + gcc/config/i386/sse.md | 234 ++++++++++++++++++ .../i386/avx512bw-shiftqihi-constant-1.c | 4 +- gcc/testsuite/gcc.target/i386/pr101989-1.c | 51 ++++ gcc/testsuite/gcc.target/i386/pr101989-2.c | 102 ++++++++ 7 files changed, 410 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-2.c diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 2fd13074c81..2bdaadcf4f3 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -60,6 +60,7 @@ extern rtx standard_80387_constant_rtx (int); extern int standard_sse_constant_p (rtx, machine_mode); extern const char *standard_sse_constant_opcode (rtx_insn *, rtx *); extern bool ix86_standard_x87sse_constant_load_p (const rtx_insn *, rtx); +extern rtx ix86_strip_reg_or_notreg_operand (rtx); extern bool ix86_pre_reload_split (void); extern bool symbolic_reference_mentioned_p (rtx); extern bool extended_reg_mentioned_p (rtx); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 46844fab08f..a69225ccc81 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -5236,6 +5236,14 @@ ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst) return true; } +/* Returns true if INSN can be transformed from a memory load + to a supported FP constant load. */ +rtx +ix86_strip_reg_or_notreg_operand (rtx op) +{ + return UNARY_P (op) ? XEXP (op, 0) : op; +} + /* Predicate for pre-reload splitters with associated instructions, which can match any time before the split1 pass (usually combine), then are unconditionally split in that pass and should not be @@ -20544,6 +20552,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case UNSPEC: if (XINT (x, 1) == UNSPEC_TP) *total = 0; + else if (XINT(x, 1) == UNSPEC_VTERNLOG) + { + *total = cost->sse_op; + return true; + } return false; case VEC_SELECT: diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 9321f332ef9..df5acb425d4 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1044,6 +1044,13 @@ (define_predicate "reg_or_pm1_operand" (ior (match_test "op == const1_rtx") (match_test "op == constm1_rtx"))))) +;; True for registers, or (not: registers). Used to optimize 3-operand +;; bitwise operation. +(define_predicate "reg_or_notreg_operand" + (ior (match_operand 0 "register_operand") + (and (match_code "not") + (match_test "register_operand (XEXP (op, 0), mode)")))) + ;; True if OP is acceptable as operand of DImode shift expander. (define_predicate "shiftdi_operand" (if_then_else (match_test "TARGET_64BIT") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 13889687793..0acd749d21c 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -933,7 +933,9 @@ (define_mode_attr iptr ;; Mapping of vector modes to VPTERNLOG suffix (define_mode_attr ternlogsuffix [(V8DI "q") (V4DI "q") (V2DI "q") + (V8DF "q") (V4DF "q") (V2DF "q") (V16SI "d") (V8SI "d") (V4SI "d") + (V16SF "d") (V8SF "d") (V4SF "d") (V32HI "d") (V16HI "d") (V8HI "d") (V64QI "d") (V32QI "d") (V16QI "d")]) @@ -10041,6 +10043,238 @@ (define_insn "<avx512>_vternlog<mode><sd_maskz_name>" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_vternlog<mode>_all" + [(set (match_operand:V 0 "register_operand" "=v") + (unspec:V + [(match_operand:V 1 "register_operand" "0") + (match_operand:V 2 "register_operand" "v") + (match_operand:V 3 "nonimmediate_operand" "vm") + (match_operand:SI 4 "const_0_to_255_operand")] + UNSPEC_VTERNLOG))] + "TARGET_AVX512F" + "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}" + [(set_attr "type" "sselog") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +;; There must be lots of other combinations like +;; +;; (any_logic:V +;; (any_logic:V op1 op2) +;; (any_logic:V op1 op3)) +;; +;; (any_logic:V +;; (any_logic:V +;; (any_logic:V op1, op2) +;; op3) +;; op1) +;; +;; and so on. + +(define_code_iterator any_logic1 [and ior xor]) +(define_code_iterator any_logic2 [and ior xor]) +(define_code_attr logic_op [(and "&") (ior "|") (xor "^")]) + +(define_insn_and_split "*<avx512>_vpternlog<mode>_1" + [(set (match_operand:V 0 "register_operand") + (any_logic:V + (any_logic1:V + (match_operand:V 1 "reg_or_notreg_operand") + (match_operand:V 2 "reg_or_notreg_operand")) + (any_logic2:V + (match_operand:V 3 "reg_or_notreg_operand") + (match_operand:V 4 "reg_or_notreg_operand"))))] + "(<MODE_SIZE> == 64 || TARGET_AVX512VL) + && ix86_pre_reload_split () + && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), + ix86_strip_reg_or_notreg_operand (operands[4])) + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), + ix86_strip_reg_or_notreg_operand (operands[4])) + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), + ix86_strip_reg_or_notreg_operand (operands[3])) + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), + ix86_strip_reg_or_notreg_operand (operands[3])))" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:V + [(match_dup 6) + (match_dup 2) + (match_dup 1) + (match_dup 5)] + UNSPEC_VTERNLOG))] +{ + /* VPTERNLOGD reg6, reg2, reg1, imm8. */ + int reg6 = 0xF0; + int reg2 = 0xCC; + int reg1 = 0xAA; + int reg3 = 0; + int reg4 = 0; + int reg_mask, tmp1, tmp2; + if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), + ix86_strip_reg_or_notreg_operand (operands[4]))) + { + reg4 = reg1; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), + ix86_strip_reg_or_notreg_operand (operands[4]))) + { + reg4 = reg2; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), + ix86_strip_reg_or_notreg_operand (operands[3]))) + { + reg4 = reg6; + reg3 = reg1; + operands[6] = operands[4]; + } + else + { + reg4 = reg6; + reg3 = reg2; + operands[6] = operands[4]; + } + + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; + reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4; + + tmp1 = reg1 <any_logic1:logic_op> reg2; + tmp2 = reg3 <any_logic2:logic_op> reg4; + reg_mask = tmp1 <any_logic:logic_op> tmp2; + reg_mask &= 0xFF; + + operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]); + operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]); + operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]); + operands[5] = GEN_INT (reg_mask); +}) + +(define_insn_and_split "*<avx512>_vpternlog<mode>_2" + [(set (match_operand:V 0 "register_operand") + (any_logic:V + (any_logic1:V + (any_logic2:V + (match_operand:V 1 "reg_or_notreg_operand") + (match_operand:V 2 "reg_or_notreg_operand")) + (match_operand:V 3 "reg_or_notreg_operand")) + (match_operand:V 4 "reg_or_notreg_operand")))] + "(<MODE_SIZE> == 64 || TARGET_AVX512VL) + && ix86_pre_reload_split () + && (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), + ix86_strip_reg_or_notreg_operand (operands[4])) + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), + ix86_strip_reg_or_notreg_operand (operands[4])) + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), + ix86_strip_reg_or_notreg_operand (operands[3])) + || rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), + ix86_strip_reg_or_notreg_operand (operands[3])))" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:V + [(match_dup 6) + (match_dup 2) + (match_dup 1) + (match_dup 5)] + UNSPEC_VTERNLOG))] +{ + /* VPTERNLOGD reg6, reg2, reg1, imm8. */ + int reg6 = 0xF0; + int reg2 = 0xCC; + int reg1 = 0xAA; + int reg3 = 0; + int reg4 = 0; + int reg_mask, tmp1, tmp2; + if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), + ix86_strip_reg_or_notreg_operand (operands[4]))) + { + reg4 = reg1; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[2]), + ix86_strip_reg_or_notreg_operand (operands[4]))) + { + reg4 = reg2; + reg3 = reg6; + operands[6] = operands[3]; + } + else if (rtx_equal_p (ix86_strip_reg_or_notreg_operand (operands[1]), + ix86_strip_reg_or_notreg_operand (operands[3]))) + { + reg4 = reg6; + reg3 = reg1; + operands[6] = operands[4]; + } + else + { + reg4 = reg6; + reg3 = reg2; + operands[6] = operands[4]; + } + + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; + reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4; + + tmp1 = reg1 <any_logic2:logic_op> reg2; + tmp2 = tmp1 <any_logic1:logic_op> reg3; + reg_mask = tmp2 <any_logic:logic_op> reg4; + reg_mask &= 0xFF; + + operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]); + operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]); + operands[6] = ix86_strip_reg_or_notreg_operand (operands[6]); + operands[5] = GEN_INT (reg_mask); +}) + +(define_insn_and_split "*<avx512>_vpternlog<mode>_3" + [(set (match_operand:V 0 "register_operand") + (any_logic:V + (any_logic1:V + (match_operand:V 1 "reg_or_notreg_operand") + (match_operand:V 2 "reg_or_notreg_operand")) + (match_operand:V 3 "reg_or_notreg_operand")))] + "(<MODE_SIZE> == 64 || TARGET_AVX512VL) + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:V + [(match_dup 3) + (match_dup 2) + (match_dup 1) + (match_dup 4)] + UNSPEC_VTERNLOG))] +{ + /* VPTERNLOGD reg3, reg2, reg1, imm8. */ + int reg3 = 0xF0; + int reg2 = 0xCC; + int reg1 = 0xAA; + int reg_mask, tmp1; + + reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1; + reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2; + reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3; + + tmp1 = reg1 <any_logic1:logic_op> reg2; + reg_mask = tmp1 <any_logic:logic_op> reg3; + reg_mask &= 0xFF; + + operands[1] = ix86_strip_reg_or_notreg_operand (operands[1]); + operands[2] = ix86_strip_reg_or_notreg_operand (operands[2]); + operands[3] = ix86_strip_reg_or_notreg_operand (operands[3]); + operands[4] = GEN_INT (reg_mask); +}) + + (define_insn "<avx512>_vternlog<mode>_mask" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") (vec_merge:VI48_AVX512VL diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c index 78bf5d33689..fbc3de08119 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c @@ -1,7 +1,8 @@ /* PR target/95524 */ /* { dg-do compile } */ /* { dg-options "-O2 -mavx512bw" } */ -/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } } */ +/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 2 } } */ +/* { dg-final { scan-assembler-times "vpternlogd\[^\n\]*%zmm" 1 } } */ typedef char v64qi __attribute__ ((vector_size (64))); typedef unsigned char v64uqi __attribute__ ((vector_size (64))); @@ -11,7 +12,6 @@ foo_ashiftrt_512 (v64qi a) return a >> 2; } /* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */ -/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */ /* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */ __attribute__((noipa)) v64qi diff --git a/gcc/testsuite/gcc.target/i386/pr101989-1.c b/gcc/testsuite/gcc.target/i386/pr101989-1.c new file mode 100644 index 00000000000..594093ecdde --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101989-1.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { scan-assembler-times "vpternlog" 6 } } */ +/* { dg-final { scan-assembler-not "vpxor" } } */ +/* { dg-final { scan-assembler-not "vpor" } } */ +/* { dg-final { scan-assembler-not "vpand" } } */ + +#include<immintrin.h> +__m256d +__attribute__((noipa, target("avx512vl"))) +copysign2_pd(__m256d from, __m256d to) { + __m256i a = _mm256_castpd_si256(from); + __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63)); + /* (avx_signbit & from) | (~avx_signbit & to) */ + return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to)); +} + +__m256i +__attribute__((noipa, target("avx512vl"))) +foo (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & ~src1) | (src3 & src1); +} + +__m256i +__attribute__ ((noipa, target("avx512vl"))) +foo1 (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & src1) | (src3 & ~src1); +} + +__m256i +__attribute__ ((noipa, target("avx512vl"))) +foo2 (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & src1) | (~src3 & src1); +} + +__m256i +__attribute__ ((noipa, target("avx512vl"))) +foo3 (__m256i src1, __m256i src2, __m256i src3) +{ + return (~src2 & src1) | (src3 & src1); +} + +__m256i +__attribute__ ((noipa, target("avx512vl"))) +foo4 (__m256i src1, __m256i src2, __m256i src3) +{ + return src3 & src2 ^ src1; +} diff --git a/gcc/testsuite/gcc.target/i386/pr101989-2.c b/gcc/testsuite/gcc.target/i386/pr101989-2.c new file mode 100644 index 00000000000..9d9759a8e1d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101989-2.c @@ -0,0 +1,102 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ +/* { dg-require-effective-target avx512vl } */ + +#define AVX512VL + +#include "avx512f-helper.h" + +#include "pr101989-1.c" +__m256d +avx2_copysign2_pd (__m256d from, __m256d to) { + __m256i a = _mm256_castpd_si256(from); + __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63)); + /* (avx_signbit & from) | (~avx_signbit & to) */ + return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to)); +} + +__m256i +avx2_foo (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & ~src1) | (src3 & src1); +} + +__m256i +avx2_foo1 (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & src1) | (src3 & ~src1); +} + +__m256i +avx2_foo2 (__m256i src1, __m256i src2, __m256i src3) +{ + return (src2 & src1) | (~src3 & src1); +} + +__m256i +avx2_foo3 (__m256i src1, __m256i src2, __m256i src3) +{ + return (~src2 & src1) | (src3 & src1); +} + +__m256i +avx2_foo4 (__m256i src1, __m256i src2, __m256i src3) +{ + return src3 & src2 ^ src1; +} + + +void +test_256 (void) +{ + union256i_q q1, q2, q3, res2, exp2; + union256d d1, d2, res1, exp1; + int i, sign = 1; + + for (i = 0; i < 4; i++) + { + d1.a[i] = 12.34 * (i + 2000) * sign; + d2.a[i] = 56.78 * (i - 30) * sign; + q1.a[i] = 12 * (i + 2000) * sign; + q2.a[i] = 56 * (i - 30) * sign; + q3.a[i] = 90 * (i + 40) * sign; + res1.a[i] = DEFAULT_VALUE; + exp1.a[i] = DEFAULT_VALUE; + res2.a[i] = exp2.a[i] = -1; + sign = -sign; + } + + exp1.x = avx2_copysign2_pd (d1.x, d2.x); + res1.x = copysign2_pd (d1.x, d2.x); + if (UNION_CHECK (256, d) (res1, exp1.a)) + abort (); + + exp2.x = avx2_foo1 (q1.x, q2.x, q3.x); + res2.x = foo1 (q1.x, q2.x, q3.x); + if (UNION_CHECK (256, i_q) (res2, exp2.a)) + abort (); + + exp2.x = avx2_foo2 (q1.x, q2.x, q3.x); + res2.x = foo2 (q1.x, q2.x, q3.x); + if (UNION_CHECK (256, i_q) (res2, exp2.a)) + abort (); + + exp2.x = avx2_foo3 (q1.x, q2.x, q3.x); + res2.x = foo3 (q1.x, q2.x, q3.x); + if (UNION_CHECK (256, i_q) (res2, exp2.a)) + abort (); + + exp2.x = avx2_foo4 (q1.x, q2.x, q3.x); + res2.x = foo4 (q1.x, q2.x, q3.x); + if (UNION_CHECK (256, i_q) (res2, exp2.a)) + abort (); + + exp2.x = avx2_foo (q1.x, q2.x, q3.x); + res2.x = foo (q1.x, q2.x, q3.x); + if (UNION_CHECK (256, i_q) (res2, exp2.a)) + abort (); +} + +static void +test_128 () +{} -- 2.18.1