On Fri, Aug 4, 2023 at 1:30 AM Alexander Monakov <amona...@ispras.ru> wrote: > > > On Thu, 27 Jul 2023, Liu, Hongtao via Gcc-patches wrote: > > > > +;; If the first and the second operands of ternlog are invariant and ;; > > > +the third operand is memory ;; then we should add load third operand > > > +from memory to register and ;; replace first and second operands with > > > +this register (define_split > > > + [(set (match_operand:V 0 "register_operand") > > > + (unspec:V > > > + [(match_operand:V 1 "register_operand") > > > + (match_operand:V 2 "register_operand") > > > + (match_operand:V 3 "memory_operand") > > > + (match_operand:SI 4 "const_0_to_255_operand")] > > > + UNSPEC_VTERNLOG))] > > > + "ternlog_invariant_operand_mask (operands) == 3 && !reload_completed" > > Maybe better with "!reload_completed && ternlog_invariant_operand_mask > > (operands) == 3" > > I made this change (in both places), plus some style TLC. Ok to apply? Ok. > > From d24304a9efd049e8db6df5ac78de8ca2d941a3c7 Mon Sep 17 00:00:00 2001 > From: Yan Simonaytes <simonaytes....@ispras.ru> > Date: Tue, 25 Jul 2023 20:43:19 +0300 > Subject: [PATCH] Eliminate irrelevant operands of VPTERNLOG > > As mentioned in PR 110202, GCC may be presented with input where control > word of the VPTERNLOG intrinsic implies that some of its operands do not > affect the result. In that case, we can eliminate irrelevant operands > of the instruction by substituting any other operand in their place. > This removes false dependencies. > > For instance, instead of (252 = 0xfc = _MM_TERNLOG_A | _MM_TERNLOG_B) > > vpternlogq $252, %zmm2, %zmm1, %zmm0 > > emit > > vpternlogq $252, %zmm0, %zmm1, %zmm0 > > When VPTERNLOG is invariant w.r.t first and second operands, and the > third operand is memory, load memory into the output operand first, i.e. > instead of (85 = 0x55 = ~_MM_TERNLOG_C) > > vpternlogq $85, (%rdi), %zmm1, %zmm0 > > emit > > vmovdqa64 (%rdi), %zmm0 > vpternlogq $85, %zmm0, %zmm0, %zmm0 > > gcc/ChangeLog: > > * config/i386/i386-protos.h (vpternlog_irrelevant_operand_mask): > Declare. > (substitute_vpternlog_operands): Declare. > * config/i386/i386.cc (vpternlog_irrelevant_operand_mask): New > helper. > (substitute_vpternlog_operands): New function. Use them... > * config/i386/sse.md: ... here in new VPTERNLOG define_splits. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/invariant-ternlog-1.c: New test. > * gcc.target/i386/invariant-ternlog-2.c: New test. > --- > gcc/config/i386/i386-protos.h | 3 ++ > gcc/config/i386/i386.cc | 43 +++++++++++++++++++ > gcc/config/i386/sse.md | 42 ++++++++++++++++++ > .../gcc.target/i386/invariant-ternlog-1.c | 21 +++++++++ > .../gcc.target/i386/invariant-ternlog-2.c | 12 ++++++ > 5 files changed, 121 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 27fe73ca65..12e6ff0ebc 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -70,6 +70,9 @@ extern machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx); > extern int avx_vpermilp_parallel (rtx par, machine_mode mode); > extern int avx_vperm2f128_parallel (rtx par, machine_mode mode); > > +extern int vpternlog_irrelevant_operand_mask (rtx[]); > +extern void substitute_vpternlog_operands (rtx[]); > + > extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx); > extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx, > rtx, rtx, rtx, rtx, bool); > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 32851a514a..9a7c1135a0 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -19420,6 +19420,49 @@ avx_vperm2f128_parallel (rtx par, machine_mode mode) > return mask + 1; > } > > +/* Return a mask of VPTERNLOG operands that do not affect output. */ > + > +int > +vpternlog_irrelevant_operand_mask (rtx *operands) > +{ > + int mask = 0; > + int imm8 = XINT (operands[4], 0); > + > + if (((imm8 >> 4) & 0x0F) == (imm8 & 0x0F)) > + mask |= 1; > + if (((imm8 >> 2) & 0x33) == (imm8 & 0x33)) > + mask |= 2; > + if (((imm8 >> 1) & 0x55) == (imm8 & 0x55)) > + mask |= 4; > + > + return mask; > +} > + > +/* Eliminate false dependencies on operands that do not affect output > + by substituting other operands of a VPTERNLOG. */ > + > +void > +substitute_vpternlog_operands (rtx *operands) > +{ > + int mask = vpternlog_irrelevant_operand_mask (operands); > + > + if (mask & 1) /* The first operand is irrelevant. */ > + operands[1] = operands[2]; > + > + if (mask & 2) /* The second operand is irrelevant. */ > + operands[2] = operands[1]; > + > + if (mask & 4) /* The third operand is irrelevant. */ > + operands[3] = operands[1]; > + else if (REG_P (operands[3])) > + { > + if (mask & 1) > + operands[1] = operands[3]; > + if (mask & 2) > + operands[2] = operands[3]; > + } > +} > + > /* Return a register priority for hard reg REGNO. */ > static int > ix86_register_priority (int hard_regno) > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index f793258b6c..1e2ec4bedc 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -12627,6 +12627,48 @@ (define_insn "*<avx512>_vternlog<mode>_all" > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > (const_string "*")))]) > > +;; When VPTERNLOG happens to be invariant w.r.t first and second operands, > +;; and the third operand is memory, eliminate false dependencies by loading > +;; memory into the output operand first. > +(define_split > + [(set (match_operand:V 0 "register_operand") > + (unspec:V > + [(match_operand:V 1 "register_operand") > + (match_operand:V 2 "register_operand") > + (match_operand:V 3 "memory_operand") > + (match_operand:SI 4 "const_0_to_255_operand")] > + UNSPEC_VTERNLOG))] > + "!reload_completed && vpternlog_irrelevant_operand_mask (operands) == 3" > + [(set (match_dup 0) > + (match_dup 3)) > + (set (match_dup 0) > + (unspec:V > + [(match_dup 0) > + (match_dup 0) > + (match_dup 0) > + (match_dup 4)] > + UNSPEC_VTERNLOG))]) > + > +;; Eliminate false dependencies when VPTERNLOG is invariant w.r.t any > +;; of input operands (except the case handled in the above split). > +(define_split > + [(set (match_operand:V 0 "register_operand") > + (unspec:V > + [(match_operand:V 1 "register_operand") > + (match_operand:V 2 "register_operand") > + (match_operand:V 3 "nonimmediate_operand") > + (match_operand:SI 4 "const_0_to_255_operand")] > + UNSPEC_VTERNLOG))] > + "!reload_completed && vpternlog_irrelevant_operand_mask (operands) != 0" > + [(set (match_dup 0) > + (unspec:V > + [(match_dup 1) > + (match_dup 2) > + (match_dup 3) > + (match_dup 4)] > + UNSPEC_VTERNLOG))] > + "substitute_vpternlog_operands (operands);") > + > ;; There must be lots of other combinations like > ;; > ;; (any_logic:V > diff --git a/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c > b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c > new file mode 100644 > index 0000000000..21051c6bba > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c > @@ -0,0 +1,21 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512f -O2" } */ > +/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */ > +/* { dg-final { scan-assembler-times {vpternlog[^\n\r]*\(%rdx\)} 2 } } */ > + > +#include <immintrin.h> > + > +__m512i f(__m512i* a, __m512i* b, __m512i* c) > +{ > + return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_B | > ~_MM_TERNLOG_C); > +} > + > +__m512i g(__m512i* a, __m512i* b, __m512i* c) > +{ > + return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_A | > ~_MM_TERNLOG_C); > +} > + > +__m512i h(__m512i* a, __m512i* b, __m512i* c) > +{ > + return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_A | > ~_MM_TERNLOG_B); > +} > diff --git a/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c > b/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c > new file mode 100644 > index 0000000000..d70bbb0239 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512f -O2" } */ > +/* { dg-final { scan-assembler-times "vmovdqa" 1 } } */ > +/* { dg-final { scan-assembler "vpternlog.*zmm0.*zmm0.*zmm0" } } */ > + > +#include <immintrin.h> > + > +__m512i f(__m512i* a, __m512i* b, __m512i* c) > +{ > + return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_C); > +} > + > -- > 2.39.2 >
-- BR, Hongtao