On Fri, Aug 4, 2023 at 1:30 AM Alexander Monakov <amona...@ispras.ru> wrote:
>
>
> On Thu, 27 Jul 2023, Liu, Hongtao via Gcc-patches wrote:
>
> > > +;; If the first and the second operands of ternlog are invariant and ;;
> > > +the third operand is memory ;; then we should add load third operand
> > > +from memory to register and ;; replace first and second operands with
> > > +this register (define_split
> > > +  [(set (match_operand:V 0 "register_operand")
> > > +   (unspec:V
> > > +     [(match_operand:V 1 "register_operand")
> > > +      (match_operand:V 2 "register_operand")
> > > +      (match_operand:V 3 "memory_operand")
> > > +      (match_operand:SI 4 "const_0_to_255_operand")]
> > > +     UNSPEC_VTERNLOG))]
> > > +  "ternlog_invariant_operand_mask (operands) == 3 && !reload_completed"
> > Maybe better with "!reload_completed  && ternlog_invariant_operand_mask 
> > (operands) == 3"
>
> I made this change (in both places), plus some style TLC. Ok to apply?
Ok.
>
> From d24304a9efd049e8db6df5ac78de8ca2d941a3c7 Mon Sep 17 00:00:00 2001
> From: Yan Simonaytes <simonaytes....@ispras.ru>
> Date: Tue, 25 Jul 2023 20:43:19 +0300
> Subject: [PATCH] Eliminate irrelevant operands of VPTERNLOG
>
> As mentioned in PR 110202, GCC may be presented with input where control
> word of the VPTERNLOG intrinsic implies that some of its operands do not
> affect the result.  In that case, we can eliminate irrelevant operands
> of the instruction by substituting any other operand in their place.
> This removes false dependencies.
>
> For instance, instead of (252 = 0xfc = _MM_TERNLOG_A | _MM_TERNLOG_B)
>
>         vpternlogq      $252, %zmm2, %zmm1, %zmm0
>
> emit
>
>         vpternlogq      $252, %zmm0, %zmm1, %zmm0
>
> When VPTERNLOG is invariant w.r.t first and second operands, and the
> third operand is memory, load memory into the output operand first, i.e.
> instead of (85 = 0x55 = ~_MM_TERNLOG_C)
>
>         vpternlogq      $85, (%rdi), %zmm1, %zmm0
>
> emit
>
>         vmovdqa64       (%rdi), %zmm0
>         vpternlogq      $85, %zmm0, %zmm0, %zmm0
>
> gcc/ChangeLog:
>
>         * config/i386/i386-protos.h (vpternlog_irrelevant_operand_mask):
>         Declare.
>         (substitute_vpternlog_operands): Declare.
>         * config/i386/i386.cc (vpternlog_irrelevant_operand_mask): New
>         helper.
>         (substitute_vpternlog_operands): New function.  Use them...
>         * config/i386/sse.md: ... here in new VPTERNLOG define_splits.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/invariant-ternlog-1.c: New test.
>         * gcc.target/i386/invariant-ternlog-2.c: New test.
> ---
>  gcc/config/i386/i386-protos.h                 |  3 ++
>  gcc/config/i386/i386.cc                       | 43 +++++++++++++++++++
>  gcc/config/i386/sse.md                        | 42 ++++++++++++++++++
>  .../gcc.target/i386/invariant-ternlog-1.c     | 21 +++++++++
>  .../gcc.target/i386/invariant-ternlog-2.c     | 12 ++++++
>  5 files changed, 121 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c
>
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 27fe73ca65..12e6ff0ebc 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -70,6 +70,9 @@ extern machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
>  extern int avx_vpermilp_parallel (rtx par, machine_mode mode);
>  extern int avx_vperm2f128_parallel (rtx par, machine_mode mode);
>
> +extern int vpternlog_irrelevant_operand_mask (rtx[]);
> +extern void substitute_vpternlog_operands (rtx[]);
> +
>  extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
>  extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx,
>                                        rtx, rtx, rtx, rtx, bool);
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 32851a514a..9a7c1135a0 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -19420,6 +19420,49 @@ avx_vperm2f128_parallel (rtx par, machine_mode mode)
>    return mask + 1;
>  }
>
> +/* Return a mask of VPTERNLOG operands that do not affect output.  */
> +
> +int
> +vpternlog_irrelevant_operand_mask (rtx *operands)
> +{
> +  int mask = 0;
> +  int imm8 = XINT (operands[4], 0);
> +
> +  if (((imm8 >> 4) & 0x0F) == (imm8 & 0x0F))
> +    mask |= 1;
> +  if (((imm8 >> 2) & 0x33) == (imm8 & 0x33))
> +    mask |= 2;
> +  if (((imm8 >> 1) & 0x55) == (imm8 & 0x55))
> +    mask |= 4;
> +
> +  return mask;
> +}
> +
> +/* Eliminate false dependencies on operands that do not affect output
> +   by substituting other operands of a VPTERNLOG.  */
> +
> +void
> +substitute_vpternlog_operands (rtx *operands)
> +{
> +  int mask = vpternlog_irrelevant_operand_mask (operands);
> +
> +  if (mask & 1) /* The first operand is irrelevant.  */
> +    operands[1] = operands[2];
> +
> +  if (mask & 2) /* The second operand is irrelevant.  */
> +    operands[2] = operands[1];
> +
> +  if (mask & 4) /* The third operand is irrelevant.  */
> +    operands[3] = operands[1];
> +  else if (REG_P (operands[3]))
> +    {
> +      if (mask & 1)
> +       operands[1] = operands[3];
> +      if (mask & 2)
> +       operands[2] = operands[3];
> +    }
> +}
> +
>  /* Return a register priority for hard reg REGNO.  */
>  static int
>  ix86_register_priority (int hard_regno)
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index f793258b6c..1e2ec4bedc 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -12627,6 +12627,48 @@ (define_insn "*<avx512>_vternlog<mode>_all"
>                       (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
>                       (const_string "*")))])
>
> +;; When VPTERNLOG happens to be invariant w.r.t first and second operands,
> +;; and the third operand is memory, eliminate false dependencies by loading
> +;; memory into the output operand first.
> +(define_split
> +  [(set (match_operand:V 0 "register_operand")
> +       (unspec:V
> +         [(match_operand:V 1 "register_operand")
> +          (match_operand:V 2 "register_operand")
> +          (match_operand:V 3 "memory_operand")
> +          (match_operand:SI 4 "const_0_to_255_operand")]
> +         UNSPEC_VTERNLOG))]
> +  "!reload_completed && vpternlog_irrelevant_operand_mask (operands) == 3"
> +  [(set (match_dup 0)
> +       (match_dup 3))
> +   (set (match_dup 0)
> +       (unspec:V
> +         [(match_dup 0)
> +          (match_dup 0)
> +          (match_dup 0)
> +          (match_dup 4)]
> +         UNSPEC_VTERNLOG))])
> +
> +;; Eliminate false dependencies when VPTERNLOG is invariant w.r.t any
> +;; of input operands (except the case handled in the above split).
> +(define_split
> +  [(set (match_operand:V 0 "register_operand")
> +       (unspec:V
> +         [(match_operand:V 1 "register_operand")
> +          (match_operand:V 2 "register_operand")
> +          (match_operand:V 3 "nonimmediate_operand")
> +          (match_operand:SI 4 "const_0_to_255_operand")]
> +         UNSPEC_VTERNLOG))]
> +  "!reload_completed && vpternlog_irrelevant_operand_mask (operands) != 0"
> +  [(set (match_dup 0)
> +       (unspec:V
> +         [(match_dup 1)
> +          (match_dup 2)
> +          (match_dup 3)
> +          (match_dup 4)]
> +         UNSPEC_VTERNLOG))]
> +  "substitute_vpternlog_operands (operands);")
> +
>  ;; There must be lots of other combinations like
>  ;;
>  ;; (any_logic:V
> diff --git a/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c 
> b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
> new file mode 100644
> index 0000000000..21051c6bba
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vmovdqa" 4 } } */
> +/* { dg-final { scan-assembler-times {vpternlog[^\n\r]*\(%rdx\)} 2 } } */
> +
> +#include <immintrin.h>
> +
> +__m512i f(__m512i* a, __m512i* b, __m512i* c)
> +{
> +       return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_B | 
> ~_MM_TERNLOG_C);
> +}
> +
> +__m512i g(__m512i* a, __m512i* b, __m512i* c)
> +{
> +       return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_A | 
> ~_MM_TERNLOG_C);
> +}
> +
> +__m512i h(__m512i* a, __m512i* b, __m512i* c)
> +{
> +       return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_A | 
> ~_MM_TERNLOG_B);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c 
> b/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c
> new file mode 100644
> index 0000000000..d70bbb0239
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/invariant-ternlog-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times "vmovdqa" 1 } } */
> +/* { dg-final { scan-assembler "vpternlog.*zmm0.*zmm0.*zmm0" } } */
> +
> +#include <immintrin.h>
> +
> +__m512i f(__m512i* a, __m512i* b, __m512i* c)
> +{
> +       return _mm512_ternarylogic_epi64 (a[0], b[0], c[0], ~_MM_TERNLOG_C);
> +}
> +
> --
> 2.39.2
>


-- 
BR,
Hongtao

Reply via email to