On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > All combinations of and, ior, xor, and not involving two operands can be > expressed that way in a single insn. > > gcc/ > > PR target/93768 > * config/i386/i386.cc (ix86_rtx_costs): Further special-case > bitwise vector operations. > * config/i386/sse.md (*iornot<mode>3): New insn. > (*xnor<mode>3): Likewise. > (*<nlogic><mode>3): Likewise. > (andor): New code iterator. > (nlogic): New code attribute. > (ternlog_nlogic): Likewise. > > gcc/testsuite/ > > PR target/93768 > gcc.target/i386/avx512-binop-not-1.h: New. > gcc.target/i386/avx512-binop-not-2.h: New. > gcc.target/i386/avx512f-orn-si-zmm-1.c: New test. > gcc.target/i386/avx512f-orn-si-zmm-2.c: New test. > --- > The use of VI matches that in e.g. one_cmpl<mode>2 / > <mask_codefor>one_cmpl<mode>2<mask_name> and *andnot<mode>3, despite > (here and there) > - V64QI and V32HI being needlessly excluded when AVX512BW isn't enabled, > - V<n>TI not being covered, > - vector modes more narrow than 16 bytes not being covered. > > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -21178,6 +21178,32 @@ ix86_rtx_costs (rtx x, machine_mode mode > return false; > > case IOR: > + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > + { > + /* (ior (not ...) ...) can be a single insn in AVX512. */ > + if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F > + && (GET_MODE_SIZE (mode) == 64 > + || (TARGET_AVX512VL > + && (GET_MODE_SIZE (mode) == 32 > + || GET_MODE_SIZE (mode) == 16)))) > + { > + rtx right = GET_CODE (XEXP (x, 1)) != NOT > + ? XEXP (x, 1) : XEXP (XEXP (x, 1), 0); > + > + *total = ix86_vec_cost (mode, cost->sse_op) > + + rtx_cost (XEXP (XEXP (x, 0), 0), mode, > + outer_code, opno, speed) > + + rtx_cost (right, mode, outer_code, opno, speed); > + return true; > + } > + *total = ix86_vec_cost (mode, cost->sse_op); > + } > + else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) > + *total = cost->add * 2; > + else > + *total = cost->add; > + return false; > + > case XOR: > if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > *total = ix86_vec_cost (mode, cost->sse_op); > @@ -21198,11 +21224,20 @@ ix86_rtx_costs (rtx x, machine_mode mode > /* pandn is a single instruction. */ > if (GET_CODE (XEXP (x, 0)) == NOT) > { > + rtx right = XEXP (x, 1); > + > + /* (and (not ...) (not ...)) can be a single insn in AVX512. */ > + if (GET_CODE (right) == NOT && TARGET_AVX512F > + && (GET_MODE_SIZE (mode) == 64 > + || (TARGET_AVX512VL > + && (GET_MODE_SIZE (mode) == 32 > + || GET_MODE_SIZE (mode) == 16)))) > + right = XEXP (right, 0); > + > *total = ix86_vec_cost (mode, cost->sse_op) > + rtx_cost (XEXP (XEXP (x, 0), 0), mode, > outer_code, opno, speed) > - + rtx_cost (XEXP (x, 1), mode, > - outer_code, opno, speed); > + + rtx_cost (right, mode, outer_code, opno, speed); > return true; > } > else if (GET_CODE (XEXP (x, 1)) == NOT) > @@ -21260,8 +21295,25 @@ ix86_rtx_costs (rtx x, machine_mode mode > > case NOT: > if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > - // vnot is pxor -1. > - *total = ix86_vec_cost (mode, cost->sse_op) + 1; > + { > + /* (not (xor ...)) can be a single insn in AVX512. */ > + if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F > + && (GET_MODE_SIZE (mode) == 64 > + || (TARGET_AVX512VL > + && (GET_MODE_SIZE (mode) == 32 > + || GET_MODE_SIZE (mode) == 16)))) > + { > + *total = ix86_vec_cost (mode, cost->sse_op) > + + rtx_cost (XEXP (XEXP (x, 0), 0), mode, > + outer_code, opno, speed) > + + rtx_cost (XEXP (XEXP (x, 0), 1), mode, > + outer_code, opno, speed); > + return true; > + } > + > + // vnot is pxor -1. > + *total = ix86_vec_cost (mode, cost->sse_op) + 1; > + } > else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) > *total = cost->add * 2; > else > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -17616,6 +17616,98 @@ > operands[2] = force_reg (V1TImode, CONSTM1_RTX (V1TImode)); > }) > > +(define_insn "*iornot<mode>3" > + [(set (match_operand:VI 0 "register_operand" "=v,v,v,v") > + (ior:VI > + (not:VI > + (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m")) > + (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))] > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > + && (register_operand (operands[1], <MODE>mode) > + || register_operand (operands[2], <MODE>mode))" > +{ > + if (!register_operand (operands[1], <MODE>mode)) > + { > + if (TARGET_AVX512VL) > + return "vpternlog<ternlogsuffix>\t{$0xdd, %1, %2, %0|%0, %2, %1, > 0xdd}"; > + return "vpternlog<ternlogsuffix>\t{$0xdd, %g1, %g2, %g0|%g0, %g2, %g1, > 0xdd}"; > + } > + if (TARGET_AVX512VL) > + return "vpternlog<ternlogsuffix>\t{$0xbb, %2, %1, %0|%0, %1, %2, 0xbb}"; > + return "vpternlog<ternlogsuffix>\t{$0xbb, %g2, %g1, %g0|%g0, %g1, %g2, > 0xbb}"; > +} > + [(set_attr "type" "sselog") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set (attr "mode") > + (if_then_else (match_test "TARGET_AVX512VL") > + (const_string "<sseinsnmode>") > + (const_string "XI"))) > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "2,3") > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > + (const_string "*")))]) > + > +(define_insn "*xnor<mode>3" > + [(set (match_operand:VI 0 "register_operand" "=v,v") > + (not:VI > + (xor:VI > + (match_operand:VI 1 "bcst_vector_operand" "%v,v") > + (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > + && (register_operand (operands[1], <MODE>mode) > + || register_operand (operands[2], <MODE>mode))" > +{ > + if (TARGET_AVX512VL) > + return "vpternlog<ternlogsuffix>\t{$0x99, %2, %1, %0|%0, %1, %2, 0x99}"; > + else > + return "vpternlog<ternlogsuffix>\t{$0x99, %g2, %g1, %g0|%g0, %g1, %g2, > 0x99}"; > +} > + [(set_attr "type" "sselog") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set (attr "mode") > + (if_then_else (match_test "TARGET_AVX512VL") > + (const_string "<sseinsnmode>") > + (const_string "XI"))) > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "1") > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > + (const_string "*")))]) > + > +(define_code_iterator andor [and ior]) > +(define_code_attr nlogic [(and "nor") (ior "nand")]) > +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) > + > +(define_insn "*<nlogic><mode>3" > + [(set (match_operand:VI 0 "register_operand" "=v,v") > + (andor:VI > + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) > + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] I'm thinking of doing it in simplify_rtx or gimple match.pd to transform (and (not op1)) (not op2)) -> (not: (ior: op1 op2)) (ior (not op1) (not op2)) -> (not : (and op1 op2))
Even w/o avx512f, the transformation should also benefit since it takes less logic operations 3 -> 2.(or 2 -> 2 for pandn). The other 2 patterns: *xnor<mode>3 and iornot<mode>3 LGTM. > + "(<MODE_SIZE> == 64 || TARGET_AVX512VL > + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) > + && (register_operand (operands[1], <MODE>mode) > + || register_operand (operands[2], <MODE>mode))" > +{ > + if (TARGET_AVX512VL) > + return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %2, %1, %0|%0, %1, > %2, <ternlog_nlogic>}"; > + else > + return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %g2, %g1, %g0|%g0, > %g1, %g2, <ternlog_nlogic>}"; > +} > + [(set_attr "type" "sselog") > + (set_attr "length_immediate" "1") > + (set_attr "prefix" "evex") > + (set (attr "mode") > + (if_then_else (match_test "TARGET_AVX512VL") > + (const_string "<sseinsnmode>") > + (const_string "XI"))) > + (set (attr "enabled") > + (if_then_else (eq_attr "alternative" "1") > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL") > + (const_string "*")))]) > + > (define_mode_iterator AVX512ZEXTMASK > [(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI]) > > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-1.h > @@ -0,0 +1,13 @@ > +#include <immintrin.h> > + > +#define PASTER2(x,y) x##y > +#define PASTER3(x,y,z) _mm##x##_##y##_##z > +#define OP(vec, op, suffix) PASTER3 (vec, op, suffix) > +#define DUP(vec, suffix, val) PASTER3 (vec, set1, suffix) (val) > + > +type > +foo (type x, SCALAR *f) > +{ > + return OP (vec, op, suffix) (x, OP (vec, xor, suffix) (DUP (vec, suffix, > *f), > + DUP (vec, suffix, > ~0))); > +} > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-2.h > @@ -0,0 +1,13 @@ > +#include <immintrin.h> > + > +#define PASTER2(x,y) x##y > +#define PASTER3(x,y,z) _mm##x##_##y##_##z > +#define OP(vec, op, suffix) PASTER3 (vec, op, suffix) > +#define DUP(vec, suffix, val) PASTER3 (vec, set1, suffix) (val) > + > +type > +foo (type x, SCALAR *f) > +{ > + return OP (vec, op, suffix) (OP (vec, xor, suffix) (x, DUP (vec, suffix, > ~0)), > + DUP (vec, suffix, *f)); > +} > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-1.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ > +/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xdd, > \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */ > +/* { dg-final { scan-assembler-not "vpbroadcast" } } */ > + > +#define type __m512i > +#define vec 512 > +#define op or > +#define suffix epi32 > +#define SCALAR int > + > +#include "avx512-binop-not-1.h" > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-2.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ > +/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xbb, > \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */ > +/* { dg-final { scan-assembler-not "vpbroadcast" } } */ > + > +#define type __m512i > +#define vec 512 > +#define op or > +#define suffix epi32 > +#define SCALAR int > + > +#include "avx512-binop-not-2.h" > -- BR, Hongtao