On Fri, Jun 14, 2024 at 9:35 AM Levy Hsu <ad...@levyhsu.com> wrote: > > This patch updates the GCC x86 backend to efficiently handle > odd, incrementally increasing permutations of BF16 vectors > using the cvtne2ps2bf16 instruction. > It modifies ix86_vectorize_vec_perm_const to support these operations > and adds a specific predicate to ensure proper sequence handling. > > Bootstrapped and tested on x86_64-linux-gnu, OK for trunk? Ok. > > gcc/ChangeLog: > > * config/i386/i386-expand.cc > (ix86_vectorize_vec_perm_const): Convert BF to HI using subreg. > * config/i386/predicates.md > (vcvtne2ps2bf_parallel): New define_insn_and_split. > * config/i386/sse.md > (vpermt2_sepcial_bf16_shuffle_<mode>): New predicates matches odd > increasing perm. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/vpermt2-special-bf16-shufflue.c: New test. > --- > gcc/config/i386/i386-expand.cc | 4 +-- > gcc/config/i386/predicates.md | 11 ++++++ > gcc/config/i386/sse.md | 35 +++++++++++++++++++ > .../i386/vpermt2-special-bf16-shufflue.c | 27 ++++++++++++++ > 4 files changed, 75 insertions(+), 2 deletions(-) > create mode 100755 > gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index 312329e550b..3d599c0651a 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -23657,8 +23657,8 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, > machine_mode op_mode, > if (GET_MODE_SIZE (vmode) == 64 && !TARGET_EVEX512) > return false; > > - /* For HF mode vector, convert it to HI using subreg. */ > - if (GET_MODE_INNER (vmode) == HFmode) > + /* For HF and BF mode vector, convert it to HI using subreg. */ > + if (GET_MODE_INNER (vmode) == HFmode || GET_MODE_INNER (vmode) == BFmode) > { > machine_mode orig_mode = vmode; > vmode = mode_for_vector (HImode, > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index 7afe3100cb7..1676c50de71 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -2322,3 +2322,14 @@ > > return true; > }) > + > +;; Check that each element is odd and incrementally increasing from 1 > +(define_predicate "vcvtne2ps2bf_parallel" > + (and (match_code "const_vector") > + (match_code "const_int" "a")) > +{ > + for (int i = 0; i < XVECLEN (op, 0); ++i) > + if (INTVAL (XVECEXP (op, 0, i)) != (2 * i + 1)) > + return false; > + return true; > +}) > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 680a46a0b08..5ddd1c0a778 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -30698,3 +30698,38 @@ > "TARGET_AVXVNNIINT16" > "vpdp<vpdpwprodtype>\t{%3, %2, %0|%0, %2, %3}" > [(set_attr "prefix" "vex")]) > + > +(define_mode_attr hi_cvt_bf > + [(V8HI "v8bf") (V16HI "v16bf") (V32HI "v32bf")]) > + > +(define_mode_attr HI_CVT_BF > + [(V8HI "V8BF") (V16HI "V16BF") (V32HI "V32BF")]) > + > +(define_insn_and_split "vpermt2_sepcial_bf16_shuffle_<mode>" > + [(set (match_operand:VI2_AVX512F 0 "register_operand") > + (unspec:VI2_AVX512F > + [(match_operand:VI2_AVX512F 1 "vcvtne2ps2bf_parallel") > + (match_operand:VI2_AVX512F 2 "register_operand") > + (match_operand:VI2_AVX512F 3 "nonimmediate_operand")] > + UNSPEC_VPERMT2))] > + "TARGET_AVX512VL && TARGET_AVX512BF16 && ix86_pre_reload_split ()" > + "#" > + "&& 1" > + [(const_int 0)] > +{ > + rtx op0 = gen_reg_rtx (<HI_CVT_BF>mode); > + operands[2] = lowpart_subreg (<ssePSmode>mode, > + force_reg (<MODE>mode, operands[2]), > + <MODE>mode); > + operands[3] = lowpart_subreg (<ssePSmode>mode, > + force_reg (<MODE>mode, operands[3]), > + <MODE>mode); > + > + emit_insn (gen_avx512f_cvtne2ps2bf16_<hi_cvt_bf>(op0, > + operands[3], > + operands[2])); > + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, > + <HI_CVT_BF>mode)); > + DONE; > +} > +[(set_attr "mode" "<sseinsnmode>")]) > diff --git a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c > b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c > new file mode 100755 > index 00000000000..5c65f2a9884 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx512bf16 -mavx512vl" } */ > +/* { dg-final { scan-assembler-not "vpermi2b" } } */ > +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16" 3 } } */ > + > +typedef __bf16 v8bf __attribute__((vector_size(16))); > +typedef __bf16 v16bf __attribute__((vector_size(32))); > +typedef __bf16 v32bf __attribute__((vector_size(64))); > + > +v8bf foo0(v8bf a, v8bf b) > +{ > + return __builtin_shufflevector(a, b, 1, 3, 5, 7, 9, 11, 13, 15); > +} > + > +v16bf foo1(v16bf a, v16bf b) > +{ > + return __builtin_shufflevector(a, b, 1, 3, 5, 7, 9, 11, 13, 15, > + 17, 19, 21, 23, 25, 27, 29, 31); > +} > + > +v32bf foo2(v32bf a, v32bf b) > +{ > + return __builtin_shufflevector(a, b, 1, 3, 5, 7, 9, 11, 13, 15, > + 17, 19, 21, 23, 25, 27, 29, 31, > + 33, 35, 37, 39, 41, 43, 45, 47, > + 49, 51, 53, 55, 57, 59, 61, 63); > +} > -- > 2.31.1 >
-- BR, Hongtao