On Sat, Jul 13, 2024 at 3:44 PM Hongyu Wang <hongyu.w...@intel.com> wrote: > > Hi, > > According to the instruction spec of AVX512BF16, the convert from float > to BF16 is not a simple truncation. It has special handling for > denormal/nan, even for normal float it will add an extra bias according > to the least significant bit for bf number. This means we cannot use the > vcvtne2ps2bf16 for any bf16 vector shuffle. > The optimization introduced in r15-1368 adds a specific split to convert > HImode permutation with this instruction, so remove it and treat the > BFmode permutation same as HFmode. > > Bootstrapped & regtested on x86_64-pc-linux-gnu. OK for trunk? Could you just git revert 6d0b7b69d143025f271d0041cfa29cf26e6c343b? > > gcc/ChangeLog: > > PR target/115889 > * config/i386/predicates.md (vcvtne2ps2bf_parallel): Remove. > * config/i386/sse.md (hi_cvt_bf): Remove. > (HI_CVT_BF): Likewise. > (vpermt2_sepcial_bf16_shuffle_<mode>):Likewise. > > gcc/testsuite/ChangeLog: > > PR target/115889 > * gcc.target/i386/vpermt2-special-bf16-shufflue.c: Adjust option > and output scan. > --- > gcc/config/i386/predicates.md | 11 ------ > gcc/config/i386/sse.md | 35 ------------------- > .../i386/vpermt2-special-bf16-shufflue.c | 5 ++- > 3 files changed, 2 insertions(+), 49 deletions(-) > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index a894847adaf..5d0bb1e0f54 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -2327,14 +2327,3 @@ (define_predicate "apx_ndd_add_memory_operand" > > return true; > }) > - > -;; Check that each element is odd and incrementally increasing from 1 > -(define_predicate "vcvtne2ps2bf_parallel" > - (and (match_code "const_vector") > - (match_code "const_int" "a")) > -{ > - for (int i = 0; i < XVECLEN (op, 0); ++i) > - if (INTVAL (XVECEXP (op, 0, i)) != (2 * i + 1)) > - return false; > - return true; > -}) > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index b3b4697924b..c134494cd20 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -31460,38 +31460,3 @@ (define_insn "vpdp<vpdpwprodtype>_<mode>" > "TARGET_AVXVNNIINT16" > "vpdp<vpdpwprodtype>\t{%3, %2, %0|%0, %2, %3}" > [(set_attr "prefix" "vex")]) > - > -(define_mode_attr hi_cvt_bf > - [(V8HI "v8bf") (V16HI "v16bf") (V32HI "v32bf")]) > - > -(define_mode_attr HI_CVT_BF > - [(V8HI "V8BF") (V16HI "V16BF") (V32HI "V32BF")]) > - > -(define_insn_and_split "vpermt2_sepcial_bf16_shuffle_<mode>" > - [(set (match_operand:VI2_AVX512F 0 "register_operand") > - (unspec:VI2_AVX512F > - [(match_operand:VI2_AVX512F 1 "vcvtne2ps2bf_parallel") > - (match_operand:VI2_AVX512F 2 "register_operand") > - (match_operand:VI2_AVX512F 3 "nonimmediate_operand")] > - UNSPEC_VPERMT2))] > - "TARGET_AVX512VL && TARGET_AVX512BF16 && ix86_pre_reload_split ()" > - "#" > - "&& 1" > - [(const_int 0)] > -{ > - rtx op0 = gen_reg_rtx (<HI_CVT_BF>mode); > - operands[2] = lowpart_subreg (<ssePSmode>mode, > - force_reg (<MODE>mode, operands[2]), > - <MODE>mode); > - operands[3] = lowpart_subreg (<ssePSmode>mode, > - force_reg (<MODE>mode, operands[3]), > - <MODE>mode); > - > - emit_insn (gen_avx512f_cvtne2ps2bf16_<hi_cvt_bf>(op0, > - operands[3], > - operands[2])); > - emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, > - <HI_CVT_BF>mode)); > - DONE; > -} > -[(set_attr "mode" "<sseinsnmode>")]) > diff --git a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c > b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c > index 5c65f2a9884..4cbc85735de 100755 > --- a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c > +++ b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c > @@ -1,7 +1,6 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2 -mavx512bf16 -mavx512vl" } */ > -/* { dg-final { scan-assembler-not "vpermi2b" } } */ > -/* { dg-final { scan-assembler-times "vcvtne2ps2bf16" 3 } } */ > +/* { dg-options "-O2 -mavx512vbmi -mavx512vl" } */ > +/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */ > > typedef __bf16 v8bf __attribute__((vector_size(16))); > typedef __bf16 v16bf __attribute__((vector_size(32))); > -- > 2.34.1 >
-- BR, Hongtao