LGTM
On Wed, Jun 28, 2023 at 4:28 PM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote: > > This patch adds combine pattern as follows: > > 1. (set (reg) (fma (float_extend:reg)(float_extend:reg)(reg))) > This pattern allows combine: vfwcvt + vfwcvt + vfmacc ==> vwfmacc. > > 2. (set (reg) (fma (float_extend:reg)(reg)(reg))) > This pattern is the intermediate IR that enhances the combine > optimizations. > Since for the complicate situation, combine pass can not combine both > operands > of multiplication at the first time, it will try to first combine at the > first > stage: (set (reg) (fma (float_extend:reg)(reg)(reg))). Then combine another > extension of the other operand at the second stage. > > This can enhance combine optimization for the following case: > #define TEST_TYPE(TYPE1, TYPE2) > \ > __attribute__ ((noipa)) void vwadd_##TYPE1_##TYPE2 ( > \ > TYPE1 *__restrict dst, TYPE1 *__restrict dst2, TYPE1 *__restrict dst3, > \ > TYPE1 *__restrict dst4, TYPE2 *__restrict a, TYPE2 *__restrict b, > \ > TYPE2 *__restrict a2, TYPE2 *__restrict b2, int n) > \ > { > \ > for (int i = 0; i < n; i++) > \ > { > \ > dst[i] += (TYPE1) a[i] * (TYPE1) b[i]; > \ > dst2[i] += (TYPE1) a2[i] * (TYPE1) b[i]; > \ > dst3[i] += (TYPE1) a2[i] * (TYPE1) a[i]; > \ > dst4[i] += (TYPE1) a[i] * (TYPE1) b2[i]; > \ > } > \ > } > > #define TEST_ALL() > \ > TEST_TYPE (int16_t, int8_t) > \ > TEST_TYPE (uint16_t, uint8_t) > \ > TEST_TYPE (int32_t, int16_t) > \ > TEST_TYPE (uint32_t, uint16_t) > \ > TEST_TYPE (int64_t, int32_t) > \ > TEST_TYPE (uint64_t, uint32_t) > \ > TEST_TYPE (float, _Float16) > \ > TEST_TYPE (double, float) > > TEST_ALL () > > gcc/ChangeLog: > > * config/riscv/autovec-opt.md (*double_widen_fma<mode>): New pattern. > (*single_widen_fma<mode>): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/widen/widen-8.c: Add floating-point. > * gcc.target/riscv/rvv/autovec/widen/widen-complicate-5.c: Ditto. > * gcc.target/riscv/rvv/autovec/widen/widen_run-8.c: Ditto. > * gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-8.c: New test. > > --- > gcc/config/riscv/autovec-opt.md | 58 +++++++++++++++++++ > .../riscv/rvv/autovec/widen/widen-8.c | 7 ++- > .../rvv/autovec/widen/widen-complicate-5.c | 7 ++- > .../riscv/rvv/autovec/widen/widen_run-8.c | 5 +- > .../rvv/autovec/widen/widen_run_zvfh-8.c | 32 ++++++++++ > 5 files changed, 103 insertions(+), 6 deletions(-) > create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-8.c > > diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md > index 1fcd55ac2a0..1a1cef0eaa5 100644 > --- a/gcc/config/riscv/autovec-opt.md > +++ b/gcc/config/riscv/autovec-opt.md > @@ -444,3 +444,61 @@ > } > [(set_attr "type" "vfwmul") > (set_attr "mode" "<MODE>")]) > + > +;; ------------------------------------------------------------------------- > +;; ---- [FP] VFWMACC > +;; ------------------------------------------------------------------------- > +;; Includes: > +;; - vfwmacc.vv > +;; ------------------------------------------------------------------------- > + > +;; Combine ext + ext + fma ===> widen fma. > +;; Most of circumstantces, LoopVectorizer will generate the following IR: > +;; vect__8.176_40 = (vector([2,2]) double) vect__7.175_41; > +;; vect__11.180_35 = (vector([2,2]) double) vect__10.179_36; > +;; vect__13.182_33 = .FMA (vect__11.180_35, vect__8.176_40, vect__4.172_45); > +(define_insn_and_split "*double_widen_fma<mode>" > + [(set (match_operand:VWEXTF 0 "register_operand") > + (fma:VWEXTF > + (float_extend:VWEXTF > + (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand")) > + (float_extend:VWEXTF > + (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand")) > + (match_operand:VWEXTF 1 "register_operand")))] > + "TARGET_VECTOR && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(const_int 0)] > + { > + riscv_vector::emit_vlmax_fp_ternary_insn (code_for_pred_widen_mul (PLUS, > <MODE>mode), > + riscv_vector::RVV_WIDEN_TERNOP, > operands); > + DONE; > + } > + [(set_attr "type" "vfwmuladd") > + (set_attr "mode" "<V_DOUBLE_TRUNC>")]) > + > +;; This helps to match ext + fma. > +(define_insn_and_split "*single_widen_fma<mode>" > + [(set (match_operand:VWEXTF 0 "register_operand") > + (fma:VWEXTF > + (float_extend:VWEXTF > + (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand")) > + (match_operand:VWEXTF 3 "register_operand") > + (match_operand:VWEXTF 1 "register_operand")))] > + "TARGET_VECTOR && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(const_int 0)] > + { > + insn_code icode = code_for_pred_extend (<MODE>mode); > + rtx tmp = gen_reg_rtx (<MODE>mode); > + rtx ext_ops[] = {tmp, operands[2]}; > + riscv_vector::emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ext_ops); > + > + rtx dst = expand_ternary_op (<MODE>mode, fma_optab, tmp, operands[3], > + operands[1], operands[0], 0); > + emit_move_insn (operands[0], dst); > + DONE; > + } > + [(set_attr "type" "vfwmuladd") > + (set_attr "mode" "<V_DOUBLE_TRUNC>")]) > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-8.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-8.c > index f3ca07c02e0..8f41bdfdec2 100644 > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-8.c > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-8.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d > --param=riscv-autovec-preference=scalable" } */ > +/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d > --param=riscv-autovec-preference=scalable -ffast-math" } */ > > #include <stdint-gcc.h> > > @@ -19,9 +19,12 @@ > TEST_TYPE (int32_t, int16_t) > \ > TEST_TYPE (uint32_t, uint16_t) > \ > TEST_TYPE (int64_t, int32_t) > \ > - TEST_TYPE (uint64_t, uint32_t) > + TEST_TYPE (uint64_t, uint32_t) > \ > + TEST_TYPE (float, _Float16) > \ > + TEST_TYPE (double, float) > > TEST_ALL () > > /* { dg-final { scan-assembler-times {\tvwmacc\.vv} 3 } } */ > /* { dg-final { scan-assembler-times {\tvwmaccu\.vv} 3 } } */ > +/* { dg-final { scan-assembler-times {\tvfwmacc\.vv} 2 } } */ > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-5.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-5.c > index 187b6db21fd..3ff8483cde4 100644 > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-5.c > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-5.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d > --param=riscv-autovec-preference=scalable" } */ > +/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d > --param=riscv-autovec-preference=scalable -ffast-math" } */ > > #include <stdint-gcc.h> > > @@ -24,9 +24,12 @@ > TEST_TYPE (int32_t, int16_t) > \ > TEST_TYPE (uint32_t, uint16_t) > \ > TEST_TYPE (int64_t, int32_t) > \ > - TEST_TYPE (uint64_t, uint32_t) > + TEST_TYPE (uint64_t, uint32_t) > \ > + TEST_TYPE (float, _Float16) > \ > + TEST_TYPE (double, float) > > TEST_ALL () > > /* { dg-final { scan-assembler-times {\tvwmacc\.vv} 12 } } */ > /* { dg-final { scan-assembler-times {\tvwmaccu\.vv} 12 } } */ > +/* { dg-final { scan-assembler-times {\tvfwmacc\.vv} 8 } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-8.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-8.c > index f4840d30dc2..15095002154 100644 > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-8.c > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-8.c > @@ -1,5 +1,5 @@ > /* { dg-do run { target { riscv_vector } } } */ > -/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */ > +/* { dg-additional-options "--param=riscv-autovec-preference=scalable > -ffast-math" } */ > > #include <assert.h> > #include "widen-8.c" > @@ -29,7 +29,8 @@ > RUN (int32_t, int16_t, -32768) > \ > RUN (uint32_t, uint16_t, 65535) > \ > RUN (int64_t, int32_t, -2147483648) > \ > - RUN (uint64_t, uint32_t, 4294967295) > + RUN (uint64_t, uint32_t, 4294967295) > \ > + RUN (double, float, -2147483648) > > int > main () > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-8.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-8.c > new file mode 100644 > index 00000000000..63563b86e7c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-8.c > @@ -0,0 +1,32 @@ > +/* { dg-do run { target { riscv_vector && riscv_zvfh_hw } } } */ > +/* { dg-additional-options "--param=riscv-autovec-preference=scalable > -ffast-math" } */ > + > +#include <assert.h> > +#include "widen-8.c" > + > +#define SZ 512 > + > +#define RUN(TYPE1, TYPE2, LIMIT) > \ > + TYPE2 a##TYPE2[SZ]; > \ > + TYPE2 b##TYPE2[SZ]; > \ > + TYPE1 dst##TYPE1[SZ]; > \ > + TYPE1 dst2##TYPE1[SZ]; > \ > + for (int i = 0; i < SZ; i++) > \ > + { > \ > + a##TYPE2[i] = LIMIT + i % 8723; > \ > + b##TYPE2[i] = LIMIT + i & 1964; > \ > + dst##TYPE1[i] = LIMIT + i & 628; > \ > + dst2##TYPE1[i] = LIMIT + i & 628; > \ > + } > \ > + vwmacc_##TYPE1_##TYPE2 (dst##TYPE1, a##TYPE2, b##TYPE2, SZ); > \ > + for (int i = 0; i < SZ; i++) > \ > + assert (dst##TYPE1[i] > \ > + == ((TYPE1) a##TYPE2[i] * (TYPE1) b##TYPE2[i]) + dst2##TYPE1[i]); > + > +#define RUN_ALL() RUN (float, _Float16, -32768) > + > +int > +main () > +{ > + RUN_ALL () > +} > -- > 2.36.1 >