Richard Sandiford <richard.sandif...@arm.com> writes: > Lehua Ding <lehua.d...@rivai.ai> writes: >> Hi, >> >> This patch adds support that tries to fold `MIN (poly, poly)` to >> a constant. Consider the following C Code: >> >> ``` >> void foo2 (int* restrict a, int* restrict b, int n) >> { >> for (int i = 0; i < 3; i += 1) >> a[i] += b[i]; >> } >> ``` >> >> Before this patch: >> >> ``` >> void foo2 (int * restrict a, int * restrict b, int n) >> { >> vector([4,4]) int vect__7.27; >> vector([4,4]) int vect__6.26; >> vector([4,4]) int vect__4.23; >> unsigned long _32; >> >> <bb 2> [local count: 268435456]: >> _32 = MIN_EXPR <3, POLY_INT_CST [4, 4]>; >> vect__4.23_20 = .MASK_LEN_LOAD (a_11(D), 32B, { -1, ... }, _32, 0); >> vect__6.26_15 = .MASK_LEN_LOAD (b_12(D), 32B, { -1, ... }, _32, 0); >> vect__7.27_9 = vect__6.26_15 + vect__4.23_20; >> .MASK_LEN_STORE (a_11(D), 32B, { -1, ... }, _32, 0, vect__7.27_9); [tail >> call] >> return; >> >> } >> ``` >> >> After this patch: >> >> ``` >> void foo2 (int * restrict a, int * restrict b, int n) >> { >> vector([4,4]) int vect__7.27; >> vector([4,4]) int vect__6.26; >> vector([4,4]) int vect__4.23; >> >> <bb 2> [local count: 268435456]: >> vect__4.23_20 = .MASK_LEN_LOAD (a_11(D), 32B, { -1, ... }, 3, 0); >> vect__6.26_15 = .MASK_LEN_LOAD (b_12(D), 32B, { -1, ... }, 3, 0); >> vect__7.27_9 = vect__6.26_15 + vect__4.23_20; >> .MASK_LEN_STORE (a_11(D), 32B, { -1, ... }, 3, 0, vect__7.27_9); [tail >> call] >> return; >> >> } >> ``` >> >> For RISC-V RVV, csrr and branch instructions can be reduced: >> >> Before this patch: >> >> ``` >> foo2: >> csrr a4,vlenb >> srli a4,a4,2 >> li a5,3 >> bleu a5,a4,.L5 >> mv a5,a4 >> .L5: >> vsetvli zero,a5,e32,m1,ta,ma >> ... >> ``` >> >> After this patch. >> >> ``` >> foo2: >> vsetivli zero,3,e32,m1,ta,ma >> ... >> ``` >> >> Best, >> Lehua >> >> gcc/ChangeLog: >> >> * fold-const.cc (can_min_p): New function. >> (poly_int_binop): Try fold MIN_EXPR. > > OK, thanks.
Sorry, just realised that the poly_int_tree_p tests are redundant. The caller has already checked that. Richard > Richard > >> gcc/testsuite/ChangeLog: >> >> * gcc.target/riscv/rvv/autovec/vls/div-1.c: Adjust. >> * gcc.target/riscv/rvv/autovec/vls/shift-3.c: Adjust. >> * gcc.target/riscv/rvv/autovec/fold-min-poly.c: New test. >> >> --- >> gcc/fold-const.cc | 27 +++++++++++++++++++ >> .../riscv/rvv/autovec/fold-min-poly.c | 24 +++++++++++++++++ >> .../gcc.target/riscv/rvv/autovec/vls/div-1.c | 2 +- >> .../riscv/rvv/autovec/vls/shift-3.c | 2 +- >> 4 files changed, 53 insertions(+), 2 deletions(-) >> create mode 100644 >> gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c >> >> diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc >> index 1da498a3152..ba4b6f3f3a3 100644 >> --- a/gcc/fold-const.cc >> +++ b/gcc/fold-const.cc >> @@ -1213,6 +1213,28 @@ wide_int_binop (wide_int &res, >> return true; >> } >> >> +/* Returns true if we know who is smaller or equal, ARG1 or ARG2, and set >> the >> + min value to RES. */ >> +bool >> +can_min_p (const_tree arg1, const_tree arg2, poly_wide_int &res) >> +{ >> + if (!poly_int_tree_p (arg1) || !poly_int_tree_p (arg2)) >> + return false; >> + >> + if (known_le (wi::to_poly_widest (arg1), wi::to_poly_widest (arg2))) >> + { >> + res = wi::to_poly_wide (arg1); >> + return true; >> + } >> + else if (known_le (wi::to_poly_widest (arg2), wi::to_poly_widest (arg1))) >> + { >> + res = wi::to_poly_wide (arg2); >> + return true; >> + } >> + >> + return false; >> +} >> + >> /* Combine two poly int's ARG1 and ARG2 under operation CODE to >> produce a new constant in RES. Return FALSE if we don't know how >> to evaluate CODE at compile-time. */ >> @@ -1261,6 +1283,11 @@ poly_int_binop (poly_wide_int &res, enum tree_code >> code, >> return false; >> break; >> >> + case MIN_EXPR: >> + if (!can_min_p (arg1, arg2, res)) >> + return false; >> + break; >> + >> default: >> return false; >> } >> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c >> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c >> new file mode 100644 >> index 00000000000..de4c472c76e >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c >> @@ -0,0 +1,24 @@ >> +/* { dg-do compile } */ >> +/* { dg-options " -march=rv64gcv_zvl128b -mabi=lp64d -O3 --param >> riscv-autovec-preference=scalable --param riscv-autovec-lmul=m1 >> -fno-vect-cost-model" } */ >> + >> +void foo1 (int* restrict a, int* restrict b, int n) >> +{ >> + for (int i = 0; i < 4; i += 1) >> + a[i] += b[i]; >> +} >> + >> +void foo2 (int* restrict a, int* restrict b, int n) >> +{ >> + for (int i = 0; i < 3; i += 1) >> + a[i] += b[i]; >> +} >> + >> +void foo3 (int* restrict a, int* restrict b, int n) >> +{ >> + for (int i = 0; i < 5; i += 1) >> + a[i] += b[i]; >> +} >> + >> +/* { dg-final { scan-assembler-not {\tcsrr\t} } } */ >> +/* { dg-final { scan-assembler {\tvsetivli\tzero,4,e32,m1,t[au],m[au]} } } >> */ >> +/* { dg-final { scan-assembler {\tvsetivli\tzero,3,e32,m1,t[au],m[au]} } } >> */ >> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/div-1.c >> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/div-1.c >> index f3388a86e38..40224c69458 100644 >> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/div-1.c >> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/div-1.c >> @@ -55,4 +55,4 @@ DEF_OP_VV (div, 512, int64_t, /) >> >> /* { dg-final { scan-assembler-times >> {vdivu?\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 42 } } */ >> /* TODO: Ideally, we should make sure there is no "csrr vlenb". However, we >> still have 'csrr vlenb' for some cases since we don't support VLS mode >> conversion which are needed by division. */ >> -/* { dg-final { scan-assembler-times {csrr} 19 } } */ >> +/* { dg-final { scan-assembler-not {csrr} } } */ >> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c >> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c >> index 98822b15657..b34a349949b 100644 >> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c >> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c >> @@ -55,4 +55,4 @@ DEF_OP_VV (shift, 512, int64_t, <<) >> >> /* { dg-final { scan-assembler-times >> {vsll\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 41 } } */ >> /* TODO: Ideally, we should make sure there is no "csrr vlenb". However, we >> still have 'csrr vlenb' for some cases since we don't support VLS mode >> conversion which are needed by division. */ >> -/* { dg-final { scan-assembler-times {csrr} 18 } } */ >> +/* { dg-final { scan-assembler-not {csrr} } } */