Richard Sandiford <richard.sandif...@arm.com> writes:
> Lehua Ding <lehua.d...@rivai.ai> writes:
>> Hi,
>>
>> This patch adds support that tries to fold `MIN (poly, poly)` to
>> a constant. Consider the following C Code:
>>
>> ```
>> void foo2 (int* restrict a, int* restrict b, int n)
>> {
>>     for (int i = 0; i < 3; i += 1)
>>       a[i] += b[i];
>> }
>> ```
>>
>> Before this patch:
>>
>> ```
>> void foo2 (int * restrict a, int * restrict b, int n)
>> {
>>   vector([4,4]) int vect__7.27;
>>   vector([4,4]) int vect__6.26;
>>   vector([4,4]) int vect__4.23;
>>   unsigned long _32;
>>
>>   <bb 2> [local count: 268435456]:
>>   _32 = MIN_EXPR <3, POLY_INT_CST [4, 4]>;
>>   vect__4.23_20 = .MASK_LEN_LOAD (a_11(D), 32B, { -1, ... }, _32, 0);
>>   vect__6.26_15 = .MASK_LEN_LOAD (b_12(D), 32B, { -1, ... }, _32, 0);
>>   vect__7.27_9 = vect__6.26_15 + vect__4.23_20;
>>   .MASK_LEN_STORE (a_11(D), 32B, { -1, ... }, _32, 0, vect__7.27_9); [tail 
>> call]
>>   return;
>>
>> }
>> ```
>>
>> After this patch:
>>
>> ```
>> void foo2 (int * restrict a, int * restrict b, int n)
>> {
>>   vector([4,4]) int vect__7.27;
>>   vector([4,4]) int vect__6.26;
>>   vector([4,4]) int vect__4.23;
>>
>>   <bb 2> [local count: 268435456]:
>>   vect__4.23_20 = .MASK_LEN_LOAD (a_11(D), 32B, { -1, ... }, 3, 0);
>>   vect__6.26_15 = .MASK_LEN_LOAD (b_12(D), 32B, { -1, ... }, 3, 0);
>>   vect__7.27_9 = vect__6.26_15 + vect__4.23_20;
>>   .MASK_LEN_STORE (a_11(D), 32B, { -1, ... }, 3, 0, vect__7.27_9); [tail 
>> call]
>>   return;
>>
>> }
>> ```
>>
>> For RISC-V RVV, csrr and branch instructions can be reduced:
>>
>> Before this patch:
>>
>> ```
>> foo2:
>>         csrr    a4,vlenb
>>         srli    a4,a4,2
>>         li      a5,3
>>         bleu    a5,a4,.L5
>>         mv      a5,a4
>> .L5:
>>         vsetvli zero,a5,e32,m1,ta,ma
>>         ...
>> ```
>>
>> After this patch.
>>
>> ```
>> foo2:
>>      vsetivli        zero,3,e32,m1,ta,ma
>>         ...
>> ```
>>
>> Best,
>> Lehua
>>
>> gcc/ChangeLog:
>>
>>      * fold-const.cc (can_min_p): New function.
>>      (poly_int_binop): Try fold MIN_EXPR.
>
> OK, thanks.

Sorry, just realised that the poly_int_tree_p tests are redundant.
The caller has already checked that.

Richard

> Richard
>
>> gcc/testsuite/ChangeLog:
>>
>>      * gcc.target/riscv/rvv/autovec/vls/div-1.c: Adjust.
>>      * gcc.target/riscv/rvv/autovec/vls/shift-3.c: Adjust.
>>      * gcc.target/riscv/rvv/autovec/fold-min-poly.c: New test.
>>
>> ---
>>  gcc/fold-const.cc                             | 27 +++++++++++++++++++
>>  .../riscv/rvv/autovec/fold-min-poly.c         | 24 +++++++++++++++++
>>  .../gcc.target/riscv/rvv/autovec/vls/div-1.c  |  2 +-
>>  .../riscv/rvv/autovec/vls/shift-3.c           |  2 +-
>>  4 files changed, 53 insertions(+), 2 deletions(-)
>>  create mode 100644 
>> gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c
>>
>> diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
>> index 1da498a3152..ba4b6f3f3a3 100644
>> --- a/gcc/fold-const.cc
>> +++ b/gcc/fold-const.cc
>> @@ -1213,6 +1213,28 @@ wide_int_binop (wide_int &res,
>>    return true;
>>  }
>>  
>> +/* Returns true if we know who is smaller or equal, ARG1 or ARG2, and set 
>> the
>> +   min value to RES.  */
>> +bool
>> +can_min_p (const_tree arg1, const_tree arg2, poly_wide_int &res)
>> +{
>> +  if (!poly_int_tree_p (arg1) || !poly_int_tree_p (arg2))
>> +    return false;
>> +
>> +  if (known_le (wi::to_poly_widest (arg1), wi::to_poly_widest (arg2)))
>> +    {
>> +      res = wi::to_poly_wide (arg1);
>> +      return true;
>> +    }
>> +  else if (known_le (wi::to_poly_widest (arg2), wi::to_poly_widest (arg1)))
>> +    {
>> +      res = wi::to_poly_wide (arg2);
>> +      return true;
>> +    }
>> +
>> +  return false;
>> +}
>> +
>>  /* Combine two poly int's ARG1 and ARG2 under operation CODE to
>>     produce a new constant in RES.  Return FALSE if we don't know how
>>     to evaluate CODE at compile-time.  */
>> @@ -1261,6 +1283,11 @@ poly_int_binop (poly_wide_int &res, enum tree_code 
>> code,
>>      return false;
>>        break;
>>  
>> +    case MIN_EXPR:
>> +      if (!can_min_p (arg1, arg2, res))
>> +    return false;
>> +      break;
>> +
>>      default:
>>        return false;
>>      }
>> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c 
>> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c
>> new file mode 100644
>> index 00000000000..de4c472c76e
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c
>> @@ -0,0 +1,24 @@
>> +/* { dg-do compile } */
>> +/* { dg-options " -march=rv64gcv_zvl128b -mabi=lp64d -O3 --param 
>> riscv-autovec-preference=scalable --param riscv-autovec-lmul=m1 
>> -fno-vect-cost-model" } */
>> +
>> +void foo1 (int* restrict a, int* restrict b, int n)
>> +{
>> +    for (int i = 0; i < 4; i += 1)
>> +      a[i] += b[i];
>> +}
>> +
>> +void foo2 (int* restrict a, int* restrict b, int n)
>> +{
>> +    for (int i = 0; i < 3; i += 1)
>> +      a[i] += b[i];
>> +}
>> +
>> +void foo3 (int* restrict a, int* restrict b, int n)
>> +{
>> +    for (int i = 0; i < 5; i += 1)
>> +      a[i] += b[i];
>> +}
>> +
>> +/* { dg-final { scan-assembler-not {\tcsrr\t} } } */
>> +/* { dg-final { scan-assembler {\tvsetivli\tzero,4,e32,m1,t[au],m[au]} } } 
>> */
>> +/* { dg-final { scan-assembler {\tvsetivli\tzero,3,e32,m1,t[au],m[au]} } } 
>> */
>> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/div-1.c 
>> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/div-1.c
>> index f3388a86e38..40224c69458 100644
>> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/div-1.c
>> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/div-1.c
>> @@ -55,4 +55,4 @@ DEF_OP_VV (div, 512, int64_t, /)
>>  
>>  /* { dg-final { scan-assembler-times 
>> {vdivu?\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 42 } } */
>>  /* TODO: Ideally, we should make sure there is no "csrr vlenb". However, we 
>> still have 'csrr vlenb' for some cases since we don't support VLS mode 
>> conversion which are needed by division.  */
>> -/* { dg-final { scan-assembler-times {csrr} 19 } } */
>> +/* { dg-final { scan-assembler-not {csrr} } } */
>> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c 
>> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c
>> index 98822b15657..b34a349949b 100644
>> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c
>> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-3.c
>> @@ -55,4 +55,4 @@ DEF_OP_VV (shift, 512, int64_t, <<)
>>  
>>  /* { dg-final { scan-assembler-times 
>> {vsll\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 41 } } */
>>  /* TODO: Ideally, we should make sure there is no "csrr vlenb". However, we 
>> still have 'csrr vlenb' for some cases since we don't support VLS mode 
>> conversion which are needed by division.  */
>> -/* { dg-final { scan-assembler-times {csrr} 18 } } */
>> +/* { dg-final { scan-assembler-not {csrr} } } */

Reply via email to