I think we can let it vectorize as long as we purely use VLSmodes.
This following patch looks reasonable:

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 58ec0b9b503..4e351bc066c 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -42,6 +42,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "backend.h"
 #include "tree-data-ref.h"
 #include "tree-ssa-loop-niter.h"
+#include "emit-rtl.h"

 /* This file should be included last.  */
 #include "riscv-vector-costs.h"
@@ -1055,6 +1056,42 @@ costs::better_main_loop_than_p (const vector_costs 
*uncast_other) const
   return vector_costs::better_main_loop_than_p (other);
 }

+/* Adjust vectorization cost after calling
+   targetm.vectorize.builtin_vectorization_cost. For some statement, we would
+   like to further fine-grain tweak the cost on top of
+   targetm.vectorize.builtin_vectorization_cost handling which doesn't have any
+   information on statement operation codes etc.  */
+
+static unsigned
+adjust_stmt_cost (enum vect_cost_for_stmt kind,
+                 struct _stmt_vec_info *stmt_info, int count, int stmt_cost)
+{
+  gimple *stmt = stmt_info->stmt;
+  switch (kind)
+    {
+      case scalar_to_vec: {
+       stmt_cost *= count;
+       /* Adjust cost by counting the scalar value initialization.  */
+       for (unsigned int i = 1; i < gimple_num_ops (stmt); i++)
+         {
+           tree op = gimple_op (stmt, i);
+           if (poly_int_tree_p (op))
+             {
+               poly_int64 value = tree_to_poly_int64 (op);
+               /* We don't need to count scalar costs if it
+                  is in range of [-16, 15] since we can use
+                  vmv.v.i.  */
+               stmt_cost += riscv_const_insns (gen_int_mode (value, Pmode));
+             }
+         }
+       return stmt_cost;
+      }
+    default:
+      break;
+    }
+  return count * stmt_cost;
+}
+
 unsigned
 costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
                      stmt_vec_info stmt_info, slp_tree, tree vectype,
@@ -1082,9 +1119,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
         as one iteration of the VLA loop.  */
       if (where == vect_body && m_unrolled_vls_niters)
        m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
+
+      if (vectype)
+       stmt_cost = adjust_stmt_cost (kind, stmt_info, count, stmt_cost);
     }

-  return record_stmt_cost (stmt_info, where, count * stmt_cost);
+  return record_stmt_cost (stmt_info, where, stmt_cost);
 }

 void
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index df9799d9c5e..a14fb36817a 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -366,7 +366,7 @@ static const common_vector_cost rvv_vls_vector_cost = {
   1, /* gather_load_cost  */
   1, /* scatter_store_cost  */
   1, /* vec_to_scalar_cost  */
-  1, /* scalar_to_vec_cost  */
+  2, /* scalar_to_vec_cost  */
   1, /* permute_cost  */
   1, /* align_load_cost  */
   1, /* align_store_cost  */
@@ -382,7 +382,7 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
     1, /* gather_load_cost  */
     1, /* scatter_store_cost  */
     1, /* vec_to_scalar_cost  */
-    1, /* scalar_to_vec_cost  */
+    2, /* scalar_to_vec_cost  */
     1, /* permute_cost  */
     1, /* align_load_cost  */
     1, /* align_store_cost  */

cost.c:3:5: note: vectorized 1 loops in function.
***** Choosing vector mode V16QI
int main ()
{
  vector(16) short int vect_patt_111.16;
  vector(16) unsigned short vect_patt_109.15;
  vector(16) unsigned short vect_patt_108.14;
  vector(16) unsigned short vect_patt_106.13;
  vector(16) unsigned char vect_vec_iv_.12;
  unsigned char tmp.11;
  unsigned char tmp.10;
  unsigned char D.2767;
  unsigned char a_lsm.8;
  short int b;
  unsigned char _4;
  int _12;
  _Bool _14;
  int _15;
  unsigned char _19(D);
  unsigned char ivtmp_20;
  int _22;
  unsigned char a.4_25;
  unsigned char ivtmp_30;
  unsigned char a.4_256;
  unsigned char ivtmp_258;
  int _259;
  int _260;
  unsigned char _262;
  unsigned char ivtmp_263;
  unsigned char a.4_266;
  unsigned char ivtmp_267;
  vector(16) unsigned char _270;
  short int _278;
  unsigned char ivtmp_279;
  unsigned char ivtmp_280;

  <bb 2> [local count: 118111601]:

  <bb 8> [local count: 118111602]:
  # a.4_25 = PHI <1(2), _4(11)>
  # ivtmp_30 = PHI <18(2), ivtmp_20(11)>
  # vect_vec_iv_.12_269 = PHI <{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 
15, 16 }(2), _270(11)>
  # ivtmp_279 = PHI <0(2), ivtmp_280(11)>
  _270 = vect_vec_iv_.12_269 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 
16, 16, 16, 16, 16 };
  vect_patt_106.13_271 = (vector(16) unsigned short) vect_vec_iv_.12_269;
  _22 = (int) a.4_25;
  vect_patt_108.14_273 = MIN_EXPR <vect_patt_106.13_271, { 15, 15, 15, 15, 15, 
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15 }>;
  vect_patt_109.15_275 = { 32872, 32872, 32872, 32872, 32872, 32872, 32872, 
32872, 32872, 32872, 32872, 32872, 32872, 32872, 32872, 32872 } >> 
vect_patt_108.14_273;
  _12 = 32872 >> _22;
  vect_patt_111.16_276 = VIEW_CONVERT_EXPR<vector(16) short 
int>(vect_patt_109.15_275);
  b_7 = (short int) _12;
  _4 = a.4_25 + 1;
  ivtmp_20 = ivtmp_30 - 1;
  ivtmp_280 = ivtmp_279 + 1;
  if (ivtmp_280 < 1)
    goto <bb 11>; [0.00%]
  else
    goto <bb 18>; [100.00%]

  <bb 16> [local count: 118111600]:
  # b_5 = PHI <b_261(19)>
  a = 19;
  _14 = b_5 != 0;
  _15 = (int) _14;
  return _15;

  <bb 11> [local count: 4]:
  goto <bb 8>; [100.00%]

  <bb 18> [local count: 118111601]:
  # a.4_266 = PHI <_4(8)>
  # ivtmp_267 = PHI <ivtmp_20(8)>
  # vect_patt_111.16_277 = PHI <vect_patt_111.16_276(8)>
  _278 = BIT_FIELD_REF <vect_patt_111.16_277, 16, 240>;
  b_264 = _278;

  <bb 19> [local count: 477576208]:
  # a.4_256 = PHI <17(18), _262(20)>
  # ivtmp_258 = PHI <2(18), ivtmp_263(20)>
  _259 = (int) a.4_256;
  _260 = 32872 >> _259;
  b_261 = (short int) _260;
  _262 = a.4_256 + 1;
  ivtmp_263 = ivtmp_258 - 1;
  if (ivtmp_263 != 0)
    goto <bb 20>; [75.27%]
  else
    goto <bb 16>; [24.73%]

  <bb 20> [local count: 359464610]:
  goto <bb 19>; [100.00%]

}

Final ASM:

main:
lui a5,%hi(a)
li a4,19
sb a4,%lo(a)(a5)
li a0,0
ret


juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2024-01-11 20:56
To: juzhe.zh...@rivai.ai; Richard Biener
CC: rdapp.gcc; gcc-patches; kito.cheng; Kito.cheng; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Increase scalar_to_vec_cost from 1 to 3
> 32872 spends 2 scalar instructions + 1 scalar_to_vec cost:
> 
> lia4,-32768
> addiwa4,a4,104
> vmv.v.xv16,a4
> 
> It seems reasonable but only can fix test with -march=rv64gcv_zvl256b but 
> failed on -march=rv64gcv_zvl4096b.
The scalar version also needs both instructions:
 
li a0,32768
addiw a0,a0,104
 
Therefore I don't think we should just add them to the
vectorization costs.  That would only be necessary if we needed
to synthesize a different constant (e.g. if a scalar constant
cannot be used directly in a vector setting).
 
Currently, scalar_outside_cost = 0 so we don't model it on the
scalar side either.
 
With scalar_to_vec = 2 we first try RVVMF2QI, vf = 8 at zvl256b:
 
a.4_25 = PHI <1(2), _4(11)> 1 times vector_stmt costs 1 in body
a.4_25 = PHI <1(2), _4(11)> 2 times scalar_to_vec costs 4 in prologue
(unsigned short) a.4_25 1 times vector_stmt costs 1 in body
MIN_EXPR <patt_28, 15> 1 times scalar_to_vec costs 2 in prologue
MIN_EXPR <patt_28, 15> 1 times vector_stmt costs 1 in body
32872 >> patt_26 1 times scalar_to_vec costs 2 in prologue
32872 >> patt_26 1 times vector_stmt costs 1 in body
<unknown> 1 times scalar_stmt costs 1 in prologue
<unknown> 1 times scalar_stmt costs 1 in body
 
  Vector inside of loop cost: 5
  Scalar iteration cost: 1 (shouldn't that be 2? but anyway)
 
So one vector iteration costs 5 right now regardless of
scalar_to_vec because there are 5 vector operations (phi,
promote, min, shift, vsetvl/len adjustment).
The scalar_to_vec costs are added to the prologue because it is
assumed that broadcasts are hoisted out of the loop.
 
Then vectorization is supposed to be profitable if
#iterations = 18 > (body_cost * min_iters)
   + vector_outside_cost - scalar_outside_cost + 1 = 15.
 
If we really don't want to vectorize, then we can either
further increase the prologue cost or the body itself.  The
body statements are all vector_stmts, though.  For the
prologue we need a good argument why to increase scalar_to_vec
to beyond, say 2.
 
> Is it reasonable ? IMHO, scalar move (vmv.v.x or vfmv.v.f) should be
> more costly than normal vadd.vv since it is transferring data between
> different pipeline/register class.
 
We want it to be more expensive, yes.  In one of the last
messages I explained how I would model it using either
register_move_cost or using (tune-specific) costs directly.
I would start with scalar_to_vec = 1 and add 1 or 2 depending
on the uarch/tune-specific reg-move costs.
 
Regards
Robin
 
 

Reply via email to