I think we can let it vectorize as long as we purely use VLSmodes. This following patch looks reasonable:
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 58ec0b9b503..4e351bc066c 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -42,6 +42,7 @@ along with GCC; see the file COPYING3. If not see #include "backend.h" #include "tree-data-ref.h" #include "tree-ssa-loop-niter.h" +#include "emit-rtl.h" /* This file should be included last. */ #include "riscv-vector-costs.h" @@ -1055,6 +1056,42 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const return vector_costs::better_main_loop_than_p (other); } +/* Adjust vectorization cost after calling + targetm.vectorize.builtin_vectorization_cost. For some statement, we would + like to further fine-grain tweak the cost on top of + targetm.vectorize.builtin_vectorization_cost handling which doesn't have any + information on statement operation codes etc. */ + +static unsigned +adjust_stmt_cost (enum vect_cost_for_stmt kind, + struct _stmt_vec_info *stmt_info, int count, int stmt_cost) +{ + gimple *stmt = stmt_info->stmt; + switch (kind) + { + case scalar_to_vec: { + stmt_cost *= count; + /* Adjust cost by counting the scalar value initialization. */ + for (unsigned int i = 1; i < gimple_num_ops (stmt); i++) + { + tree op = gimple_op (stmt, i); + if (poly_int_tree_p (op)) + { + poly_int64 value = tree_to_poly_int64 (op); + /* We don't need to count scalar costs if it + is in range of [-16, 15] since we can use + vmv.v.i. */ + stmt_cost += riscv_const_insns (gen_int_mode (value, Pmode)); + } + } + return stmt_cost; + } + default: + break; + } + return count * stmt_cost; +} + unsigned costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_vec_info stmt_info, slp_tree, tree vectype, @@ -1082,9 +1119,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind, as one iteration of the VLA loop. */ if (where == vect_body && m_unrolled_vls_niters) m_unrolled_vls_stmts += count * m_unrolled_vls_niters; + + if (vectype) + stmt_cost = adjust_stmt_cost (kind, stmt_info, count, stmt_cost); } - return record_stmt_cost (stmt_info, where, count * stmt_cost); + return record_stmt_cost (stmt_info, where, stmt_cost); } void diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index df9799d9c5e..a14fb36817a 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -366,7 +366,7 @@ static const common_vector_cost rvv_vls_vector_cost = { 1, /* gather_load_cost */ 1, /* scatter_store_cost */ 1, /* vec_to_scalar_cost */ - 1, /* scalar_to_vec_cost */ + 2, /* scalar_to_vec_cost */ 1, /* permute_cost */ 1, /* align_load_cost */ 1, /* align_store_cost */ @@ -382,7 +382,7 @@ static const scalable_vector_cost rvv_vla_vector_cost = { 1, /* gather_load_cost */ 1, /* scatter_store_cost */ 1, /* vec_to_scalar_cost */ - 1, /* scalar_to_vec_cost */ + 2, /* scalar_to_vec_cost */ 1, /* permute_cost */ 1, /* align_load_cost */ 1, /* align_store_cost */ cost.c:3:5: note: vectorized 1 loops in function. ***** Choosing vector mode V16QI int main () { vector(16) short int vect_patt_111.16; vector(16) unsigned short vect_patt_109.15; vector(16) unsigned short vect_patt_108.14; vector(16) unsigned short vect_patt_106.13; vector(16) unsigned char vect_vec_iv_.12; unsigned char tmp.11; unsigned char tmp.10; unsigned char D.2767; unsigned char a_lsm.8; short int b; unsigned char _4; int _12; _Bool _14; int _15; unsigned char _19(D); unsigned char ivtmp_20; int _22; unsigned char a.4_25; unsigned char ivtmp_30; unsigned char a.4_256; unsigned char ivtmp_258; int _259; int _260; unsigned char _262; unsigned char ivtmp_263; unsigned char a.4_266; unsigned char ivtmp_267; vector(16) unsigned char _270; short int _278; unsigned char ivtmp_279; unsigned char ivtmp_280; <bb 2> [local count: 118111601]: <bb 8> [local count: 118111602]: # a.4_25 = PHI <1(2), _4(11)> # ivtmp_30 = PHI <18(2), ivtmp_20(11)> # vect_vec_iv_.12_269 = PHI <{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }(2), _270(11)> # ivtmp_279 = PHI <0(2), ivtmp_280(11)> _270 = vect_vec_iv_.12_269 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; vect_patt_106.13_271 = (vector(16) unsigned short) vect_vec_iv_.12_269; _22 = (int) a.4_25; vect_patt_108.14_273 = MIN_EXPR <vect_patt_106.13_271, { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15 }>; vect_patt_109.15_275 = { 32872, 32872, 32872, 32872, 32872, 32872, 32872, 32872, 32872, 32872, 32872, 32872, 32872, 32872, 32872, 32872 } >> vect_patt_108.14_273; _12 = 32872 >> _22; vect_patt_111.16_276 = VIEW_CONVERT_EXPR<vector(16) short int>(vect_patt_109.15_275); b_7 = (short int) _12; _4 = a.4_25 + 1; ivtmp_20 = ivtmp_30 - 1; ivtmp_280 = ivtmp_279 + 1; if (ivtmp_280 < 1) goto <bb 11>; [0.00%] else goto <bb 18>; [100.00%] <bb 16> [local count: 118111600]: # b_5 = PHI <b_261(19)> a = 19; _14 = b_5 != 0; _15 = (int) _14; return _15; <bb 11> [local count: 4]: goto <bb 8>; [100.00%] <bb 18> [local count: 118111601]: # a.4_266 = PHI <_4(8)> # ivtmp_267 = PHI <ivtmp_20(8)> # vect_patt_111.16_277 = PHI <vect_patt_111.16_276(8)> _278 = BIT_FIELD_REF <vect_patt_111.16_277, 16, 240>; b_264 = _278; <bb 19> [local count: 477576208]: # a.4_256 = PHI <17(18), _262(20)> # ivtmp_258 = PHI <2(18), ivtmp_263(20)> _259 = (int) a.4_256; _260 = 32872 >> _259; b_261 = (short int) _260; _262 = a.4_256 + 1; ivtmp_263 = ivtmp_258 - 1; if (ivtmp_263 != 0) goto <bb 20>; [75.27%] else goto <bb 16>; [24.73%] <bb 20> [local count: 359464610]: goto <bb 19>; [100.00%] } Final ASM: main: lui a5,%hi(a) li a4,19 sb a4,%lo(a)(a5) li a0,0 ret juzhe.zh...@rivai.ai From: Robin Dapp Date: 2024-01-11 20:56 To: juzhe.zh...@rivai.ai; Richard Biener CC: rdapp.gcc; gcc-patches; kito.cheng; Kito.cheng; jeffreyalaw Subject: Re: [PATCH] RISC-V: Increase scalar_to_vec_cost from 1 to 3 > 32872 spends 2 scalar instructions + 1 scalar_to_vec cost: > > lia4,-32768 > addiwa4,a4,104 > vmv.v.xv16,a4 > > It seems reasonable but only can fix test with -march=rv64gcv_zvl256b but > failed on -march=rv64gcv_zvl4096b. The scalar version also needs both instructions: li a0,32768 addiw a0,a0,104 Therefore I don't think we should just add them to the vectorization costs. That would only be necessary if we needed to synthesize a different constant (e.g. if a scalar constant cannot be used directly in a vector setting). Currently, scalar_outside_cost = 0 so we don't model it on the scalar side either. With scalar_to_vec = 2 we first try RVVMF2QI, vf = 8 at zvl256b: a.4_25 = PHI <1(2), _4(11)> 1 times vector_stmt costs 1 in body a.4_25 = PHI <1(2), _4(11)> 2 times scalar_to_vec costs 4 in prologue (unsigned short) a.4_25 1 times vector_stmt costs 1 in body MIN_EXPR <patt_28, 15> 1 times scalar_to_vec costs 2 in prologue MIN_EXPR <patt_28, 15> 1 times vector_stmt costs 1 in body 32872 >> patt_26 1 times scalar_to_vec costs 2 in prologue 32872 >> patt_26 1 times vector_stmt costs 1 in body <unknown> 1 times scalar_stmt costs 1 in prologue <unknown> 1 times scalar_stmt costs 1 in body Vector inside of loop cost: 5 Scalar iteration cost: 1 (shouldn't that be 2? but anyway) So one vector iteration costs 5 right now regardless of scalar_to_vec because there are 5 vector operations (phi, promote, min, shift, vsetvl/len adjustment). The scalar_to_vec costs are added to the prologue because it is assumed that broadcasts are hoisted out of the loop. Then vectorization is supposed to be profitable if #iterations = 18 > (body_cost * min_iters) + vector_outside_cost - scalar_outside_cost + 1 = 15. If we really don't want to vectorize, then we can either further increase the prologue cost or the body itself. The body statements are all vector_stmts, though. For the prologue we need a good argument why to increase scalar_to_vec to beyond, say 2. > Is it reasonable ? IMHO, scalar move (vmv.v.x or vfmv.v.f) should be > more costly than normal vadd.vv since it is transferring data between > different pipeline/register class. We want it to be more expensive, yes. In one of the last messages I explained how I would model it using either register_move_cost or using (tune-specific) costs directly. I would start with scalar_to_vec = 1 and add 1 or 2 depending on the uarch/tune-specific reg-move costs. Regards Robin