LGTM Juzhe-Zhong <juzhe.zh...@rivai.ai> 於 2023年12月14日 週四 11:24 寫道:
> This patch fixes PR11153: > > ble a1,zero,.L8 > addiw a5,a1,-1 > li a4,4 > addi sp,sp,-16 > mv a2,a0 > sext.w a3,a1 > bleu a5,a4,.L9 > srliw a4,a3,2 > slli a4,a4,4 > mv a5,a0 > add a4,a4,a0 > vsetivli zero,4,e32,m1,ta,ma > vmv.v.i v1,0 > vse32.v v1,0(sp) > .L4: > vle32.v v1,0(a5) ---> This loop always processes 4 elements which > is ok for VLEN = 128bits, but waste a huge amount of computation units when > VLEN > 128bits > vle32.v v2,0(sp) > addi a5,a5,16 > vadd.vv v1,v2,v1 > vse32.v v1,0(sp) > bne a4,a5,.L4 > ld a5,0(sp) > lw a4,0(sp) > andi a1,a1,-4 > srai a5,a5,32 > addw a5,a4,a5 > lw a4,8(sp) > addw a5,a5,a4 > ld a4,8(sp) > srai a4,a4,32 > addw a0,a5,a4 > beq a3,a1,.L15 > .L3: > subw a3,a3,a1 > slli a5,a1,32 > slli a3,a3,32 > srli a3,a3,32 > srli a5,a5,30 > add a2,a2,a5 > vsetvli a5,a3,e8,mf4,tu,mu > vsetvli a4,zero,e32,m1,ta,ma > sub a1,a3,a5 > vmv.v.i v1,0 > vsetvli zero,a3,e32,m1,tu,ma > vle32.v v2,0(a2) > vmv.v.v v1,v2 > bne a3,a5,.L21 > .L7: > vsetvli a4,zero,e32,m1,ta,ma > vmv.s.x v2,zero > vredsum.vs v1,v1,v2 > vmv.x.s a5,v1 > addw a0,a0,a5 > .L15: > addi sp,sp,16 > jr ra > .L21: > slli a5,a5,2 > add a2,a2,a5 > vsetvli zero,a1,e32,m1,tu,ma > vle32.v v2,0(a2) > vadd.vv v1,v1,v2 > j .L7 > .L8: > li a0,0 > ret > .L9: > li a1,0 > li a0,0 > j .L3 > > The rootcause of this is we missed RVV builtin vectorization cost model. > > After this patch: > > ble a1,zero,.L4 > vsetvli a5,zero,e32,m1,ta,ma > vmv.v.i v1,0 > .L3: > vsetvli a5,a1,e32,m1,tu,ma > vle32.v v2,0(a0) > slli a4,a5,2 > sub a1,a1,a5 > add a0,a0,a4 > vadd.vv v1,v2,v1 > bne a1,zero,.L3 > li a5,0 > vsetivli zero,1,e32,m1,ta,ma > vmv.s.x v2,a5 > vsetvli a5,zero,e32,m1,ta,ma > vredsum.vs v1,v1,v2 > vmv.x.s a0,v1 > ret > .L4: > li a0,0 > ret > > PR target/111153 > > gcc/ChangeLog: > > * config/riscv/riscv-protos.h (struct common_vector_cost): New > struct. > (struct scalable_vector_cost): Ditto. > (struct cpu_vector_cost): Ditto. > * config/riscv/riscv-vector-costs.cc (costs::add_stmt_cost): Add > RVV builtin vectorization cost > * config/riscv/riscv.cc (struct riscv_tune_param): Ditto. > (get_common_costs): New function. > (riscv_builtin_vectorization_cost): Ditto. > (TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST): New targethook. > > gcc/testsuite/ChangeLog: > > * gcc.dg/vect/costmodel/riscv/rvv/pr111153.c: New test. > > --- > gcc/config/riscv/riscv-protos.h | 76 ++++++++++ > gcc/config/riscv/riscv-vector-costs.cc | 5 +- > gcc/config/riscv/riscv.cc | 143 ++++++++++++++++++ > .../vect/costmodel/riscv/rvv/pr111153.c | 18 +++ > 4 files changed, 239 insertions(+), 3 deletions(-) > create mode 100644 > gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c > > diff --git a/gcc/config/riscv/riscv-protos.h > b/gcc/config/riscv/riscv-protos.h > index 85ab1db2088..7de0b031001 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -200,6 +200,82 @@ struct riscv_cpu_info { > > extern const riscv_cpu_info *riscv_find_cpu (const char *); > > +/* Common vector costs in any kind of vectorization (e.g VLA and VLS). */ > +struct common_vector_cost > +{ > + /* Cost of any integer vector operation, excluding the ones handled > + specially below. */ > + const int int_stmt_cost; > + > + /* Cost of any fp vector operation, excluding the ones handled > + specially below. */ > + const int fp_stmt_cost; > + > + /* Gather/scatter vectorization cost. */ > + const int gather_load_cost; > + const int scatter_store_cost; > + > + /* Cost of a vector-to-scalar operation. */ > + const int vec_to_scalar_cost; > + > + /* Cost of a scalar-to-vector operation. */ > + const int scalar_to_vec_cost; > + > + /* Cost of a permute operation. */ > + const int permute_cost; > + > + /* Cost of an aligned vector load. */ > + const int align_load_cost; > + > + /* Cost of an aligned vector store. */ > + const int align_store_cost; > + > + /* Cost of an unaligned vector load. */ > + const int unalign_load_cost; > + > + /* Cost of an unaligned vector store. */ > + const int unalign_store_cost; > +}; > + > +/* scalable vectorization (VLA) specific cost. */ > +struct scalable_vector_cost : common_vector_cost > +{ > + CONSTEXPR scalable_vector_cost (const common_vector_cost &base) > + : common_vector_cost (base) > + {} > + > + /* TODO: We will need more other kinds of vector cost for VLA. > + E.g. fold_left reduction cost, lanes load/store cost, ..., etc. */ > +}; > + > +/* Cost for vector insn classes. */ > +struct cpu_vector_cost > +{ > + /* Cost of any integer scalar operation, excluding load and store. */ > + const int scalar_int_stmt_cost; > + > + /* Cost of any fp scalar operation, excluding load and store. */ > + const int scalar_fp_stmt_cost; > + > + /* Cost of a scalar load. */ > + const int scalar_load_cost; > + > + /* Cost of a scalar store. */ > + const int scalar_store_cost; > + > + /* Cost of a taken branch. */ > + const int cond_taken_branch_cost; > + > + /* Cost of a not-taken branch. */ > + const int cond_not_taken_branch_cost; > + > + /* Cost of an VLS modes operations. */ > + const common_vector_cost *vls; > + > + /* Cost of an VLA modes operations. */ > + const scalable_vector_cost *vla; > +}; > + > /* Routines implemented in riscv-selftests.cc. */ > #if CHECKING_P > namespace selftest { > diff --git a/gcc/config/riscv/riscv-vector-costs.cc > b/gcc/config/riscv/riscv-vector-costs.cc > index 7888cef58fe..e7bc9ed5233 100644 > --- a/gcc/config/riscv/riscv-vector-costs.cc > +++ b/gcc/config/riscv/riscv-vector-costs.cc > @@ -750,9 +750,8 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt > kind, > stmt_vec_info stmt_info, slp_tree, tree vectype, > int misalign, vect_cost_model_location where) > { > - /* TODO: Use default STMT cost model. > - We will support more accurate STMT cost model later. */ > - int stmt_cost = default_builtin_vectorization_cost (kind, vectype, > misalign); > + int stmt_cost > + = targetm.vectorize.builtin_vectorization_cost (kind, vectype, > misalign); > > /* Do one-time initialization based on the vinfo. */ > loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo); > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc > index 69a8a503f30..2dc44244309 100644 > --- a/gcc/config/riscv/riscv.cc > +++ b/gcc/config/riscv/riscv.cc > @@ -281,6 +281,7 @@ struct riscv_tune_param > bool slow_unaligned_access; > bool use_divmod_expansion; > unsigned int fusible_ops; > + const struct cpu_vector_cost *vec_costs; > }; > > > @@ -348,6 +349,50 @@ const enum reg_class > riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = { > VD_REGS, VD_REGS, VD_REGS, VD_REGS, > }; > > +/* Generic costs for VLS vector operations. */ > +static const common_vector_cost generic_vls_vector_cost = { > + 1, /* int_stmt_cost */ > + 1, /* fp_stmt_cost */ > + 1, /* gather_load_cost */ > + 1, /* scatter_store_cost */ > + 1, /* vec_to_scalar_cost */ > + 1, /* scalar_to_vec_cost */ > + 1, /* permute_cost */ > + 3, /* align_load_cost */ > + 3, /* align_store_cost */ > + 3, /* unalign_load_cost */ > + 3, /* unalign_store_cost */ > +}; > + > +/* Generic costs for VLA vector operations. */ > +static const scalable_vector_cost generic_vla_vector_cost = { > + { > + 1, /* int_stmt_cost */ > + 1, /* fp_stmt_cost */ > + 1, /* gather_load_cost */ > + 1, /* scatter_store_cost */ > + 1, /* vec_to_scalar_cost */ > + 1, /* scalar_to_vec_cost */ > + 1, /* permute_cost */ > + 3, /* align_load_cost */ > + 3, /* align_store_cost */ > + 3, /* unalign_load_cost */ > + 3, /* unalign_store_cost */ > + }, > +}; > + > +/* Generic costs for vector insn classes. */ > +static const struct cpu_vector_cost generic_vector_cost = { > + 1, /* scalar_int_stmt_cost */ > + 1, /* scalar_fp_stmt_cost */ > + 1, /* scalar_load_cost */ > + 1, /* scalar_store_cost */ > + 3, /* cond_taken_branch_cost */ > + 1, /* cond_not_taken_branch_cost */ > + &generic_vls_vector_cost, /* vls */ > + &generic_vla_vector_cost, /* vla */ > +}; > + > /* Costs to use when optimizing for rocket. */ > static const struct riscv_tune_param rocket_tune_info = { > {COSTS_N_INSNS (4), COSTS_N_INSNS (5)}, /* fp_add */ > @@ -362,6 +407,7 @@ static const struct riscv_tune_param rocket_tune_info > = { > true, /* > slow_unaligned_access */ > false, /* use_divmod_expansion */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > + NULL, /* vector cost */ > }; > > /* Costs to use when optimizing for Sifive 7 Series. */ > @@ -378,6 +424,7 @@ static const struct riscv_tune_param > sifive_7_tune_info = { > true, /* > slow_unaligned_access */ > false, /* use_divmod_expansion */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > + NULL, /* vector cost */ > }; > > /* Costs to use when optimizing for T-HEAD c906. */ > @@ -394,6 +441,7 @@ static const struct riscv_tune_param > thead_c906_tune_info = { > false, /* slow_unaligned_access */ > false, /* use_divmod_expansion */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > + NULL, /* vector cost */ > }; > > /* Costs to use when optimizing for a generic ooo profile. */ > @@ -410,6 +458,7 @@ static const struct riscv_tune_param > generic_ooo_tune_info = { > false, /* slow_unaligned_access */ > false, /* use_divmod_expansion */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > + &generic_vector_cost, /* vector cost */ > }; > > /* Costs to use when optimizing for size. */ > @@ -426,6 +475,7 @@ static const struct riscv_tune_param > optimize_size_tune_info = { > false, /* slow_unaligned_access */ > false, /* use_divmod_expansion */ > RISCV_FUSE_NOTHING, /* fusible_ops */ > + NULL, /* vector cost */ > }; > > static bool riscv_avoid_shrink_wrapping_separate (); > @@ -10192,6 +10242,95 @@ riscv_frame_pointer_required (void) > return riscv_save_frame_pointer && !crtl->is_leaf; > } > > +/* Return the appropriate common costs for vectors of type VECTYPE. */ > +static const common_vector_cost * > +get_common_costs (tree vectype) > +{ > + const cpu_vector_cost *costs = tune_param->vec_costs; > + gcc_assert (costs); > + > + if (vectype && riscv_v_ext_vls_mode_p (TYPE_MODE (vectype))) > + return costs->vls; > + return costs->vla; > +} > + > +/* Implement targetm.vectorize.builtin_vectorization_cost. */ > + > +static int > +riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, > + tree vectype, int misalign > ATTRIBUTE_UNUSED) > +{ > + unsigned elements; > + const cpu_vector_cost *costs = tune_param->vec_costs; > + bool fp = false; > + > + if (vectype != NULL) > + fp = FLOAT_TYPE_P (vectype); > + > + if (costs != NULL) > + { > + const common_vector_cost *common_costs = get_common_costs (vectype); > + gcc_assert (common_costs != NULL); > + switch (type_of_cost) > + { > + case scalar_stmt: > + return fp ? costs->scalar_fp_stmt_cost : > costs->scalar_int_stmt_cost; > + > + case scalar_load: > + return costs->scalar_load_cost; > + > + case scalar_store: > + return costs->scalar_store_cost; > + > + case vector_stmt: > + return fp ? common_costs->fp_stmt_cost : > common_costs->int_stmt_cost; > + > + case vector_load: > + return common_costs->align_load_cost; > + > + case vector_store: > + return common_costs->align_store_cost; > + > + case vec_to_scalar: > + return common_costs->vec_to_scalar_cost; > + > + case scalar_to_vec: > + return common_costs->scalar_to_vec_cost; > + > + case unaligned_load: > + return common_costs->unalign_load_cost; > + case vector_gather_load: > + return common_costs->gather_load_cost; > + > + case unaligned_store: > + return common_costs->unalign_store_cost; > + case vector_scatter_store: > + return common_costs->scatter_store_cost; > + > + case cond_branch_taken: > + return costs->cond_taken_branch_cost; > + > + case cond_branch_not_taken: > + return costs->cond_not_taken_branch_cost; > + > + case vec_perm: > + return common_costs->permute_cost; > + > + case vec_promote_demote: > + return fp ? common_costs->fp_stmt_cost : > common_costs->int_stmt_cost; > + > + case vec_construct: > + elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)); > + return elements / 2 + 1; > + > + default: > + gcc_unreachable (); > + } > + } > + > + return default_builtin_vectorization_cost (type_of_cost, vectype, > misalign); > +} > + > /* Implement targetm.vectorize.create_costs. */ > > static vector_costs * > @@ -10582,6 +10721,10 @@ extract_base_offset_in_addr (rtx mem, rtx *base, > rtx *offset) > #undef TARGET_FRAME_POINTER_REQUIRED > #define TARGET_FRAME_POINTER_REQUIRED riscv_frame_pointer_required > > +#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST > +#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \ > + riscv_builtin_vectorization_cost > + > #undef TARGET_VECTORIZE_CREATE_COSTS > #define TARGET_VECTORIZE_CREATE_COSTS riscv_vectorize_create_costs > > diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c > b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c > new file mode 100644 > index 00000000000..06e08ec5f2e > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize > -mtune=generic-ooo" } */ > + > +#define DEF_REDUC_PLUS(TYPE) > \ > + TYPE __attribute__ ((noinline, noclone)) > \ > + reduc_plus_##TYPE (TYPE *__restrict a, int n) > \ > + { > \ > + TYPE r = 0; > \ > + for (int i = 0; i < n; ++i) > \ > + r += a[i]; > \ > + return r; > \ > + } > + > +#define TEST_PLUS(T) T (int) > + > +TEST_PLUS (DEF_REDUC_PLUS) > + > +/* { dg-final { scan-assembler-not {vsetivli\s+zero,\s*4} } } */ > -- > 2.36.3 > >