Tamar Christina <tamar.christ...@arm.com> writes: > Hi All, > > When determining issue rates we currently discount non-constant MLA > accumulators > for Advanced SIMD but don't do it for the latency. > > This means the costs for Advanced SIMD with a constant accumulator are wrong > and > results in us costing SVE and Advanced SIMD the same. This can cauze us to > vectorize with Advanced SIMD instead of SVE in some cases. > > This patch adds the same discount for SVE and Scalar as we do for issue rate. > > My assumption was that on issue rate we reject all scalar constants early > because we take into account the extra instruction to create the constant? > Though I'd have expected this to be in prologue costs. For this reason I > added > an extra parameter to allow me to force the check to at least look for the > multiplication.
I'm not sure that was it. I wish I'd added a comment to say what it was though :( I suspect different parts of this function were written at different times, hence the inconsistency. > This gives a 5% improvement in fotonik3d_r in SPECCPU 2017 on large > Neoverse cores. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64.cc (aarch64_multiply_add_p): Add param > allow_constants. > (aarch64_adjust_stmt_cost): Use it. > (aarch64_vector_costs::count_ops): Likewise. > (aarch64_vector_costs::add_stmt_cost): Pass vinfo to > aarch64_adjust_stmt_cost. > > --- inline copy of patch -- > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > 560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c972557e2f83b63ba365fea9 > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt > kind, > or multiply-subtract sequence that might be suitable for fusing into a > single instruction. If VEC_FLAGS is zero, analyze the operation as > a scalar one, otherwise analyze it as an operation on vectors with those > - VEC_* flags. */ > + VEC_* flags. When ALLOW_CONSTANTS we'll recognize all accumulators > including > + constant ones. */ > static bool > aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info, > - unsigned int vec_flags) > + unsigned int vec_flags, bool allow_constants) > { > gassign *assign = dyn_cast<gassign *> (stmt_info->stmt); > if (!assign) > @@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo, > stmt_vec_info stmt_info, > if (code != PLUS_EXPR && code != MINUS_EXPR) > return false; > > - if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign)) > - || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))) > + if (!allow_constants > + && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign)) > + || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))) > return false; > > for (int i = 1; i < 3; ++i) > @@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo, > stmt_vec_info stmt_info, > if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR) > continue; > > - if (vec_flags & VEC_ADVSIMD) > + if (!allow_constants && (vec_flags & VEC_ADVSIMD)) > { > /* Scalar and SVE code can tie the result to any FMLA input (or none, > although that requires a MOVPRFX for SVE). However, Advanced SIMD > @@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info *vinfo, > stmt_vec_info stmt_info, > return false; > def_stmt_info = vinfo->lookup_def (rhs); > if (!def_stmt_info > - || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def) > + || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def > + || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def) Do you see vect_constant_defs in practice, or is this just for completeness? I would expect any constants to appear as direct operands. I don't mind keeping it if it's just a belt-and-braces thing though. But rather than add the allow_constants parameter, I think we should just try removing: if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign)) || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))) return false; so that the detection is the same for throughput and latency. I think: if (vec_flags & VEC_ADVSIMD) { /* Scalar and SVE code can tie the result to any FMLA input (or none, although that requires a MOVPRFX for SVE). However, Advanced SIMD only supports MLA forms, so will require a move if the result cannot be tied to the accumulator. The most important case in which this is true is when the accumulator input is invariant. */ rhs = gimple_op (assign, 3 - i); if (TREE_CODE (rhs) != SSA_NAME) return false; def_stmt_info = vinfo->lookup_def (rhs); if (!def_stmt_info || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def) return false; } will then do the right thing, because the first return false will reject constant accumulators for Advanced SIMD. Thanks, Richard > @@ -16721,8 +16724,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, > vect_cost_for_stmt kind, > and which when vectorized would operate on vector type VECTYPE. Add the > cost of any embedded operations. */ > static fractional_cost > -aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, > - tree vectype, fractional_cost stmt_cost) > +aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind, > + stmt_vec_info stmt_info, tree vectype, > + unsigned vec_flags, fractional_cost stmt_cost) > { > if (vectype) > { > @@ -16745,6 +16749,15 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, > stmt_vec_info stmt_info, > break; > } > > + gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info)); > + if (assign && !vect_is_reduction (stmt_info)) > + { > + bool simd_p = vec_flags & VEC_ADVSIMD; > + /* For MLA we need to reduce the cost since MLA is 1 instruction. */ > + if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p)) > + return 0; > + } > + > if (kind == vector_stmt || kind == vec_to_scalar) > if (tree cmp_type = vect_embedded_comparison_type (stmt_info)) > { > @@ -16795,7 +16808,8 @@ aarch64_vector_costs::count_ops (unsigned int count, > vect_cost_for_stmt kind, > } > > /* Assume that multiply-adds will become a single operation. */ > - if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags)) > + if (stmt_info > + && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false)) > return; > > /* Count the basic operation cost associated with KIND. */ > @@ -17041,8 +17055,8 @@ aarch64_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > { > /* Account for any extra "embedded" costs that apply additively > to the base cost calculated above. */ > - stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, > - stmt_cost); > + stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info, > + vectype, m_vec_flags, stmt_cost); > > /* If we're recording a nonzero vector loop body cost for the > innermost loop, also estimate the operations that would need