> -----Original Message----- > From: Tamar Christina <tamar.christ...@arm.com> > Sent: Thursday, August 1, 2024 9:51 AM > To: Richard Sandiford <richard.sandif...@arm.com> > Cc: Kyrylo Tkachov <ktkac...@nvidia.com>; gcc-patches@gcc.gnu.org; nd > <n...@arm.com>; Richard Earnshaw <richard.earns...@arm.com>; Marcus > Shawcroft <marcus.shawcr...@arm.com>; ktkac...@gcc.gnu.org > Subject: RE: [PATCH 8/8]AArch64: take gather/scatter decode overhead into > account > > > -----Original Message----- > > From: Richard Sandiford <richard.sandif...@arm.com> > > Sent: Wednesday, July 31, 2024 7:17 PM > > To: Tamar Christina <tamar.christ...@arm.com> > > Cc: Kyrylo Tkachov <ktkac...@nvidia.com>; gcc-patches@gcc.gnu.org; nd > > <n...@arm.com>; Richard Earnshaw <richard.earns...@arm.com>; Marcus > > Shawcroft <marcus.shawcr...@arm.com>; ktkac...@gcc.gnu.org > > Subject: Re: [PATCH 8/8]AArch64: take gather/scatter decode overhead into > > account > > > > Tamar Christina <tamar.christ...@arm.com> writes: > > > @@ -289,6 +293,12 @@ struct sve_vec_cost : simd_vec_cost > > > const int gather_load_x32_cost; > > > const int gather_load_x64_cost; > > > > > > + /* Additional loop initialization cost of using a gather load > > > instruction. The > x32 > > > > Sorry for the trivia, but: long line. > > Yeah, noticed it after I sent out the patch 😊 > > > > > > + value is for loads of 32-bit elements and the x64 value is for > > > loads of > > > + 64-bit elements. */ > > > + const int gather_load_x32_init_cost; > > > + const int gather_load_x64_init_cost; > > > + > > > /* The per-element cost of a scatter store. */ > > > const int scatter_store_elt_cost; > > > }; > > > [...] > > > @@ -17291,6 +17295,20 @@ aarch64_vector_costs::add_stmt_cost (int > count, > > vect_cost_for_stmt kind, > > > stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind, > > > stmt_info, vectype, > > > where, stmt_cost); > > > + > > > + /* Check if we've seen an SVE gather/scatter operation and which > > > size. */ > > > + if (kind == scalar_load > > > + && aarch64_sve_mode_p (TYPE_MODE (vectype)) > > > + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == > > VMAT_GATHER_SCATTER) > > > + { > > > + const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve; > > > > I think we need to check whether this is nonnull, since not all tuning > > targets provide SVE costs. > > Will do, but I thought since this was in a block that checked > aarch64_use_new_vector_costs_p () > that the SVE costs were required. What does that predicate mean then?
Ah nevermind, just realized it just means to apply the new vector cost routines, I hadn't realized that we supported new costing for non-SVE models as well. Will add the check. Thanks, Tamar > > Cheers, > Tamar > > > > > + if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64) > > > + m_sve_gather_scatter_init_cost > > > + += sve_costs->gather_load_x64_init_cost; > > > + else > > > + m_sve_gather_scatter_init_cost > > > + += sve_costs->gather_load_x32_init_cost; > > > + } > > > } > > > > > > /* Do any SVE-specific adjustments to the cost. */ > > > @@ -17676,6 +17694,12 @@ aarch64_vector_costs::finish_cost (const > > vector_costs *uncast_scalar_costs) > > > m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs, > > > m_costs[vect_body]); > > > m_suggested_unroll_factor = determine_suggested_unroll_factor (); > > > + > > > + /* For gather and scatters there's an additional overhead for the > > > first > > > + iteration. For low count loops they're not beneficial so model the > > > + overhead as loop prologue costs. */ > > > + if (m_sve_gather_scatter_init_cost) > > > + m_costs[vect_prologue] += m_sve_gather_scatter_init_cost; > > > > Might as well make this unconditional now. > > > > LGTM with those changes, but please wait for Kyrill's review too. > > > > Thanks, > > Richard > > > > > } > > > > > > /* Apply the heuristic described above m_stp_sequence_cost. Prefer > > > diff --git a/gcc/config/aarch64/tuning_models/a64fx.h > > b/gcc/config/aarch64/tuning_models/a64fx.h > > > index > > > 6091289d4c3c66f01d7e4dbf97a85c1f8c40bb0b..378a1b3889ee265859786c1 > > ff6525fce2305b615 100644 > > > --- a/gcc/config/aarch64/tuning_models/a64fx.h > > > +++ b/gcc/config/aarch64/tuning_models/a64fx.h > > > @@ -104,6 +104,8 @@ static const sve_vec_cost a64fx_sve_vector_cost = > > > 13, /* fadda_f64_cost */ > > > 64, /* gather_load_x32_cost */ > > > 32, /* gather_load_x64_cost */ > > > + 0, /* gather_load_x32_init_cost */ > > > + 0, /* gather_load_x64_init_cost */ > > > 1 /* scatter_store_elt_cost */ > > > }; > > > > > > diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h > > b/gcc/config/aarch64/tuning_models/cortexx925.h > > > index > > > 6cae5b7de5ca7ffad8a0f683e1285039bb55d159..b509cae758419a415d9067ec > > 751ef1e6528eb09a 100644 > > > --- a/gcc/config/aarch64/tuning_models/cortexx925.h > > > +++ b/gcc/config/aarch64/tuning_models/cortexx925.h > > > @@ -135,6 +135,8 @@ static const sve_vec_cost cortexx925_sve_vector_cost > = > > > operation more than a 64-bit gather. */ > > > 14, /* gather_load_x32_cost */ > > > 12, /* gather_load_x64_cost */ > > > + 42, /* gather_load_x32_init_cost */ > > > + 24, /* gather_load_x64_init_cost */ > > > 1 /* scatter_store_elt_cost */ > > > }; > > > > > > diff --git a/gcc/config/aarch64/tuning_models/generic.h > > b/gcc/config/aarch64/tuning_models/generic.h > > > index > > > 2b1f68b3052117814161a32f426422736ad6462b..101969bdbb9ccf7eafbd9a1c > > d6e25f0b584fb261 100644 > > > --- a/gcc/config/aarch64/tuning_models/generic.h > > > +++ b/gcc/config/aarch64/tuning_models/generic.h > > > @@ -105,6 +105,8 @@ static const sve_vec_cost generic_sve_vector_cost = > > > 2, /* fadda_f64_cost */ > > > 4, /* gather_load_x32_cost */ > > > 2, /* gather_load_x64_cost */ > > > + 12, /* gather_load_x32_init_cost */ > > > + 4, /* gather_load_x64_init_cost */ > > > 1 /* scatter_store_elt_cost */ > > > }; > > > > > > diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h > > b/gcc/config/aarch64/tuning_models/generic_armv8_a.h > > > index > > > b38b9a8c5cad7d12aa38afdb610a14a25e755010..b5088afe068aa4be7f9dd614 > > cfdd2a51fa96e524 100644 > > > --- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h > > > +++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h > > > @@ -106,6 +106,8 @@ static const sve_vec_cost > > generic_armv8_a_sve_vector_cost = > > > 2, /* fadda_f64_cost */ > > > 4, /* gather_load_x32_cost */ > > > 2, /* gather_load_x64_cost */ > > > + 12, /* gather_load_x32_init_cost */ > > > + 4, /* gather_load_x64_init_cost */ > > > 1 /* scatter_store_elt_cost */ > > > }; > > > > > > diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h > > b/gcc/config/aarch64/tuning_models/generic_armv9_a.h > > > index > > > 7156dbe5787e831bc4343deb7d7b88e9823fc1bc..999985ed40f694f2681779d > > 940bdb282f289b8e3 100644 > > > --- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h > > > +++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h > > > @@ -136,6 +136,8 @@ static const sve_vec_cost > > generic_armv9_a_sve_vector_cost = > > > operation more than a 64-bit gather. */ > > > 14, /* gather_load_x32_cost */ > > > 12, /* gather_load_x64_cost */ > > > + 42, /* gather_load_x32_init_cost */ > > > + 24, /* gather_load_x64_init_cost */ > > > 3 /* scatter_store_elt_cost */ > > > }; > > > > > > diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h > > b/gcc/config/aarch64/tuning_models/neoverse512tvb.h > > > index > > > 825c6a64990b72cda3641737957dc94d75db1509..d2a0b647791de8fca6d7684 > > 849d2ab1e9104b045 100644 > > > --- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h > > > +++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h > > > @@ -79,6 +79,8 @@ static const sve_vec_cost > > neoverse512tvb_sve_vector_cost = > > > operation more than a 64-bit gather. */ > > > 14, /* gather_load_x32_cost */ > > > 12, /* gather_load_x64_cost */ > > > + 42, /* gather_load_x32_init_cost */ > > > + 24, /* gather_load_x64_init_cost */ > > > 3 /* scatter_store_elt_cost */ > > > }; > > > > > > diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h > > b/gcc/config/aarch64/tuning_models/neoversen2.h > > > index > > > d41e714aa045266ecae62a36ed02dfbfb7597c3a..1a5b66901b5c3fb78f87fee40 > > 236957139644585 100644 > > > --- a/gcc/config/aarch64/tuning_models/neoversen2.h > > > +++ b/gcc/config/aarch64/tuning_models/neoversen2.h > > > @@ -135,6 +135,8 @@ static const sve_vec_cost neoversen2_sve_vector_cost > = > > > operation more than a 64-bit gather. */ > > > 14, /* gather_load_x32_cost */ > > > 12, /* gather_load_x64_cost */ > > > + 42, /* gather_load_x32_init_cost */ > > > + 24, /* gather_load_x64_init_cost */ > > > 3 /* scatter_store_elt_cost */ > > > }; > > > > > > diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h > > b/gcc/config/aarch64/tuning_models/neoversen3.h > > > index > > > 36f770c0a14fc127c75a60cd37048d46c3b069c7..cfd5060e6b64a0433de41b03c > > de886da119d9a1c 100644 > > > --- a/gcc/config/aarch64/tuning_models/neoversen3.h > > > +++ b/gcc/config/aarch64/tuning_models/neoversen3.h > > > @@ -135,6 +135,8 @@ static const sve_vec_cost neoversen3_sve_vector_cost > = > > > operation more than a 64-bit gather. */ > > > 14, /* gather_load_x32_cost */ > > > 12, /* gather_load_x64_cost */ > > > + 42, /* gather_load_x32_init_cost */ > > > + 24, /* gather_load_x64_init_cost */ > > > 1 /* scatter_store_elt_cost */ > > > }; > > > > > > diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h > > b/gcc/config/aarch64/tuning_models/neoversev1.h > > > index > > > 0fc41ce6a41b3135fa06d2bda1f517fdf4f8dbcf..705ed025730f6683109a4796c6 > > eefa55b437cec9 100644 > > > --- a/gcc/config/aarch64/tuning_models/neoversev1.h > > > +++ b/gcc/config/aarch64/tuning_models/neoversev1.h > > > @@ -126,6 +126,8 @@ static const sve_vec_cost neoversev1_sve_vector_cost > = > > > 8, /* fadda_f64_cost */ > > > 32, /* gather_load_x32_cost */ > > > 16, /* gather_load_x64_cost */ > > > + 96, /* gather_load_x32_init_cost */ > > > + 32, /* gather_load_x64_init_cost */ > > > 3 /* scatter_store_elt_cost */ > > > }; > > > > > > diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h > > b/gcc/config/aarch64/tuning_models/neoversev2.h > > > index > > > c9c3019dd01a98bc20a76e8455fb59ff24a9ff6c..47908636b0f4c3eadd5848b59 > > 0fd079c1c04aa10 100644 > > > --- a/gcc/config/aarch64/tuning_models/neoversev2.h > > > +++ b/gcc/config/aarch64/tuning_models/neoversev2.h > > > @@ -135,6 +135,8 @@ static const sve_vec_cost neoversev2_sve_vector_cost > = > > > operation more than a 64-bit gather. */ > > > 14, /* gather_load_x32_cost */ > > > 12, /* gather_load_x64_cost */ > > > + 42, /* gather_load_x32_init_cost */ > > > + 24, /* gather_load_x64_init_cost */ > > > 3 /* scatter_store_elt_cost */ > > > }; > > > > > > diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h > > b/gcc/config/aarch64/tuning_models/neoversev3.h > > > index > > > c602d067c7116cf6b081caeae8d36f9969e06d8d..c91e8c829532f9236de01027 > > 70e5c6b94e83da9a 100644 > > > --- a/gcc/config/aarch64/tuning_models/neoversev3.h > > > +++ b/gcc/config/aarch64/tuning_models/neoversev3.h > > > @@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3_sve_vector_cost > = > > > operation more than a 64-bit gather. */ > > > 14, /* gather_load_x32_cost */ > > > 12, /* gather_load_x64_cost */ > > > + 42, /* gather_load_x32_init_cost */ > > > + 24, /* gather_load_x64_init_cost */ > > > 1 /* scatter_store_elt_cost */ > > > }; > > > > > > diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h > > b/gcc/config/aarch64/tuning_models/neoversev3ae.h > > > index > > > 96d7ccf03cd96056d09676d908c63a25e3da6765..61e439326eb6f983abf8574e > > 657cfbb0c2f9bb33 100644 > > > --- a/gcc/config/aarch64/tuning_models/neoversev3ae.h > > > +++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h > > > @@ -135,6 +135,8 @@ static const sve_vec_cost > > neoversev3ae_sve_vector_cost = > > > operation more than a 64-bit gather. */ > > > 14, /* gather_load_x32_cost */ > > > 12, /* gather_load_x64_cost */ > > > + 42, /* gather_load_x32_init_cost */ > > > + 24, /* gather_load_x64_init_cost */ > > > 1 /* scatter_store_elt_cost */ > > > };