scatter decode overhead into account

Tamar Christina Wed, 31 Jul 2024 09:47:27 -0700

Hi Kyrill,

> >   /* True if the vector body contains a store to a decl and if the
> >      function is known to have a vld1 from the same decl.
> >
> > @@ -17291,6 +17297,17 @@ aarch64_vector_costs::add_stmt_cost (int count,
> vect_cost_for_stmt kind,
> >        stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
> >                                                        stmt_info, vectype,
> >                                                        where, stmt_cost);
> > +
> > +      /* Check if we've seen an SVE gather/scatter operation and which 
> > size.  */
> > +      if (kind == scalar_load
> > +         && aarch64_sve_mode_p (TYPE_MODE (vectype))
> > +         && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) ==
> VMAT_GATHER_SCATTER)
> > +       {
> > +         if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
> > +           m_sve_gather_scatter_x64 = true;
> > +         else
> > +           m_sve_gather_scatter_x32 = true;
> 
> This is a bit academic at this stage but SVE2.1 adds quadword gather loads. I 
> know
> we’re not vectoring for those yet, but maybe it’s worth explicitly checking 
> for 32-
> bit size and gcc_unreachable () otherwise?


To be honest I'm not quite sure how to detect it.  If it just 
GET_MODE_UNIT_BITSIZE () == 128?
But do we want an assert in the cost model? Happy to do so though but maybe a 
debug print is more
appropriate? i.e. make it a missed optimization?

> 
> 
> > +       }
> >     }
> >
> >   /* Do any SVE-specific adjustments to the cost.  */
> > @@ -17676,6 +17693,18 @@ aarch64_vector_costs::finish_cost (const
> vector_costs *uncast_scalar_costs)
> >       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
> >                                             m_costs[vect_body]);
> >       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
> > +
> > +      /* For gather and scatters there's an additional overhead for the 
> > first
> > +        iteration.  For low count loops they're not beneficial so model the
> > +        overhead as loop prologue costs.  */
> > +      if (m_sve_gather_scatter_x32 || m_sve_gather_scatter_x64)
> > +       {
> > +         const sve_vec_cost *sve_costs = 
> > aarch64_tune_params.vec_costs->sve;
> > +         if (m_sve_gather_scatter_x32)
> > +           m_costs[vect_prologue] += sve_costs->gather_load_x32_init_cost;
> > +         else
> > +           m_costs[vect_prologue] += sve_costs->gather_load_x64_init_cost;
> 
> Shouldn’t this not be en else but rather:
> If (m_sve_gather_scatter_x64)
>    m_costs[vect_prologue] += sve_costs->gather_load_x64_init_cost;
> 
> In case the loop has both 32-bit and 64-bit gather/scatter?
> 

This was an interesting comment.  After some discussion and more benchmarking
we've changed it to be an additive cost.

> 
> > +       }
> >     }
> >
> >   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
> > diff --git a/gcc/config/aarch64/tuning_models/a64fx.h
> b/gcc/config/aarch64/tuning_models/a64fx.h
> > index
> 6091289d4c3c66f01d7e4dbf97a85c1f8c40bb0b..378a1b3889ee265859786c1
> ff6525fce2305b615 100644
> > --- a/gcc/config/aarch64/tuning_models/a64fx.h
> > +++ b/gcc/config/aarch64/tuning_models/a64fx.h
> > @@ -104,6 +104,8 @@ static const sve_vec_cost a64fx_sve_vector_cost =
> >   13, /* fadda_f64_cost  */
> >   64, /* gather_load_x32_cost  */
> >   32, /* gather_load_x64_cost  */
> > +  0, /* gather_load_x32_init_cost  */
> > +  0, /* gather_load_x64_init_cost  */
> >   1 /* scatter_store_elt_cost  */
> > };
> >
> > diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h
> b/gcc/config/aarch64/tuning_models/cortexx925.h
> > index
> fb95e87526985b02410d54a5a3ec8539c1b0ba6d..c4206018a3ff707f89ff33007
> 00ec7dc2a5bc6b0 100644
> > --- a/gcc/config/aarch64/tuning_models/cortexx925.h
> > +++ b/gcc/config/aarch64/tuning_models/cortexx925.h
> > @@ -135,6 +135,8 @@ static const sve_vec_cost cortexx925_sve_vector_cost =
> >      operation more than a 64-bit gather.  */
> >   14, /* gather_load_x32_cost  */
> >   12, /* gather_load_x64_cost  */
> > +  42, /* gather_load_x32_init_cost  */
> > +  24, /* gather_load_x64_init_cost  */
> 
> 
> Can you comment on how these numbers are derived?

They were derived essentially from benchmarking.  I did a bunch of runs over 
various cores
to determine at which iteration count they become profitable.  From that as you 
can
probably tell the costs are a multiple of the cost of the operations for the 
specific core.

This because that cost already keeps in mind things like VL differences.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-protos.h (struct sve_vec_cost): Add
        gather_load_x32_init_cost and gather_load_x64_init_cost.
        * config/aarch64/aarch64.cc (aarch64_vector_costs): Add
        m_sve_gather_scatter_init_cost.
        (aarch64_vector_costs::add_stmt_cost): Use them.
        (aarch64_vector_costs::finish_cost): Likewise.
        * config/aarch64/tuning_models/a64fx.h: Update.
        * config/aarch64/tuning_models/cortexx925.h: Update.
        * config/aarch64/tuning_models/generic.h: Update.
        * config/aarch64/tuning_models/generic_armv8_a.h: Update.
        * config/aarch64/tuning_models/generic_armv9_a.h: Update.
        * config/aarch64/tuning_models/neoverse512tvb.h: Update.
        * config/aarch64/tuning_models/neoversen2.h: Update.
        * config/aarch64/tuning_models/neoversen3.h: Update.
        * config/aarch64/tuning_models/neoversev1.h: Update.
        * config/aarch64/tuning_models/neoversev2.h: Update.
        * config/aarch64/tuning_models/neoversev3.h: Update.
        * config/aarch64/tuning_models/neoversev3ae.h: Update.

-- inline copy of patch --

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
42639e9efcf1e0f9362f759ae63a31b8eeb0d581..16eb8edab4d9fdfc6e3672c56ef5c9f6962d0c0b
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -262,6 +262,8 @@ struct sve_vec_cost : simd_vec_cost
                          unsigned int fadda_f64_cost,
                          unsigned int gather_load_x32_cost,
                          unsigned int gather_load_x64_cost,
+                         unsigned int gather_load_x32_init_cost,
+                         unsigned int gather_load_x64_init_cost,
                          unsigned int scatter_store_elt_cost)
     : simd_vec_cost (base),
       clast_cost (clast_cost),
@@ -270,6 +272,8 @@ struct sve_vec_cost : simd_vec_cost
       fadda_f64_cost (fadda_f64_cost),
       gather_load_x32_cost (gather_load_x32_cost),
       gather_load_x64_cost (gather_load_x64_cost),
+      gather_load_x32_init_cost (gather_load_x32_init_cost),
+      gather_load_x64_init_cost (gather_load_x64_init_cost),
       scatter_store_elt_cost (scatter_store_elt_cost)
   {}
 
@@ -289,6 +293,12 @@ struct sve_vec_cost : simd_vec_cost
   const int gather_load_x32_cost;
   const int gather_load_x64_cost;
 
+  /* Additional loop initialization cost of using a gather load instruction.  
The x32
+     value is for loads of 32-bit elements and the x64 value is for loads of
+     64-bit elements.  */
+  const int gather_load_x32_init_cost;
+  const int gather_load_x64_init_cost;
+
   /* The per-element cost of a scatter store.  */
   const int scatter_store_elt_cost;
 };
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
eafa377cb095f49408d8a926fb49ce13e2155ba2..da2feb54ddad9b39db92e0a9ec7c4e40cfa3e4e2
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16227,6 +16227,10 @@ private:
      supported by Advanced SIMD and SVE2.  */
   bool m_has_avg = false;
 
+  /* Additional initialization costs for using gather or scatter operation in
+     the current loop.  */
+  unsigned int m_sve_gather_scatter_init_cost = 0;
+
   /* True if the vector body contains a store to a decl and if the
      function is known to have a vld1 from the same decl.
 
@@ -17291,6 +17295,20 @@ aarch64_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
        stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
                                                        stmt_info, vectype,
                                                        where, stmt_cost);
+
+      /* Check if we've seen an SVE gather/scatter operation and which size.  
*/
+      if (kind == scalar_load
+         && aarch64_sve_mode_p (TYPE_MODE (vectype))
+         && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
+       {
+         const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
+         if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
+           m_sve_gather_scatter_init_cost
+             += sve_costs->gather_load_x64_init_cost;
+         else
+           m_sve_gather_scatter_init_cost
+             += sve_costs->gather_load_x32_init_cost;
+       }
     }
 
   /* Do any SVE-specific adjustments to the cost.  */
@@ -17676,6 +17694,12 @@ aarch64_vector_costs::finish_cost (const vector_costs 
*uncast_scalar_costs)
       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
                                             m_costs[vect_body]);
       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+
+      /* For gather and scatters there's an additional overhead for the first
+        iteration.  For low count loops they're not beneficial so model the
+        overhead as loop prologue costs.  */
+      if (m_sve_gather_scatter_init_cost)
+       m_costs[vect_prologue] += m_sve_gather_scatter_init_cost;
     }
 
   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
diff --git a/gcc/config/aarch64/tuning_models/a64fx.h 
b/gcc/config/aarch64/tuning_models/a64fx.h
index 
6091289d4c3c66f01d7e4dbf97a85c1f8c40bb0b..378a1b3889ee265859786c1ff6525fce2305b615
 100644
--- a/gcc/config/aarch64/tuning_models/a64fx.h
+++ b/gcc/config/aarch64/tuning_models/a64fx.h
@@ -104,6 +104,8 @@ static const sve_vec_cost a64fx_sve_vector_cost =
   13, /* fadda_f64_cost  */
   64, /* gather_load_x32_cost  */
   32, /* gather_load_x64_cost  */
+  0, /* gather_load_x32_init_cost  */
+  0, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h 
b/gcc/config/aarch64/tuning_models/cortexx925.h
index 
6cae5b7de5ca7ffad8a0f683e1285039bb55d159..b509cae758419a415d9067ec751ef1e6528eb09a
 100644
--- a/gcc/config/aarch64/tuning_models/cortexx925.h
+++ b/gcc/config/aarch64/tuning_models/cortexx925.h
@@ -135,6 +135,8 @@ static const sve_vec_cost cortexx925_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/generic.h 
b/gcc/config/aarch64/tuning_models/generic.h
index 
2b1f68b3052117814161a32f426422736ad6462b..101969bdbb9ccf7eafbd9a1cd6e25f0b584fb261
 100644
--- a/gcc/config/aarch64/tuning_models/generic.h
+++ b/gcc/config/aarch64/tuning_models/generic.h
@@ -105,6 +105,8 @@ static const sve_vec_cost generic_sve_vector_cost =
   2, /* fadda_f64_cost  */
   4, /* gather_load_x32_cost  */
   2, /* gather_load_x64_cost  */
+  12, /* gather_load_x32_init_cost  */
+  4, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h 
b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
index 
b38b9a8c5cad7d12aa38afdb610a14a25e755010..b5088afe068aa4be7f9dd614cfdd2a51fa96e524
 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
@@ -106,6 +106,8 @@ static const sve_vec_cost generic_armv8_a_sve_vector_cost =
   2, /* fadda_f64_cost  */
   4, /* gather_load_x32_cost  */
   2, /* gather_load_x64_cost  */
+  12, /* gather_load_x32_init_cost  */
+  4, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h 
b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
index 
7156dbe5787e831bc4343deb7d7b88e9823fc1bc..999985ed40f694f2681779d940bdb282f289b8e3
 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
@@ -136,6 +136,8 @@ static const sve_vec_cost generic_armv9_a_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   3 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h 
b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
index 
825c6a64990b72cda3641737957dc94d75db1509..d2a0b647791de8fca6d7684849d2ab1e9104b045
 100644
--- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h
+++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
@@ -79,6 +79,8 @@ static const sve_vec_cost neoverse512tvb_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   3 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h 
b/gcc/config/aarch64/tuning_models/neoversen2.h
index 
d41e714aa045266ecae62a36ed02dfbfb7597c3a..1a5b66901b5c3fb78f87fee40236957139644585
 100644
--- a/gcc/config/aarch64/tuning_models/neoversen2.h
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversen2_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   3 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h 
b/gcc/config/aarch64/tuning_models/neoversen3.h
index 
36f770c0a14fc127c75a60cd37048d46c3b069c7..cfd5060e6b64a0433de41b03cde886da119d9a1c
 100644
--- a/gcc/config/aarch64/tuning_models/neoversen3.h
+++ b/gcc/config/aarch64/tuning_models/neoversen3.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversen3_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h 
b/gcc/config/aarch64/tuning_models/neoversev1.h
index 
0fc41ce6a41b3135fa06d2bda1f517fdf4f8dbcf..705ed025730f6683109a4796c6eefa55b437cec9
 100644
--- a/gcc/config/aarch64/tuning_models/neoversev1.h
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -126,6 +126,8 @@ static const sve_vec_cost neoversev1_sve_vector_cost =
   8, /* fadda_f64_cost  */
   32, /* gather_load_x32_cost  */
   16, /* gather_load_x64_cost  */
+  96, /* gather_load_x32_init_cost  */
+  32, /* gather_load_x64_init_cost  */
   3 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
b/gcc/config/aarch64/tuning_models/neoversev2.h
index 
c9c3019dd01a98bc20a76e8455fb59ff24a9ff6c..47908636b0f4c3eadd5848b590fd079c1c04aa10
 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversev2_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   3 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h 
b/gcc/config/aarch64/tuning_models/neoversev3.h
index 
c602d067c7116cf6b081caeae8d36f9969e06d8d..c91e8c829532f9236de0102770e5c6b94e83da9a
 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h 
b/gcc/config/aarch64/tuning_models/neoversev3ae.h
index 
96d7ccf03cd96056d09676d908c63a25e3da6765..61e439326eb6f983abf8574e657cfbb0c2f9bb33
 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3ae.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3ae_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };

rb18671.patch
Description: rb18671.patch

RE: [PATCH 8/8]AArch64: take gather/scatter decode overhead into account

Reply via email to