scatter decode overhead into account

Richard Sandiford Wed, 31 Jul 2024 11:18:08 -0700

Tamar Christina <tamar.christ...@arm.com> writes:
> @@ -289,6 +293,12 @@ struct sve_vec_cost : simd_vec_cost
>    const int gather_load_x32_cost;
>    const int gather_load_x64_cost;
>  
> +  /* Additional loop initialization cost of using a gather load instruction. 
>  The x32


Sorry for the trivia, but: long line.

> +     value is for loads of 32-bit elements and the x64 value is for loads of
> +     64-bit elements.  */
> +  const int gather_load_x32_init_cost;
> +  const int gather_load_x64_init_cost;
> +
>    /* The per-element cost of a scatter store.  */
>    const int scatter_store_elt_cost;
>  };
> [...]
> @@ -17291,6 +17295,20 @@ aarch64_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>       stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
>                                                       stmt_info, vectype,
>                                                       where, stmt_cost);
> +
> +      /* Check if we've seen an SVE gather/scatter operation and which size. 
>  */
> +      if (kind == scalar_load
> +       && aarch64_sve_mode_p (TYPE_MODE (vectype))
> +       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
> +     {
> +       const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;

I think we need to check whether this is nonnull, since not all tuning
targets provide SVE costs.

> +       if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
> +         m_sve_gather_scatter_init_cost
> +           += sve_costs->gather_load_x64_init_cost;
> +       else
> +         m_sve_gather_scatter_init_cost
> +           += sve_costs->gather_load_x32_init_cost;
> +     }
>      }
>  
>    /* Do any SVE-specific adjustments to the cost.  */
> @@ -17676,6 +17694,12 @@ aarch64_vector_costs::finish_cost (const 
> vector_costs *uncast_scalar_costs)
>        m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
>                                            m_costs[vect_body]);
>        m_suggested_unroll_factor = determine_suggested_unroll_factor ();
> +
> +      /* For gather and scatters there's an additional overhead for the first
> +      iteration.  For low count loops they're not beneficial so model the
> +      overhead as loop prologue costs.  */
> +      if (m_sve_gather_scatter_init_cost)
> +     m_costs[vect_prologue] += m_sve_gather_scatter_init_cost;

Might as well make this unconditional now.

LGTM with those changes, but please wait for Kyrill's review too.

Thanks,
Richard

>      }
>  
>    /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
> diff --git a/gcc/config/aarch64/tuning_models/a64fx.h 
> b/gcc/config/aarch64/tuning_models/a64fx.h
> index 
> 6091289d4c3c66f01d7e4dbf97a85c1f8c40bb0b..378a1b3889ee265859786c1ff6525fce2305b615
>  100644
> --- a/gcc/config/aarch64/tuning_models/a64fx.h
> +++ b/gcc/config/aarch64/tuning_models/a64fx.h
> @@ -104,6 +104,8 @@ static const sve_vec_cost a64fx_sve_vector_cost =
>    13, /* fadda_f64_cost  */
>    64, /* gather_load_x32_cost  */
>    32, /* gather_load_x64_cost  */
> +  0, /* gather_load_x32_init_cost  */
> +  0, /* gather_load_x64_init_cost  */
>    1 /* scatter_store_elt_cost  */
>  };
>  
> diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h 
> b/gcc/config/aarch64/tuning_models/cortexx925.h
> index 
> 6cae5b7de5ca7ffad8a0f683e1285039bb55d159..b509cae758419a415d9067ec751ef1e6528eb09a
>  100644
> --- a/gcc/config/aarch64/tuning_models/cortexx925.h
> +++ b/gcc/config/aarch64/tuning_models/cortexx925.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost cortexx925_sve_vector_cost =
>       operation more than a 64-bit gather.  */
>    14, /* gather_load_x32_cost  */
>    12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>    1 /* scatter_store_elt_cost  */
>  };
>  
> diff --git a/gcc/config/aarch64/tuning_models/generic.h 
> b/gcc/config/aarch64/tuning_models/generic.h
> index 
> 2b1f68b3052117814161a32f426422736ad6462b..101969bdbb9ccf7eafbd9a1cd6e25f0b584fb261
>  100644
> --- a/gcc/config/aarch64/tuning_models/generic.h
> +++ b/gcc/config/aarch64/tuning_models/generic.h
> @@ -105,6 +105,8 @@ static const sve_vec_cost generic_sve_vector_cost =
>    2, /* fadda_f64_cost  */
>    4, /* gather_load_x32_cost  */
>    2, /* gather_load_x64_cost  */
> +  12, /* gather_load_x32_init_cost  */
> +  4, /* gather_load_x64_init_cost  */
>    1 /* scatter_store_elt_cost  */
>  };
>  
> diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h 
> b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
> index 
> b38b9a8c5cad7d12aa38afdb610a14a25e755010..b5088afe068aa4be7f9dd614cfdd2a51fa96e524
>  100644
> --- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h
> +++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
> @@ -106,6 +106,8 @@ static const sve_vec_cost generic_armv8_a_sve_vector_cost 
> =
>    2, /* fadda_f64_cost  */
>    4, /* gather_load_x32_cost  */
>    2, /* gather_load_x64_cost  */
> +  12, /* gather_load_x32_init_cost  */
> +  4, /* gather_load_x64_init_cost  */
>    1 /* scatter_store_elt_cost  */
>  };
>  
> diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h 
> b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> index 
> 7156dbe5787e831bc4343deb7d7b88e9823fc1bc..999985ed40f694f2681779d940bdb282f289b8e3
>  100644
> --- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> +++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> @@ -136,6 +136,8 @@ static const sve_vec_cost generic_armv9_a_sve_vector_cost 
> =
>       operation more than a 64-bit gather.  */
>    14, /* gather_load_x32_cost  */
>    12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>    3 /* scatter_store_elt_cost  */
>  };
>  
> diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h 
> b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
> index 
> 825c6a64990b72cda3641737957dc94d75db1509..d2a0b647791de8fca6d7684849d2ab1e9104b045
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h
> +++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
> @@ -79,6 +79,8 @@ static const sve_vec_cost neoverse512tvb_sve_vector_cost =
>       operation more than a 64-bit gather.  */
>    14, /* gather_load_x32_cost  */
>    12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>    3 /* scatter_store_elt_cost  */
>  };
>  
> diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h 
> b/gcc/config/aarch64/tuning_models/neoversen2.h
> index 
> d41e714aa045266ecae62a36ed02dfbfb7597c3a..1a5b66901b5c3fb78f87fee40236957139644585
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversen2.h
> +++ b/gcc/config/aarch64/tuning_models/neoversen2.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost neoversen2_sve_vector_cost =
>       operation more than a 64-bit gather.  */
>    14, /* gather_load_x32_cost  */
>    12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>    3 /* scatter_store_elt_cost  */
>  };
>  
> diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h 
> b/gcc/config/aarch64/tuning_models/neoversen3.h
> index 
> 36f770c0a14fc127c75a60cd37048d46c3b069c7..cfd5060e6b64a0433de41b03cde886da119d9a1c
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversen3.h
> +++ b/gcc/config/aarch64/tuning_models/neoversen3.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost neoversen3_sve_vector_cost =
>       operation more than a 64-bit gather.  */
>    14, /* gather_load_x32_cost  */
>    12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>    1 /* scatter_store_elt_cost  */
>  };
>  
> diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h 
> b/gcc/config/aarch64/tuning_models/neoversev1.h
> index 
> 0fc41ce6a41b3135fa06d2bda1f517fdf4f8dbcf..705ed025730f6683109a4796c6eefa55b437cec9
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversev1.h
> +++ b/gcc/config/aarch64/tuning_models/neoversev1.h
> @@ -126,6 +126,8 @@ static const sve_vec_cost neoversev1_sve_vector_cost =
>    8, /* fadda_f64_cost  */
>    32, /* gather_load_x32_cost  */
>    16, /* gather_load_x64_cost  */
> +  96, /* gather_load_x32_init_cost  */
> +  32, /* gather_load_x64_init_cost  */
>    3 /* scatter_store_elt_cost  */
>  };
>  
> diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
> b/gcc/config/aarch64/tuning_models/neoversev2.h
> index 
> c9c3019dd01a98bc20a76e8455fb59ff24a9ff6c..47908636b0f4c3eadd5848b590fd079c1c04aa10
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversev2.h
> +++ b/gcc/config/aarch64/tuning_models/neoversev2.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost neoversev2_sve_vector_cost =
>       operation more than a 64-bit gather.  */
>    14, /* gather_load_x32_cost  */
>    12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>    3 /* scatter_store_elt_cost  */
>  };
>  
> diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h 
> b/gcc/config/aarch64/tuning_models/neoversev3.h
> index 
> c602d067c7116cf6b081caeae8d36f9969e06d8d..c91e8c829532f9236de0102770e5c6b94e83da9a
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversev3.h
> +++ b/gcc/config/aarch64/tuning_models/neoversev3.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3_sve_vector_cost =
>       operation more than a 64-bit gather.  */
>    14, /* gather_load_x32_cost  */
>    12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>    1 /* scatter_store_elt_cost  */
>  };
>  
> diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h 
> b/gcc/config/aarch64/tuning_models/neoversev3ae.h
> index 
> 96d7ccf03cd96056d09676d908c63a25e3da6765..61e439326eb6f983abf8574e657cfbb0c2f9bb33
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversev3ae.h
> +++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3ae_sve_vector_cost =
>       operation more than a 64-bit gather.  */
>    14, /* gather_load_x32_cost  */
>    12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>    1 /* scatter_store_elt_cost  */
>  };

Re: [PATCH 8/8]AArch64: take gather/scatter decode overhead into account

Reply via email to