On Wed, Jan 20, 2016 at 03:22:11PM +0000, James Greenhalgh wrote:
> 
> Hi,
> 
> In a number of cases where we try to create vectors we end up spilling to the
> stack and then filling. This is one example distilled from a couple of
> micro-benchmrks where the issue shows up. The reason for the extra cost
> in this case is the unnecessary use of the stack. The patch attempts to
> finesse this by using lane loads or vector inserts to produce the right
> results.
> 
> This patch is mostly Ramana's work, I've just cleaned it up a little.
> 
> This has been in a number of our trees lately, and we haven't seen any
> regressions. I've also bootstrapped and tested it, and run a set of
> benchmarks to show no regressions on Cortex-A57 or Cortex-A53.
> 
> The patch fixes some regressions caused by the more agressive vectorization
> in GCC6, so I'd like to propose it to go in even though we are in Stage 4.
> 
> OK?

*Ping*

I just ran in to this investigating another performance regression. It would
be nice to get fixed.

Thanks,
James


> 
> Thanks,
> James
> 
> ---
> gcc/
> 
> 2016-01-20  James Greenhalgh  <james.greenha...@arm.com>
>           Ramana Radhakrishnan  <ramana.radhakrish...@arm.com>
> 
>       * config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor,
>       always use lane loads to construct non-constant vectors.
> 
> gcc/testsuite/
> 
> 2016-01-20  James Greenhalgh  <james.greenha...@arm.com>
>           Ramana Radhakrishnan  <ramana.radhakrish...@arm.com>
> 
>       * gcc.target/aarch64/vector_initialization_nostack.c: New.
> 

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 03bc1b9..3787b38 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -10985,28 +10985,37 @@ aarch64_simd_make_constant (rtx vals)
>      return NULL_RTX;
>  }
>  
> +/* Expand a vector initialisation sequence, such that TARGET is
> +   initialised to contain VALS.  */
> +
>  void
>  aarch64_expand_vector_init (rtx target, rtx vals)
>  {
>    machine_mode mode = GET_MODE (target);
>    machine_mode inner_mode = GET_MODE_INNER (mode);
> +  /* The number of vector elements.  */
>    int n_elts = GET_MODE_NUNITS (mode);
> +  /* The number of vector elements which are not constant.  */
>    int n_var = 0;
>    rtx any_const = NULL_RTX;
> +  /* The first element of vals.  */
> +  rtx v0 = XVECEXP (vals, 0, 0);
>    bool all_same = true;
>  
> +  /* Count the number of variable elements to initialise.  */
>    for (int i = 0; i < n_elts; ++i)
>      {
>        rtx x = XVECEXP (vals, 0, i);
> -      if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
> +      if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
>       ++n_var;
>        else
>       any_const = x;
>  
> -      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
> -     all_same = false;
> +      all_same &= rtx_equal_p (x, v0);
>      }
>  
> +  /* No variable elements, hand off to aarch64_simd_make_constant which knows
> +     how best to handle this.  */
>    if (n_var == 0)
>      {
>        rtx constant = aarch64_simd_make_constant (vals);
> @@ -11020,14 +11029,15 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>    /* Splat a single non-constant element if we can.  */
>    if (all_same)
>      {
> -      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
> +      rtx x = copy_to_mode_reg (inner_mode, v0);
>        aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
>        return;
>      }
>  
> -  /* Half the fields (or less) are non-constant.  Load constant then 
> overwrite
> -     varying fields.  Hope that this is more efficient than using the stack. 
>  */
> -  if (n_var <= n_elts/2)
> +  /* Initialise a vector which is part-variable.  We want to first try
> +     to build those lanes which are constant in the most efficient way we
> +     can.  */
> +  if (n_var != n_elts)
>      {
>        rtx copy = copy_rtx (vals);
>  
> @@ -11054,31 +11064,21 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>         XVECEXP (copy, 0, i) = subst;
>       }
>        aarch64_expand_vector_init (target, copy);
> +    }
>  
> -      /* Insert variables.  */
> -      enum insn_code icode = optab_handler (vec_set_optab, mode);
> -      gcc_assert (icode != CODE_FOR_nothing);
> +  /* Insert the variable lanes directly.  */
>  
> -      for (int i = 0; i < n_elts; i++)
> -     {
> -       rtx x = XVECEXP (vals, 0, i);
> -       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
> -         continue;
> -       x = copy_to_mode_reg (inner_mode, x);
> -       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
> -     }
> -      return;
> -    }
> +  enum insn_code icode = optab_handler (vec_set_optab, mode);
> +  gcc_assert (icode != CODE_FOR_nothing);
>  
> -  /* Construct the vector in memory one field at a time
> -     and load the whole vector.  */
> -  rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
>    for (int i = 0; i < n_elts; i++)
> -    emit_move_insn (adjust_address_nv (mem, inner_mode,
> -                                 i * GET_MODE_SIZE (inner_mode)),
> -                 XVECEXP (vals, 0, i));
> -  emit_move_insn (target, mem);
> -
> +    {
> +      rtx x = XVECEXP (vals, 0, i);
> +      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
> +     continue;
> +      x = copy_to_mode_reg (inner_mode, x);
> +      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
> +    }
>  }
>  
>  static unsigned HOST_WIDE_INT
> diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c 
> b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
> new file mode 100644
> index 0000000..bbad04d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
> @@ -0,0 +1,53 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */
> +float arr_f[100][100];
> +float
> +f9 (void)
> +{
> +
> +  int i;
> +  float sum = 0;
> +  for (i = 0; i < 100; i++)
> +    sum += arr_f[i][0] * arr_f[0][i];
> +  return sum;
> +
> +}
> +
> +
> +int arr[100][100];
> +int
> +f10 (void)
> +{
> +
> +  int i;
> +  int sum = 0;
> +  for (i = 0; i < 100; i++)
> +    sum += arr[i][0] * arr[0][i];
> +  return sum;
> +
> +}
> +
> +double arr_d[100][100];
> +double
> +f11 (void)
> +{
> +  int i;
> +  double sum = 0;
> +  for (i = 0; i < 100; i++)
> +    sum += arr_d[i][0] * arr_d[0][i];
> +  return sum;
> +}
> +
> +char arr_c[100][100];
> +char
> +f12 (void)
> +{
> +  int i;
> +  char sum = 0;
> +  for (i = 0; i < 100; i++)
> +    sum += arr_c[i][0] * arr_c[0][i];
> +  return sum;
> +}
> +
> +
> +/* { dg-final { scan-assembler-not "sp" } } */

Reply via email to