On Wed, Jan 20, 2016 at 03:22:11PM +0000, James Greenhalgh wrote: > > Hi, > > In a number of cases where we try to create vectors we end up spilling to the > stack and then filling. This is one example distilled from a couple of > micro-benchmrks where the issue shows up. The reason for the extra cost > in this case is the unnecessary use of the stack. The patch attempts to > finesse this by using lane loads or vector inserts to produce the right > results. > > This patch is mostly Ramana's work, I've just cleaned it up a little. > > This has been in a number of our trees lately, and we haven't seen any > regressions. I've also bootstrapped and tested it, and run a set of > benchmarks to show no regressions on Cortex-A57 or Cortex-A53. > > The patch fixes some regressions caused by the more agressive vectorization > in GCC6, so I'd like to propose it to go in even though we are in Stage 4. > > OK?
*Ping* I just ran in to this investigating another performance regression. It would be nice to get fixed. Thanks, James > > Thanks, > James > > --- > gcc/ > > 2016-01-20 James Greenhalgh <james.greenha...@arm.com> > Ramana Radhakrishnan <ramana.radhakrish...@arm.com> > > * config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor, > always use lane loads to construct non-constant vectors. > > gcc/testsuite/ > > 2016-01-20 James Greenhalgh <james.greenha...@arm.com> > Ramana Radhakrishnan <ramana.radhakrish...@arm.com> > > * gcc.target/aarch64/vector_initialization_nostack.c: New. > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index 03bc1b9..3787b38 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -10985,28 +10985,37 @@ aarch64_simd_make_constant (rtx vals) > return NULL_RTX; > } > > +/* Expand a vector initialisation sequence, such that TARGET is > + initialised to contain VALS. */ > + > void > aarch64_expand_vector_init (rtx target, rtx vals) > { > machine_mode mode = GET_MODE (target); > machine_mode inner_mode = GET_MODE_INNER (mode); > + /* The number of vector elements. */ > int n_elts = GET_MODE_NUNITS (mode); > + /* The number of vector elements which are not constant. */ > int n_var = 0; > rtx any_const = NULL_RTX; > + /* The first element of vals. */ > + rtx v0 = XVECEXP (vals, 0, 0); > bool all_same = true; > > + /* Count the number of variable elements to initialise. */ > for (int i = 0; i < n_elts; ++i) > { > rtx x = XVECEXP (vals, 0, i); > - if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x)) > + if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x))) > ++n_var; > else > any_const = x; > > - if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) > - all_same = false; > + all_same &= rtx_equal_p (x, v0); > } > > + /* No variable elements, hand off to aarch64_simd_make_constant which knows > + how best to handle this. */ > if (n_var == 0) > { > rtx constant = aarch64_simd_make_constant (vals); > @@ -11020,14 +11029,15 @@ aarch64_expand_vector_init (rtx target, rtx vals) > /* Splat a single non-constant element if we can. */ > if (all_same) > { > - rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); > + rtx x = copy_to_mode_reg (inner_mode, v0); > aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x)); > return; > } > > - /* Half the fields (or less) are non-constant. Load constant then > overwrite > - varying fields. Hope that this is more efficient than using the stack. > */ > - if (n_var <= n_elts/2) > + /* Initialise a vector which is part-variable. We want to first try > + to build those lanes which are constant in the most efficient way we > + can. */ > + if (n_var != n_elts) > { > rtx copy = copy_rtx (vals); > > @@ -11054,31 +11064,21 @@ aarch64_expand_vector_init (rtx target, rtx vals) > XVECEXP (copy, 0, i) = subst; > } > aarch64_expand_vector_init (target, copy); > + } > > - /* Insert variables. */ > - enum insn_code icode = optab_handler (vec_set_optab, mode); > - gcc_assert (icode != CODE_FOR_nothing); > + /* Insert the variable lanes directly. */ > > - for (int i = 0; i < n_elts; i++) > - { > - rtx x = XVECEXP (vals, 0, i); > - if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) > - continue; > - x = copy_to_mode_reg (inner_mode, x); > - emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); > - } > - return; > - } > + enum insn_code icode = optab_handler (vec_set_optab, mode); > + gcc_assert (icode != CODE_FOR_nothing); > > - /* Construct the vector in memory one field at a time > - and load the whole vector. */ > - rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); > for (int i = 0; i < n_elts; i++) > - emit_move_insn (adjust_address_nv (mem, inner_mode, > - i * GET_MODE_SIZE (inner_mode)), > - XVECEXP (vals, 0, i)); > - emit_move_insn (target, mem); > - > + { > + rtx x = XVECEXP (vals, 0, i); > + if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) > + continue; > + x = copy_to_mode_reg (inner_mode, x); > + emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); > + } > } > > static unsigned HOST_WIDE_INT > diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c > b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c > new file mode 100644 > index 0000000..bbad04d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c > @@ -0,0 +1,53 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */ > +float arr_f[100][100]; > +float > +f9 (void) > +{ > + > + int i; > + float sum = 0; > + for (i = 0; i < 100; i++) > + sum += arr_f[i][0] * arr_f[0][i]; > + return sum; > + > +} > + > + > +int arr[100][100]; > +int > +f10 (void) > +{ > + > + int i; > + int sum = 0; > + for (i = 0; i < 100; i++) > + sum += arr[i][0] * arr[0][i]; > + return sum; > + > +} > + > +double arr_d[100][100]; > +double > +f11 (void) > +{ > + int i; > + double sum = 0; > + for (i = 0; i < 100; i++) > + sum += arr_d[i][0] * arr_d[0][i]; > + return sum; > +} > + > +char arr_c[100][100]; > +char > +f12 (void) > +{ > + int i; > + char sum = 0; > + for (i = 0; i < 100; i++) > + sum += arr_c[i][0] * arr_c[0][i]; > + return sum; > +} > + > + > +/* { dg-final { scan-assembler-not "sp" } } */