On Fri, Feb 18, 2022 at 10:01 PM Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > This uses the now passed SLP node to the vectorizer costing hook > to adjust vector construction costs for the cost of moving an > integer component from a GPR to a vector register when that's > required for building a vector from components. A cruical difference > here is whether the component is loaded from memory or extracted > from a vector register as in those cases no intermediate GPR is involved. > > The pr99881.c testcase can be Un-XFAILed with this patch, the > pr91446.c testcase now produces scalar code which looks superior > to me so I've adjusted it as well. > > I'm currently re-bootstrapping and testing on x86_64-unknown-linux-gnu > after adding the BIT_FIELD_REF vector extracting special casing. Does the patch handle PR101929? > > I suppose we can let autotesters look for SPEC performance fallout. > > OK if testing succeeds? > > Thanks, > Richard. > > 2022-02-18 Richard Biener <rguent...@suse.de> > > PR tree-optimization/104582 > PR target/99881 > * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): > Cost GPR to vector register moves for integer vector construction. > > * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c: New. > * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c: Likewise. > * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c: Likewise. > * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c: Likewise. > * gcc.target/i386/pr99881.c: Un-XFAIL. > * gcc.target/i386/pr91446.c: Adjust to not expect vectorization. > --- > gcc/config/i386/i386.cc | 45 ++++++++++++++++++- > .../costmodel/x86_64/costmodel-pr104582-1.c | 15 +++++++ > .../costmodel/x86_64/costmodel-pr104582-2.c | 13 ++++++ > .../costmodel/x86_64/costmodel-pr104582-3.c | 13 ++++++ > .../costmodel/x86_64/costmodel-pr104582-4.c | 15 +++++++ > gcc/testsuite/gcc.target/i386/pr91446.c | 2 +- > gcc/testsuite/gcc.target/i386/pr99881.c | 2 +- > 7 files changed, 102 insertions(+), 3 deletions(-) > create mode 100644 > gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c > create mode 100644 > gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c > create mode 100644 > gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c > create mode 100644 > gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 0830dbd7dca..b2bf90576d5 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -22997,7 +22997,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool > costing_for_scalar) > > unsigned > ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, > - stmt_vec_info stmt_info, slp_tree, > + stmt_vec_info stmt_info, slp_tree node, > tree vectype, int misalign, > vect_cost_model_location where) > { > @@ -23160,6 +23160,49 @@ ix86_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); > stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); > } > + else if (kind == vec_construct > + && node > + && SLP_TREE_DEF_TYPE (node) == vect_external_def > + && INTEGRAL_TYPE_P (TREE_TYPE (vectype))) > + { > + stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); > + unsigned i; > + tree op; > + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) > + if (TREE_CODE (op) == SSA_NAME) > + TREE_VISITED (op) = 0; > + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) > + { > + if (TREE_CODE (op) != SSA_NAME > + || TREE_VISITED (op)) > + continue; > + TREE_VISITED (op) = 1; > + gimple *def = SSA_NAME_DEF_STMT (op); > + tree tem; > + if (is_gimple_assign (def) > + && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)) > + && ((tem = gimple_assign_rhs1 (def)), true) > + && TREE_CODE (tem) == SSA_NAME > + /* A sign-change expands to nothing. */ > + && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (def)), > + TREE_TYPE (tem))) > + def = SSA_NAME_DEF_STMT (tem); > + /* When the component is loaded from memory we can directly > + move it to a vector register, otherwise we have to go > + via a GPR or via vpinsr which involves similar cost. > + Likewise with a BIT_FIELD_REF extracting from a vector > + register we can hope to avoid using a GPR. */ > + if (!is_gimple_assign (def) > + || (!gimple_assign_load_p (def) > + && (gimple_assign_rhs_code (def) != BIT_FIELD_REF > + || !VECTOR_TYPE_P (TREE_TYPE > + (TREE_OPERAND (gimple_assign_rhs1 (def), > 0)))))) > + stmt_cost += ix86_cost->sse_to_integer; > + } > + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) > + if (TREE_CODE (op) == SSA_NAME) > + TREE_VISITED (op) = 0; > + } > if (stmt_cost == -1) > stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); > > diff --git > a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c > b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c > new file mode 100644 > index 00000000000..992a845ad7a > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */ > + > +struct S { unsigned long a, b; } s; > + > +void > +foo (unsigned long *a, unsigned long *b) > +{ > + unsigned long a_ = *a; > + unsigned long b_ = *b; > + s.a = a_; > + s.b = b_; > +} > + > +/* { dg-final { scan-tree-dump "basic block part vectorized" "slp2" } } */ > diff --git > a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c > b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c > new file mode 100644 > index 00000000000..7637cdb4a97 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */ > + > +struct S { unsigned long a, b; } s; > + > +void > +foo (unsigned long a, unsigned long b) > +{ > + s.a = a; > + s.b = b; > +} > + > +/* { dg-final { scan-tree-dump-not "basic block part vectorized" "slp2" } } > */ > diff --git > a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c > b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c > new file mode 100644 > index 00000000000..999c4905708 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */ > + > +struct S { double a, b; } s; > + > +void > +foo (double a, double b) > +{ > + s.a = a; > + s.b = b; > +} > + > +/* { dg-final { scan-tree-dump "basic block part vectorized" "slp2" } } */ > diff --git > a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c > b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c > new file mode 100644 > index 00000000000..cc471e1ed73 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */ > + > +struct S { unsigned long a, b; } s; > + > +void > +foo (signed long *a, unsigned long *b) > +{ > + unsigned long a_ = *a; > + unsigned long b_ = *b; > + s.a = a_; > + s.b = b_; > +} > + > +/* { dg-final { scan-tree-dump "basic block part vectorized" "slp2" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr91446.c > b/gcc/testsuite/gcc.target/i386/pr91446.c > index 0243ca3ea68..067bf43f698 100644 > --- a/gcc/testsuite/gcc.target/i386/pr91446.c > +++ b/gcc/testsuite/gcc.target/i386/pr91446.c > @@ -21,4 +21,4 @@ foo (unsigned long long width, unsigned long long height, > bar (&t); > } > > -/* { dg-final { scan-assembler-times "vmovdqa\[^\n\r\]*xmm\[0-9\]" 2 } } */ > +/* { dg-final { scan-assembler-times "xmm\[0-9\]" 0 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c > b/gcc/testsuite/gcc.target/i386/pr99881.c > index 3e087eb2ed7..a1ec1d1ba8a 100644 > --- a/gcc/testsuite/gcc.target/i386/pr99881.c > +++ b/gcc/testsuite/gcc.target/i386/pr99881.c > @@ -1,7 +1,7 @@ > /* PR target/99881. */ > /* { dg-do compile { target { ! ia32 } } } */ > /* { dg-options "-Ofast -march=skylake" } */ > -/* { dg-final { scan-assembler-not "xmm\[0-9\]" { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-not "xmm\[0-9\]" } } */ > > void > foo (int* __restrict a, int n, int c) > -- > 2.34.1
-- BR, Hongtao