On Wed, Jul 12, 2023 at 6:37 AM Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > The PRs ask for optimizing of > > _1 = BIT_FIELD_REF <b_3(D), 64, 64>; > result_4 = BIT_INSERT_EXPR <a_2(D), _1, 64>; > > to a vector permutation. The following implements this as > match.pd pattern, improving code generation on x86_64.
This is more general case of PR 93080 really where we had: _1 = BIT_FIELD_REF <_2, 64, 64>; result_4 = BIT_INSERT_EXPR <_2, _1, 64>; Thanks, Andrew Pinski > > On the RTL level we face the issue that backend patterns inconsistently > use vec_merge and vec_select of vec_concat to represent permutes. > > I think using a (supported) permute is almost always better > than an extract plus insert, maybe excluding the case we extract > element zero and that's aliased to a register that can be used > directly for insertion (not sure how to query that). > > But this regresses for example gcc.target/i386/pr54855-8.c because PRE > now realizes that > > _1 = BIT_FIELD_REF <x_3(D), 64, 0>; > if (_1 > a_4(D)) > goto <bb 3>; [50.00%] > else > goto <bb 4>; [50.00%] > > <bb 3> [local count: 536870913]: > > <bb 4> [local count: 1073741824]: > # iftmp.0_2 = PHI <_1(3), a_4(D)(2)> > x_5 = BIT_INSERT_EXPR <x_3(D), iftmp.0_2, 0>; > > is equal to > > <bb 2> [local count: 1073741824]: > _1 = BIT_FIELD_REF <x_3(D), 64, 0>; > if (_1 > a_4(D)) > goto <bb 4>; [50.00%] > else > goto <bb 3>; [50.00%] > > <bb 3> [local count: 536870912]: > _7 = BIT_INSERT_EXPR <x_3(D), a_4(D), 0>; > > <bb 4> [local count: 1073741824]: > # prephitmp_8 = PHI <x_3(D)(2), _7(3)> > > and that no longer produces the desired maxsd operation at the RTL > level (we fail to match .FMAX at the GIMPLE level earlier). > > Bootstrapped and tested on x86_64-unknown-linux-gnu with regressions: > > FAIL: gcc.target/i386/pr54855-13.c scan-assembler-times vmaxsh[ \\\\t] 1 > FAIL: gcc.target/i386/pr54855-13.c scan-assembler-not vcomish[ \\\\t] > FAIL: gcc.target/i386/pr54855-8.c scan-assembler-times maxsd 1 > FAIL: gcc.target/i386/pr54855-8.c scan-assembler-not movsd > FAIL: gcc.target/i386/pr54855-9.c scan-assembler-times minss 1 > FAIL: gcc.target/i386/pr54855-9.c scan-assembler-not movss > > I think this is also PR88540 (the lack of min/max detection, not > sure if the SSE min/max are suitable here) > > PR tree-optimization/94864 > PR tree-optimization/94865 > * match.pd (bit_insert @0 (BIT_FIELD_REF @1 ..) ..): New pattern > for vector insertion from vector extraction. > > * gcc.target/i386/pr94864.c: New testcase. > * gcc.target/i386/pr94865.c: Likewise. > --- > gcc/match.pd | 25 +++++++++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr94864.c | 13 +++++++++++++ > gcc/testsuite/gcc.target/i386/pr94865.c | 13 +++++++++++++ > 3 files changed, 51 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/i386/pr94864.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94865.c > > diff --git a/gcc/match.pd b/gcc/match.pd > index 8543f777a28..8cc106049c4 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -7770,6 +7770,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > wi::to_wide (@ipos) + isize)) > (BIT_FIELD_REF @0 @rsize @rpos))))) > > +/* Simplify vector inserts of other vector extracts to a permute. */ > +(simplify > + (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) > + (if (VECTOR_TYPE_P (type) > + && types_match (@0, @1) > + && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2)) > + && TYPE_VECTOR_SUBPARTS (type).is_constant ()) > + (with > + { > + unsigned HOST_WIDE_INT elsz > + = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1)))); > + poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz); > + poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz); > + unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant (); > + vec_perm_builder builder; > + builder.new_vector (nunits, nunits, 1); > + for (unsigned i = 0; i < nunits; ++i) > + builder.quick_push (known_eq (ielt, i) ? nunits + relt : i); > + vec_perm_indices sel (builder, 2, nunits); > + } > + (if (!VECTOR_MODE_P (TYPE_MODE (type)) > + || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, > false)) > + (vec_perm @0 @1 { vec_perm_indices_to_tree > + (build_vector_type (ssizetype, nunits), sel); }))))) > + > (if (canonicalize_math_after_vectorization_p ()) > (for fmas (FMA) > (simplify > diff --git a/gcc/testsuite/gcc.target/i386/pr94864.c > b/gcc/testsuite/gcc.target/i386/pr94864.c > new file mode 100644 > index 00000000000..69cb481fcfe > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94864.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -mno-avx" } */ > + > +typedef double v2df __attribute__((vector_size(16))); > + > +v2df move_sd(v2df a, v2df b) > +{ > + v2df result = a; > + result[0] = b[1]; > + return result; > +} > + > +/* { dg-final { scan-assembler "unpckhpd\[\\t \]%xmm0, %xmm1" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr94865.c > b/gcc/testsuite/gcc.target/i386/pr94865.c > new file mode 100644 > index 00000000000..84065ac2467 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr94865.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -mno-avx" } */ > + > +typedef double v2df __attribute__((vector_size(16))); > + > +v2df move_sd(v2df a, v2df b) > +{ > + v2df result = a; > + result[1] = b[1]; > + return result; > +} > + > +/* { dg-final { scan-assembler "shufpd\[\\t \]*.2, %xmm1, %xmm0" } } */ > -- > 2.35.3