On Thu, Jul 13, 2023 at 10:47 AM Hongtao Liu <crazy...@gmail.com> wrote: > > On Wed, Jul 12, 2023 at 9:37 PM Richard Biener via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > > The PRs ask for optimizing of > > > > _1 = BIT_FIELD_REF <b_3(D), 64, 64>; > > result_4 = BIT_INSERT_EXPR <a_2(D), _1, 64>; > > > > to a vector permutation. The following implements this as > > match.pd pattern, improving code generation on x86_64. > > > > On the RTL level we face the issue that backend patterns inconsistently > > use vec_merge and vec_select of vec_concat to represent permutes. > > > > I think using a (supported) permute is almost always better > > than an extract plus insert, maybe excluding the case we extract > > element zero and that's aliased to a register that can be used > > directly for insertion (not sure how to query that). > > > > But this regresses for example gcc.target/i386/pr54855-8.c because PRE > > now realizes that > > > > _1 = BIT_FIELD_REF <x_3(D), 64, 0>; > > if (_1 > a_4(D)) > > goto <bb 3>; [50.00%] > > else > > goto <bb 4>; [50.00%] > > > > <bb 3> [local count: 536870913]: > > > > <bb 4> [local count: 1073741824]: > > # iftmp.0_2 = PHI <_1(3), a_4(D)(2)> > > x_5 = BIT_INSERT_EXPR <x_3(D), iftmp.0_2, 0>; > > > > is equal to > > > > <bb 2> [local count: 1073741824]: > > _1 = BIT_FIELD_REF <x_3(D), 64, 0>; > > if (_1 > a_4(D)) > > goto <bb 4>; [50.00%] > > else > > goto <bb 3>; [50.00%] > > > > <bb 3> [local count: 536870912]: > > _7 = BIT_INSERT_EXPR <x_3(D), a_4(D), 0>; > > > > <bb 4> [local count: 1073741824]: > > # prephitmp_8 = PHI <x_3(D)(2), _7(3)> > > > > and that no longer produces the desired maxsd operation at the RTL > The comparison is scalar mode, but operations in then_bb is > vector_mode, if_convert can't eliminate the condition any more(and > won't go into backend ix86_expand_sse_fp_minmax). > I think for ordered comparisons like _1 > a_4, it doesn't match > fmin/fmax, but match SSE MINSS/MAXSS since it alway returns the second > operand(not the other operand) when there's NONE. I mean NANs. > > level (we fail to match .FMAX at the GIMPLE level earlier). > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu with regressions: > > > > FAIL: gcc.target/i386/pr54855-13.c scan-assembler-times vmaxsh[ \\\\t] 1 > > FAIL: gcc.target/i386/pr54855-13.c scan-assembler-not vcomish[ \\\\t] > > FAIL: gcc.target/i386/pr54855-8.c scan-assembler-times maxsd 1 > > FAIL: gcc.target/i386/pr54855-8.c scan-assembler-not movsd > > FAIL: gcc.target/i386/pr54855-9.c scan-assembler-times minss 1 > > FAIL: gcc.target/i386/pr54855-9.c scan-assembler-not movss > > > > I think this is also PR88540 (the lack of min/max detection, not > > sure if the SSE min/max are suitable here) > > > > PR tree-optimization/94864 > > PR tree-optimization/94865 > > * match.pd (bit_insert @0 (BIT_FIELD_REF @1 ..) ..): New pattern > > for vector insertion from vector extraction. > > > > * gcc.target/i386/pr94864.c: New testcase. > > * gcc.target/i386/pr94865.c: Likewise. > > --- > > gcc/match.pd | 25 +++++++++++++++++++++++++ > > gcc/testsuite/gcc.target/i386/pr94864.c | 13 +++++++++++++ > > gcc/testsuite/gcc.target/i386/pr94865.c | 13 +++++++++++++ > > 3 files changed, 51 insertions(+) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr94864.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr94865.c > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 8543f777a28..8cc106049c4 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -7770,6 +7770,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > wi::to_wide (@ipos) + isize)) > > (BIT_FIELD_REF @0 @rsize @rpos))))) > > > > +/* Simplify vector inserts of other vector extracts to a permute. */ > > +(simplify > > + (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) > > + (if (VECTOR_TYPE_P (type) > > + && types_match (@0, @1) > > + && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2)) > > + && TYPE_VECTOR_SUBPARTS (type).is_constant ()) > > + (with > > + { > > + unsigned HOST_WIDE_INT elsz > > + = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1)))); > > + poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz); > > + poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz); > > + unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant (); > > + vec_perm_builder builder; > > + builder.new_vector (nunits, nunits, 1); > > + for (unsigned i = 0; i < nunits; ++i) > > + builder.quick_push (known_eq (ielt, i) ? nunits + relt : i); > > + vec_perm_indices sel (builder, 2, nunits); > > + } > > + (if (!VECTOR_MODE_P (TYPE_MODE (type)) > > + || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, > > false)) > > + (vec_perm @0 @1 { vec_perm_indices_to_tree > > + (build_vector_type (ssizetype, nunits), sel); > > }))))) > > + > > (if (canonicalize_math_after_vectorization_p ()) > > (for fmas (FMA) > > (simplify > > diff --git a/gcc/testsuite/gcc.target/i386/pr94864.c > > b/gcc/testsuite/gcc.target/i386/pr94864.c > > new file mode 100644 > > index 00000000000..69cb481fcfe > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr94864.c > > @@ -0,0 +1,13 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -msse2 -mno-avx" } */ > > + > > +typedef double v2df __attribute__((vector_size(16))); > > + > > +v2df move_sd(v2df a, v2df b) > > +{ > > + v2df result = a; > > + result[0] = b[1]; > > + return result; > > +} > > + > > +/* { dg-final { scan-assembler "unpckhpd\[\\t \]%xmm0, %xmm1" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr94865.c > > b/gcc/testsuite/gcc.target/i386/pr94865.c > > new file mode 100644 > > index 00000000000..84065ac2467 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr94865.c > > @@ -0,0 +1,13 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -msse2 -mno-avx" } */ > > + > > +typedef double v2df __attribute__((vector_size(16))); > > + > > +v2df move_sd(v2df a, v2df b) > > +{ > > + v2df result = a; > > + result[1] = b[1]; > > + return result; > > +} > > + > > +/* { dg-final { scan-assembler "shufpd\[\\t \]*.2, %xmm1, %xmm0" } } */ > > -- > > 2.35.3 > > > > -- > BR, > Hongtao
-- BR, Hongtao