Re: [PATCH v2] forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]
on 2021/5/27 下午8:55, Richard Sandiford wrote: > Sorry for the slow reponse. > > "Kewen.Lin" writes: >> diff --git a/gcc/vec-perm-indices.c b/gcc/vec-perm-indices.c >> index ede590dc5c9..57dd11d723c 100644 >> --- a/gcc/vec-perm-indices.c >> +++ b/gcc/vec-perm-indices.c >> @@ -101,6 +101,70 @@ vec_perm_indices::new_expanded_vector (const >> vec_perm_indices &orig, >>m_encoding.finalize (); >> } >> >> +/* Check whether we can switch to a new permutation vector that >> + selects the same input elements as ORIG, but with each element >> + built up from FACTOR pieces. Return true if yes, otherwise >> + return false. Every FACTOR permutation indexes should be >> + continuous separately and the first one of each batch should >> + be able to exactly modulo FACTOR. For example, if ORIG is >> + { 2, 3, 4, 5, 0, 1, 6, 7 } and FACTOR is 2, the new permutation >> + is { 1, 2, 0, 3 }. */ >> + >> +bool >> +vec_perm_indices::new_shrunk_vector (const vec_perm_indices &orig, >> + unsigned int factor) >> +{ >> + gcc_assert (factor > 0); >> + >> + if (maybe_lt (orig.m_nelts_per_input, factor)) >> +return false; >> + >> + poly_uint64 nelts; >> + /* Invalid if vector units number isn't multiple of factor. */ >> + if (!multiple_p (orig.m_nelts_per_input, factor, &nelts)) >> +return false; >> + >> + /* Only handle the case that npatterns is multiple of factor. >> + FIXME: Try to see whether we can reshape it by factor npatterns. */ >> + if (orig.m_encoding.npatterns () % factor != 0) >> +return false; >> + >> + unsigned int encoded_nelts = orig.m_encoding.encoded_nelts (); >> + auto_vec encodings (encoded_nelts); > > auto_vec would avoid memory allocations in the > same cases that m_encoding can. “encoding” might be better than > “encodings” since there's only really one encoding here. > >> + /* Separate all encoded elements into batches by size factor, >> + then ensure the first element of each batch is multiple of >> + factor and all elements in each batch is consecutive from >> + the first one. */ >> + for (unsigned int i = 0; i < encoded_nelts; i += factor) >> +{ >> + element_type first = orig.m_encoding[i]; >> + element_type new_index; >> + if (!multiple_p (first, factor, &new_index)) >> +return false; >> + for (unsigned int j = 1; j < factor; ++j) >> +{ >> + if (maybe_ne (first + j, orig.m_encoding[i + j])) >> +return false; >> +} > > Formatting nit: unnecessary braces around if. > >> + encodings.quick_push (new_index); >> +} >> + >> + m_ninputs = orig.m_ninputs; >> + m_nelts_per_input = nelts; >> + poly_uint64 full_nelts = exact_div (orig.m_encoding.full_nelts (), >> factor); >> + unsigned int npatterns = orig.m_encoding.npatterns () / factor; >> + >> + m_encoding.new_vector (full_nelts, npatterns, >> + orig.m_encoding.nelts_per_pattern ()); >> + >> + for (unsigned int i = 0; i < encodings.length (); i++) >> +m_encoding.quick_push (encodings[i]); > > I think this can be: > >m_encoding.splice (encodings); > > OK with those changes, thanks. Thanks also for doing it in a > variable-length-friendly way. > Thanks for the comments, Richard! The patch was updated as them, re-tested and committed in r12-1103. BR, Kewen
Re: [PATCH v2] forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]
Sorry for the slow reponse. "Kewen.Lin" writes: > diff --git a/gcc/vec-perm-indices.c b/gcc/vec-perm-indices.c > index ede590dc5c9..57dd11d723c 100644 > --- a/gcc/vec-perm-indices.c > +++ b/gcc/vec-perm-indices.c > @@ -101,6 +101,70 @@ vec_perm_indices::new_expanded_vector (const > vec_perm_indices &orig, >m_encoding.finalize (); > } > > +/* Check whether we can switch to a new permutation vector that > + selects the same input elements as ORIG, but with each element > + built up from FACTOR pieces. Return true if yes, otherwise > + return false. Every FACTOR permutation indexes should be > + continuous separately and the first one of each batch should > + be able to exactly modulo FACTOR. For example, if ORIG is > + { 2, 3, 4, 5, 0, 1, 6, 7 } and FACTOR is 2, the new permutation > + is { 1, 2, 0, 3 }. */ > + > +bool > +vec_perm_indices::new_shrunk_vector (const vec_perm_indices &orig, > + unsigned int factor) > +{ > + gcc_assert (factor > 0); > + > + if (maybe_lt (orig.m_nelts_per_input, factor)) > +return false; > + > + poly_uint64 nelts; > + /* Invalid if vector units number isn't multiple of factor. */ > + if (!multiple_p (orig.m_nelts_per_input, factor, &nelts)) > +return false; > + > + /* Only handle the case that npatterns is multiple of factor. > + FIXME: Try to see whether we can reshape it by factor npatterns. */ > + if (orig.m_encoding.npatterns () % factor != 0) > +return false; > + > + unsigned int encoded_nelts = orig.m_encoding.encoded_nelts (); > + auto_vec encodings (encoded_nelts); auto_vec would avoid memory allocations in the same cases that m_encoding can. “encoding” might be better than “encodings” since there's only really one encoding here. > + /* Separate all encoded elements into batches by size factor, > + then ensure the first element of each batch is multiple of > + factor and all elements in each batch is consecutive from > + the first one. */ > + for (unsigned int i = 0; i < encoded_nelts; i += factor) > +{ > + element_type first = orig.m_encoding[i]; > + element_type new_index; > + if (!multiple_p (first, factor, &new_index)) > + return false; > + for (unsigned int j = 1; j < factor; ++j) > + { > + if (maybe_ne (first + j, orig.m_encoding[i + j])) > + return false; > + } Formatting nit: unnecessary braces around if. > + encodings.quick_push (new_index); > +} > + > + m_ninputs = orig.m_ninputs; > + m_nelts_per_input = nelts; > + poly_uint64 full_nelts = exact_div (orig.m_encoding.full_nelts (), factor); > + unsigned int npatterns = orig.m_encoding.npatterns () / factor; > + > + m_encoding.new_vector (full_nelts, npatterns, > + orig.m_encoding.nelts_per_pattern ()); > + > + for (unsigned int i = 0; i < encodings.length (); i++) > +m_encoding.quick_push (encodings[i]); I think this can be: m_encoding.splice (encodings); OK with those changes, thanks. Thanks also for doing it in a variable-length-friendly way. Richard > + > + m_encoding.finalize (); > + > + return true; > +} > + > /* Rotate the inputs of the permutation right by DELTA inputs. This changes > the values of the permutation vector but it doesn't change the way that > the elements are encoded. */ > diff --git a/gcc/vec-perm-indices.h b/gcc/vec-perm-indices.h > index bc70ecd8a1d..98d27f0ec42 100644 > --- a/gcc/vec-perm-indices.h > +++ b/gcc/vec-perm-indices.h > @@ -57,6 +57,7 @@ public: > >void new_vector (const vec_perm_builder &, unsigned int, poly_uint64); >void new_expanded_vector (const vec_perm_indices &, unsigned int); > + bool new_shrunk_vector (const vec_perm_indices &, unsigned int); >void rotate_inputs (int delta); > >/* Return the underlying vector encoding. */
Re: [PATCH v2] forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]
>> The attached patch v2 use the structure by considering the above >> advice and the (code == CONSTRUCTOR || code == VECTOR_CST) part >> can be shared with VIEW_CONVERT_EXPR handlings as below: >> >> op0 gathering (leave V_C_E in code if it's met) >> >> else if (code == CONSTRUCTOR || code == VECTOR_CST || VIEW_CONVERT_EXPR) >> { >>op1 gathering (leave V_C_E in code2) >> >>if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR) >> do the tricks on arg0/arg1/op2 >> >>the previous handlings on CONSTRUCTOR/VECTOR_CST >> } >> >> Also updated "shrinked" to "shrunk" as Segher pointed out. :-) >> >> Does it look better now? > > Yes. The forwprop changes are OK - I'd still like Richard to > review the vec-perm-indices change. > Thanks Richi! Hi Richard, Gentle ping for the vec-perm-indices change in case this thread escaped from your radar. https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570240.html BR, Kewen
Re: [PATCH v2] forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]
On Thu, 13 May 2021, Kewen.Lin wrote: > Hi Richi, > > Thanks for the review! > > on 2021/5/11 下午9:26, Richard Biener wrote: > > On Fri, 7 May 2021, Kewen.Lin wrote: > > > >> Hi, > >> > >> This patch is to teach forwprop to optimize some cases where the > >> permutated operands of vector permutation are from two same type > >> CTOR and CTOR or one CTOR and one VECTOR CST. It aggressively > >> takes VIEW_CONVERT_EXPR as trivial copies and transform the vector > >> permutation into vector CTOR. > >> > >> Bootstrapped/regtested on powerpc64le-linux-gnu P9, powerpc64-linux-gnu P8, > >> x86_64-redhat-linux and aarch64-linux-gnu. > >> > >> Is it ok for trunk? > > > > Can you please avoid the changes to get_prop_source_stmt and > > can_propagate_from? > > Sure! Do you mean that we need to keep those functions as pure as > possible? I meant to reuse the single use check in the call. Yeah, I'd like to get rid of them eventually ... > > It should work to add a single match > > of a V_C_E after the get_prop_source_stmt call. Ideally > > we'd have > > > > /* Shuffle of a constructor. */ > > else if (code == CONSTRUCTOR || code == VECTOR_CST) > > { > > ... > > } > > else if (code == VIEW_CONVERT_EXPR) > > { > >op1 must also be a V_C_E or VECTOR_CST here > > } > > > > but then I fear we have no canonicalization of the VECTOR_CST > > to 2nd VEC_PERM operand. But then moving the op1 gathering > > out of the if (code == CONSTRUCTOR || code == VECTOR_CST) > > case (doesn't need an else) might still make such refactoring > > possible as first matching > > > > if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR) > >{ > > ... > > } > > else if (code == CONSTRUCTOR || code == VECTOR_CST) > > ... > > > > > The attached patch v2 use the structure by considering the above > advice and the (code == CONSTRUCTOR || code == VECTOR_CST) part > can be shared with VIEW_CONVERT_EXPR handlings as below: > > op0 gathering (leave V_C_E in code if it's met) > > else if (code == CONSTRUCTOR || code == VECTOR_CST || VIEW_CONVERT_EXPR) > { >op1 gathering (leave V_C_E in code2) > >if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR) > do the tricks on arg0/arg1/op2 > >the previous handlings on CONSTRUCTOR/VECTOR_CST > } > > Also updated "shrinked" to "shrunk" as Segher pointed out. :-) > > Does it look better now? Yes. The forwprop changes are OK - I'd still like Richard to review the vec-perm-indices change. Thanks, and sorry for the delay, Richard. > Bootstrapped/regtested on powerpc64le-linux-gnu P9 btw. > > BR, > Kewen > - > gcc/ChangeLog: > > PR tree-optimization/99398 > * tree-ssa-forwprop.c (simplify_permutation): Optimize some cases > where the fed operands are CTOR/CST and propagated through > VIEW_CONVERT_EXPR. Call vec_perm_indices::new_shrunk_vector. > * vec-perm-indices.c (vec_perm_indices::new_shrunk_vector): New > function. > * vec-perm-indices.h (vec_perm_indices::new_shrunk_vector): New > declare. > > gcc/testsuite/ChangeLog: > > PR tree-optimization/99398 > * gcc.target/powerpc/vec-perm-ctor-run.c: New test. > * gcc.target/powerpc/vec-perm-ctor.c: New test. > * gcc.target/powerpc/vec-perm-ctor.h: New test. > -- Richard Biener SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
[PATCH v2] forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]
Hi Richi, Thanks for the review! on 2021/5/11 下午9:26, Richard Biener wrote: > On Fri, 7 May 2021, Kewen.Lin wrote: > >> Hi, >> >> This patch is to teach forwprop to optimize some cases where the >> permutated operands of vector permutation are from two same type >> CTOR and CTOR or one CTOR and one VECTOR CST. It aggressively >> takes VIEW_CONVERT_EXPR as trivial copies and transform the vector >> permutation into vector CTOR. >> >> Bootstrapped/regtested on powerpc64le-linux-gnu P9, powerpc64-linux-gnu P8, >> x86_64-redhat-linux and aarch64-linux-gnu. >> >> Is it ok for trunk? > > Can you please avoid the changes to get_prop_source_stmt and > can_propagate_from? Sure! Do you mean that we need to keep those functions as pure as possible? I meant to reuse the single use check in the call. > It should work to add a single match > of a V_C_E after the get_prop_source_stmt call. Ideally > we'd have > > /* Shuffle of a constructor. */ > else if (code == CONSTRUCTOR || code == VECTOR_CST) > { > ... > } > else if (code == VIEW_CONVERT_EXPR) > { >op1 must also be a V_C_E or VECTOR_CST here > } > > but then I fear we have no canonicalization of the VECTOR_CST > to 2nd VEC_PERM operand. But then moving the op1 gathering > out of the if (code == CONSTRUCTOR || code == VECTOR_CST) > case (doesn't need an else) might still make such refactoring > possible as first matching > > if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR) >{ > ... > } > else if (code == CONSTRUCTOR || code == VECTOR_CST) > ... > The attached patch v2 use the structure by considering the above advice and the (code == CONSTRUCTOR || code == VECTOR_CST) part can be shared with VIEW_CONVERT_EXPR handlings as below: op0 gathering (leave V_C_E in code if it's met) else if (code == CONSTRUCTOR || code == VECTOR_CST || VIEW_CONVERT_EXPR) { op1 gathering (leave V_C_E in code2) if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR) do the tricks on arg0/arg1/op2 the previous handlings on CONSTRUCTOR/VECTOR_CST } Also updated "shrinked" to "shrunk" as Segher pointed out. :-) Does it look better now? Bootstrapped/regtested on powerpc64le-linux-gnu P9 btw. BR, Kewen - gcc/ChangeLog: PR tree-optimization/99398 * tree-ssa-forwprop.c (simplify_permutation): Optimize some cases where the fed operands are CTOR/CST and propagated through VIEW_CONVERT_EXPR. Call vec_perm_indices::new_shrunk_vector. * vec-perm-indices.c (vec_perm_indices::new_shrunk_vector): New function. * vec-perm-indices.h (vec_perm_indices::new_shrunk_vector): New declare. gcc/testsuite/ChangeLog: PR tree-optimization/99398 * gcc.target/powerpc/vec-perm-ctor-run.c: New test. * gcc.target/powerpc/vec-perm-ctor.c: New test. * gcc.target/powerpc/vec-perm-ctor.h: New test. diff --git a/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c new file mode 100644 index 000..987d6db999c --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c @@ -0,0 +1,124 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vsx_hw } */ +/* { dg-options "-O2 -mvsx" } */ + +#include "vec-perm-ctor.h" + +#include + +int +main () +{ + du a_du = 100ULL; + du b_du = 200ULL; + + di a_di = -100; + di b_di = 200; + + df a_df = 10.0; + df b_df = 20.0; + + si a_si = 12; + si b_si = -25; + si c_si = -37; + si d_si = 50; + + sf a_sf = 30.0f; + sf b_sf = 40.0f; + sf c_sf = 50.0f; + sf d_sf = 60.0f; + + hu a_hu = 10; + hu b_hu = 20; + hu c_hu = 30; + hu d_hu = 40; + hu e_hu = 50; + hu f_hu = 60; + hu g_hu = 70; + hu h_hu = 80; + + qi a_qi = 10; + qi b_qi = 20; + qi c_qi = -30; + qi d_qi = 40; + qi e_qi = -50; + qi f_qi = 60; + qi g_qi = 70; + qi h_qi = -80; + + v2du res1 = test_ctor_ctor_same_du (a_du, b_du); + if (res1[0] != a_du || res1[1] != b_du) +abort (); + + v2df res2 = test_ctor_ctor_same_df (a_df, b_df); + if (res2[0] != a_df || res2[1] != b_df) +abort (); + + v4si res3 = test_ctor_ctor_same_si (a_si, b_si, c_si, d_si); + if (res3[0] != a_si || res3[1] != b_si || res3[2] != c_si || res3[3] != d_si) +abort (); + + v4sf res4 = test_ctor_ctor_same_sf (a_sf, b_sf, c_sf, d_sf); + if (res4[0] != a_sf || res4[1] != b_sf || res4[2] != c_sf || res4[3] != d_sf) +abort (); + + v8hu res5 += test_ctor_ctor_same_hu (a_hu, b_hu, c_hu, d_hu, e_hu, f_hu, g_hu, h_hu); + + if (res5[0] != a_hu || res5[1] != b_hu || res5[2] != c_hu || res5[3] != d_hu + || res5[4] != e_hu || res5[5] != f_hu || res5[6] != g_hu + || res5[7] != h_hu) +abort (); + + v16qi res6 += test_ctor_ctor_same_qi (a_qi, b_qi, c_qi, d_qi, e_qi, f_qi, g_qi, h_qi); + + if (res6[0] != a_qi || res6[1] != b_qi || res6[2] != c_qi || res6[3] != d_qi + || res6[4] != a_qi || res6[5] != b_qi || res6[6] != c_qi +