Re: [PATCH v2] forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]

2021-05-27 Thread Kewen.Lin via Gcc-patches
on 2021/5/27 下午8:55, Richard Sandiford wrote:
> Sorry for the slow reponse.
> 
> "Kewen.Lin"  writes:
>> diff --git a/gcc/vec-perm-indices.c b/gcc/vec-perm-indices.c
>> index ede590dc5c9..57dd11d723c 100644
>> --- a/gcc/vec-perm-indices.c
>> +++ b/gcc/vec-perm-indices.c
>> @@ -101,6 +101,70 @@ vec_perm_indices::new_expanded_vector (const 
>> vec_perm_indices &orig,
>>m_encoding.finalize ();
>>  }
>>  
>> +/* Check whether we can switch to a new permutation vector that
>> +   selects the same input elements as ORIG, but with each element
>> +   built up from FACTOR pieces.  Return true if yes, otherwise
>> +   return false.  Every FACTOR permutation indexes should be
>> +   continuous separately and the first one of each batch should
>> +   be able to exactly modulo FACTOR.  For example, if ORIG is
>> +   { 2, 3, 4, 5, 0, 1, 6, 7 } and FACTOR is 2, the new permutation
>> +   is { 1, 2, 0, 3 }.  */
>> +
>> +bool
>> +vec_perm_indices::new_shrunk_vector (const vec_perm_indices &orig,
>> + unsigned int factor)
>> +{
>> +  gcc_assert (factor > 0);
>> +
>> +  if (maybe_lt (orig.m_nelts_per_input, factor))
>> +return false;
>> +
>> +  poly_uint64 nelts;
>> +  /* Invalid if vector units number isn't multiple of factor.  */
>> +  if (!multiple_p (orig.m_nelts_per_input, factor, &nelts))
>> +return false;
>> +
>> +  /* Only handle the case that npatterns is multiple of factor.
>> + FIXME: Try to see whether we can reshape it by factor npatterns.  */
>> +  if (orig.m_encoding.npatterns () % factor != 0)
>> +return false;
>> +
>> +  unsigned int encoded_nelts = orig.m_encoding.encoded_nelts ();
>> +  auto_vec encodings (encoded_nelts);
> 
> auto_vec would avoid memory allocations in the
> same cases that m_encoding can.  “encoding” might be better than
> “encodings” since there's only really one encoding here.
> 
>> +  /* Separate all encoded elements into batches by size factor,
>> + then ensure the first element of each batch is multiple of
>> + factor and all elements in each batch is consecutive from
>> + the first one.  */
>> +  for (unsigned int i = 0; i < encoded_nelts; i += factor)
>> +{
>> +  element_type first = orig.m_encoding[i];
>> +  element_type new_index;
>> +  if (!multiple_p (first, factor, &new_index))
>> +return false;
>> +  for (unsigned int j = 1; j < factor; ++j)
>> +{
>> +  if (maybe_ne (first + j, orig.m_encoding[i + j]))
>> +return false;
>> +}
> 
> Formatting nit: unnecessary braces around if.
> 
>> +  encodings.quick_push (new_index);
>> +}
>> +
>> +  m_ninputs = orig.m_ninputs;
>> +  m_nelts_per_input = nelts;
>> +  poly_uint64 full_nelts = exact_div (orig.m_encoding.full_nelts (), 
>> factor);
>> +  unsigned int npatterns = orig.m_encoding.npatterns () / factor;
>> +
>> +  m_encoding.new_vector (full_nelts, npatterns,
>> + orig.m_encoding.nelts_per_pattern ());
>> +
>> +  for (unsigned int i = 0; i < encodings.length (); i++)
>> +m_encoding.quick_push (encodings[i]);
> 
> I think this can be:
> 
>m_encoding.splice (encodings);
> 
> OK with those changes, thanks.  Thanks also for doing it in a
> variable-length-friendly way.
> 


Thanks for the comments, Richard!  The patch was updated as them,
re-tested and committed in r12-1103.

BR,
Kewen


Re: [PATCH v2] forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]

2021-05-27 Thread Richard Sandiford via Gcc-patches
Sorry for the slow reponse.

"Kewen.Lin"  writes:
> diff --git a/gcc/vec-perm-indices.c b/gcc/vec-perm-indices.c
> index ede590dc5c9..57dd11d723c 100644
> --- a/gcc/vec-perm-indices.c
> +++ b/gcc/vec-perm-indices.c
> @@ -101,6 +101,70 @@ vec_perm_indices::new_expanded_vector (const 
> vec_perm_indices &orig,
>m_encoding.finalize ();
>  }
>  
> +/* Check whether we can switch to a new permutation vector that
> +   selects the same input elements as ORIG, but with each element
> +   built up from FACTOR pieces.  Return true if yes, otherwise
> +   return false.  Every FACTOR permutation indexes should be
> +   continuous separately and the first one of each batch should
> +   be able to exactly modulo FACTOR.  For example, if ORIG is
> +   { 2, 3, 4, 5, 0, 1, 6, 7 } and FACTOR is 2, the new permutation
> +   is { 1, 2, 0, 3 }.  */
> +
> +bool
> +vec_perm_indices::new_shrunk_vector (const vec_perm_indices &orig,
> +  unsigned int factor)
> +{
> +  gcc_assert (factor > 0);
> +
> +  if (maybe_lt (orig.m_nelts_per_input, factor))
> +return false;
> +
> +  poly_uint64 nelts;
> +  /* Invalid if vector units number isn't multiple of factor.  */
> +  if (!multiple_p (orig.m_nelts_per_input, factor, &nelts))
> +return false;
> +
> +  /* Only handle the case that npatterns is multiple of factor.
> + FIXME: Try to see whether we can reshape it by factor npatterns.  */
> +  if (orig.m_encoding.npatterns () % factor != 0)
> +return false;
> +
> +  unsigned int encoded_nelts = orig.m_encoding.encoded_nelts ();
> +  auto_vec encodings (encoded_nelts);

auto_vec would avoid memory allocations in the
same cases that m_encoding can.  “encoding” might be better than
“encodings” since there's only really one encoding here.

> +  /* Separate all encoded elements into batches by size factor,
> + then ensure the first element of each batch is multiple of
> + factor and all elements in each batch is consecutive from
> + the first one.  */
> +  for (unsigned int i = 0; i < encoded_nelts; i += factor)
> +{
> +  element_type first = orig.m_encoding[i];
> +  element_type new_index;
> +  if (!multiple_p (first, factor, &new_index))
> + return false;
> +  for (unsigned int j = 1; j < factor; ++j)
> + {
> +   if (maybe_ne (first + j, orig.m_encoding[i + j]))
> + return false;
> + }

Formatting nit: unnecessary braces around if.

> +  encodings.quick_push (new_index);
> +}
> +
> +  m_ninputs = orig.m_ninputs;
> +  m_nelts_per_input = nelts;
> +  poly_uint64 full_nelts = exact_div (orig.m_encoding.full_nelts (), factor);
> +  unsigned int npatterns = orig.m_encoding.npatterns () / factor;
> +
> +  m_encoding.new_vector (full_nelts, npatterns,
> +  orig.m_encoding.nelts_per_pattern ());
> +
> +  for (unsigned int i = 0; i < encodings.length (); i++)
> +m_encoding.quick_push (encodings[i]);

I think this can be:

   m_encoding.splice (encodings);

OK with those changes, thanks.  Thanks also for doing it in a
variable-length-friendly way.

Richard

> +
> +  m_encoding.finalize ();
> +
> +  return true;
> +}
> +
>  /* Rotate the inputs of the permutation right by DELTA inputs.  This changes
> the values of the permutation vector but it doesn't change the way that
> the elements are encoded.  */
> diff --git a/gcc/vec-perm-indices.h b/gcc/vec-perm-indices.h
> index bc70ecd8a1d..98d27f0ec42 100644
> --- a/gcc/vec-perm-indices.h
> +++ b/gcc/vec-perm-indices.h
> @@ -57,6 +57,7 @@ public:
>  
>void new_vector (const vec_perm_builder &, unsigned int, poly_uint64);
>void new_expanded_vector (const vec_perm_indices &, unsigned int);
> +  bool new_shrunk_vector (const vec_perm_indices &, unsigned int);
>void rotate_inputs (int delta);
>  
>/* Return the underlying vector encoding.  */


Re: [PATCH v2] forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]

2021-05-25 Thread Kewen.Lin via Gcc-patches
>> The attached patch v2 use the structure by considering the above
>> advice and the (code == CONSTRUCTOR || code == VECTOR_CST) part
>> can be shared with VIEW_CONVERT_EXPR handlings as below:
>>
>>   op0 gathering (leave V_C_E in code if it's met)  
>>
>>   else if (code == CONSTRUCTOR || code == VECTOR_CST || VIEW_CONVERT_EXPR) 
>> {
>>op1 gathering (leave V_C_E in code2)
>>
>>if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR)
>>  do the tricks on arg0/arg1/op2
>>
>>the previous handlings on CONSTRUCTOR/VECTOR_CST
>> }
>>
>> Also updated "shrinked" to "shrunk" as Segher pointed out.  :-)
>>
>> Does it look better now?
> 
> Yes.  The forwprop changes are OK - I'd still like Richard to
> review the vec-perm-indices change.
> 

Thanks Richi!



Hi Richard,

Gentle ping for the vec-perm-indices change in case this thread
escaped from your radar.

https://gcc.gnu.org/pipermail/gcc-patches/2021-May/570240.html

BR,
Kewen


Re: [PATCH v2] forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]

2021-05-20 Thread Richard Biener
On Thu, 13 May 2021, Kewen.Lin wrote:

> Hi Richi,
> 
> Thanks for the review!
> 
> on 2021/5/11 下午9:26, Richard Biener wrote:
> > On Fri, 7 May 2021, Kewen.Lin wrote:
> > 
> >> Hi, 
> >>
> >> This patch is to teach forwprop to optimize some cases where the
> >> permutated operands of vector permutation are from two same type
> >> CTOR and CTOR or one CTOR and one VECTOR CST.  It aggressively
> >> takes VIEW_CONVERT_EXPR as trivial copies and transform the vector
> >> permutation into vector CTOR.
> >>
> >> Bootstrapped/regtested on powerpc64le-linux-gnu P9, powerpc64-linux-gnu P8,
> >> x86_64-redhat-linux and aarch64-linux-gnu.
> >>
> >> Is it ok for trunk?
> > 
> > Can you please avoid the changes to get_prop_source_stmt and
> > can_propagate_from?  
> 
> Sure!  Do you mean that we need to keep those functions as pure as
> possible?  I meant to reuse the single use check in the call.

Yeah, I'd like to get rid of them eventually ...

> > It should work to add a single match
> > of a V_C_E after the get_prop_source_stmt call.  Ideally
> > we'd have
> > 
> >   /* Shuffle of a constructor.  */
> >   else if (code == CONSTRUCTOR || code == VECTOR_CST)
> > {
> > ...
> > }
> >   else if (code == VIEW_CONVERT_EXPR)
> > {
> >op1 must also be a V_C_E or VECTOR_CST here
> > }
> > 
> > but then I fear we have no canonicalization of the VECTOR_CST
> > to 2nd VEC_PERM operand.  But then moving the op1 gathering
> > out of the if (code == CONSTRUCTOR || code == VECTOR_CST)
> > case (doesn't need an else) might still make such refactoring
> > possible as first matching
> > 
> >   if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR)
> >{
> > ...
> >   }
> >   else if (code == CONSTRUCTOR || code == VECTOR_CST)
> > ...
> > 
> 
> 
> The attached patch v2 use the structure by considering the above
> advice and the (code == CONSTRUCTOR || code == VECTOR_CST) part
> can be shared with VIEW_CONVERT_EXPR handlings as below:
> 
>   op0 gathering (leave V_C_E in code if it's met)  
> 
>   else if (code == CONSTRUCTOR || code == VECTOR_CST || VIEW_CONVERT_EXPR) 
> {
>op1 gathering (leave V_C_E in code2)
>
>if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR)
>  do the tricks on arg0/arg1/op2
> 
>the previous handlings on CONSTRUCTOR/VECTOR_CST
> }
> 
> Also updated "shrinked" to "shrunk" as Segher pointed out.  :-)
> 
> Does it look better now?

Yes.  The forwprop changes are OK - I'd still like Richard to
review the vec-perm-indices change.

Thanks, and sorry for the delay,
Richard.

> Bootstrapped/regtested on powerpc64le-linux-gnu P9 btw.
> 
> BR,
> Kewen
> -
> gcc/ChangeLog:
> 
>   PR tree-optimization/99398
>   * tree-ssa-forwprop.c (simplify_permutation): Optimize some cases
>   where the fed operands are CTOR/CST and propagated through
>   VIEW_CONVERT_EXPR.  Call vec_perm_indices::new_shrunk_vector.
>   * vec-perm-indices.c (vec_perm_indices::new_shrunk_vector): New
>   function.
>   * vec-perm-indices.h (vec_perm_indices::new_shrunk_vector): New
>   declare.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR tree-optimization/99398
>   * gcc.target/powerpc/vec-perm-ctor-run.c: New test.
>   * gcc.target/powerpc/vec-perm-ctor.c: New test.
>   * gcc.target/powerpc/vec-perm-ctor.h: New test.
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)


[PATCH v2] forwprop: Support vec perm fed by CTOR and CTOR/CST [PR99398]

2021-05-12 Thread Kewen.Lin via Gcc-patches
Hi Richi,

Thanks for the review!

on 2021/5/11 下午9:26, Richard Biener wrote:
> On Fri, 7 May 2021, Kewen.Lin wrote:
> 
>> Hi, 
>>
>> This patch is to teach forwprop to optimize some cases where the
>> permutated operands of vector permutation are from two same type
>> CTOR and CTOR or one CTOR and one VECTOR CST.  It aggressively
>> takes VIEW_CONVERT_EXPR as trivial copies and transform the vector
>> permutation into vector CTOR.
>>
>> Bootstrapped/regtested on powerpc64le-linux-gnu P9, powerpc64-linux-gnu P8,
>> x86_64-redhat-linux and aarch64-linux-gnu.
>>
>> Is it ok for trunk?
> 
> Can you please avoid the changes to get_prop_source_stmt and
> can_propagate_from?  

Sure!  Do you mean that we need to keep those functions as pure as
possible?  I meant to reuse the single use check in the call.


> It should work to add a single match
> of a V_C_E after the get_prop_source_stmt call.  Ideally
> we'd have
> 
>   /* Shuffle of a constructor.  */
>   else if (code == CONSTRUCTOR || code == VECTOR_CST)
> {
> ...
> }
>   else if (code == VIEW_CONVERT_EXPR)
> {
>op1 must also be a V_C_E or VECTOR_CST here
> }
> 
> but then I fear we have no canonicalization of the VECTOR_CST
> to 2nd VEC_PERM operand.  But then moving the op1 gathering
> out of the if (code == CONSTRUCTOR || code == VECTOR_CST)
> case (doesn't need an else) might still make such refactoring
> possible as first matching
> 
>   if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR)
>{
> ...
>   }
>   else if (code == CONSTRUCTOR || code == VECTOR_CST)
> ...
> 


The attached patch v2 use the structure by considering the above
advice and the (code == CONSTRUCTOR || code == VECTOR_CST) part
can be shared with VIEW_CONVERT_EXPR handlings as below:

  op0 gathering (leave V_C_E in code if it's met)  

  else if (code == CONSTRUCTOR || code == VECTOR_CST || VIEW_CONVERT_EXPR) 
{
   op1 gathering (leave V_C_E in code2)
   
   if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR)
 do the tricks on arg0/arg1/op2

   the previous handlings on CONSTRUCTOR/VECTOR_CST
}

Also updated "shrinked" to "shrunk" as Segher pointed out.  :-)

Does it look better now?

Bootstrapped/regtested on powerpc64le-linux-gnu P9 btw.

BR,
Kewen
-
gcc/ChangeLog:

PR tree-optimization/99398
* tree-ssa-forwprop.c (simplify_permutation): Optimize some cases
where the fed operands are CTOR/CST and propagated through
VIEW_CONVERT_EXPR.  Call vec_perm_indices::new_shrunk_vector.
* vec-perm-indices.c (vec_perm_indices::new_shrunk_vector): New
function.
* vec-perm-indices.h (vec_perm_indices::new_shrunk_vector): New
declare.

gcc/testsuite/ChangeLog:

PR tree-optimization/99398
* gcc.target/powerpc/vec-perm-ctor-run.c: New test.
* gcc.target/powerpc/vec-perm-ctor.c: New test.
* gcc.target/powerpc/vec-perm-ctor.h: New test.
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c 
b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c
new file mode 100644
index 000..987d6db999c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c
@@ -0,0 +1,124 @@
+/* { dg-do run } */
+/* { dg-require-effective-target vsx_hw } */
+/* { dg-options "-O2 -mvsx" } */
+
+#include "vec-perm-ctor.h"
+
+#include 
+
+int
+main ()
+{
+  du a_du = 100ULL;
+  du b_du = 200ULL;
+
+  di a_di = -100;
+  di b_di = 200;
+
+  df a_df = 10.0;
+  df b_df = 20.0;
+
+  si a_si = 12;
+  si b_si = -25;
+  si c_si = -37;
+  si d_si = 50;
+
+  sf a_sf = 30.0f;
+  sf b_sf = 40.0f;
+  sf c_sf = 50.0f;
+  sf d_sf = 60.0f;
+
+  hu a_hu = 10;
+  hu b_hu = 20;
+  hu c_hu = 30;
+  hu d_hu = 40;
+  hu e_hu = 50;
+  hu f_hu = 60;
+  hu g_hu = 70;
+  hu h_hu = 80;
+
+  qi a_qi = 10;
+  qi b_qi = 20;
+  qi c_qi = -30;
+  qi d_qi = 40;
+  qi e_qi = -50;
+  qi f_qi = 60;
+  qi g_qi = 70;
+  qi h_qi = -80;
+
+  v2du res1 = test_ctor_ctor_same_du (a_du, b_du);
+  if (res1[0] != a_du || res1[1] != b_du)
+abort ();
+
+  v2df res2 = test_ctor_ctor_same_df (a_df, b_df);
+  if (res2[0] != a_df || res2[1] != b_df)
+abort ();
+
+  v4si res3 = test_ctor_ctor_same_si (a_si, b_si, c_si, d_si);
+  if (res3[0] != a_si || res3[1] != b_si || res3[2] != c_si || res3[3] != d_si)
+abort ();
+
+  v4sf res4 = test_ctor_ctor_same_sf (a_sf, b_sf, c_sf, d_sf);
+  if (res4[0] != a_sf || res4[1] != b_sf || res4[2] != c_sf || res4[3] != d_sf)
+abort ();
+
+  v8hu res5
+= test_ctor_ctor_same_hu (a_hu, b_hu, c_hu, d_hu, e_hu, f_hu, g_hu, h_hu);
+
+  if (res5[0] != a_hu || res5[1] != b_hu || res5[2] != c_hu || res5[3] != d_hu
+  || res5[4] != e_hu || res5[5] != f_hu || res5[6] != g_hu
+  || res5[7] != h_hu)
+abort ();
+
+  v16qi res6
+= test_ctor_ctor_same_qi (a_qi, b_qi, c_qi, d_qi, e_qi, f_qi, g_qi, h_qi);
+
+  if (res6[0] != a_qi || res6[1] != b_qi || res6[2] != c_qi || res6[3] != d_qi
+  || res6[4] != a_qi || res6[5] != b_qi || res6[6] != c_qi
+