Hi Richi,

Thanks for the review!

on 2021/5/11 下午9:26, Richard Biener wrote:
> On Fri, 7 May 2021, Kewen.Lin wrote:
> 
>> Hi, 
>>
>> This patch is to teach forwprop to optimize some cases where the
>> permutated operands of vector permutation are from two same type
>> CTOR and CTOR or one CTOR and one VECTOR CST.  It aggressively
>> takes VIEW_CONVERT_EXPR as trivial copies and transform the vector
>> permutation into vector CTOR.
>>
>> Bootstrapped/regtested on powerpc64le-linux-gnu P9, powerpc64-linux-gnu P8,
>> x86_64-redhat-linux and aarch64-linux-gnu.
>>
>> Is it ok for trunk?
> 
> Can you please avoid the changes to get_prop_source_stmt and
> can_propagate_from?  

Sure!  Do you mean that we need to keep those functions as pure as
possible?  I meant to reuse the single use check in the call.


> It should work to add a single match
> of a V_C_E after the get_prop_source_stmt call.  Ideally
> we'd have
> 
>   /* Shuffle of a constructor.  */
>   else if (code == CONSTRUCTOR || code == VECTOR_CST)
>     {
> ...
>     }
>   else if (code == VIEW_CONVERT_EXPR)
>     {
>        op1 must also be a V_C_E or VECTOR_CST here
>     }
> 
> but then I fear we have no canonicalization of the VECTOR_CST
> to 2nd VEC_PERM operand.  But then moving the op1 gathering
> out of the if (code == CONSTRUCTOR || code == VECTOR_CST)
> case (doesn't need an else) might still make such refactoring
> possible as first matching
> 
>   if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR)
>    {
> ...
>   }
>   else if (code == CONSTRUCTOR || code == VECTOR_CST)
> ...
> 


The attached patch v2 use the structure by considering the above
advice and the (code == CONSTRUCTOR || code == VECTOR_CST) part
can be shared with VIEW_CONVERT_EXPR handlings as below:

  op0 gathering (leave V_C_E in code if it's met)  

  else if (code == CONSTRUCTOR || code == VECTOR_CST || VIEW_CONVERT_EXPR) 
{
   op1 gathering (leave V_C_E in code2)
   
   if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR)
     do the tricks on arg0/arg1/op2

   the previous handlings on CONSTRUCTOR/VECTOR_CST
}

Also updated "shrinked" to "shrunk" as Segher pointed out.  :-)

Does it look better now?

Bootstrapped/regtested on powerpc64le-linux-gnu P9 btw.

BR,
Kewen
-----
gcc/ChangeLog:

        PR tree-optimization/99398
        * tree-ssa-forwprop.c (simplify_permutation): Optimize some cases
        where the fed operands are CTOR/CST and propagated through
        VIEW_CONVERT_EXPR.  Call vec_perm_indices::new_shrunk_vector.
        * vec-perm-indices.c (vec_perm_indices::new_shrunk_vector): New
        function.
        * vec-perm-indices.h (vec_perm_indices::new_shrunk_vector): New
        declare.

gcc/testsuite/ChangeLog:

        PR tree-optimization/99398
        * gcc.target/powerpc/vec-perm-ctor-run.c: New test.
        * gcc.target/powerpc/vec-perm-ctor.c: New test.
        * gcc.target/powerpc/vec-perm-ctor.h: New test.
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c 
b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c
new file mode 100644
index 00000000000..987d6db999c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor-run.c
@@ -0,0 +1,124 @@
+/* { dg-do run } */
+/* { dg-require-effective-target vsx_hw } */
+/* { dg-options "-O2 -mvsx" } */
+
+#include "vec-perm-ctor.h"
+
+#include <stdlib.h>
+
+int
+main ()
+{
+  du a_du = 100ULL;
+  du b_du = 200ULL;
+
+  di a_di = -100;
+  di b_di = 200;
+
+  df a_df = 10.0;
+  df b_df = 20.0;
+
+  si a_si = 12;
+  si b_si = -25;
+  si c_si = -37;
+  si d_si = 50;
+
+  sf a_sf = 30.0f;
+  sf b_sf = 40.0f;
+  sf c_sf = 50.0f;
+  sf d_sf = 60.0f;
+
+  hu a_hu = 10;
+  hu b_hu = 20;
+  hu c_hu = 30;
+  hu d_hu = 40;
+  hu e_hu = 50;
+  hu f_hu = 60;
+  hu g_hu = 70;
+  hu h_hu = 80;
+
+  qi a_qi = 10;
+  qi b_qi = 20;
+  qi c_qi = -30;
+  qi d_qi = 40;
+  qi e_qi = -50;
+  qi f_qi = 60;
+  qi g_qi = 70;
+  qi h_qi = -80;
+
+  v2du res1 = test_ctor_ctor_same_du (a_du, b_du);
+  if (res1[0] != a_du || res1[1] != b_du)
+    abort ();
+
+  v2df res2 = test_ctor_ctor_same_df (a_df, b_df);
+  if (res2[0] != a_df || res2[1] != b_df)
+    abort ();
+
+  v4si res3 = test_ctor_ctor_same_si (a_si, b_si, c_si, d_si);
+  if (res3[0] != a_si || res3[1] != b_si || res3[2] != c_si || res3[3] != d_si)
+    abort ();
+
+  v4sf res4 = test_ctor_ctor_same_sf (a_sf, b_sf, c_sf, d_sf);
+  if (res4[0] != a_sf || res4[1] != b_sf || res4[2] != c_sf || res4[3] != d_sf)
+    abort ();
+
+  v8hu res5
+    = test_ctor_ctor_same_hu (a_hu, b_hu, c_hu, d_hu, e_hu, f_hu, g_hu, h_hu);
+
+  if (res5[0] != a_hu || res5[1] != b_hu || res5[2] != c_hu || res5[3] != d_hu
+      || res5[4] != e_hu || res5[5] != f_hu || res5[6] != g_hu
+      || res5[7] != h_hu)
+    abort ();
+
+  v16qi res6
+    = test_ctor_ctor_same_qi (a_qi, b_qi, c_qi, d_qi, e_qi, f_qi, g_qi, h_qi);
+
+  if (res6[0] != a_qi || res6[1] != b_qi || res6[2] != c_qi || res6[3] != d_qi
+      || res6[4] != a_qi || res6[5] != b_qi || res6[6] != c_qi
+      || res6[7] != d_qi || res6[8] != e_qi || res6[9] != f_qi
+      || res6[10] != g_qi || res6[11] != h_qi || res6[12] != e_qi
+      || res6[13] != f_qi || res6[14] != g_qi || res6[15] != h_qi)
+    abort ();
+
+  v2du res7 = test_ctor_cst_same_du (a_du, b_du);
+  if (res7[0] != a_du || res7[1] != 100)
+    abort ();
+
+  v4sf res8 = test_ctor_cst_same_sf (a_sf, b_sf);
+  if (res8[0] != a_sf || res8[1] != 2.0f || res8[2] != b_sf || res8[3] != 4.0f)
+    abort ();
+
+  v2df res9 = test_ctor_cst_same_df (a_df, b_df);
+  if (res9[0] != b_df || res9[1] != 200.0)
+    abort ();
+
+  v4si res10 = test_cst_ctor_same_si (a_si, b_si);
+  if (res10[0] != 1 || res10[1] != 3 || res10[2] != a_si || res10[3] != b_si)
+    abort ();
+
+  v2di res11 = test_ctor_cst_diff_di_si (a_di, b_di);
+  /* Need to take care of the endianness since the function converts vector
+     const to one different vector type (element size), the endianness
+     determines the reinterpreted layout.  Same reason for res12 below.  */
+  if (res11[0] != -100 ||
+#ifdef __LITTLE_ENDIAN__
+      res11[1] != 3
+#else
+      res11[1] != 0x300000000LL
+#endif
+  )
+    abort ();
+
+  v2du res12 = test_cst_ctor_diff_sf_du (a_du, b_du);
+  if (
+#ifdef __LITTLE_ENDIAN__
+    res12[0] != 0x400000003f800000ULL
+#else
+    res12[0] != 0x3f80000040000000ULL
+#endif
+    || res12[1] != 100)
+    abort ();
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.c 
b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.c
new file mode 100644
index 00000000000..cc59e60035f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx -fdump-tree-optimized" } */
+
+/* To test all permutations fed by CTOR and CST can be optimized away.  */
+
+#include "vec-perm-ctor.h"
+
+/* { dg-final { scan-tree-dump-not "VIEW_CONVERT_EXPR" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.h 
b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.h
new file mode 100644
index 00000000000..18782701e51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-perm-ctor.h
@@ -0,0 +1,163 @@
+#include "altivec.h"
+
+typedef vector unsigned long long v2du;
+typedef vector signed long long v2di;
+typedef vector unsigned int v4su;
+typedef vector signed int v4si;
+typedef vector unsigned short v8hu;
+typedef vector signed short v8hi;
+typedef vector unsigned char v16qu;
+typedef vector signed char v16qi;
+typedef vector double v2df;
+typedef vector float v4sf;
+
+typedef unsigned long long du;
+typedef signed long long di;
+typedef unsigned int su;
+typedef signed int si;
+typedef unsigned short hu;
+typedef signed short hi;
+typedef unsigned char qu;
+typedef signed char qi;
+typedef double df;
+typedef float sf;
+
+/* To test whether we can optimize vector permutation away when
+   the two inputs are same type CTOR or one input is CTOR and the
+   other is CST.  */
+
+/* CTOR + CTOR part (only same type supported).  */
+
+/* Test both operands are same type CTOR (type unsigned long long).  */
+__attribute__ ((noipa)) v2du
+test_ctor_ctor_same_du (du a, du b)
+{
+  v2du v1 = {a, 0};
+  v2du v2 = {b, 0};
+  v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+  v2du vres = (v2du) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
+
+/* Test both operands are same type CTOR (type double).  */
+__attribute__ ((noipa)) v2df
+test_ctor_ctor_same_df (df a, df b)
+{
+  v2df v1 = {0.0, a};
+  v2df v2 = {0.0, b};
+  v16qu vc = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+  v2df vres = (v2df) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
+
+/* Test both operands are same type CTOR (type signed int).  */
+__attribute__ ((noipa)) v4si
+test_ctor_ctor_same_si (si a, si b, si c, si d)
+{
+  v4si v1 = {0, a, 0, c};
+  v4si v2 = {0, b, 0, d};
+  v16qu vc = {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31};
+  v4si vres = (v4si) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
+
+/* Test both operands are same type CTOR (type float).  */
+__attribute__ ((noipa)) v4sf
+test_ctor_ctor_same_sf (sf a, sf b, sf c, sf d)
+{
+  v4sf v1 = {c, 0.0f, d, 0.0f};
+  v4sf v2 = {a, 0.0f, b, 0.0f};
+  v16qu vc = {16, 17, 18, 19, 24, 25, 26, 27, 0, 1, 2, 3, 8, 9, 10, 11};
+  v4sf vres = (v4sf) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
+
+/* Test both operands are same type CTOR (type unsigned short).  */
+__attribute__ ((noipa)) v8hu
+test_ctor_ctor_same_hu (hu a, hu b, hu c, hu d, hu e, hu f, hu g, hu h)
+{
+  v8hu v1 = {0, a, 0, b, 0, c, 0, d};
+  v8hu v2 = {0, e, 0, f, 0, g, 0, h};
+  v16qu vc = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
+  v8hu vres = (v8hu) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
+
+/* Test both operands are same type CTOR (type signed char).  */
+__attribute__ ((noipa)) v16qi
+test_ctor_ctor_same_qi (qi a, qi b, qi c, qi d, qi e, qi f, qi g, qi h)
+{
+  v16qi v1 = {0, a, 0, b, 0, c, 0, d, 0, a, 0, b, 0, c, 0, d};
+  v16qi v2 = {0, e, 0, f, 0, g, 0, h, 0, e, 0, f, 0, g, 0, h};
+  v16qu vc = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
+  v16qi vres = (v16qi) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
+
+/* CTOR + CST part (same type).  */
+
+__attribute__ ((noipa)) v2du
+test_ctor_cst_same_du (du a, du b)
+{
+  v2du v1 = {a, b};
+  v2du v2 = {100, 200};
+  v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+  v2du vres = (v2du) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
+
+__attribute__ ((noipa)) v4sf
+test_ctor_cst_same_sf (sf a, sf b)
+{
+  v4sf v1 = {0.0f, a, 0.0f, b};
+  v4sf v2 = {1.0f, 2.0f, 3.0f, 4.0f};
+  v16qu vc = {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31};
+  v4sf vres = (v4sf) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
+
+/* CST + CTOR part (same type).  */
+
+__attribute__ ((noipa)) v2df
+test_ctor_cst_same_df (df a, df b)
+{
+  v2df v1 = {a, b};
+  v2df v2 = {100.0, 200.0};
+  v16qu vc = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+  v2df vres = (v2df) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
+
+__attribute__ ((noipa)) v4si
+test_cst_ctor_same_si (si a, si b)
+{
+  v4si v1 = {a, 0, b, 0};
+  v4si v2 = {1, 2, 3, 4};
+  v16qu vc = {16, 17, 18, 19, 24, 25, 26, 27, 0, 1, 2, 3, 8, 9, 10, 11};
+  v4si vres = (v4si) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
+
+/* CTOR + CST part (different types).  */
+
+__attribute__ ((noipa)) v2di
+test_ctor_cst_diff_di_si (di a, di b)
+{
+  v2di v1 = {a, b};
+  v4si v2 = {3, 0, 4, 0};
+  v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+  v2di vres = (v2di) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
+
+/* CST + CTOR part (different types).  */
+
+__attribute__ ((noipa)) v2du
+test_cst_ctor_diff_sf_du (du a, du b)
+{
+  v4sf v1 = {1.0f, 2.0f, 3.0f, 4.0f};
+  v2du v2 = {a, b};
+  v16qu vc = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+  v2du vres = (v2du) vec_perm ((v16qu) v1, (v16qu) v2, vc);
+  return vres;
+}
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index 0706fd862de..beb2702f3b6 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2120,9 +2120,9 @@ static int
 simplify_permutation (gimple_stmt_iterator *gsi)
 {
   gimple *stmt = gsi_stmt (*gsi);
-  gimple *def_stmt;
+  gimple *def_stmt = NULL;
   tree op0, op1, op2, op3, arg0, arg1;
-  enum tree_code code;
+  enum tree_code code, code2 = ERROR_MARK;
   bool single_use_op0 = false;
 
   gcc_checking_assert (gimple_assign_rhs_code (stmt) == VEC_PERM_EXPR);
@@ -2142,10 +2142,28 @@ simplify_permutation (gimple_stmt_iterator *gsi)
   else if (TREE_CODE (op0) == SSA_NAME)
     {
       def_stmt = get_prop_source_stmt (op0, false, &single_use_op0);
-      if (!def_stmt || !can_propagate_from (def_stmt))
+      if (!def_stmt)
        return 0;
-
       code = gimple_assign_rhs_code (def_stmt);
+      if (code == VIEW_CONVERT_EXPR)
+       {
+         tree rhs = gimple_assign_rhs1 (def_stmt);
+         tree name = TREE_OPERAND (rhs, 0);
+         if (TREE_CODE (name) != SSA_NAME)
+           return 0;
+         if (!has_single_use (name))
+           single_use_op0 = false;
+         /* Here we update the def_stmt through this VIEW_CONVERT_EXPR,
+            but still keep the code to indicate it comes from
+            VIEW_CONVERT_EXPR.  */
+         def_stmt = SSA_NAME_DEF_STMT (name);
+         if (!def_stmt || !is_gimple_assign (def_stmt))
+           return 0;
+         if (gimple_assign_rhs_code (def_stmt) != CONSTRUCTOR)
+           return 0;
+       }
+      if (!can_propagate_from (def_stmt))
+       return 0;
       arg0 = gimple_assign_rhs1 (def_stmt);
     }
   else
@@ -2173,12 +2191,10 @@ simplify_permutation (gimple_stmt_iterator *gsi)
       update_stmt (stmt);
       return remove_prop_source_from_use (op0) ? 2 : 1;
     }
-
-  /* Shuffle of a constructor.  */
-  else if (code == CONSTRUCTOR || code == VECTOR_CST)
+  else if (code == CONSTRUCTOR
+          || code == VECTOR_CST
+          || code == VIEW_CONVERT_EXPR)
     {
-      tree opt;
-      bool ret = false;
       if (op0 != op1)
        {
          if (TREE_CODE (op0) == SSA_NAME && !single_use_op0)
@@ -2188,14 +2204,27 @@ simplify_permutation (gimple_stmt_iterator *gsi)
            arg1 = op1;
          else if (TREE_CODE (op1) == SSA_NAME)
            {
-             enum tree_code code2;
-
              gimple *def_stmt2 = get_prop_source_stmt (op1, true, NULL);
-             if (!def_stmt2 || !can_propagate_from (def_stmt2))
+             if (!def_stmt2)
                return 0;
-
              code2 = gimple_assign_rhs_code (def_stmt2);
-             if (code2 != CONSTRUCTOR && code2 != VECTOR_CST)
+             if (code2 == VIEW_CONVERT_EXPR)
+               {
+                 tree rhs = gimple_assign_rhs1 (def_stmt2);
+                 tree name = TREE_OPERAND (rhs, 0);
+                 if (TREE_CODE (name) != SSA_NAME)
+                   return 0;
+                 if (!has_single_use (name))
+                   return 0;
+                 def_stmt2 = SSA_NAME_DEF_STMT (name);
+                 if (!def_stmt2 || !is_gimple_assign (def_stmt2))
+                   return 0;
+                 if (gimple_assign_rhs_code (def_stmt2) != CONSTRUCTOR)
+                   return 0;
+               }
+             else if (code2 != CONSTRUCTOR && code2 != VECTOR_CST)
+               return 0;
+             if (!can_propagate_from (def_stmt2))
                return 0;
              arg1 = gimple_assign_rhs1 (def_stmt2);
            }
@@ -2209,10 +2238,92 @@ simplify_permutation (gimple_stmt_iterator *gsi)
            return 0;
          arg1 = arg0;
        }
-      opt = fold_ternary (VEC_PERM_EXPR, TREE_TYPE (op0), arg0, arg1, op2);
+
+      /* If there are any VIEW_CONVERT_EXPRs found when finding permutation
+        operands source, check whether it's valid to transform and prepare
+        the required new operands.  */
+      if (code == VIEW_CONVERT_EXPR || code2 == VIEW_CONVERT_EXPR)
+       {
+         /* Figure out the target vector type to which operands should be
+            converted.  If both are CONSTRUCTOR, the types should be the
+            same, otherwise, use the one of CONSTRUCTOR.  */
+         tree tgt_type = NULL_TREE;
+         if (code == VIEW_CONVERT_EXPR)
+           {
+             gcc_assert (gimple_assign_rhs_code (def_stmt) == CONSTRUCTOR);
+             code = CONSTRUCTOR;
+             tgt_type = TREE_TYPE (arg0);
+           }
+         if (code2 == VIEW_CONVERT_EXPR)
+           {
+             tree arg1_type = TREE_TYPE (arg1);
+             if (tgt_type == NULL_TREE)
+               tgt_type = arg1_type;
+             else if (tgt_type != arg1_type)
+               return 0;
+           }
+
+         if (!VECTOR_TYPE_P (tgt_type))
+           return 0;
+         tree op2_type = TREE_TYPE (op2);
+         /* Should have folded this before.  */
+         gcc_assert (op2_type != tgt_type);
+
+         /* Figure out the shrunk factor.  */
+         poly_uint64 tgt_units = TYPE_VECTOR_SUBPARTS (tgt_type);
+         poly_uint64 op2_units = TYPE_VECTOR_SUBPARTS (op2_type);
+         if (maybe_gt (tgt_units, op2_units))
+           return 0;
+         unsigned int factor;
+         if (!constant_multiple_p (op2_units, tgt_units, &factor))
+           return 0;
+
+         /* Build the new permutation control vector as target vector.  */
+         vec_perm_builder builder;
+         if (!tree_to_vec_perm_builder (&builder, op2))
+           return 0;
+         vec_perm_indices indices (builder, 2, op2_units);
+         vec_perm_indices new_indices;
+         if (new_indices.new_shrunk_vector (indices, factor))
+           {
+             tree mask_type = tgt_type;
+             if (!VECTOR_INTEGER_TYPE_P (mask_type))
+               {
+                 tree elem_type = TREE_TYPE (mask_type);
+                 unsigned elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
+                 tree int_type = build_nonstandard_integer_type (elem_size, 0);
+                 mask_type = build_vector_type (int_type, tgt_units);
+               }
+             op2 = vec_perm_indices_to_tree (mask_type, new_indices);
+           }
+         else
+           return 0;
+
+         /* Convert the VECTOR_CST to the appropriate vector type.  */
+         if (tgt_type != TREE_TYPE (arg0))
+           arg0 = fold_build1 (VIEW_CONVERT_EXPR, tgt_type, arg0);
+         else if (tgt_type != TREE_TYPE (arg1))
+           arg1 = fold_build1 (VIEW_CONVERT_EXPR, tgt_type, arg1);
+       }
+
+      /* VIEW_CONVERT_EXPR should be updated to CONSTRUCTOR before.  */
+      gcc_assert (code == CONSTRUCTOR || code == VECTOR_CST);
+
+      /* Shuffle of a constructor.  */
+      bool ret = false;
+      tree res_type = TREE_TYPE (arg0);
+      tree opt = fold_ternary (VEC_PERM_EXPR, res_type, arg0, arg1, op2);
       if (!opt
          || (TREE_CODE (opt) != CONSTRUCTOR && TREE_CODE (opt) != VECTOR_CST))
        return 0;
+      /* Found VIEW_CONVERT_EXPR before, need one explicit conversion.  */
+      if (res_type != TREE_TYPE (op0))
+       {
+         tree name = make_ssa_name (TREE_TYPE (opt));
+         gimple *ass_stmt = gimple_build_assign (name, opt);
+         gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+         opt = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (op0), name);
+       }
       gimple_assign_set_rhs_from_tree (gsi, opt);
       update_stmt (gsi_stmt (*gsi));
       if (TREE_CODE (op0) == SSA_NAME)
diff --git a/gcc/vec-perm-indices.c b/gcc/vec-perm-indices.c
index ede590dc5c9..57dd11d723c 100644
--- a/gcc/vec-perm-indices.c
+++ b/gcc/vec-perm-indices.c
@@ -101,6 +101,70 @@ vec_perm_indices::new_expanded_vector (const 
vec_perm_indices &orig,
   m_encoding.finalize ();
 }
 
+/* Check whether we can switch to a new permutation vector that
+   selects the same input elements as ORIG, but with each element
+   built up from FACTOR pieces.  Return true if yes, otherwise
+   return false.  Every FACTOR permutation indexes should be
+   continuous separately and the first one of each batch should
+   be able to exactly modulo FACTOR.  For example, if ORIG is
+   { 2, 3, 4, 5, 0, 1, 6, 7 } and FACTOR is 2, the new permutation
+   is { 1, 2, 0, 3 }.  */
+
+bool
+vec_perm_indices::new_shrunk_vector (const vec_perm_indices &orig,
+                                    unsigned int factor)
+{
+  gcc_assert (factor > 0);
+
+  if (maybe_lt (orig.m_nelts_per_input, factor))
+    return false;
+
+  poly_uint64 nelts;
+  /* Invalid if vector units number isn't multiple of factor.  */
+  if (!multiple_p (orig.m_nelts_per_input, factor, &nelts))
+    return false;
+
+  /* Only handle the case that npatterns is multiple of factor.
+     FIXME: Try to see whether we can reshape it by factor npatterns.  */
+  if (orig.m_encoding.npatterns () % factor != 0)
+    return false;
+
+  unsigned int encoded_nelts = orig.m_encoding.encoded_nelts ();
+  auto_vec<element_type> encodings (encoded_nelts);
+  /* Separate all encoded elements into batches by size factor,
+     then ensure the first element of each batch is multiple of
+     factor and all elements in each batch is consecutive from
+     the first one.  */
+  for (unsigned int i = 0; i < encoded_nelts; i += factor)
+    {
+      element_type first = orig.m_encoding[i];
+      element_type new_index;
+      if (!multiple_p (first, factor, &new_index))
+       return false;
+      for (unsigned int j = 1; j < factor; ++j)
+       {
+         if (maybe_ne (first + j, orig.m_encoding[i + j]))
+           return false;
+       }
+      encodings.quick_push (new_index);
+    }
+
+  m_ninputs = orig.m_ninputs;
+  m_nelts_per_input = nelts;
+  poly_uint64 full_nelts = exact_div (orig.m_encoding.full_nelts (), factor);
+  unsigned int npatterns = orig.m_encoding.npatterns () / factor;
+
+  m_encoding.new_vector (full_nelts, npatterns,
+                        orig.m_encoding.nelts_per_pattern ());
+
+  for (unsigned int i = 0; i < encodings.length (); i++)
+    m_encoding.quick_push (encodings[i]);
+
+  m_encoding.finalize ();
+
+  return true;
+}
+
 /* Rotate the inputs of the permutation right by DELTA inputs.  This changes
    the values of the permutation vector but it doesn't change the way that
    the elements are encoded.  */
diff --git a/gcc/vec-perm-indices.h b/gcc/vec-perm-indices.h
index bc70ecd8a1d..98d27f0ec42 100644
--- a/gcc/vec-perm-indices.h
+++ b/gcc/vec-perm-indices.h
@@ -57,6 +57,7 @@ public:
 
   void new_vector (const vec_perm_builder &, unsigned int, poly_uint64);
   void new_expanded_vector (const vec_perm_indices &, unsigned int);
+  bool new_shrunk_vector (const vec_perm_indices &, unsigned int);
   void rotate_inputs (int delta);
 
   /* Return the underlying vector encoding.  */

Reply via email to