We failed to build the correct initialization vector.  For VLA
vectors and a non-uniform initialization vector this rejects
vectorization for now.

v2 adds VLA support for uniform initializers.  I've added a
testcase for the case I thought we might get wrong, we get it
correct.

Re-bootstrap and regtest running on x86_64-unknown-linux-gnu.

        PR tree-optimization/121256
        * tree-vect-loop.cc (vectorizable_recurr): Build a correct
        initialization vector for SLP_TREE_LANES > 1.

        * gcc.dg/vect/vect-recurr-pr121256.c: New testcase.
        * gcc.dg/vect/vect-recurr-pr121256-2.c: Likewise.
---
 .../gcc.dg/vect/vect-recurr-pr121256-2.c      | 49 +++++++++++++++
 .../gcc.dg/vect/vect-recurr-pr121256.c        | 54 +++++++++++++++++
 gcc/tree-vect-loop.cc                         | 60 ++++++++++++++++---
 3 files changed, 155 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256-2.c
new file mode 100644
index 00000000000..7350fd9feba
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256-2.c
@@ -0,0 +1,49 @@
+/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
+
+#include "tree-vect.h"
+
+#define B 0
+#define G 1
+#define R 2
+
+int red = 153;
+int green = 66;
+int blue = 187;
+
+static void __attribute__((noipa))
+sub_left_prediction_bgr32(int *restrict dst, int *restrict src)
+{
+  for (int i = 0; i < 8; i++) {
+    int rt = src[i * 3 + R];
+    int gt = src[i * 3 + G];
+    int bt = src[i * 3 + B];
+
+    dst[i * 3 + R] = rt - red;
+    dst[i * 3 + G] = gt - green;
+    dst[i * 3 + B] = bt - blue;
+
+    red = rt;
+    green = gt;
+    blue = bt;
+  }
+}
+
+int main()
+{
+  int dst[8*3];
+  int src[8*3] = { 160, 73, 194, 17, 33, 99, 0, 12, 283, 87, 73, 11,
+                  9, 7, 1, 23, 19, 13, 77, 233, 97, 78, 2, 5 };
+  int dst2[8*3] = {-27, 7, 41, -143, -40, -95, -17, -21, 184, 87, 61,
+      -272, -78, -66, -10, 14, 12, 12, 54, 214, 84, 1, -231, -92};
+
+  check_vect ();
+
+  sub_left_prediction_bgr32(dst, src);
+
+#pragma GCC novector
+  for (int i = 0; i < 8*3; ++i)
+    if (dst[i] != dst2[i])
+      __builtin_abort();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256.c 
b/gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256.c
new file mode 100644
index 00000000000..c895e94021d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256.c
@@ -0,0 +1,54 @@
+/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define B 0
+#define G 1
+#define R 2
+#define A 3
+
+int red = 153;
+int green = 66;
+int blue = 187;
+int alpha = 255;
+
+static void __attribute__((noipa))
+sub_left_prediction_bgr32(uint8_t *restrict dst, uint8_t *restrict src, int w)
+{
+  for (int i = 0; i < 8; i++) {
+    int rt = src[i * 4 + R];
+    int gt = src[i * 4 + G];
+    int bt = src[i * 4 + B];
+    int at = src[i * 4 + A];
+
+    dst[i * 4 + R] = rt - red;
+    dst[i * 4 + G] = gt - green;
+    dst[i * 4 + B] = bt - blue;
+    dst[i * 4 + A] = at - alpha;
+
+    red = rt;
+    green = gt;
+    blue = bt;
+    alpha = at;
+  }
+}
+
+int main()
+{
+  check_vect ();
+
+  uint8_t *dst = calloc(36, sizeof(uint8_t));
+  uint8_t *src = calloc(36, sizeof(uint8_t));
+
+  src[R] = 160;
+  src[G] = 73;
+  src[B] = 194;
+  src[A] = 255;
+
+  sub_left_prediction_bgr32(dst, src, 33);
+  if (dst[R] != 7 || dst[B] != 7 || dst[A] != 0)
+    __builtin_abort();
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 15f23640292..9be1c79cd47 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8971,6 +8971,33 @@ vectorizable_recurr (loop_vec_info loop_vinfo, 
stmt_vec_info stmt_info,
       return false;
     }
 
+  /* We need to be able to build a { ..., a, b } init vector with
+     dist number of distinct trailing values.  Always possible
+     when dist == 1 or when nunits is constant or when the initializations
+     are uniform.  */
+  tree uniform_initval = NULL_TREE;
+  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+  for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
+    {
+      gphi *phi = as_a <gphi *> (s->stmt);
+      if (! uniform_initval)
+       uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
+      else if (! operand_equal_p (uniform_initval,
+                                 PHI_ARG_DEF_FROM_EDGE (phi, pe)))
+       {
+         uniform_initval = NULL_TREE;
+         break;
+       }
+    }
+  if (!uniform_initval && !nunits.is_constant ())
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "cannot build initialization vector for "
+                        "first order recurrence\n");
+      return false;
+    }
+
   /* First-order recurrence autovectorization needs to handle permutation
      with indices = [nunits-1, nunits, nunits+1, ...].  */
   vec_perm_builder sel (nunits, 1, 3);
@@ -9021,21 +9048,38 @@ vectorizable_recurr (loop_vec_info loop_vinfo, 
stmt_vec_info stmt_info,
       return true;
     }
 
-  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
-  basic_block bb = gimple_bb (phi);
-  tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
-  if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
+  tree vec_init;
+  if (! uniform_initval)
     {
-      gimple_seq stmts = NULL;
-      preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
-      gsi_insert_seq_on_edge_immediate (pe, stmts);
+      vec<constructor_elt, va_gc> *v = NULL;
+      vec_alloc (v, nunits.to_constant ());
+      for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
+       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+                               build_zero_cst (TREE_TYPE (vectype)));
+      for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
+       {
+         gphi *phi = as_a <gphi *> (s->stmt);
+         tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
+         if (!useless_type_conversion_p (TREE_TYPE (vectype),
+                                         TREE_TYPE (preheader)))
+           {
+             gimple_seq stmts = NULL;
+             preheader = gimple_convert (&stmts,
+                                         TREE_TYPE (vectype), preheader);
+             gsi_insert_seq_on_edge_immediate (pe, stmts);
+           }
+         CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
+       }
+      vec_init = build_constructor (vectype, v);
     }
-  tree vec_init = build_vector_from_val (vectype, preheader);
+  else
+    vec_init = uniform_initval;
   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
 
   /* Create the vectorized first-order PHI node.  */
   tree vec_dest = vect_get_new_vect_var (vectype,
                                         vect_simple_var, "vec_recur_");
+  basic_block bb = gimple_bb (phi);
   gphi *new_phi = create_phi_node (vec_dest, bb);
   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
 
-- 
2.43.0

Reply via email to