We failed to build the correct initialization vector.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

Note this restricts n-lanes to 1 for VLA vectors since I don't
know how to build a proper initializer there.  I think that
RVV at least could do this by shifting in the individual lane
PHI latch defs into a zero vector from the right (or from the
left and then possibly reverse the vector).  Not sure how that
goes for SVE or how to express this as CTOR.  I could possibly
build a fixed size vector CTOR and repeat that - but I guess
that would only work for a power-of-two number of lanes
(the permute scheme likely doesn't work for 3 lanes and I think
we miss some additional checks here).

        PR tree-optimization/121256
        * tree-vect-loop.cc (vectorizable_recurr): Build a correct
        initialization vector for SLP_TREE_LANES > 1.

        * gcc.dg/vect/vect-recurr-pr121256.c: New testcase.
---
 .../gcc.dg/vect/vect-recurr-pr121256.c        | 54 +++++++++++++++++++
 gcc/tree-vect-loop.cc                         | 46 +++++++++++++---
 2 files changed, 92 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256.c 
b/gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256.c
new file mode 100644
index 00000000000..c895e94021d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-recurr-pr121256.c
@@ -0,0 +1,54 @@
+/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define B 0
+#define G 1
+#define R 2
+#define A 3
+
+int red = 153;
+int green = 66;
+int blue = 187;
+int alpha = 255;
+
+static void __attribute__((noipa))
+sub_left_prediction_bgr32(uint8_t *restrict dst, uint8_t *restrict src, int w)
+{
+  for (int i = 0; i < 8; i++) {
+    int rt = src[i * 4 + R];
+    int gt = src[i * 4 + G];
+    int bt = src[i * 4 + B];
+    int at = src[i * 4 + A];
+
+    dst[i * 4 + R] = rt - red;
+    dst[i * 4 + G] = gt - green;
+    dst[i * 4 + B] = bt - blue;
+    dst[i * 4 + A] = at - alpha;
+
+    red = rt;
+    green = gt;
+    blue = bt;
+    alpha = at;
+  }
+}
+
+int main()
+{
+  check_vect ();
+
+  uint8_t *dst = calloc(36, sizeof(uint8_t));
+  uint8_t *src = calloc(36, sizeof(uint8_t));
+
+  src[R] = 160;
+  src[G] = 73;
+  src[B] = 194;
+  src[A] = 255;
+
+  sub_left_prediction_bgr32(dst, src, 33);
+  if (dst[R] != 7 || dst[B] != 7 || dst[A] != 0)
+    __builtin_abort();
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 93607cbf247..56d6228572b 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8967,6 +8967,18 @@ vectorizable_recurr (loop_vec_info loop_vinfo, 
stmt_vec_info stmt_info,
       return false;
     }
 
+  /* We need to be able to build a { ..., a, b } init vector with
+     dist number of distinct trailing values.  Always possible
+     when dist == 1 or when nunits is constant.  */
+  if (dist != 1 && !nunits.is_constant ())
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "cannot build initialization vector for "
+                        "first order recurrence\n");
+      return false;
+    }
+
   /* First-order recurrence autovectorization needs to handle permutation
      with indices = [nunits-1, nunits, nunits+1, ...].  */
   vec_perm_builder sel (nunits, 1, 3);
@@ -9018,20 +9030,38 @@ vectorizable_recurr (loop_vec_info loop_vinfo, 
stmt_vec_info stmt_info,
     }
 
   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
-  basic_block bb = gimple_bb (phi);
-  tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
-  if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
-    {
-      gimple_seq stmts = NULL;
-      preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
-      gsi_insert_seq_on_edge_immediate (pe, stmts);
+  tree vec_init;
+  if (dist > 1)
+    {
+      vec<constructor_elt, va_gc> *v = NULL;
+      vec_alloc (v, nunits.to_constant ());
+      for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
+       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+                               build_zero_cst (TREE_TYPE (vectype)));
+      for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
+       {
+         gphi *phi = as_a <gphi *> (s->stmt);
+         tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
+         if (!useless_type_conversion_p (TREE_TYPE (vectype),
+                                         TREE_TYPE (preheader)))
+           {
+             gimple_seq stmts = NULL;
+             preheader = gimple_convert (&stmts,
+                                         TREE_TYPE (vectype), preheader);
+             gsi_insert_seq_on_edge_immediate (pe, stmts);
+           }
+         CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
+       }
+      vec_init = build_constructor (vectype, v);
     }
-  tree vec_init = build_vector_from_val (vectype, preheader);
+  else
+    vec_init = PHI_ARG_DEF_FROM_EDGE (phi, pe);
   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
 
   /* Create the vectorized first-order PHI node.  */
   tree vec_dest = vect_get_new_vect_var (vectype,
                                         vect_simple_var, "vec_recur_");
+  basic_block bb = gimple_bb (phi);
   gphi *new_phi = create_phi_node (vec_dest, bb);
   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
 
-- 
2.43.0

Reply via email to