The SLP unrolling factor is calculated by finding the smallest
scalar type for each SLP statement and taking the number of required
lanes from the vector versions of those scalar types.  E.g. for an
int32->int64 conversion, it's the vector of int32s rather than the
vector of int64s that determines the unroll factor.

We rely on tree-vect-patterns.c to replace boolean operations like:

   bool a, b, c;
   a = b & c;

with integer operations of whatever the best size is in context.
E.g. if b and c are fed by comparisons of ints, a, b and c will become
the appropriate size for an int comparison.  For most targets this means
that a, b and c will end up as int-sized themselves, but on targets like
SVE and AVX512 with packed vector booleans, they'll instead become a
small bitfield like :1, padded to a byte for memory purposes.
The SLP code would then take these scalar types and try to calculate
the vector type for them, causing the unroll factor to be much higher
than necessary.

This patch makes SLP use the cached vector boolean type if that's
appropriate.  Tested on aarch64-linux-gnu (with and without SVE),
aarch64_be-none-elf and x86_64-linux-gnu.  OK to install?

Richard


2018-05-09  Richard Sandiford  <richard.sandif...@linaro.org>

gcc/
        * tree-vect-slp.c (get_vectype_for_smallest_scalar_type): New function.
        (vect_build_slp_tree_1): Use it when calculating the unroll factor.

gcc/testsuite/
        * gcc.target/aarch64/sve/vcond_10.c: New test.
        * gcc.target/aarch64/sve/vcond_10_run.c: Likewise.
        * gcc.target/aarch64/sve/vcond_11.c: Likewise.
        * gcc.target/aarch64/sve/vcond_11_run.c: Likewise.

Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c 2018-05-08 09:42:03.526648115 +0100
+++ gcc/tree-vect-slp.c 2018-05-09 11:30:41.061096063 +0100
@@ -608,6 +608,41 @@ vect_record_max_nunits (vec_info *vinfo,
   return true;
 }
 
+/* Return the vector type associated with the smallest scalar type in STMT.  */
+
+static tree
+get_vectype_for_smallest_scalar_type (gimple *stmt)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  if (vectype != NULL_TREE
+      && VECTOR_BOOLEAN_TYPE_P (vectype))
+    {
+      /* The result of a vector boolean operation has the smallest scalar
+        type unless the statement is extending an even narrower boolean.  */
+      if (!gimple_assign_cast_p (stmt))
+       return vectype;
+
+      tree src = gimple_assign_rhs1 (stmt);
+      gimple *def_stmt;
+      enum vect_def_type dt;
+      tree src_vectype = NULL_TREE;
+      if (vect_is_simple_use (src, stmt_info->vinfo, &def_stmt, &dt,
+                             &src_vectype)
+         && src_vectype
+         && VECTOR_BOOLEAN_TYPE_P (src_vectype))
+       {
+         if (TYPE_PRECISION (TREE_TYPE (src_vectype))
+             < TYPE_PRECISION (TREE_TYPE (vectype)))
+           return src_vectype;
+         return vectype;
+       }
+    }
+  HOST_WIDE_INT dummy;
+  tree scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
+  return get_vectype_for_scalar_type (scalar_type);
+}
+
 /* Verify if the scalar stmts STMTS are isomorphic, require data
    permutation or are of unsupported types of operation.  Return
    true if they are, otherwise return false and indicate in *MATCHES
@@ -636,12 +671,11 @@ vect_build_slp_tree_1 (vec_info *vinfo,
   enum tree_code first_cond_code = ERROR_MARK;
   tree lhs;
   bool need_same_oprnds = false;
-  tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
+  tree vectype = NULL_TREE, first_op1 = NULL_TREE;
   optab optab;
   int icode;
   machine_mode optab_op2_mode;
   machine_mode vec_mode;
-  HOST_WIDE_INT dummy;
   gimple *first_load = NULL, *prev_first_load = NULL;
 
   /* For every stmt in NODE find its def stmt/s.  */
@@ -685,15 +719,14 @@ vect_build_slp_tree_1 (vec_info *vinfo,
          return false;
        }
 
-      scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
-      vectype = get_vectype_for_scalar_type (scalar_type);
+      vectype = get_vectype_for_smallest_scalar_type (stmt);
       if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
                                   max_nunits))
        {
          /* Fatal mismatch.  */
          matches[0] = false;
-          return false;
-        }
+         return false;
+       }
 
       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
        {
Index: gcc/testsuite/gcc.target/aarch64/sve/vcond_10.c
===================================================================
--- /dev/null   2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/vcond_10.c     2018-05-09 
11:30:41.057096221 +0100
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE)                                                 \
+  void __attribute__ ((noinline, noclone))                             \
+  test_##TYPE (TYPE *a, TYPE a1, TYPE a2, TYPE a3, TYPE a4, int n)     \
+  {                                                                    \
+    for (int i = 0; i < n; i += 2)                                     \
+      {                                                                        
\
+       a[i] = a[i] >= 1 && a[i] != 3 ? a1 : a2;                        \
+       a[i + 1] = a[i + 1] >= 1 && a[i + 1] != 3 ? a3 : a4;            \
+      }                                                                        
\
+  }
+
+#define FOR_EACH_TYPE(T) \
+  T (int8_t) \
+  T (uint8_t) \
+  T (int16_t) \
+  T (uint16_t) \
+  T (int32_t) \
+  T (uint32_t) \
+  T (int64_t) \
+  T (uint64_t) \
+  T (_Float16) \
+  T (float) \
+  T (double)
+
+FOR_EACH_TYPE (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1b\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 3 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]} 11 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/vcond_10_run.c
===================================================================
--- /dev/null   2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/vcond_10_run.c 2018-05-09 
11:30:41.057096221 +0100
@@ -0,0 +1,24 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */
+
+#include "vcond_10.c"
+
+#define N 133
+
+#define TEST_LOOP(TYPE)                                                        
\
+  {                                                                    \
+    TYPE a[N];                                                         \
+    for (int i = 0; i < N; ++i)                                                
\
+      a[i] = i % 7;                                                    \
+    test_##TYPE (a, 10, 11, 12, 13, N);                                        
\
+    for (int i = 0; i < N; ++i)                                                
\
+      if (a[i] != 10 + (i & 1) * 2 + (i % 7 == 0 || i % 7 == 3))       \
+       __builtin_abort ();                                             \
+  }
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (TEST_LOOP);
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/vcond_11.c
===================================================================
--- /dev/null   2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/vcond_11.c     2018-05-09 
11:30:41.057096221 +0100
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE)                                                 \
+  void __attribute__ ((noinline, noclone))                             \
+  test_##TYPE (int *restrict a, TYPE *restrict b, int a1, int a2,      \
+              int a3, int a4, int n)                                   \
+  {                                                                    \
+    for (int i = 0; i < n; i += 2)                                     \
+      {                                                                        
\
+       a[i] = a[i] >= 1 & b[i] != 3 ? a1 : a2;                         \
+       a[i + 1] = a[i + 1] >= 1 & b[i + 1] != 3 ? a3 : a4;             \
+      }                                                                        
\
+  }
+
+#define FOR_EACH_TYPE(T) \
+  T (int8_t) \
+  T (uint8_t) \
+  T (int16_t) \
+  T (uint16_t) \
+  T (int64_t) \
+  T (uint64_t) \
+  T (double)
+
+FOR_EACH_TYPE (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1b\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */
+/* 4 for each 8-bit function, 2 for each 16-bit function, 1 for
+   each 64-bit function.  */
+/* { dg-final { scan-assembler-times {\tld1w\t} 15 } } */
+/* 3 64-bit functions * 2 64-bit vectors per 32-bit vector.  */
+/* { dg-final { scan-assembler-times {\tld1d\t} 6 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]} 15 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/vcond_11_run.c
===================================================================
--- /dev/null   2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/vcond_11_run.c 2018-05-09 
11:30:41.059096142 +0100
@@ -0,0 +1,28 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */
+
+#include "vcond_11.c"
+
+#define N 133
+
+#define TEST_LOOP(TYPE)                                                        
\
+  {                                                                    \
+    int a[N];                                                          \
+    TYPE b[N];                                                         \
+    for (int i = 0; i < N; ++i)                                                
\
+      {                                                                        
\
+       a[i] = i % 5;                                                   \
+       b[i] = i % 7;                                                   \
+      }                                                                        
\
+    test_##TYPE (a, b, 10, 11, 12, 13, N);                             \
+    for (int i = 0; i < N; ++i)                                                
\
+      if (a[i] != 10 + (i & 1) * 2 + (i % 5 == 0 || i % 7 == 3))       \
+       __builtin_abort ();                                             \
+  }
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (TEST_LOOP);
+  return 0;
+}

Reply via email to