Hi,

This patch adds a fallback when the backend does not provide a popcount
implementation.  The algorithm is the same one libgcc uses, as well as
match.pd for recognizing a popcount idiom.  __builtin_ctz and __builtin_ffs
can also rely on popcount so I used the fallback for them as well.

Bootstrapped and regtested on x86, aarch64 and power10.  Unfortunately
I don't have access to any architecture other than riscv that vectorizes
but does not have a vectorized popcount.  I added all vect_int targets
to the selector where a cursory grep "expand.*popcount" would yield no
result. 

Regards
 Robin

gcc/ChangeLog:

        * tree-vect-patterns.cc (vect_have_popcount_fallback): New
        function.
        (vect_generate_popcount_fallback): New function to emit
        vectorized popcount fallback.
        (vect_recog_ctz_ffs_pattern): Use fallback.
        (vect_recog_popcount_clz_ctz_ffs_pattern): Ditto.

gcc/testsuite/ChangeLog:

        * gcc.dg/vect/vect-popcount-fallback.c: New test.
---
 .../gcc.dg/vect/vect-popcount-fallback.c      | 106 +++++++++++
 gcc/tree-vect-patterns.cc                     | 172 ++++++++++++++++--
 2 files changed, 267 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c 
b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
new file mode 100644
index 00000000000..f6300f4ab35
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
@@ -0,0 +1,106 @@
+/* Check if we vectorize popcount when no expander is available.  */
+/* { dg-do run { target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* 
riscv*-*-* } } } */
+/* { dg-additional-options { -O2 -fdump-tree-vect-details -fno-vect-cost-model 
} }  */
+/* { dg-require-effective-target vect_int } */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <stdint-gcc.h>
+
+__attribute__ ((noipa))
+void popc32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+  for (int i = 0; i < n; i++)
+    dst[i] = __builtin_popcount (a[i]);
+}
+
+__attribute__ ((noipa))
+void ctz32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+  for (int i = 0; i < n; i++)
+    dst[i] = __builtin_ctz (a[i]);
+}
+
+__attribute__ ((noipa))
+void ffs32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+  for (int i = 0; i < n; i++)
+    dst[i] = __builtin_ffs (a[i]);
+}
+
+__attribute__ ((noipa))
+void popc64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+  for (int i = 0; i < n; i++)
+    dst[i] = __builtin_popcountll (a[i]);
+}
+
+__attribute__ ((noipa))
+void ctz64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+  for (int i = 0; i < n; i++)
+    dst[i] = __builtin_ctzll (a[i]);
+}
+
+__attribute__ ((noipa))
+void ffs64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+  for (int i = 0; i < n; i++)
+    dst[i] = __builtin_ffsll (a[i]);
+}
+
+#define SZ 512
+
+__attribute__ ((optimize ("0")))
+int main ()
+{
+  uint32_t *a32pc = malloc (SZ * sizeof (*a32pc));
+  uint32_t *b32pc = malloc (SZ * sizeof (*b32pc));
+  uint32_t *a32ctz = malloc (SZ * sizeof (*a32ctz));
+  uint32_t *b32ctz = malloc (SZ * sizeof (*b32ctz));
+  uint32_t *a32ffs = malloc (SZ * sizeof (*a32ffs));
+  uint32_t *b32ffs = malloc (SZ * sizeof (*b32ffs));
+
+  uint64_t *a64pc = malloc (SZ * sizeof (*a64pc));
+  uint64_t *b64pc = malloc (SZ * sizeof (*b64pc));
+  uint64_t *a64ctz = malloc (SZ * sizeof (*a64ctz));
+  uint64_t *b64ctz = malloc (SZ * sizeof (*b64ctz));
+  uint64_t *a64ffs = malloc (SZ * sizeof (*a64ffs));
+  uint64_t *b64ffs = malloc (SZ * sizeof (*b64ffs));
+
+  for (int i = 0; i < SZ; i++)
+    {
+      int ia = i + 1;
+      a32pc[i] = ia * 1234567;
+      b32pc[i] = 0;
+      a32ctz[i] = ia * 1234567;
+      b32ctz[i] = 0;
+      a32ffs[i] = ia * 1234567;
+      b32ffs[i] = 0;
+      a64pc[i] = ia * 123456789ull;
+      b64pc[i] = 0;
+      a64ctz[i] = ia * 123456789ull;
+      b64ctz[i] = 0;
+      a64ffs[i] = ia * 123456789ull;
+      b64ffs[i] = 0;
+    }
+
+  popc32 (b32pc, a32pc, SZ);
+  ctz32 (b32ctz, a32ctz, SZ);
+  ffs32 (b32ffs, a32ffs, SZ);
+  popc64 (b64pc, a64pc, SZ);
+  ctz64 (b64ctz, a64ctz, SZ);
+  ffs64 (b64ffs, a64ffs, SZ);
+
+  for (int i = 0; i < SZ; i++)
+    {
+      assert (b32pc[i] == __builtin_popcount (a32pc[i]));
+      assert (b32ctz[i] == __builtin_ctz (a32ctz[i]));
+      assert (b32ffs[i] == __builtin_ffs (a32ffs[i]));
+      assert (b64pc[i] == __builtin_popcountll (a64pc[i]));
+      assert (b64ctz[i] == __builtin_ctzll (a64ctz[i]));
+      assert (b64ffs[i] == __builtin_ffsll (a64ffs[i]));
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 6 "vect" target { 
amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } }  */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index ef806e2346e..b812354b986 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -1782,6 +1782,122 @@ vect_recog_widen_abd_pattern (vec_info *vinfo, 
stmt_vec_info stmt_vinfo,
   return widen_abd_stmt;
 }
 
+/* Return TRUE if we have the necessary operations to create a vectorized
+   popcount for type VEC_TYPE.  */
+
+static bool
+vect_have_popcount_fallback (tree vec_type)
+{
+  return (optab_for_tree_code (RSHIFT_EXPR, vec_type, optab_scalar)
+         && optab_for_tree_code (PLUS_EXPR, vec_type, optab_default)
+         && optab_for_tree_code (MINUS_EXPR, vec_type, optab_default)
+         && optab_for_tree_code (BIT_AND_EXPR, vec_type, optab_default)
+         && optab_for_tree_code (MULT_EXPR, vec_type, optab_default));
+}
+
+/* This generates a Wilkes-Wheeler-Gill popcount similar to what libgcc
+   does (and match.pd recognizes).  There are only 32-bit and 64-bit
+   variants.
+   It returns the generated gimple sequence of vector instructions with
+   type VEC_TYPE which is being attached to STMT_VINFO.
+   RHS is the unpromoted input value and LHS_TYPE is the output type.
+   RET_VAR is the address of an SSA variable that holds the result
+   of the last operation.  It needs to be created before calling
+   this function and must have LHS_TYPE.
+
+   int popcount64c (uint64_t x)
+   {
+     x -= (x >> 1) & 0x5555555555555555ULL;
+     x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+     x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
+     return (x * 0x0101010101010101ULL) >> 56;
+   }
+
+   int popcount32c (uint32_t x)
+   {
+     x -= (x >> 1) & 0x55555555;
+     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+     x = (x + (x >> 4)) & 0x0f0f0f0f;
+     return (x * 0x01010101) >> 24;
+   }
+   */
+
+static gimple*
+vect_generate_popcount_fallback (vec_info *vinfo, stmt_vec_info stmt_vinfo,
+                                vect_unpromoted_value rhs, tree lhs_type,
+                                tree vec_type, tree *ret_var)
+{
+  int bitsize = GET_MODE_BITSIZE (TYPE_MODE (lhs_type)).to_constant ();
+  bool is64 = bitsize == 64;
+
+  tree nuop1 = vect_convert_input (vinfo, stmt_vinfo,
+                                  lhs_type, &rhs, vec_type);
+
+  tree one_cst = build_one_cst (lhs_type);
+  tree two_cst = build_int_cst (lhs_type, 2);
+  tree four_cst = build_int_cst (lhs_type, 4);
+  tree lcst = build_int_cst (lhs_type, bitsize - CHAR_BIT);
+
+  tree c1 = build_int_cst (lhs_type,
+                          is64 ? 0x5555555555555555ull : 0x55555555);
+  tree c2 = build_int_cst (lhs_type,
+                          is64 ? 0x3333333333333333ull : 0x33333333);
+  tree c4 = build_int_cst (lhs_type,
+                          is64 ? 0x0F0F0F0F0F0F0F0Full : 0x0F0F0F0F);
+  tree cm = build_int_cst (lhs_type,
+                          is64 ? 0x0101010101010101ull : 0x01010101);
+
+  gassign *g;
+
+  tree shift1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+  g = gimple_build_assign (shift1, RSHIFT_EXPR, nuop1, one_cst);
+  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+  tree and1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+  g = gimple_build_assign (and1, BIT_AND_EXPR, shift1, c1);
+  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+  tree x1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+  g = gimple_build_assign (x1, MINUS_EXPR, nuop1, and1);
+  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+  tree shift2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+  g = gimple_build_assign (shift2, RSHIFT_EXPR, x1, two_cst);
+  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+  tree and2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+  g = gimple_build_assign (and2, BIT_AND_EXPR, shift2, c2);
+  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+  tree and22 = vect_recog_temp_ssa_var (lhs_type, NULL);
+  g = gimple_build_assign (and22, BIT_AND_EXPR, x1, c2);
+  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+  tree x2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+  g = gimple_build_assign (x2, PLUS_EXPR, and2, and22);
+  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+  tree shift3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+  g = gimple_build_assign (shift3, RSHIFT_EXPR, x2, four_cst);
+  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+  tree plus3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+  g = gimple_build_assign (plus3, PLUS_EXPR, shift3, x2);
+  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+  tree x3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+  g = gimple_build_assign (x3, BIT_AND_EXPR, plus3, c4);
+  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+  tree x4 = vect_recog_temp_ssa_var (lhs_type, NULL);
+  g = gimple_build_assign (x4, MULT_EXPR, x3, cm);
+  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+  g = gimple_build_assign (*ret_var, RSHIFT_EXPR, x4, lcst);
+
+  return g;
+}
+
 /* Function vect_recog_ctz_ffs_pattern
 
    Try to find the following pattern:
@@ -1894,8 +2010,9 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, 
stmt_vec_info stmt_vinfo,
     }
   if ((ifnnew == IFN_LAST
        || (defined_at_zero && !defined_at_zero_new))
-      && direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
-                                        OPTIMIZE_FOR_SPEED))
+      && (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
+                                        OPTIMIZE_FOR_SPEED)
+         || vect_have_popcount_fallback (vec_rhs_type)))
     {
       ifnnew = IFN_POPCOUNT;
       defined_at_zero_new = true;
@@ -1996,9 +2113,22 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, 
stmt_vec_info stmt_vinfo,
 
   /* Create B = .IFNNEW (A).  */
   new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
-  pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
-  gimple_call_set_lhs (pattern_stmt, new_var);
-  gimple_set_location (pattern_stmt, loc);
+  if (ifnnew == IFN_POPCOUNT
+      && !direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED))
+    {
+      gcc_assert (vect_have_popcount_fallback (vec_type));
+      vect_unpromoted_value un_rhs;
+      vect_look_through_possible_promotion (vinfo, rhs_oprnd, &un_rhs);
+      pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo, 
un_rhs,
+                                                     lhs_type, vec_type,
+                                                     &new_var);
+    }
+  else
+    {
+      pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
+      gimple_call_set_lhs (pattern_stmt, new_var);
+      gimple_set_location (pattern_stmt, loc);
+    }
   *type_out = vec_type;
 
   if (sub)
@@ -2042,6 +2172,7 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, 
stmt_vec_info stmt_vinfo,
   return pattern_stmt;
 }
 
+
 /* Function vect_recog_popcount_clz_ctz_ffs_pattern
 
    Try to find the following pattern:
@@ -2226,12 +2357,17 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info 
*vinfo,
   if (!vec_type)
     return NULL;
 
+  bool popcount_fallback_p = false;
+
   bool supported
     = direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED);
   if (!supported)
     switch (ifn)
       {
       case IFN_POPCOUNT:
+       if (vect_have_popcount_fallback (vec_type))
+         popcount_fallback_p = true;
+       break;
       case IFN_CLZ:
        return NULL;
       case IFN_FFS:
@@ -2247,7 +2383,8 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
                                            OPTIMIZE_FOR_SPEED))
          break;
        if (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
-                                           OPTIMIZE_FOR_SPEED))
+                                           OPTIMIZE_FOR_SPEED)
+           || vect_have_popcount_fallback (vec_type))
          break;
        return NULL;
       default:
@@ -2257,12 +2394,25 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info 
*vinfo,
   vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
                         call_stmt);
 
-  /* Create B = .POPCOUNT (A).  */
   new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
-  pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
-  gimple_call_set_lhs (pattern_stmt, new_var);
-  gimple_set_location (pattern_stmt, gimple_location (last_stmt));
-  *type_out = vec_type;
+  if (!popcount_fallback_p)
+    {
+      /* Create B = .POPCOUNT (A).  */
+      pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
+      gimple_call_set_lhs (pattern_stmt, new_var);
+      gimple_set_location (pattern_stmt, gimple_location (last_stmt));
+      *type_out = vec_type;
+    }
+  else
+    {
+      pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo,
+                                                     unprom_diff,
+                                                     lhs_type, vec_type,
+                                                     &new_var);
+      *type_out = vec_type;
+      /* For popcount we're done here.  */
+      return pattern_stmt;
+    }
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
-- 
2.41.0

Reply via email to