On Mon, Aug 7, 2023 at 10:20 PM Robin Dapp via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> This patch adds a fallback when the backend does not provide a popcount
> implementation.  The algorithm is the same one libgcc uses, as well as
> match.pd for recognizing a popcount idiom.  __builtin_ctz and __builtin_ffs
> can also rely on popcount so I used the fallback for them as well.
>
> Bootstrapped and regtested on x86, aarch64 and power10.  Unfortunately
> I don't have access to any architecture other than riscv that vectorizes
> but does not have a vectorized popcount.  I added all vect_int targets
> to the selector where a cursory grep "expand.*popcount" would yield no
> result.

Looks reasonable to me - I couldn't read from above whether you did
testing on riscv and thus verified the runtime correctness of the fallback?
If not may I suggest to force matching the pattern on a target you can
test for this purpose?

One note below ...

Thanks,
Richard.

> Regards
>  Robin
>
> gcc/ChangeLog:
>
>         * tree-vect-patterns.cc (vect_have_popcount_fallback): New
>         function.
>         (vect_generate_popcount_fallback): New function to emit
>         vectorized popcount fallback.
>         (vect_recog_ctz_ffs_pattern): Use fallback.
>         (vect_recog_popcount_clz_ctz_ffs_pattern): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/vect/vect-popcount-fallback.c: New test.
> ---
>  .../gcc.dg/vect/vect-popcount-fallback.c      | 106 +++++++++++
>  gcc/tree-vect-patterns.cc                     | 172 ++++++++++++++++--
>  2 files changed, 267 insertions(+), 11 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c 
> b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
> new file mode 100644
> index 00000000000..f6300f4ab35
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
> @@ -0,0 +1,106 @@
> +/* Check if we vectorize popcount when no expander is available.  */
> +/* { dg-do run { target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* 
> mips*-*-* riscv*-*-* } } } */
> +/* { dg-additional-options { -O2 -fdump-tree-vect-details 
> -fno-vect-cost-model } }  */
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdlib.h>
> +#include <assert.h>
> +#include <stdint-gcc.h>
> +
> +__attribute__ ((noipa))
> +void popc32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    dst[i] = __builtin_popcount (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ctz32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    dst[i] = __builtin_ctz (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ffs32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    dst[i] = __builtin_ffs (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void popc64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    dst[i] = __builtin_popcountll (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ctz64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    dst[i] = __builtin_ctzll (a[i]);
> +}
> +
> +__attribute__ ((noipa))
> +void ffs64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    dst[i] = __builtin_ffsll (a[i]);
> +}
> +
> +#define SZ 512
> +
> +__attribute__ ((optimize ("0")))
> +int main ()
> +{
> +  uint32_t *a32pc = malloc (SZ * sizeof (*a32pc));
> +  uint32_t *b32pc = malloc (SZ * sizeof (*b32pc));
> +  uint32_t *a32ctz = malloc (SZ * sizeof (*a32ctz));
> +  uint32_t *b32ctz = malloc (SZ * sizeof (*b32ctz));
> +  uint32_t *a32ffs = malloc (SZ * sizeof (*a32ffs));
> +  uint32_t *b32ffs = malloc (SZ * sizeof (*b32ffs));
> +
> +  uint64_t *a64pc = malloc (SZ * sizeof (*a64pc));
> +  uint64_t *b64pc = malloc (SZ * sizeof (*b64pc));
> +  uint64_t *a64ctz = malloc (SZ * sizeof (*a64ctz));
> +  uint64_t *b64ctz = malloc (SZ * sizeof (*b64ctz));
> +  uint64_t *a64ffs = malloc (SZ * sizeof (*a64ffs));
> +  uint64_t *b64ffs = malloc (SZ * sizeof (*b64ffs));
> +
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      int ia = i + 1;
> +      a32pc[i] = ia * 1234567;
> +      b32pc[i] = 0;
> +      a32ctz[i] = ia * 1234567;
> +      b32ctz[i] = 0;
> +      a32ffs[i] = ia * 1234567;
> +      b32ffs[i] = 0;
> +      a64pc[i] = ia * 123456789ull;
> +      b64pc[i] = 0;
> +      a64ctz[i] = ia * 123456789ull;
> +      b64ctz[i] = 0;
> +      a64ffs[i] = ia * 123456789ull;
> +      b64ffs[i] = 0;
> +    }
> +
> +  popc32 (b32pc, a32pc, SZ);
> +  ctz32 (b32ctz, a32ctz, SZ);
> +  ffs32 (b32ffs, a32ffs, SZ);
> +  popc64 (b64pc, a64pc, SZ);
> +  ctz64 (b64ctz, a64ctz, SZ);
> +  ffs64 (b64ffs, a64ffs, SZ);
> +
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      assert (b32pc[i] == __builtin_popcount (a32pc[i]));
> +      assert (b32ctz[i] == __builtin_ctz (a32ctz[i]));
> +      assert (b32ffs[i] == __builtin_ffs (a32ffs[i]));
> +      assert (b64pc[i] == __builtin_popcountll (a64pc[i]));
> +      assert (b64ctz[i] == __builtin_ctzll (a64ctz[i]));
> +      assert (b64ffs[i] == __builtin_ffsll (a64ffs[i]));
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 6 "vect" target { 
> amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } }  */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index ef806e2346e..b812354b986 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -1782,6 +1782,122 @@ vect_recog_widen_abd_pattern (vec_info *vinfo, 
> stmt_vec_info stmt_vinfo,
>    return widen_abd_stmt;
>  }
>
> +/* Return TRUE if we have the necessary operations to create a vectorized
> +   popcount for type VEC_TYPE.  */
> +
> +static bool
> +vect_have_popcount_fallback (tree vec_type)
> +{
> +  return (optab_for_tree_code (RSHIFT_EXPR, vec_type, optab_scalar)

optab_vector would also work (vectorizable_shift will try both)

> +         && optab_for_tree_code (PLUS_EXPR, vec_type, optab_default)
> +         && optab_for_tree_code (MINUS_EXPR, vec_type, optab_default)
> +         && optab_for_tree_code (BIT_AND_EXPR, vec_type, optab_default)
> +         && optab_for_tree_code (MULT_EXPR, vec_type, optab_default));

... note this doesn't actually check the target can do these operations,
you'd have to look whether optab_handler (optab, TYPE_MODE (vec_type))
isn't CODE_FOR_nothing.  I see we don't do this consistently though,
and the alternative is a known unsupported popcount.

Did you check whether we try popcount with DImode before using the
fallback for SImode?  Or whether we try V2nSImode before falling
back to VnDImode?  Note that if the target has popcountqi or hi then
we can end up pattern matching popcount for those modes, not sure
whether targets usually support vectorized those.

> +}
> +
> +/* This generates a Wilkes-Wheeler-Gill popcount similar to what libgcc
> +   does (and match.pd recognizes).  There are only 32-bit and 64-bit
> +   variants.
> +   It returns the generated gimple sequence of vector instructions with
> +   type VEC_TYPE which is being attached to STMT_VINFO.
> +   RHS is the unpromoted input value and LHS_TYPE is the output type.
> +   RET_VAR is the address of an SSA variable that holds the result
> +   of the last operation.  It needs to be created before calling
> +   this function and must have LHS_TYPE.
> +
> +   int popcount64c (uint64_t x)
> +   {
> +     x -= (x >> 1) & 0x5555555555555555ULL;
> +     x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
> +     x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
> +     return (x * 0x0101010101010101ULL) >> 56;
> +   }
> +
> +   int popcount32c (uint32_t x)
> +   {
> +     x -= (x >> 1) & 0x55555555;
> +     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
> +     x = (x + (x >> 4)) & 0x0f0f0f0f;
> +     return (x * 0x01010101) >> 24;
> +   }
> +   */
> +
> +static gimple*
> +vect_generate_popcount_fallback (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> +                                vect_unpromoted_value rhs, tree lhs_type,
> +                                tree vec_type, tree *ret_var)
> +{
> +  int bitsize = GET_MODE_BITSIZE (TYPE_MODE (lhs_type)).to_constant ();
> +  bool is64 = bitsize == 64;
> +
> +  tree nuop1 = vect_convert_input (vinfo, stmt_vinfo,
> +                                  lhs_type, &rhs, vec_type);
> +
> +  tree one_cst = build_one_cst (lhs_type);
> +  tree two_cst = build_int_cst (lhs_type, 2);
> +  tree four_cst = build_int_cst (lhs_type, 4);
> +  tree lcst = build_int_cst (lhs_type, bitsize - CHAR_BIT);
> +
> +  tree c1 = build_int_cst (lhs_type,
> +                          is64 ? 0x5555555555555555ull : 0x55555555);

Hmm, looks like we miss a useful helper to produce an
integer constant with a repeated byte sequence?  A

unsigned char buf[8];
memset (buf, val, 8);
c1 = native_interpret (...);

would do the trick but I guess we can have it cheaper using wide-int
directly?  This must have come up before ...

> +  tree c2 = build_int_cst (lhs_type,
> +                          is64 ? 0x3333333333333333ull : 0x33333333);
> +  tree c4 = build_int_cst (lhs_type,
> +                          is64 ? 0x0F0F0F0F0F0F0F0Full : 0x0F0F0F0F);
> +  tree cm = build_int_cst (lhs_type,
> +                          is64 ? 0x0101010101010101ull : 0x01010101);
> +
> +  gassign *g;
> +
> +  tree shift1 = vect_recog_temp_ssa_var (lhs_type, NULL);
> +  g = gimple_build_assign (shift1, RSHIFT_EXPR, nuop1, one_cst);
> +  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> +  tree and1 = vect_recog_temp_ssa_var (lhs_type, NULL);
> +  g = gimple_build_assign (and1, BIT_AND_EXPR, shift1, c1);
> +  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> +  tree x1 = vect_recog_temp_ssa_var (lhs_type, NULL);
> +  g = gimple_build_assign (x1, MINUS_EXPR, nuop1, and1);
> +  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> +  tree shift2 = vect_recog_temp_ssa_var (lhs_type, NULL);
> +  g = gimple_build_assign (shift2, RSHIFT_EXPR, x1, two_cst);
> +  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> +  tree and2 = vect_recog_temp_ssa_var (lhs_type, NULL);
> +  g = gimple_build_assign (and2, BIT_AND_EXPR, shift2, c2);
> +  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> +  tree and22 = vect_recog_temp_ssa_var (lhs_type, NULL);
> +  g = gimple_build_assign (and22, BIT_AND_EXPR, x1, c2);
> +  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> +  tree x2 = vect_recog_temp_ssa_var (lhs_type, NULL);
> +  g = gimple_build_assign (x2, PLUS_EXPR, and2, and22);
> +  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> +  tree shift3 = vect_recog_temp_ssa_var (lhs_type, NULL);
> +  g = gimple_build_assign (shift3, RSHIFT_EXPR, x2, four_cst);
> +  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> +  tree plus3 = vect_recog_temp_ssa_var (lhs_type, NULL);
> +  g = gimple_build_assign (plus3, PLUS_EXPR, shift3, x2);
> +  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> +  tree x3 = vect_recog_temp_ssa_var (lhs_type, NULL);
> +  g = gimple_build_assign (x3, BIT_AND_EXPR, plus3, c4);
> +  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> +  tree x4 = vect_recog_temp_ssa_var (lhs_type, NULL);
> +  g = gimple_build_assign (x4, MULT_EXPR, x3, cm);
> +  append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
> +
> +  g = gimple_build_assign (*ret_var, RSHIFT_EXPR, x4, lcst);
> +
> +  return g;
> +}
> +
>  /* Function vect_recog_ctz_ffs_pattern
>
>     Try to find the following pattern:
> @@ -1894,8 +2010,9 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, 
> stmt_vec_info stmt_vinfo,
>      }
>    if ((ifnnew == IFN_LAST
>         || (defined_at_zero && !defined_at_zero_new))
> -      && direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
> -                                        OPTIMIZE_FOR_SPEED))
> +      && (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
> +                                        OPTIMIZE_FOR_SPEED)
> +         || vect_have_popcount_fallback (vec_rhs_type)))
>      {
>        ifnnew = IFN_POPCOUNT;
>        defined_at_zero_new = true;
> @@ -1996,9 +2113,22 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, 
> stmt_vec_info stmt_vinfo,
>
>    /* Create B = .IFNNEW (A).  */
>    new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> -  pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
> -  gimple_call_set_lhs (pattern_stmt, new_var);
> -  gimple_set_location (pattern_stmt, loc);
> +  if (ifnnew == IFN_POPCOUNT
> +      && !direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED))
> +    {
> +      gcc_assert (vect_have_popcount_fallback (vec_type));
> +      vect_unpromoted_value un_rhs;
> +      vect_look_through_possible_promotion (vinfo, rhs_oprnd, &un_rhs);
> +      pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo, 
> un_rhs,
> +                                                     lhs_type, vec_type,
> +                                                     &new_var);
> +    }
> +  else
> +    {
> +      pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
> +      gimple_call_set_lhs (pattern_stmt, new_var);
> +      gimple_set_location (pattern_stmt, loc);
> +    }
>    *type_out = vec_type;
>
>    if (sub)
> @@ -2042,6 +2172,7 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, 
> stmt_vec_info stmt_vinfo,
>    return pattern_stmt;
>  }
>
> +
>  /* Function vect_recog_popcount_clz_ctz_ffs_pattern
>
>     Try to find the following pattern:
> @@ -2226,12 +2357,17 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info 
> *vinfo,
>    if (!vec_type)
>      return NULL;
>
> +  bool popcount_fallback_p = false;
> +
>    bool supported
>      = direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED);
>    if (!supported)
>      switch (ifn)
>        {
>        case IFN_POPCOUNT:
> +       if (vect_have_popcount_fallback (vec_type))
> +         popcount_fallback_p = true;
> +       break;
>        case IFN_CLZ:
>         return NULL;
>        case IFN_FFS:
> @@ -2247,7 +2383,8 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info 
> *vinfo,
>                                             OPTIMIZE_FOR_SPEED))
>           break;
>         if (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
> -                                           OPTIMIZE_FOR_SPEED))
> +                                           OPTIMIZE_FOR_SPEED)
> +           || vect_have_popcount_fallback (vec_type))
>           break;
>         return NULL;
>        default:
> @@ -2257,12 +2394,25 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info 
> *vinfo,
>    vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
>                          call_stmt);
>
> -  /* Create B = .POPCOUNT (A).  */
>    new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> -  pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
> -  gimple_call_set_lhs (pattern_stmt, new_var);
> -  gimple_set_location (pattern_stmt, gimple_location (last_stmt));
> -  *type_out = vec_type;
> +  if (!popcount_fallback_p)
> +    {
> +      /* Create B = .POPCOUNT (A).  */
> +      pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
> +      gimple_call_set_lhs (pattern_stmt, new_var);
> +      gimple_set_location (pattern_stmt, gimple_location (last_stmt));
> +      *type_out = vec_type;
> +    }
> +  else
> +    {
> +      pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo,
> +                                                     unprom_diff,
> +                                                     lhs_type, vec_type,
> +                                                     &new_var);
> +      *type_out = vec_type;
> +      /* For popcount we're done here.  */
> +      return pattern_stmt;
> +    }
>
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
> --
> 2.41.0
>

Reply via email to