Hi, This patch adds a fallback when the backend does not provide a popcount implementation. The algorithm is the same one libgcc uses, as well as match.pd for recognizing a popcount idiom. __builtin_ctz and __builtin_ffs can also rely on popcount so I used the fallback for them as well.
Bootstrapped and regtested on x86, aarch64 and power10. Unfortunately I don't have access to any architecture other than riscv that vectorizes but does not have a vectorized popcount. I added all vect_int targets to the selector where a cursory grep "expand.*popcount" would yield no result. Regards Robin gcc/ChangeLog: * tree-vect-patterns.cc (vect_have_popcount_fallback): New function. (vect_generate_popcount_fallback): New function to emit vectorized popcount fallback. (vect_recog_ctz_ffs_pattern): Use fallback. (vect_recog_popcount_clz_ctz_ffs_pattern): Ditto. gcc/testsuite/ChangeLog: * gcc.dg/vect/vect-popcount-fallback.c: New test. --- .../gcc.dg/vect/vect-popcount-fallback.c | 106 +++++++++++ gcc/tree-vect-patterns.cc | 172 ++++++++++++++++-- 2 files changed, 267 insertions(+), 11 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c diff --git a/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c new file mode 100644 index 00000000000..f6300f4ab35 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c @@ -0,0 +1,106 @@ +/* Check if we vectorize popcount when no expander is available. */ +/* { dg-do run { target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } */ +/* { dg-additional-options { -O2 -fdump-tree-vect-details -fno-vect-cost-model } } */ +/* { dg-require-effective-target vect_int } */ + +#include <stdlib.h> +#include <assert.h> +#include <stdint-gcc.h> + +__attribute__ ((noipa)) +void popc32 (uint32_t *restrict dst, uint32_t *restrict a, int n) +{ + for (int i = 0; i < n; i++) + dst[i] = __builtin_popcount (a[i]); +} + +__attribute__ ((noipa)) +void ctz32 (uint32_t *restrict dst, uint32_t *restrict a, int n) +{ + for (int i = 0; i < n; i++) + dst[i] = __builtin_ctz (a[i]); +} + +__attribute__ ((noipa)) +void ffs32 (uint32_t *restrict dst, uint32_t *restrict a, int n) +{ + for (int i = 0; i < n; i++) + dst[i] = __builtin_ffs (a[i]); +} + +__attribute__ ((noipa)) +void popc64 (uint64_t *restrict dst, uint64_t *restrict a, int n) +{ + for (int i = 0; i < n; i++) + dst[i] = __builtin_popcountll (a[i]); +} + +__attribute__ ((noipa)) +void ctz64 (uint64_t *restrict dst, uint64_t *restrict a, int n) +{ + for (int i = 0; i < n; i++) + dst[i] = __builtin_ctzll (a[i]); +} + +__attribute__ ((noipa)) +void ffs64 (uint64_t *restrict dst, uint64_t *restrict a, int n) +{ + for (int i = 0; i < n; i++) + dst[i] = __builtin_ffsll (a[i]); +} + +#define SZ 512 + +__attribute__ ((optimize ("0"))) +int main () +{ + uint32_t *a32pc = malloc (SZ * sizeof (*a32pc)); + uint32_t *b32pc = malloc (SZ * sizeof (*b32pc)); + uint32_t *a32ctz = malloc (SZ * sizeof (*a32ctz)); + uint32_t *b32ctz = malloc (SZ * sizeof (*b32ctz)); + uint32_t *a32ffs = malloc (SZ * sizeof (*a32ffs)); + uint32_t *b32ffs = malloc (SZ * sizeof (*b32ffs)); + + uint64_t *a64pc = malloc (SZ * sizeof (*a64pc)); + uint64_t *b64pc = malloc (SZ * sizeof (*b64pc)); + uint64_t *a64ctz = malloc (SZ * sizeof (*a64ctz)); + uint64_t *b64ctz = malloc (SZ * sizeof (*b64ctz)); + uint64_t *a64ffs = malloc (SZ * sizeof (*a64ffs)); + uint64_t *b64ffs = malloc (SZ * sizeof (*b64ffs)); + + for (int i = 0; i < SZ; i++) + { + int ia = i + 1; + a32pc[i] = ia * 1234567; + b32pc[i] = 0; + a32ctz[i] = ia * 1234567; + b32ctz[i] = 0; + a32ffs[i] = ia * 1234567; + b32ffs[i] = 0; + a64pc[i] = ia * 123456789ull; + b64pc[i] = 0; + a64ctz[i] = ia * 123456789ull; + b64ctz[i] = 0; + a64ffs[i] = ia * 123456789ull; + b64ffs[i] = 0; + } + + popc32 (b32pc, a32pc, SZ); + ctz32 (b32ctz, a32ctz, SZ); + ffs32 (b32ffs, a32ffs, SZ); + popc64 (b64pc, a64pc, SZ); + ctz64 (b64ctz, a64ctz, SZ); + ffs64 (b64ffs, a64ffs, SZ); + + for (int i = 0; i < SZ; i++) + { + assert (b32pc[i] == __builtin_popcount (a32pc[i])); + assert (b32ctz[i] == __builtin_ctz (a32ctz[i])); + assert (b32ffs[i] == __builtin_ffs (a32ffs[i])); + assert (b64pc[i] == __builtin_popcountll (a64pc[i])); + assert (b64ctz[i] == __builtin_ctzll (a64ctz[i])); + assert (b64ffs[i] == __builtin_ffsll (a64ffs[i])); + } +} + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 6 "vect" target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } */ diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index ef806e2346e..b812354b986 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -1782,6 +1782,122 @@ vect_recog_widen_abd_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, return widen_abd_stmt; } +/* Return TRUE if we have the necessary operations to create a vectorized + popcount for type VEC_TYPE. */ + +static bool +vect_have_popcount_fallback (tree vec_type) +{ + return (optab_for_tree_code (RSHIFT_EXPR, vec_type, optab_scalar) + && optab_for_tree_code (PLUS_EXPR, vec_type, optab_default) + && optab_for_tree_code (MINUS_EXPR, vec_type, optab_default) + && optab_for_tree_code (BIT_AND_EXPR, vec_type, optab_default) + && optab_for_tree_code (MULT_EXPR, vec_type, optab_default)); +} + +/* This generates a Wilkes-Wheeler-Gill popcount similar to what libgcc + does (and match.pd recognizes). There are only 32-bit and 64-bit + variants. + It returns the generated gimple sequence of vector instructions with + type VEC_TYPE which is being attached to STMT_VINFO. + RHS is the unpromoted input value and LHS_TYPE is the output type. + RET_VAR is the address of an SSA variable that holds the result + of the last operation. It needs to be created before calling + this function and must have LHS_TYPE. + + int popcount64c (uint64_t x) + { + x -= (x >> 1) & 0x5555555555555555ULL; + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; + return (x * 0x0101010101010101ULL) >> 56; + } + + int popcount32c (uint32_t x) + { + x -= (x >> 1) & 0x55555555; + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (x + (x >> 4)) & 0x0f0f0f0f; + return (x * 0x01010101) >> 24; + } + */ + +static gimple* +vect_generate_popcount_fallback (vec_info *vinfo, stmt_vec_info stmt_vinfo, + vect_unpromoted_value rhs, tree lhs_type, + tree vec_type, tree *ret_var) +{ + int bitsize = GET_MODE_BITSIZE (TYPE_MODE (lhs_type)).to_constant (); + bool is64 = bitsize == 64; + + tree nuop1 = vect_convert_input (vinfo, stmt_vinfo, + lhs_type, &rhs, vec_type); + + tree one_cst = build_one_cst (lhs_type); + tree two_cst = build_int_cst (lhs_type, 2); + tree four_cst = build_int_cst (lhs_type, 4); + tree lcst = build_int_cst (lhs_type, bitsize - CHAR_BIT); + + tree c1 = build_int_cst (lhs_type, + is64 ? 0x5555555555555555ull : 0x55555555); + tree c2 = build_int_cst (lhs_type, + is64 ? 0x3333333333333333ull : 0x33333333); + tree c4 = build_int_cst (lhs_type, + is64 ? 0x0F0F0F0F0F0F0F0Full : 0x0F0F0F0F); + tree cm = build_int_cst (lhs_type, + is64 ? 0x0101010101010101ull : 0x01010101); + + gassign *g; + + tree shift1 = vect_recog_temp_ssa_var (lhs_type, NULL); + g = gimple_build_assign (shift1, RSHIFT_EXPR, nuop1, one_cst); + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type); + + tree and1 = vect_recog_temp_ssa_var (lhs_type, NULL); + g = gimple_build_assign (and1, BIT_AND_EXPR, shift1, c1); + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type); + + tree x1 = vect_recog_temp_ssa_var (lhs_type, NULL); + g = gimple_build_assign (x1, MINUS_EXPR, nuop1, and1); + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type); + + tree shift2 = vect_recog_temp_ssa_var (lhs_type, NULL); + g = gimple_build_assign (shift2, RSHIFT_EXPR, x1, two_cst); + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type); + + tree and2 = vect_recog_temp_ssa_var (lhs_type, NULL); + g = gimple_build_assign (and2, BIT_AND_EXPR, shift2, c2); + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type); + + tree and22 = vect_recog_temp_ssa_var (lhs_type, NULL); + g = gimple_build_assign (and22, BIT_AND_EXPR, x1, c2); + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type); + + tree x2 = vect_recog_temp_ssa_var (lhs_type, NULL); + g = gimple_build_assign (x2, PLUS_EXPR, and2, and22); + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type); + + tree shift3 = vect_recog_temp_ssa_var (lhs_type, NULL); + g = gimple_build_assign (shift3, RSHIFT_EXPR, x2, four_cst); + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type); + + tree plus3 = vect_recog_temp_ssa_var (lhs_type, NULL); + g = gimple_build_assign (plus3, PLUS_EXPR, shift3, x2); + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type); + + tree x3 = vect_recog_temp_ssa_var (lhs_type, NULL); + g = gimple_build_assign (x3, BIT_AND_EXPR, plus3, c4); + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type); + + tree x4 = vect_recog_temp_ssa_var (lhs_type, NULL); + g = gimple_build_assign (x4, MULT_EXPR, x3, cm); + append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type); + + g = gimple_build_assign (*ret_var, RSHIFT_EXPR, x4, lcst); + + return g; +} + /* Function vect_recog_ctz_ffs_pattern Try to find the following pattern: @@ -1894,8 +2010,9 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, } if ((ifnnew == IFN_LAST || (defined_at_zero && !defined_at_zero_new)) - && direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type, - OPTIMIZE_FOR_SPEED)) + && (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type, + OPTIMIZE_FOR_SPEED) + || vect_have_popcount_fallback (vec_rhs_type))) { ifnnew = IFN_POPCOUNT; defined_at_zero_new = true; @@ -1996,9 +2113,22 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, /* Create B = .IFNNEW (A). */ new_var = vect_recog_temp_ssa_var (lhs_type, NULL); - pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd); - gimple_call_set_lhs (pattern_stmt, new_var); - gimple_set_location (pattern_stmt, loc); + if (ifnnew == IFN_POPCOUNT + && !direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED)) + { + gcc_assert (vect_have_popcount_fallback (vec_type)); + vect_unpromoted_value un_rhs; + vect_look_through_possible_promotion (vinfo, rhs_oprnd, &un_rhs); + pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo, un_rhs, + lhs_type, vec_type, + &new_var); + } + else + { + pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd); + gimple_call_set_lhs (pattern_stmt, new_var); + gimple_set_location (pattern_stmt, loc); + } *type_out = vec_type; if (sub) @@ -2042,6 +2172,7 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, return pattern_stmt; } + /* Function vect_recog_popcount_clz_ctz_ffs_pattern Try to find the following pattern: @@ -2226,12 +2357,17 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo, if (!vec_type) return NULL; + bool popcount_fallback_p = false; + bool supported = direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED); if (!supported) switch (ifn) { case IFN_POPCOUNT: + if (vect_have_popcount_fallback (vec_type)) + popcount_fallback_p = true; + break; case IFN_CLZ: return NULL; case IFN_FFS: @@ -2247,7 +2383,8 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo, OPTIMIZE_FOR_SPEED)) break; if (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type, - OPTIMIZE_FOR_SPEED)) + OPTIMIZE_FOR_SPEED) + || vect_have_popcount_fallback (vec_type)) break; return NULL; default: @@ -2257,12 +2394,25 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo, vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern", call_stmt); - /* Create B = .POPCOUNT (A). */ new_var = vect_recog_temp_ssa_var (lhs_type, NULL); - pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op); - gimple_call_set_lhs (pattern_stmt, new_var); - gimple_set_location (pattern_stmt, gimple_location (last_stmt)); - *type_out = vec_type; + if (!popcount_fallback_p) + { + /* Create B = .POPCOUNT (A). */ + pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op); + gimple_call_set_lhs (pattern_stmt, new_var); + gimple_set_location (pattern_stmt, gimple_location (last_stmt)); + *type_out = vec_type; + } + else + { + pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo, + unprom_diff, + lhs_type, vec_type, + &new_var); + *type_out = vec_type; + /* For popcount we're done here. */ + return pattern_stmt; + } if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, -- 2.41.0