https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97770
--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> --- After adding expander, successfully vectorize the loop. --- diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b153a87fb98..e8159997c40 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -22678,6 +22678,12 @@ (define_insn "avx5124vnniw_vp4dpwssds_maskz" (set_attr ("prefix") ("evex")) (set_attr ("mode") ("TI"))]) +(define_expand "popcount<mode>2" + [(set (match_operand:VI48_AVX512VL 0 "register_operand") + (popcount:VI48_AVX512VL + (match_operand:VI48_AVX512VL 1 "nonimmediate_operand")))] + "TARGET_AVX512VPOPCNTDQ") + (define_insn "vpopcount<mode><mask_name>" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") (popcount:VI48_AVX512VL @@ -22722,6 +22728,12 @@ (define_insn "*restore_multiple_leave_return<mode>" "TARGET_SSE && TARGET_64BIT" "jmp\t%P1") +(define_insn "popcount<mode>2" + [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") + (popcount:VI12_AVX512VL + (match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX512BITALG") + (define_insn "vpopcount<mode><mask_name>" [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") (popcount:VI12_AVX512VL --- But for vector byte/word/quadword, vectorizer still use vpopcntd, but not vpopcnt{b,w,q}, missing corresponding ifn? void fooq(long long* __restrict dest, long long* src) { for (int i = 0; i != 4; i++) dest[i] = __builtin_popcount (src[i]); } void foow(short* __restrict dest, short* src) { for (int i = 0; i != 16; i++) dest[i] = __builtin_popcount (src[i]); } void foob(char* __restrict dest, char* src) { for (int i = 0; i != 32; i++) dest[i] = __builtin_popcount (src[i]); } dump of test.c.164.vect ;; Function foow (foow, funcdef_no=0, decl_uid=4228, cgraph_uid=1, symbol_order=0) Merging blocks 2 and 6 foow (short int * restrict dest, short int * src) { vector(8) short int * vectp_dest.10; vector(8) short int * vectp_dest.9; vector(8) short int vect__8.8; vector(4) int vect__6.7; vector(4) unsigned int vect__5.6; vector(8) short int vect__4.5; vector(8) short int * vectp_src.4; vector(8) short int * vectp_src.3; int i; long unsigned int _1; long unsigned int _2; short int * _3; short int _4; unsigned int _5; int _6; short int * _7; short int _8; unsigned int ivtmp_26; unsigned int ivtmp_28; unsigned int ivtmp_34; unsigned int ivtmp_35; <bb 2> [local count: 119292720]: <bb 3> [local count: 119292719]: # i_19 = PHI <i_15(5), 0(2)> # ivtmp_35 = PHI <ivtmp_34(5), 8(2)> # vectp_src.3_24 = PHI <vectp_src.3_23(5), src_12(D)(2)> # vectp_dest.9_9 = PHI <vectp_dest.9_29(5), dest_13(D)(2)> # ivtmp_26 = PHI <ivtmp_28(5), 0(2)> _1 = (long unsigned int) i_19; _2 = _1 * 2; _3 = src_12(D) + _2; vect__4.5_22 = MEM <vector(8) short int> [(short int *)vectp_src.3_24]; _4 = *_3; vect__5.6_21 = [vec_unpack_lo_expr] vect__4.5_22; vect__5.6_18 = [vec_unpack_hi_expr] vect__4.5_22; _5 = (unsigned int) _4; vect__6.7_17 = .POPCOUNT (vect__5.6_21); vect__6.7_16 = .POPCOUNT (vect__5.6_18); _6 = 0; _7 = dest_13(D) + _2; vect__8.8_10 = VEC_PACK_TRUNC_EXPR <vect__6.7_17, vect__6.7_16>; _8 = (short int) _6; MEM <vector(8) short int> [(short int *)vectp_dest.9_9] = vect__8.8_10; i_15 = i_19 + 1; ivtmp_34 = ivtmp_35 - 1; vectp_src.3_23 = vectp_src.3_24 + 16; vectp_dest.9_29 = vectp_dest.9_9 + 16; ivtmp_28 = ivtmp_26 + 1; if (ivtmp_28 < 1) goto <bb 5>; [0.00%] else goto <bb 4>; [100.00%] <bb 5> [local count: 0]: goto <bb 3>; [100.00%] <bb 4> [local count: 119292720]: return; }