> Sorry for the slow review. > > Pengxuan Zheng <quic_pzh...@quicinc.com> writes: > > This patch improves the Advanced SIMD popcount expansion by using SVE > > if available. > > > > For example, GCC currently generates the following code sequence for V2DI: > > cnt v31.16b, v31.16b > > uaddlp v31.8h, v31.16b > > uaddlp v31.4s, v31.8h > > uaddlp v31.2d, v31.4s > > > > However, by using SVE, we can generate the following sequence instead: > > ptrue p7.b, all > > cnt z31.d, p7/m, z31.d > > > > Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too. > > > > The scalar popcount expansion can also be improved similarly by using > > SVE and those changes will be included in a separate patch. > > > > PR target/113860 > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-simd.md (popcount<mode>2): Add > TARGET_SVE > > support. > > * config/aarch64/aarch64-sve.md > (@aarch64_pred_popcount<mode>): New > > insn. > > * config/aarch64/iterators.md (VPRED): Add V4HI, V8HI and V2SI. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/popcnt-sve.c: New test. > > > > Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com> > > --- > > gcc/config/aarch64/aarch64-simd.md | 9 ++ > > gcc/config/aarch64/aarch64-sve.md | 12 +++ > > gcc/config/aarch64/iterators.md | 1 + > > gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 > > +++++++++++++++++++ > > 4 files changed, 110 insertions(+) > > create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c > > > > diff --git a/gcc/config/aarch64/aarch64-simd.md > > b/gcc/config/aarch64/aarch64-simd.md > > index bbeee221f37..895d6e5eab5 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -3508,6 +3508,15 @@ (define_expand "popcount<mode>2" > > (popcount:VDQHSD (match_operand:VDQHSD 1 > "register_operand")))] > > "TARGET_SIMD" > > { > > + if (TARGET_SVE) > > + { > > + rtx p = aarch64_ptrue_reg (<VPRED>mode); > > + emit_insn (gen_aarch64_pred_popcount<mode> (operands[0], > > + p, > > + operands[1])); > > + DONE; > > + } > > + > > /* Generate a byte popcount. */ > > machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode; > > rtx tmp = gen_reg_rtx (mode); > > diff --git a/gcc/config/aarch64/aarch64-sve.md > > b/gcc/config/aarch64/aarch64-sve.md > > index 5331e7121d5..b5021dd2da0 100644 > > --- a/gcc/config/aarch64/aarch64-sve.md > > +++ b/gcc/config/aarch64/aarch64-sve.md > > @@ -3168,6 +3168,18 @@ (define_insn "*cond_<optab><mode>_any" > > } > > ) > > > > +;; Popcount predicated with a PTRUE. > > +(define_insn "@aarch64_pred_popcount<mode>" > > + [(set (match_operand:VDQHSD 0 "register_operand" "=w") > > + (unspec:VDQHSD > > + [(match_operand:<VPRED> 1 "register_operand" "Upl") > > + (popcount:VDQHSD > > + (match_operand:VDQHSD 2 "register_operand" "0"))] > > + UNSPEC_PRED_X))] > > + "TARGET_SVE" > > + "cnt\t%Z0.<Vetype>, %1/m, %Z2.<Vetype>" > > +) > > + > > Could you instead change: > > (define_insn "@aarch64_pred_<optab><mode>" > [(set (match_operand:SVE_I 0 "register_operand") > (unspec:SVE_I > [(match_operand:<VPRED> 1 "register_operand") > (SVE_INT_UNARY:SVE_I > (match_operand:SVE_I 2 "register_operand"))] > UNSPEC_PRED_X))] > "TARGET_SVE" > {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ] > [ w , Upl , 0 ; * ] <sve_int_op>\t%0.<Vetype>, %1/m, > %2.<Vetype> > [ ?&w , Upl , w ; yes ] movprfx\t%0, > %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype> > } > ) > > to use a new iterator SVE_VDQ_I, defined as: > > (define_mode_iterator SVE_VDQ_I [SVE_I VDQI_I]) > > ? That will give the benefit of the movprfx handling and avoid code > duplication. It will define some patterns that are initially unused, but > that's > ok. I think the direction of travel would be to use some of the others > eventually. > > OK with that change if there are no other comments in 24 hours.
Thanks, Richard. Here's the patch updated according to your feedback. https://gcc.gnu.org/pipermail/gcc-patches/2024-August/658929.html I'll commit it if there's no other comments in 24 hours. Thanks, Pengxuan > > Thanks, > Richard > > > ;; > > ---------------------------------------------------------------------- > > --- ;; ---- [INT] General unary arithmetic corresponding to unspecs > > ;; > > ---------------------------------------------------------------------- > > --- diff --git a/gcc/config/aarch64/iterators.md > > b/gcc/config/aarch64/iterators.md index f527b2cfeb8..a06159b23ea > > 100644 > > --- a/gcc/config/aarch64/iterators.md > > +++ b/gcc/config/aarch64/iterators.md > > @@ -2278,6 +2278,7 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") > (VNx8QI "VNx8BI") > > (VNx32BF "VNx8BI") > > (VNx16SI "VNx4BI") (VNx16SF "VNx4BI") > > (VNx8DI "VNx2BI") (VNx8DF "VNx2BI") > > + (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI") > > (V4SI "VNx4BI") (V2DI "VNx2BI")]) > > > > ;; ...and again in lower case. > > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c > > b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c > > new file mode 100644 > > index 00000000000..8e349efe390 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c > > @@ -0,0 +1,88 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -march=armv8.2-a+sve -fno-vect-cost-model > > +-fno-schedule-insns -fno-schedule-insns2" } */ > > +/* { dg-final { check-function-bodies "**" "" "" } } */ > > + > > +/* > > +** f_v4hi: > > +** ptrue (p[0-7]).b, all > > +** ldr d([0-9]+), \[x0\] > > +** cnt z\2.h, \1/m, z\2.h > > +** str d\2, \[x1\] > > +** ret > > +*/ > > +void > > +f_v4hi (unsigned short *__restrict b, unsigned short *__restrict d) { > > + d[0] = __builtin_popcount (b[0]); > > + d[1] = __builtin_popcount (b[1]); > > + d[2] = __builtin_popcount (b[2]); > > + d[3] = __builtin_popcount (b[3]); > > +} > > + > > +/* > > +** f_v8hi: > > +** ptrue (p[0-7]).b, all > > +** ldr q([0-9]+), \[x0\] > > +** cnt z\2.h, \1/m, z\2.h > > +** str q\2, \[x1\] > > +** ret > > +*/ > > +void > > +f_v8hi (unsigned short *__restrict b, unsigned short *__restrict d) { > > + d[0] = __builtin_popcount (b[0]); > > + d[1] = __builtin_popcount (b[1]); > > + d[2] = __builtin_popcount (b[2]); > > + d[3] = __builtin_popcount (b[3]); > > + d[4] = __builtin_popcount (b[4]); > > + d[5] = __builtin_popcount (b[5]); > > + d[6] = __builtin_popcount (b[6]); > > + d[7] = __builtin_popcount (b[7]); > > +} > > + > > +/* > > +** f_v2si: > > +** ptrue (p[0-7]).b, all > > +** ldr d([0-9]+), \[x0\] > > +** cnt z\2.s, \1/m, z\2.s > > +** str d\2, \[x1\] > > +** ret > > +*/ > > +void > > +f_v2si (unsigned int *__restrict b, unsigned int *__restrict d) { > > + d[0] = __builtin_popcount (b[0]); > > + d[1] = __builtin_popcount (b[1]); > > +} > > + > > +/* > > +** f_v4si: > > +** ptrue (p[0-7]).b, all > > +** ldr q([0-9]+), \[x0\] > > +** cnt z\2.s, \1/m, z\2.s > > +** str q\2, \[x1\] > > +** ret > > +*/ > > +void > > +f_v4si (unsigned int *__restrict b, unsigned int *__restrict d) { > > + d[0] = __builtin_popcount (b[0]); > > + d[1] = __builtin_popcount (b[1]); > > + d[2] = __builtin_popcount (b[2]); > > + d[3] = __builtin_popcount (b[3]); > > +} > > + > > +/* > > +** f_v2di: > > +** ptrue (p[0-7]).b, all > > +** ldr q([0-9]+), \[x0\] > > +** cnt z\2.d, \1/m, z\2.d > > +** str q\2, \[x1\] > > +** ret > > +*/ > > +void > > +f_v2di (unsigned long *__restrict b, unsigned long *__restrict d) { > > + d[0] = __builtin_popcountll (b[0]); > > + d[1] = __builtin_popcountll (b[1]); }