> Sorry for the slow review.
> 
> Pengxuan Zheng <quic_pzh...@quicinc.com> writes:
> > This patch improves the Advanced SIMD popcount expansion by using SVE
> > if available.
> >
> > For example, GCC currently generates the following code sequence for V2DI:
> >   cnt     v31.16b, v31.16b
> >   uaddlp  v31.8h, v31.16b
> >   uaddlp  v31.4s, v31.8h
> >   uaddlp  v31.2d, v31.4s
> >
> > However, by using SVE, we can generate the following sequence instead:
> >   ptrue   p7.b, all
> >   cnt     z31.d, p7/m, z31.d
> >
> > Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.
> >
> > The scalar popcount expansion can also be improved similarly by using
> > SVE and those changes will be included in a separate patch.
> >
> >     PR target/113860
> >
> > gcc/ChangeLog:
> >
> >     * config/aarch64/aarch64-simd.md (popcount<mode>2): Add
> TARGET_SVE
> >     support.
> >     * config/aarch64/aarch64-sve.md
> (@aarch64_pred_popcount<mode>): New
> >     insn.
> >     * config/aarch64/iterators.md (VPRED): Add V4HI, V8HI and V2SI.
> >
> > gcc/testsuite/ChangeLog:
> >
> >     * gcc.target/aarch64/popcnt-sve.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com>
> > ---
> >  gcc/config/aarch64/aarch64-simd.md            |  9 ++
> >  gcc/config/aarch64/aarch64-sve.md             | 12 +++
> >  gcc/config/aarch64/iterators.md               |  1 +
> >  gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88
> > +++++++++++++++++++
> >  4 files changed, 110 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index bbeee221f37..895d6e5eab5 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3508,6 +3508,15 @@ (define_expand "popcount<mode>2"
> >     (popcount:VDQHSD (match_operand:VDQHSD 1
> "register_operand")))]
> >    "TARGET_SIMD"
> >    {
> > +    if (TARGET_SVE)
> > +      {
> > +   rtx p = aarch64_ptrue_reg (<VPRED>mode);
> > +   emit_insn (gen_aarch64_pred_popcount<mode> (operands[0],
> > +                                               p,
> > +                                               operands[1]));
> > +   DONE;
> > +      }
> > +
> >      /* Generate a byte popcount.  */
> >      machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode;
> >      rtx tmp = gen_reg_rtx (mode);
> > diff --git a/gcc/config/aarch64/aarch64-sve.md
> > b/gcc/config/aarch64/aarch64-sve.md
> > index 5331e7121d5..b5021dd2da0 100644
> > --- a/gcc/config/aarch64/aarch64-sve.md
> > +++ b/gcc/config/aarch64/aarch64-sve.md
> > @@ -3168,6 +3168,18 @@ (define_insn "*cond_<optab><mode>_any"
> >    }
> >  )
> >
> > +;; Popcount predicated with a PTRUE.
> > +(define_insn "@aarch64_pred_popcount<mode>"
> > +  [(set (match_operand:VDQHSD 0 "register_operand" "=w")
> > +   (unspec:VDQHSD
> > +     [(match_operand:<VPRED> 1 "register_operand" "Upl")
> > +      (popcount:VDQHSD
> > +        (match_operand:VDQHSD 2 "register_operand" "0"))]
> > +     UNSPEC_PRED_X))]
> > +  "TARGET_SVE"
> > +  "cnt\t%Z0.<Vetype>, %1/m, %Z2.<Vetype>"
> > +)
> > +
> 
> Could you instead change:
> 
> (define_insn "@aarch64_pred_<optab><mode>"
>   [(set (match_operand:SVE_I 0 "register_operand")
>       (unspec:SVE_I
>         [(match_operand:<VPRED> 1 "register_operand")
>          (SVE_INT_UNARY:SVE_I
>            (match_operand:SVE_I 2 "register_operand"))]
>         UNSPEC_PRED_X))]
>   "TARGET_SVE"
>   {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
>      [ w        , Upl , 0 ; *              ] <sve_int_op>\t%0.<Vetype>, %1/m,
> %2.<Vetype>
>      [ ?&w      , Upl , w ; yes            ] movprfx\t%0,
> %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
>   }
> )
> 
> to use a new iterator SVE_VDQ_I, defined as:
> 
> (define_mode_iterator SVE_VDQ_I [SVE_I VDQI_I])
> 
> ?  That will give the benefit of the movprfx handling and avoid code
> duplication.  It will define some patterns that are initially unused, but 
> that's
> ok.  I think the direction of travel would be to use some of the others
> eventually.
> 
> OK with that change if there are no other comments in 24 hours.

Thanks, Richard. Here's the patch updated according to your feedback.
https://gcc.gnu.org/pipermail/gcc-patches/2024-August/658929.html

I'll commit it if there's no other comments in 24 hours.

Thanks,
Pengxuan
> 
> Thanks,
> Richard
> 
> >  ;;
> > ----------------------------------------------------------------------
> > ---  ;; ---- [INT] General unary arithmetic corresponding to unspecs
> > ;;
> > ----------------------------------------------------------------------
> > --- diff --git a/gcc/config/aarch64/iterators.md
> > b/gcc/config/aarch64/iterators.md index f527b2cfeb8..a06159b23ea
> > 100644
> > --- a/gcc/config/aarch64/iterators.md
> > +++ b/gcc/config/aarch64/iterators.md
> > @@ -2278,6 +2278,7 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI")
> (VNx8QI "VNx8BI")
> >                      (VNx32BF "VNx8BI")
> >                      (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
> >                      (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
> > +                    (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
> >                      (V4SI "VNx4BI") (V2DI "VNx2BI")])
> >
> >  ;; ...and again in lower case.
> > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> > b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> > new file mode 100644
> > index 00000000000..8e349efe390
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> > @@ -0,0 +1,88 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -march=armv8.2-a+sve -fno-vect-cost-model
> > +-fno-schedule-insns -fno-schedule-insns2" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +/*
> > +** f_v4hi:
> > +** ptrue   (p[0-7]).b, all
> > +** ldr     d([0-9]+), \[x0\]
> > +** cnt     z\2.h, \1/m, z\2.h
> > +** str     d\2, \[x1\]
> > +** ret
> > +*/
> > +void
> > +f_v4hi (unsigned short *__restrict b, unsigned short *__restrict d) {
> > +  d[0] = __builtin_popcount (b[0]);
> > +  d[1] = __builtin_popcount (b[1]);
> > +  d[2] = __builtin_popcount (b[2]);
> > +  d[3] = __builtin_popcount (b[3]);
> > +}
> > +
> > +/*
> > +** f_v8hi:
> > +** ptrue   (p[0-7]).b, all
> > +** ldr     q([0-9]+), \[x0\]
> > +** cnt     z\2.h, \1/m, z\2.h
> > +** str     q\2, \[x1\]
> > +** ret
> > +*/
> > +void
> > +f_v8hi (unsigned short *__restrict b, unsigned short *__restrict d) {
> > +  d[0] = __builtin_popcount (b[0]);
> > +  d[1] = __builtin_popcount (b[1]);
> > +  d[2] = __builtin_popcount (b[2]);
> > +  d[3] = __builtin_popcount (b[3]);
> > +  d[4] = __builtin_popcount (b[4]);
> > +  d[5] = __builtin_popcount (b[5]);
> > +  d[6] = __builtin_popcount (b[6]);
> > +  d[7] = __builtin_popcount (b[7]);
> > +}
> > +
> > +/*
> > +** f_v2si:
> > +** ptrue   (p[0-7]).b, all
> > +** ldr     d([0-9]+), \[x0\]
> > +** cnt     z\2.s, \1/m, z\2.s
> > +** str     d\2, \[x1\]
> > +** ret
> > +*/
> > +void
> > +f_v2si (unsigned int *__restrict b, unsigned int *__restrict d) {
> > +  d[0] = __builtin_popcount (b[0]);
> > +  d[1] = __builtin_popcount (b[1]);
> > +}
> > +
> > +/*
> > +** f_v4si:
> > +** ptrue   (p[0-7]).b, all
> > +** ldr     q([0-9]+), \[x0\]
> > +** cnt     z\2.s, \1/m, z\2.s
> > +** str     q\2, \[x1\]
> > +** ret
> > +*/
> > +void
> > +f_v4si (unsigned int *__restrict b, unsigned int *__restrict d) {
> > +  d[0] = __builtin_popcount (b[0]);
> > +  d[1] = __builtin_popcount (b[1]);
> > +  d[2] = __builtin_popcount (b[2]);
> > +  d[3] = __builtin_popcount (b[3]);
> > +}
> > +
> > +/*
> > +** f_v2di:
> > +** ptrue   (p[0-7]).b, all
> > +** ldr     q([0-9]+), \[x0\]
> > +** cnt     z\2.d, \1/m, z\2.d
> > +** str     q\2, \[x1\]
> > +** ret
> > +*/
> > +void
> > +f_v2di (unsigned long *__restrict b, unsigned long *__restrict d) {
> > +  d[0] = __builtin_popcountll (b[0]);
> > +  d[1] = __builtin_popcountll (b[1]); }

Reply via email to