RE: [PATCH v5] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-27 Thread Pengxuan Zheng (QUIC)
Thanks, Richard! I've updated the patch accordingly.

https://gcc.gnu.org/pipermail/gcc-patches/2024-June/655912.html

Please let me know if any other changes are needed.

Thanks,
Pengxuan
> Sorry for the slow reply.
> 
> Pengxuan Zheng  writes:
> > This patch improves GCC’s vectorization of __builtin_popcount for
> > aarch64 target by adding popcount patterns for vector modes besides
> > QImode, i.e., HImode, SImode and DImode.
> >
> > With this patch, we now generate the following for V8HI:
> >   cnt v1.16b, v0.16b
> >   uaddlp  v2.8h, v1.16b
> >
> > For V4HI, we generate:
> >   cnt v1.8b, v0.8b
> >   uaddlp  v2.4h, v1.8b
> >
> > For V4SI, we generate:
> >   cnt v1.16b, v0.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >
> > For V4SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.4s, #0
> >   moviv1.16b, #1
> >   cnt v3.16b, v2.16b
> >   udotv0.4s, v3.16b, v1.16b
> >
> > For V2SI, we generate:
> >   cnt v1.8b, v.8b
> >   uaddlp  v2.4h, v1.8b
> >   uaddlp  v3.2s, v2.4h
> >
> > For V2SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.8b, #0
> >   moviv1.8b, #1
> >   cnt v3.8b, v2.8b
> >   udotv0.2s, v3.8b, v1.8b
> >
> > For V2DI, we generate:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >   uaddlp  v4.2d, v3.4s
> >
> > For V4SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.4s, #0
> >   moviv1.16b, #1
> >   cnt v3.16b, v2.16b
> >   udotv0.4s, v3.16b, v1.16b
> >   uaddlp  v0.2d, v0.4s
> >
> > PR target/113859
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (aarch64_addlp):
> Rename to...
> > (@aarch64_addlp): ... This.
> > (popcount2): New define_expand.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/popcnt-udot.c: New test.
> > * gcc.target/aarch64/popcnt-vec.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64-simd.md| 51 +-
> >  .../gcc.target/aarch64/popcnt-udot.c  | 58 
> >  gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69
> > +++
> >  3 files changed, 177 insertions(+), 1 deletion(-)  create mode 100644
> > gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index 0bb39091a38..1c76123a518 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3461,7 +3461,7 @@ (define_insn
> "*aarch64_addlv_ze"
> >[(set_attr "type" "neon_reduc_add")]
> >  )
> >
> > -(define_expand "aarch64_addlp"
> > +(define_expand "@aarch64_addlp"
> >[(set (match_operand: 0 "register_operand")
> > (plus:
> >   (vec_select:
> > @@ -3517,6 +3517,55 @@ (define_insn
> "popcount2"
> >[(set_attr "type" "neon_cnt")]
> >  )
> >
> > +(define_expand "popcount2"
> > +  [(set (match_operand:VDQHSD 0 "register_operand")
> > +(popcount:VDQHSD (match_operand:VDQHSD 1
> > +"register_operand")))]
> > +  "TARGET_SIMD"
> > +  {
> > +/* Generate a byte popcount. */
> > +machine_mode mode =  == 64 ? V8QImode : V16QImode;
> > +rtx tmp = gen_reg_rtx (mode);
> > +auto icode = optab_handler (popcount_optab, mode);
> > +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode,
> > +operands[1])));
> > +
> > +if (TARGET_DOTPROD)
> > +  {
> > +/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator
> and a
> > +   1 multiplicand. For V2DI, another UAADDLP is needed. */
> > +if (mode == SImode || mode == DImode)
> 
> How about combining these into a single if:
> 
>   if (TARGET_DOTPROD
>   && (mode == SImode || mode == DImode))
> 
> > +  {
> > +machine_mode dp_mode =  == 64 ? V2SImode : V4SImode;
> > +rtx ones = force_reg (mode, CONST1_RTX (mode));
> > +rtx zeros = CONST0_RTX (dp_mode);
> > +rtx dp = gen_reg_rtx (dp_mode);
> > +auto dp_icode = optab_handler (udot_prod_optab, mode);
> > +emit_move_insn (dp, zeros);
> > +emit_insn (GEN_FCN (dp_icode) (dp, tmp, ones, dp));
> > +if (mode == V2DImode)
> > +  {
> > +emit_insn (gen_aarch64_uaddlpv4si (operands[0], dp));
> > +DONE;
> > +  }
> > +emit_move_insn (operands[0], dp);
> > +DONE;
> > +  }
> 
> It's minor, but I think we should write this as something like:
> 
> {
>   rtx ones = force_reg (mode, CONST1_RTX (mode));
>   mode =  == 64 ? V2SImode : V4SImode;
>   rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
>   rtx zeros = force_reg (mode, CONST0_RTX (mode));
>   auto dp_icode = optab_handler (udot_prod_optab, mode);
>   emit_insn (GEN_FCN (dp_icode

Re: [PATCH v5] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-27 Thread Richard Sandiford
Sorry for the slow reply.

Pengxuan Zheng  writes:
> This patch improves GCC’s vectorization of __builtin_popcount for aarch64 
> target
> by adding popcount patterns for vector modes besides QImode, i.e., HImode,
> SImode and DImode.
>
> With this patch, we now generate the following for V8HI:
>   cnt v1.16b, v0.16b
>   uaddlp  v2.8h, v1.16b
>
> For V4HI, we generate:
>   cnt v1.8b, v0.8b
>   uaddlp  v2.4h, v1.8b
>
> For V4SI, we generate:
>   cnt v1.16b, v0.16b
>   uaddlp  v2.8h, v1.16b
>   uaddlp  v3.4s, v2.8h
>
> For V4SI with TARGET_DOTPROD, we generate the following instead:
>   moviv0.4s, #0
>   moviv1.16b, #1
>   cnt v3.16b, v2.16b
>   udotv0.4s, v3.16b, v1.16b
>
> For V2SI, we generate:
>   cnt v1.8b, v.8b
>   uaddlp  v2.4h, v1.8b
>   uaddlp  v3.2s, v2.4h
>
> For V2SI with TARGET_DOTPROD, we generate the following instead:
>   moviv0.8b, #0
>   moviv1.8b, #1
>   cnt v3.8b, v2.8b
>   udotv0.2s, v3.8b, v1.8b
>
> For V2DI, we generate:
>   cnt v1.16b, v.16b
>   uaddlp  v2.8h, v1.16b
>   uaddlp  v3.4s, v2.8h
>   uaddlp  v4.2d, v3.4s
>
> For V4SI with TARGET_DOTPROD, we generate the following instead:
>   moviv0.4s, #0
>   moviv1.16b, #1
>   cnt v3.16b, v2.16b
>   udotv0.4s, v3.16b, v1.16b
>   uaddlp  v0.2d, v0.4s
>
>   PR target/113859
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to...
>   (@aarch64_addlp): ... This.
>   (popcount2): New define_expand.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/popcnt-udot.c: New test.
>   * gcc.target/aarch64/popcnt-vec.c: New test.
>
> Signed-off-by: Pengxuan Zheng 
> ---
>  gcc/config/aarch64/aarch64-simd.md| 51 +-
>  .../gcc.target/aarch64/popcnt-udot.c  | 58 
>  gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 +++
>  3 files changed, 177 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 0bb39091a38..1c76123a518 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3461,7 +3461,7 @@ (define_insn 
> "*aarch64_addlv_ze"
>[(set_attr "type" "neon_reduc_add")]
>  )
>  
> -(define_expand "aarch64_addlp"
> +(define_expand "@aarch64_addlp"
>[(set (match_operand: 0 "register_operand")
>   (plus:
> (vec_select:
> @@ -3517,6 +3517,55 @@ (define_insn "popcount2"
>[(set_attr "type" "neon_cnt")]
>  )
>  
> +(define_expand "popcount2"
> +  [(set (match_operand:VDQHSD 0 "register_operand")
> +(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
> +  "TARGET_SIMD"
> +  {
> +/* Generate a byte popcount. */
> +machine_mode mode =  == 64 ? V8QImode : V16QImode;
> +rtx tmp = gen_reg_rtx (mode);
> +auto icode = optab_handler (popcount_optab, mode);
> +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
> +
> +if (TARGET_DOTPROD)
> +  {
> +/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator 
> and a
> +   1 multiplicand. For V2DI, another UAADDLP is needed. */
> +if (mode == SImode || mode == DImode)

How about combining these into a single if:

  if (TARGET_DOTPROD
  && (mode == SImode || mode == DImode))

> +  {
> +machine_mode dp_mode =  == 64 ? V2SImode : V4SImode;
> +rtx ones = force_reg (mode, CONST1_RTX (mode));
> +rtx zeros = CONST0_RTX (dp_mode);
> +rtx dp = gen_reg_rtx (dp_mode);
> +auto dp_icode = optab_handler (udot_prod_optab, mode);
> +emit_move_insn (dp, zeros);
> +emit_insn (GEN_FCN (dp_icode) (dp, tmp, ones, dp));
> +if (mode == V2DImode)
> +  {
> +emit_insn (gen_aarch64_uaddlpv4si (operands[0], dp));
> +DONE;
> +  }
> +emit_move_insn (operands[0], dp);
> +DONE;
> +  }

It's minor, but I think we should write this as something like:

{
  rtx ones = force_reg (mode, CONST1_RTX (mode));
  mode =  == 64 ? V2SImode : V4SImode;
  rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
  rtx zeros = force_reg (mode, CONST0_RTX (mode));
  auto dp_icode = optab_handler (udot_prod_optab, mode);
  emit_insn (GEN_FCN (dp_icode) (dest, tmp, ones, zeros));
  tmp = dest;
}

so that we don't repeat the final SI->DI stage.

> +  }
> +
> +/* Use a sequence of UADDLPs to accumulate the counts. Each step doubles
> +   the element size and halves the number of elements. */
> +do
> +  {
> +auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
> +mode = insn_data[icode].operand[0].mode;
> +rtx dest = mode == mode ? operan

[PATCH v5] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-18 Thread Pengxuan Zheng
This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for V8HI:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b

For V4HI, we generate:
  cnt v1.8b, v0.8b
  uaddlp  v2.4h, v1.8b

For V4SI, we generate:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b

For V2SI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b
  uaddlp  v3.2s, v2.4h

For V2SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.8b, #0
  moviv1.8b, #1
  cnt v3.8b, v2.8b
  udotv0.2s, v3.8b, v1.8b

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b
  uaddlp  v0.2d, v0.4s

PR target/113859

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to...
(@aarch64_addlp): ... This.
(popcount2): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-udot.c: New test.
* gcc.target/aarch64/popcnt-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md| 51 +-
 .../gcc.target/aarch64/popcnt-udot.c  | 58 
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 +++
 3 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 0bb39091a38..1c76123a518 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3461,7 +3461,7 @@ (define_insn 
"*aarch64_addlv_ze"
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_expand "aarch64_addlp"
+(define_expand "@aarch64_addlp"
   [(set (match_operand: 0 "register_operand")
(plus:
  (vec_select:
@@ -3517,6 +3517,55 @@ (define_insn "popcount2"
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VDQHSD 0 "register_operand")
+(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+/* Generate a byte popcount. */
+machine_mode mode =  == 64 ? V8QImode : V16QImode;
+rtx tmp = gen_reg_rtx (mode);
+auto icode = optab_handler (popcount_optab, mode);
+emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
+
+if (TARGET_DOTPROD)
+  {
+/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a
+   1 multiplicand. For V2DI, another UAADDLP is needed. */
+if (mode == SImode || mode == DImode)
+  {
+machine_mode dp_mode =  == 64 ? V2SImode : V4SImode;
+rtx ones = force_reg (mode, CONST1_RTX (mode));
+rtx zeros = CONST0_RTX (dp_mode);
+rtx dp = gen_reg_rtx (dp_mode);
+auto dp_icode = optab_handler (udot_prod_optab, mode);
+emit_move_insn (dp, zeros);
+emit_insn (GEN_FCN (dp_icode) (dp, tmp, ones, dp));
+if (mode == V2DImode)
+  {
+emit_insn (gen_aarch64_uaddlpv4si (operands[0], dp));
+DONE;
+  }
+emit_move_insn (operands[0], dp);
+DONE;
+  }
+  }
+
+/* Use a sequence of UADDLPs to accumulate the counts. Each step doubles
+   the element size and halves the number of elements. */
+do
+  {
+auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
+mode = insn_data[icode].operand[0].mode;
+rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+emit_insn (GEN_FCN (icode) (dest, tmp));
+tmp = dest;
+  }
+while (mode != mode);
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
new file mode 100644
index 000..150ff746361
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+dotprod -fno-vect-cost-model" } */
+
+/*
+** bar:
+** ldr q([0-9]+), \[x0\]
+** moviv([0-9]+).16b, 0x1
+** moviv([0-9]+).4s, 0
+** cnt v([0-9]+).16b, v\1.16b
+** udotv\3.4s, v\4.16b, v\2.16b
+** str q\3, \[x1\]
+** ret
+*/
+void
+bar (unsigned int *_