Christophe Lyon <christophe.l...@linaro.org> writes:
> This patch adds vec_unpack<US>_hi_<mode>, vec_unpack<US>_lo_<mode>,
> vec_pack_trunc_<mode> patterns for MVE.
>
> It does so by moving the unpack patterns from neon.md to
> vec-common.md, while adding them support for MVE. The pack expander is
> derived from the Neon one (which in turn is renamed into
> neon_quad_vec_pack_trunc_<mode>).
>
> The patch introduces mve_vec_pack_trunc_<mode> to avoid the need for a
> zero-initialized temporary, which is needed if the
> vec_pack_trunc_<mode> expander calls @mve_vmovn[bt]q_<supf><mode>
> instead.
>
> With this patch, we can now vectorize the 16 and 8-bit versions of
> vclz and vshl, although the generated code could still be improved.
> For test_clz_s16, we now generate
>         vldrh.16        q3, [r1]
>         vmovlb.s16   q2, q3
>         vmovlt.s16   q3, q3
>         vclz.i32  q2, q2
>         vclz.i32  q3, q3
>         vmovnb.i32      q1, q2
>         vmovnt.i32      q1, q3
>         vstrh.16        q1, [r0]
> which could be improved to
>         vldrh.16        q3, [r1]
>       vclz.i16        q1, q3
>         vstrh.16        q1, [r0]
> if we could avoid the need for unpack/pack steps.

Yeah, there was a PR about fixing this for popcount.  I guess the same
approach would apply here too.

> For reference, clang-12 generates:
>       vldrh.s32       q0, [r1]
>       vldrh.s32       q1, [r1, #8]
>       vclz.i32        q0, q0
>       vstrh.32        q0, [r0]
>       vclz.i32        q0, q1
>       vstrh.32        q0, [r0, #8]
>
> 2021-06-03  Christophe Lyon  <christophe.l...@linaro.org>
>
>       gcc/
>       * config/arm/mve.md (mve_vmovltq_<supf><mode>): Prefix with '@'.
>       (mve_vmovlbq_<supf><mode>): Likewise.
>       (mve_vmovnbq_<supf><mode>): Likewise.
>       (mve_vmovntq_<supf><mode>): Likewise.
>       (@mve_vec_pack_trunc_<mode>): New pattern.
>       * config/arm/neon.md (vec_unpack<US>_hi_<mode>): Move to
>       vec-common.md.
>       (vec_unpack<US>_lo_<mode>): Likewise.
>       (vec_pack_trunc_<mode>): Rename to
>       neon_quad_vec_pack_trunc_<mode>.
>       * config/arm/vec-common.md (vec_unpack<US>_hi_<mode>): New
>       pattern.
>       (vec_unpack<US>_lo_<mode>): New.
>       (vec_pack_trunc_<mode>): New.
>
>       gcc/testsuite/
>       * gcc.target/arm/simd/mve-vclz.c: Update expected results.
>       * gcc.target/arm/simd/mve-vshl.c: Likewise.
> ---
>  gcc/config/arm/mve.md                        | 20 ++++-
>  gcc/config/arm/neon.md                       | 39 +--------
>  gcc/config/arm/vec-common.md                 | 89 ++++++++++++++++++++
>  gcc/testsuite/gcc.target/arm/simd/mve-vclz.c |  7 +-
>  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c |  5 +-
>  5 files changed, 114 insertions(+), 46 deletions(-)
>
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 99e46d0bc69..b18292c07d3 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_<supf><mode>"
>  ;;
>  ;; [vmovltq_u, vmovltq_s])
>  ;;
> -(define_insn "mve_vmovltq_<supf><mode>"
> +(define_insn "@mve_vmovltq_<supf><mode>"
>    [
>     (set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
>       (unspec:<V_double_width> [(match_operand:MVE_3 1 "s_register_operand" 
> "w")]
> @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_<supf><mode>"
>  ;;
>  ;; [vmovlbq_s, vmovlbq_u])
>  ;;
> -(define_insn "mve_vmovlbq_<supf><mode>"
> +(define_insn "@mve_vmovlbq_<supf><mode>"
>    [
>     (set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
>       (unspec:<V_double_width> [(match_operand:MVE_3 1 "s_register_operand" 
> "w")]
> @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s<mode>"
>  ;;
>  ;; [vmovnbq_u, vmovnbq_s])
>  ;;
> -(define_insn "mve_vmovnbq_<supf><mode>"
> +(define_insn "@mve_vmovnbq_<supf><mode>"
>    [
>     (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
>       (unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 
> "s_register_operand" "0")
> @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_<supf><mode>"
>  ;;
>  ;; [vmovntq_s, vmovntq_u])
>  ;;
> -(define_insn "mve_vmovntq_<supf><mode>"
> +(define_insn "@mve_vmovntq_<supf><mode>"
>    [
>     (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
>       (unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 
> "s_register_operand" "0")
> @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_<supf><mode>"
>    [(set_attr "type" "mve_move")
>  ])
>  
> +(define_insn "@mve_vec_pack_trunc_<mode>"
> + [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
> +       (vec_concat:<V_narrow_pack>
> +             (truncate:<V_narrow>
> +                     (match_operand:MVE_5 1 "register_operand" "w"))
> +             (truncate:<V_narrow>
> +                     (match_operand:MVE_5 2 "register_operand" "w"))))]
> + "TARGET_HAVE_MVE"
> + "vmovnb.i<V_sz_elem>        %q0, %q1\;vmovnt.i<V_sz_elem>   %q0, %q2"
> +  [(set_attr "type" "mve_move")]
> +)
> +

I realise this is (like you say) based on the neon.md pattern,
but we should use separate vmovnb and vmovnt instructions instead
of putting two instructions into a single pattern.

One specific advantage to using separate patterns is that it would
avoid the imprecision of the earlyclobber: the output only conflicts
with operand 1 and can be tied to operand 2.

>  ;;
>  ;; [vmulq_f])
>  ;;
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 0fdffaf4ec4..392d9607919 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -5924,43 +5924,6 @@ (define_insn "neon_vec_unpack<US>_hi_<mode>"
>    [(set_attr "type" "neon_shift_imm_long")]
>  )
>  
> -(define_expand "vec_unpack<US>_hi_<mode>"
> -  [(match_operand:<V_unpack> 0 "register_operand")
> -   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
> - "TARGET_NEON && !BYTES_BIG_ENDIAN"
> -  {
> -   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
> -   rtx t1;
> -   int i;
> -   for (i = 0; i < (<V_mode_nunits>/2); i++)
> -     RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
> -  
> -   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
> -   emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0], 
> -                                                 operands[1], 
> -                                              t1));
> -   DONE;
> -  }
> -)
> -
> -(define_expand "vec_unpack<US>_lo_<mode>"
> -  [(match_operand:<V_unpack> 0 "register_operand")
> -   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
> - "TARGET_NEON && !BYTES_BIG_ENDIAN"
> -  {
> -   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
> -   rtx t1;
> -   int i;
> -   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
> -     RTVEC_ELT (v, i) = GEN_INT (i);
> -   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
> -   emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0], 
> -                                                 operands[1], 
> -                                              t1));
> -   DONE;
> -  }
> -)
> -
>  (define_insn "neon_vec_<US>mult_lo_<mode>"
>   [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
>         (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
> @@ -6176,7 +6139,7 @@ (define_expand "vec_widen_<US>shiftl_lo_<mode>"
>  ; because the ordering of vector elements in Q registers is different from 
> what
>  ; the semantics of the instructions require.
>  
> -(define_insn "vec_pack_trunc_<mode>"
> +(define_insn "neon_quad_vec_pack_trunc_<mode>"
>   [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
>         (vec_concat:<V_narrow_pack> 
>               (truncate:<V_narrow> 
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 1ba1e5eb008..0ffc7a9322c 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -638,3 +638,92 @@ (define_expand "clz<mode>2"
>      emit_insn (gen_mve_vclzq_s (<MODE>mode, operands[0], operands[1]));
>    DONE;
>  })
> +
> +;; vmovl[tb] are not available for V4SI on MVE
> +(define_expand "vec_unpack<US>_hi_<mode>"
> +  [(match_operand:<V_unpack> 0 "register_operand")
> +   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
> + "ARM_HAVE_<MODE>_ARITH
> +  && !TARGET_REALLY_IWMMXT
> +  && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
> +  && !BYTES_BIG_ENDIAN"
> +  {
> +    if (TARGET_NEON)
> +      {
> +     rtvec v = rtvec_alloc (<V_mode_nunits>/2);
> +     rtx t1;
> +     int i;
> +     for (i = 0; i < (<V_mode_nunits>/2); i++)
> +       RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
> +
> +     t1 = gen_rtx_PARALLEL (<MODE>mode, v);
> +     emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0],
> +                                                   operands[1],
> +                                                   t1));
> +      }
> +    else
> +      {
> +     emit_insn (gen_mve_vmovltq (VMOVLTQ_S, <MODE>mode, operands[0],
> +                                 operands[1]));

It would be good to use the same rtx pattern for the MVE instructions,
since the compiler can optimise it better than an unspec.

It would be good to have separate tests for the packs and unpacks
as well, in case the vclz thing is fixed in future.

Thanks,
Richard

> +      }
> +    DONE;
> +  }
> +)
> +
> +;; vmovl[tb] are not available for V4SI on MVE
> +(define_expand "vec_unpack<US>_lo_<mode>"
> +  [(match_operand:<V_unpack> 0 "register_operand")
> +   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
> + "ARM_HAVE_<MODE>_ARITH
> +  && !TARGET_REALLY_IWMMXT
> +  && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
> +  && !BYTES_BIG_ENDIAN"
> +  {
> +    if (TARGET_NEON)
> +      {
> +     rtvec v = rtvec_alloc (<V_mode_nunits>/2);
> +     rtx t1;
> +     int i;
> +     for (i = 0; i < (<V_mode_nunits>/2) ; i++)
> +       RTVEC_ELT (v, i) = GEN_INT (i);
> +
> +     t1 = gen_rtx_PARALLEL (<MODE>mode, v);
> +     emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0],
> +                                                   operands[1],
> +                                                   t1));
> +      }
> +    else
> +      {
> +     emit_insn (gen_mve_vmovlbq (VMOVLBQ_S, <MODE>mode, operands[0],
> +                                 operands[1]));
> +      }
> +    DONE;
> +  }
> +)
> +
> +;; vmovn[tb] are not available for V2DI on MVE
> +(define_expand "vec_pack_trunc_<mode>"
> + [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
> +       (vec_concat:<V_narrow_pack>
> +             (truncate:<V_narrow>
> +                     (match_operand:VN 1 "register_operand" "w"))
> +             (truncate:<V_narrow>
> +                     (match_operand:VN 2 "register_operand" "w"))))]
> + "ARM_HAVE_<MODE>_ARITH
> +  && !TARGET_REALLY_IWMMXT
> +  && ! (<MODE>mode == V2DImode && TARGET_HAVE_MVE)
> +  && !BYTES_BIG_ENDIAN"
> + {
> +   if (TARGET_NEON)
> +     {
> +       emit_insn (gen_neon_quad_vec_pack_trunc_<mode> (operands[0], 
> operands[1],
> +                                                    operands[2]));
> +     }
> +   else
> +     {
> +       emit_insn (gen_mve_vec_pack_trunc (<MODE>mode, operands[0], 
> operands[1],
> +                                       operands[2]));
> +     }
> +   DONE;
> + }
> +)
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c 
> b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c
> index 7068736bc28..5d6e991cfc6 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c
> @@ -21,8 +21,9 @@ FUNC(u, uint, 16, clz)
>  FUNC(s, int, 8, clz)
>  FUNC(u, uint, 8, clz)
>  
> -/* 16 and 8-bit versions are not vectorized because they need pack/unpack
> -   patterns since __builtin_clz uses 32-bit parameter and return value.  */
> -/* { dg-final { scan-assembler-times {vclz\.i32  q[0-9]+, q[0-9]+} 2 } } */
> +/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for
> +   instance instead of using vclz.i8, we need 4 vclz.i32, leading to a total 
> of
> +   14 vclz.i32 expected in this testcase.  */
> +/* { dg-final { scan-assembler-times {vclz\.i32  q[0-9]+, q[0-9]+} 14 } } */
>  /* { dg-final { scan-assembler-times {vclz\.i16  q[0-9]+, q[0-9]+} 2 { xfail 
> *-*-* } } } */
>  /* { dg-final { scan-assembler-times {vclz\.i8  q[0-9]+, q[0-9]+} 2 { xfail 
> *-*-* } } } */
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c 
> b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> index 7a0644997c8..91dd942d818 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
> @@ -56,7 +56,10 @@ FUNC_IMM(u, uint, 8, 16, <<, vshlimm)
>  /* MVE has only 128-bit vectors, so we can vectorize only half of the
>     functions above.  */
>  /* We only emit vshl.u, which is equivalent to vshl.s anyway.  */
> -/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } 
> */
> +/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for
> +   instance instead of using vshl.u8, we need 4 vshl.i32, leading to a total 
> of
> +   14 vshl.i32 expected in this testcase.  */
> +/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 14 } } 
> */
>  
>  /* We emit vshl.i when the shift amount is an immediate.  */
>  /* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } 
> */

Reply via email to