On Mon, 14 Jun 2021 at 18:01, Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> Christophe Lyon <christophe.l...@linaro.org> writes:
> > On Fri, 11 Jun 2021 at 18:25, Richard Sandiford
> > <richard.sandif...@arm.com> wrote:
> >>
> >> Christophe Lyon <christophe.l...@linaro.org> writes:
> >> > Thanks for the feedback. How about v2 attached?
> >> > Do you want me to merge neon_vec_unpack<US> and
> >> > mve_vec_unpack<US>  and only have different assembly?
> >> > if (TARGET_HAVE_MVE)
> >> >   return "vmovlb.<US>%#<V_sz_elem> %q0, %q1";
> >> > else
> >> >   return "vmovlb.<US>%#<V_sz_elem> %q0, %q1";
> >>
> >> I think it'd be better to keep them separate, given that they have
> >> different attributes.
> >>
> >> > @@ -2199,10 +2222,23 @@ (define_insn "mve_vmovnbq_<supf><mode>"
> >> >    [(set_attr "type" "mve_move")
> >> >  ])
> >> >
> >> > +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the
> >> > +;; need for an extra register.
> >>
> >> “to avoid the need for an uninitailized input operand” might
> >> be clearer.
> >>
> > Done
> >
> >> > +(define_insn "@mve_vec_pack_trunc_lo_<mode>"
> >> > +  [
> >> > +   (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
> >> > +     (unspec:<V_narrow_pack> [(match_operand:MVE_5 1 
> >> > "s_register_operand" "w")]
> >> > +      VMOVNBQ_S))
> >> > +  ]
> >> > +  "TARGET_HAVE_MVE"
> >> > +  "vmovnb.i%#<V_sz_elem>     %q0, %q1"
> >> > +  [(set_attr "type" "mve_move")
> >> > +])
> >> > +
> >> > […]
> >> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> >> > index 430a92ce966..3afb3e1d891 100644
> >> > --- a/gcc/config/arm/vec-common.md
> >> > +++ b/gcc/config/arm/vec-common.md
> >> > @@ -632,3 +632,91 @@ (define_expand "clz<mode>2"
> >> >    "ARM_HAVE_<MODE>_ARITH
> >> >     && !TARGET_REALLY_IWMMXT"
> >> >  )
> >> > +
> >> > +;; vmovl[tb] are not available for V4SI on MVE
> >> > +(define_expand "vec_unpack<US>_hi_<mode>"
> >> > +  [(match_operand:<V_unpack> 0 "register_operand")
> >> > +   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
> >> > + "ARM_HAVE_<MODE>_ARITH
> >> > +  && !TARGET_REALLY_IWMMXT
> >> > +  && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
> >> > +  && !BYTES_BIG_ENDIAN"
> >> > +  {
> >> > +    rtvec v = rtvec_alloc (<V_mode_nunits>/2);
> >> > +    rtx t1;
> >> > +    int i;
> >> > +    for (i = 0; i < (<V_mode_nunits>/2); i++)
> >> > +      RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
> >> > +
> >> > +    t1 = gen_rtx_PARALLEL (<MODE>mode, v);
> >> > +
> >> > +    if (TARGET_NEON)
> >> > +      emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0],
> >> > +                                                 operands[1],
> >> > +                                                 t1));
> >> > +    else
> >> > +      emit_insn (gen_mve_vec_unpack<US>_hi_<mode> (operands[0],
> >> > +                                                operands[1],
> >> > +                                                t1));
> >> > +    DONE;
> >> > +  }
> >> > +)
> >>
> >> Since the patterns are the same for both MVE and Neon (a good thing!),
> >> it would be simpler to write this as:
> >>
> >> (define_insn "mve_vec_unpack<US>_lo_<mode>"
> > I guess you mean define_expand?
>
> Yes, sorry.
>
> >>   [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
> >>         (SE:<V_unpack> (vec_select:<V_HALF>
> >>                           (match_operand:VU 1 "register_operand" "w")
> >>                           (match_dup 2))))]
> >>   …
> >>
> >> and then set operands[2] to what is t1 above.  There would then be
> >> no need to call emit_insn & DONE, and we could use the natural
> >> iterator (MVE_3) for the MVE patterns.
> >>
> >> Same idea for the lo version.
> >>
> >> > […]
> >> > +;; vmovn[tb] are not available for V2DI on MVE
> >> > +(define_expand "vec_pack_trunc_<mode>"
> >> > + [(set (match_operand:<V_narrow_pack> 0 "register_operand")
> >> > +       (vec_concat:<V_narrow_pack>
> >> > +             (truncate:<V_narrow>
> >> > +                     (match_operand:VN 1 "register_operand"))
> >> > +             (truncate:<V_narrow>
> >> > +                     (match_operand:VN 2 "register_operand"))))]
> >> > + "ARM_HAVE_<MODE>_ARITH
> >> > +  && !TARGET_REALLY_IWMMXT
> >> > +  && ! (<MODE>mode == V2DImode && TARGET_HAVE_MVE)
> >> > +  && !BYTES_BIG_ENDIAN"
> >> > + {
> >> > +   if (TARGET_NEON)
> >> > +     {
> >> > +       emit_insn (gen_neon_quad_vec_pack_trunc_<mode> (operands[0], 
> >> > operands[1],
> >> > +                                                    operands[2]));
> >> > +     }
> >> > +   else
> >> > +     {
> >> > +       rtx tmpreg = gen_reg_rtx (<V_narrow_pack>mode);
> >> > +       emit_insn (gen_mve_vec_pack_trunc_lo (<MODE>mode, tmpreg, 
> >> > operands[1]));
> >> > +       emit_insn (gen_mve_vmovntq (VMOVNTQ_S, <MODE>mode,
> >> > +                                        tmpreg, tmpreg, operands[2]));
> >> > +       emit_move_insn (operands[0], tmpreg);
> >>
> >> vmovntq can assign directly to operands[0], without a separate move.
> >>
> >> OK with those changes, thanks.
> >
> > Just to be sure, here is v3 which hopefully matches what you meant.
>
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > index 430a92ce966..1d6d4379855 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -632,3 +632,78 @@ (define_expand "clz<mode>2"
> >    "ARM_HAVE_<MODE>_ARITH
> >     && !TARGET_REALLY_IWMMXT"
> >  )
> > +
> > +;; vmovl[tb] are not available for V4SI on MVE
> > +(define_expand "vec_unpack<US>_hi_<mode>"
> > +  [(set (match_operand:<V_unpack> 0 "register_operand")
> > +       (SE:<V_unpack> (vec_select:<V_HALF>
> > +                             (match_operand:VU 1 "register_operand")
> > +                     (match_dup 2))))]
>
> The formatting of the vec_unpack<US>_lo_<mode> pattern looks nicer.
>
> > + "ARM_HAVE_<MODE>_ARITH
> > +  && !TARGET_REALLY_IWMMXT
> > +  && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
> > +  && !BYTES_BIG_ENDIAN"
> > +  {
> > +    rtvec v = rtvec_alloc (<V_mode_nunits>/2);
> > +    rtx t1;
> > +    int i;
> > +    for (i = 0; i < (<V_mode_nunits>/2); i++)
> > +      RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
> > +
> > +    t1 = gen_rtx_PARALLEL (<MODE>mode, v);
> > +
> > +    operands[2] = t1;
>
> t1 doesn't seem worth it now, might as well assign directly to
> operands[2].
>
> > […]
> > +       emit_insn (gen_mve_vmovntq (VMOVNTQ_S, <MODE>mode,
> > +                                        operands[0], tmpreg, operands[2]));
>
> The indentation looks odd here (operands[0] should line up with
> VMOVNTQ_S), but that might be a mailer issue.
>

That was the intent, but there was a mix of tabs/spaces, now fixed
(and elsewhere too).

I have attached what I have committed.

Thanks
Christophe

> > +     }
> > +   DONE;
>
>
> OK with those changes, thanks.
>
> Richard
From d10a87dd7b8ce3b3cccb0cb5c0b9ee21e085d19d Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.l...@linaro.org>
Date: Thu, 3 Jun 2021 14:35:50 +0000
Subject: [PATCH v4] arm: Auto-vectorization for MVE: add pack/unpack patterns

This patch adds vec_unpack<US>_hi_<mode>, vec_unpack<US>_lo_<mode>,
vec_pack_trunc_<mode> patterns for MVE.

It does so by moving the unpack patterns from neon.md to
vec-common.md, while adding them support for MVE. The pack expander is
derived from the Neon one (which in turn is renamed into
neon_quad_vec_pack_trunc_<mode>).

The patch introduces mve_vec_unpack<US>_lo_<mode> and
mve_vec_unpack<US>_hi_<mode> which are similar to their Neon
counterparts, except for the assembly syntax.

The patch introduces mve_vec_pack_trunc_lo_<mode> to avoid the need for a
zero-initialized temporary, which is needed if the
vec_pack_trunc_<mode> expander calls @mve_vmovn[bt]q_<supf><mode>
instead.

With this patch, we can now vectorize the 16 and 8-bit versions of
vclz and vshl, although the generated code could still be improved.
For test_clz_s16, we now generate
        vldrh.16        q3, [r1]
        vmovlb.s16   q2, q3
        vmovlt.s16   q3, q3
        vclz.i32  q2, q2
        vclz.i32  q3, q3
        vmovnb.i32      q1, q2
        vmovnt.i32      q1, q3
        vstrh.16        q1, [r0]
which could be improved to
        vldrh.16        q3, [r1]
	vclz.i16	q1, q3
        vstrh.16        q1, [r0]
if we could avoid the need for unpack/pack steps.

For reference, clang-12 generates:
	vldrh.s32       q0, [r1]
	vldrh.s32       q1, [r1, #8]
	vclz.i32        q0, q0
	vstrh.32        q0, [r0]
	vclz.i32        q0, q1
	vstrh.32        q0, [r0, #8]

2021-06-11  Christophe Lyon  <christophe.l...@linaro.org>

	gcc/
	* config/arm/mve.md (mve_vec_unpack<US>_lo_<mode>): New pattern.
	(mve_vec_unpack<US>_hi_<mode>): New pattern.
	(@mve_vec_pack_trunc_lo_<mode>): New pattern.
	(mve_vmovntq_<supf><mode>): Prefix with '@'.
	* config/arm/neon.md (vec_unpack<US>_hi_<mode>): Move to
	vec-common.md.
	(vec_unpack<US>_lo_<mode>): Likewise.
	(vec_pack_trunc_<mode>): Rename to
	neon_quad_vec_pack_trunc_<mode>.
	* config/arm/vec-common.md (vec_unpack<US>_hi_<mode>): New
	pattern.
	(vec_unpack<US>_lo_<mode>): New.
	(vec_pack_trunc_<mode>): New.

	gcc/testsuite/
	* gcc.target/arm/simd/mve-vclz.c: Update expected results.
	* gcc.target/arm/simd/mve-vshl.c: Likewise.
	* gcc.target/arm/simd/mve-vec-pack.c: New test.
	* gcc.target/arm/simd/mve-vec-unpack.c: New test.
---
 gcc/config/arm/mve.md                         | 35 +++++++++-
 gcc/config/arm/neon.md                        | 39 +----------
 gcc/config/arm/vec-common.md                  | 70 +++++++++++++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vclz.c  |  7 +-
 .../gcc.target/arm/simd/mve-vec-pack.c        | 26 +++++++
 .../gcc.target/arm/simd/mve-vec-unpack.c      | 29 ++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vshl.c  |  5 +-
 7 files changed, 168 insertions(+), 43 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c

diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 99e46d0bc69..e393518ea88 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -535,6 +535,26 @@ (define_insn "mve_vmovlbq_<supf><mode>"
   [(set_attr "type" "mve_move")
 ])
 
+(define_insn "mve_vec_unpack<US>_lo_<mode>"
+  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+	(SE:<V_unpack> (vec_select:<V_HALF>
+			  (match_operand:MVE_3 1 "register_operand" "w")
+			  (match_operand:MVE_3 2 "vect_par_constant_low" ""))))]
+  "TARGET_HAVE_MVE"
+  "vmovlb.<US>%#<V_sz_elem> %q0, %q1"
+  [(set_attr "type" "mve_move")]
+)
+
+(define_insn "mve_vec_unpack<US>_hi_<mode>"
+  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+	(SE:<V_unpack> (vec_select:<V_HALF>
+			  (match_operand:MVE_3 1 "register_operand" "w")
+			  (match_operand:MVE_3 2 "vect_par_constant_high" ""))))]
+  "TARGET_HAVE_MVE"
+  "vmovlt.<US>%#<V_sz_elem> %q0, %q1"
+  [(set_attr "type" "mve_move")]
+)
+
 ;;
 ;; [vcvtpq_s, vcvtpq_u])
 ;;
@@ -2199,10 +2219,23 @@ (define_insn "mve_vmovnbq_<supf><mode>"
   [(set_attr "type" "mve_move")
 ])
 
+;; vmovnb pattern used by the vec_pack_trunc expander to avoid the
+;; need for an uninitialized input operand.
+(define_insn "@mve_vec_pack_trunc_lo_<mode>"
+  [
+   (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
+	(unspec:<V_narrow_pack> [(match_operand:MVE_5 1 "s_register_operand" "w")]
+	 VMOVNBQ_S))
+  ]
+  "TARGET_HAVE_MVE"
+  "vmovnb.i%#<V_sz_elem>	%q0, %q1"
+  [(set_attr "type" "mve_move")
+])
+
 ;;
 ;; [vmovntq_s, vmovntq_u])
 ;;
-(define_insn "mve_vmovntq_<supf><mode>"
+(define_insn "@mve_vmovntq_<supf><mode>"
   [
    (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 0fdffaf4ec4..392d9607919 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -5924,43 +5924,6 @@ (define_insn "neon_vec_unpack<US>_hi_<mode>"
   [(set_attr "type" "neon_shift_imm_long")]
 )
 
-(define_expand "vec_unpack<US>_hi_<mode>"
-  [(match_operand:<V_unpack> 0 "register_operand")
-   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
- "TARGET_NEON && !BYTES_BIG_ENDIAN"
-  {
-   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
-   rtx t1;
-   int i;
-   for (i = 0; i < (<V_mode_nunits>/2); i++)
-     RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
-  
-   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
-   emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0], 
-                                                 operands[1], 
-					         t1));
-   DONE;
-  }
-)
-
-(define_expand "vec_unpack<US>_lo_<mode>"
-  [(match_operand:<V_unpack> 0 "register_operand")
-   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
- "TARGET_NEON && !BYTES_BIG_ENDIAN"
-  {
-   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
-   rtx t1;
-   int i;
-   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
-     RTVEC_ELT (v, i) = GEN_INT (i);
-   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
-   emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0], 
-                                                 operands[1], 
-				   	         t1));
-   DONE;
-  }
-)
-
 (define_insn "neon_vec_<US>mult_lo_<mode>"
  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
        (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
@@ -6176,7 +6139,7 @@ (define_expand "vec_widen_<US>shiftl_lo_<mode>"
 ; because the ordering of vector elements in Q registers is different from what
 ; the semantics of the instructions require.
 
-(define_insn "vec_pack_trunc_<mode>"
+(define_insn "neon_quad_vec_pack_trunc_<mode>"
  [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
        (vec_concat:<V_narrow_pack> 
 		(truncate:<V_narrow> 
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 430a92ce966..f90afa4cdb9 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -632,3 +632,73 @@ (define_expand "clz<mode>2"
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT"
 )
+
+;; vmovl[tb] are not available for V4SI on MVE
+(define_expand "vec_unpack<US>_hi_<mode>"
+  [(set (match_operand:<V_unpack> 0 "register_operand")
+	(SE:<V_unpack> (vec_select:<V_HALF>
+			 (match_operand:VU 1 "register_operand")
+			 (match_dup 2))))]
+ "ARM_HAVE_<MODE>_ARITH
+  && !TARGET_REALLY_IWMMXT
+  && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
+  && !BYTES_BIG_ENDIAN"
+  {
+    rtvec v = rtvec_alloc (<V_mode_nunits>/2);
+    int i;
+    for (i = 0; i < (<V_mode_nunits>/2); i++)
+      RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
+
+    operands[2] = gen_rtx_PARALLEL (<MODE>mode, v);
+  }
+)
+
+;; vmovl[tb] are not available for V4SI on MVE
+(define_expand "vec_unpack<US>_lo_<mode>"
+  [(set (match_operand:<V_unpack> 0 "register_operand")
+	(SE:<V_unpack> (vec_select:<V_HALF>
+			 (match_operand:VU 1 "register_operand")
+			 (match_dup 2))))]
+ "ARM_HAVE_<MODE>_ARITH
+  && !TARGET_REALLY_IWMMXT
+  && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
+  && !BYTES_BIG_ENDIAN"
+  {
+    rtvec v = rtvec_alloc (<V_mode_nunits>/2);
+    int i;
+    for (i = 0; i < (<V_mode_nunits>/2) ; i++)
+      RTVEC_ELT (v, i) = GEN_INT (i);
+
+    operands[2] = gen_rtx_PARALLEL (<MODE>mode, v);
+
+  }
+)
+
+;; vmovn[tb] are not available for V2DI on MVE
+(define_expand "vec_pack_trunc_<mode>"
+ [(set (match_operand:<V_narrow_pack> 0 "register_operand")
+       (vec_concat:<V_narrow_pack>
+		(truncate:<V_narrow>
+			(match_operand:VN 1 "register_operand"))
+		(truncate:<V_narrow>
+			(match_operand:VN 2 "register_operand"))))]
+ "ARM_HAVE_<MODE>_ARITH
+  && !TARGET_REALLY_IWMMXT
+  && ! (<MODE>mode == V2DImode && TARGET_HAVE_MVE)
+  && !BYTES_BIG_ENDIAN"
+ {
+   if (TARGET_NEON)
+     {
+       emit_insn (gen_neon_quad_vec_pack_trunc_<mode> (operands[0], operands[1],
+						       operands[2]));
+     }
+   else
+     {
+       rtx tmpreg = gen_reg_rtx (<V_narrow_pack>mode);
+       emit_insn (gen_mve_vec_pack_trunc_lo (<MODE>mode, tmpreg, operands[1]));
+       emit_insn (gen_mve_vmovntq (VMOVNTQ_S, <MODE>mode,
+				   operands[0], tmpreg, operands[2]));
+     }
+   DONE;
+ }
+)
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c
index 7068736bc28..5d6e991cfc6 100644
--- a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c
@@ -21,8 +21,9 @@ FUNC(u, uint, 16, clz)
 FUNC(s, int, 8, clz)
 FUNC(u, uint, 8, clz)
 
-/* 16 and 8-bit versions are not vectorized because they need pack/unpack
-   patterns since __builtin_clz uses 32-bit parameter and return value.  */
-/* { dg-final { scan-assembler-times {vclz\.i32  q[0-9]+, q[0-9]+} 2 } } */
+/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for
+   instance instead of using vclz.i8, we need 4 vclz.i32, leading to a total of
+   14 vclz.i32 expected in this testcase.  */
+/* { dg-final { scan-assembler-times {vclz\.i32  q[0-9]+, q[0-9]+} 14 } } */
 /* { dg-final { scan-assembler-times {vclz\.i16  q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {vclz\.i8  q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c b/gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c
new file mode 100644
index 00000000000..43642b2fec5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vec-pack.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+#define FUNC(SIGN, TYPE, DSTBITS, BITS, NAME)				\
+  void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##DSTBITS##_t * __restrict__ dest, \
+					  TYPE##BITS##_t *a) {		\
+    int i;								\
+    for (i=0; i < (256 / BITS); i++) {					\
+      dest[i] = a[i];							\
+    }									\
+  }
+
+FUNC(s, int, 16, 32, pack)
+FUNC(u, uint, 16, 32, pack)
+FUNC(s, int, 8, 16, pack)
+FUNC(u, uint, 8, 16, pack)
+
+/* { dg-final { scan-assembler-times {vmovnt\.i32\tq[0-9]+, q[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {vmovnb\.i32\tq[0-9]+, q[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {vmovnt\.i16\tq[0-9]+, q[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {vmovnb\.i16\tq[0-9]+, q[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-not {vldr\.64\td[0-9]+, \.L} } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c b/gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c
new file mode 100644
index 00000000000..cdc62f854ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vec-unpack.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+#define FUNC(SIGN, TYPE, DSTBITS, BITS, NAME)				\
+  void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##DSTBITS##_t * __restrict__ dest, \
+					  TYPE##BITS##_t *a) {		\
+    int i;								\
+    for (i=0; i < (128 / BITS); i++) {					\
+      dest[i] = a[i];							\
+    }									\
+  }
+
+FUNC(s, int, 32, 16, unpack)
+FUNC(u, uint, 32, 16, unpack)
+FUNC(s, int, 16, 8, unpack)
+FUNC(u, uint, 16, 8, unpack)
+
+/* { dg-final { scan-assembler-times {vmovlt\.s16 q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vmovlb\.s16 q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vmovlt\.u16 q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vmovlb\.u16 q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vmovlt\.s8 q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vmovlb\.s8 q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vmovlt\.u8 q[0-9]+, q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vmovlb\.u8 q[0-9]+, q[0-9]+} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
index 7a0644997c8..91dd942d818 100644
--- a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
@@ -56,7 +56,10 @@ FUNC_IMM(u, uint, 8, 16, <<, vshlimm)
 /* MVE has only 128-bit vectors, so we can vectorize only half of the
    functions above.  */
 /* We only emit vshl.u, which is equivalent to vshl.s anyway.  */
-/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */
+/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for
+   instance instead of using vshl.u8, we need 4 vshl.i32, leading to a total of
+   14 vshl.i32 expected in this testcase.  */
+/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 14 } } */
 
 /* We emit vshl.i when the shift amount is an immediate.  */
 /* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */
-- 
2.25.1

Reply via email to