This patch adds vec_unpack<US>_hi_<mode>, vec_unpack<US>_lo_<mode>,
vec_pack_trunc_<mode> patterns for MVE.

It does so by moving the unpack patterns from neon.md to
vec-common.md, while adding them support for MVE. The pack expander is
derived from the Neon one (which in turn is renamed into
neon_quad_vec_pack_trunc_<mode>).

The patch introduces mve_vec_pack_trunc_<mode> to avoid the need for a
zero-initialized temporary, which is needed if the
vec_pack_trunc_<mode> expander calls @mve_vmovn[bt]q_<supf><mode>
instead.

With this patch, we can now vectorize the 16 and 8-bit versions of
vclz and vshl, although the generated code could still be improved.
For test_clz_s16, we now generate
        vldrh.16        q3, [r1]
        vmovlb.s16   q2, q3
        vmovlt.s16   q3, q3
        vclz.i32  q2, q2
        vclz.i32  q3, q3
        vmovnb.i32      q1, q2
        vmovnt.i32      q1, q3
        vstrh.16        q1, [r0]
which could be improved to
        vldrh.16        q3, [r1]
        vclz.i16        q1, q3
        vstrh.16        q1, [r0]
if we could avoid the need for unpack/pack steps.

For reference, clang-12 generates:
        vldrh.s32       q0, [r1]
        vldrh.s32       q1, [r1, #8]
        vclz.i32        q0, q0
        vstrh.32        q0, [r0]
        vclz.i32        q0, q1
        vstrh.32        q0, [r0, #8]

2021-06-03  Christophe Lyon  <christophe.l...@linaro.org>

        gcc/
        * config/arm/mve.md (mve_vmovltq_<supf><mode>): Prefix with '@'.
        (mve_vmovlbq_<supf><mode>): Likewise.
        (mve_vmovnbq_<supf><mode>): Likewise.
        (mve_vmovntq_<supf><mode>): Likewise.
        (@mve_vec_pack_trunc_<mode>): New pattern.
        * config/arm/neon.md (vec_unpack<US>_hi_<mode>): Move to
        vec-common.md.
        (vec_unpack<US>_lo_<mode>): Likewise.
        (vec_pack_trunc_<mode>): Rename to
        neon_quad_vec_pack_trunc_<mode>.
        * config/arm/vec-common.md (vec_unpack<US>_hi_<mode>): New
        pattern.
        (vec_unpack<US>_lo_<mode>): New.
        (vec_pack_trunc_<mode>): New.

        gcc/testsuite/
        * gcc.target/arm/simd/mve-vclz.c: Update expected results.
        * gcc.target/arm/simd/mve-vshl.c: Likewise.
---
 gcc/config/arm/mve.md                        | 20 ++++-
 gcc/config/arm/neon.md                       | 39 +--------
 gcc/config/arm/vec-common.md                 | 89 ++++++++++++++++++++
 gcc/testsuite/gcc.target/arm/simd/mve-vclz.c |  7 +-
 gcc/testsuite/gcc.target/arm/simd/mve-vshl.c |  5 +-
 5 files changed, 114 insertions(+), 46 deletions(-)

diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 99e46d0bc69..b18292c07d3 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_<supf><mode>"
 ;;
 ;; [vmovltq_u, vmovltq_s])
 ;;
-(define_insn "mve_vmovltq_<supf><mode>"
+(define_insn "@mve_vmovltq_<supf><mode>"
   [
    (set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
        (unspec:<V_double_width> [(match_operand:MVE_3 1 "s_register_operand" 
"w")]
@@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_<supf><mode>"
 ;;
 ;; [vmovlbq_s, vmovlbq_u])
 ;;
-(define_insn "mve_vmovlbq_<supf><mode>"
+(define_insn "@mve_vmovlbq_<supf><mode>"
   [
    (set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
        (unspec:<V_double_width> [(match_operand:MVE_3 1 "s_register_operand" 
"w")]
@@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s<mode>"
 ;;
 ;; [vmovnbq_u, vmovnbq_s])
 ;;
-(define_insn "mve_vmovnbq_<supf><mode>"
+(define_insn "@mve_vmovnbq_<supf><mode>"
   [
    (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
        (unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 
"s_register_operand" "0")
@@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_<supf><mode>"
 ;;
 ;; [vmovntq_s, vmovntq_u])
 ;;
-(define_insn "mve_vmovntq_<supf><mode>"
+(define_insn "@mve_vmovntq_<supf><mode>"
   [
    (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
        (unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 
"s_register_operand" "0")
@@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_<supf><mode>"
   [(set_attr "type" "mve_move")
 ])
 
+(define_insn "@mve_vec_pack_trunc_<mode>"
+ [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
+       (vec_concat:<V_narrow_pack>
+               (truncate:<V_narrow>
+                       (match_operand:MVE_5 1 "register_operand" "w"))
+               (truncate:<V_narrow>
+                       (match_operand:MVE_5 2 "register_operand" "w"))))]
+ "TARGET_HAVE_MVE"
+ "vmovnb.i<V_sz_elem>  %q0, %q1\;vmovnt.i<V_sz_elem>   %q0, %q2"
+  [(set_attr "type" "mve_move")]
+)
+
 ;;
 ;; [vmulq_f])
 ;;
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 0fdffaf4ec4..392d9607919 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -5924,43 +5924,6 @@ (define_insn "neon_vec_unpack<US>_hi_<mode>"
   [(set_attr "type" "neon_shift_imm_long")]
 )
 
-(define_expand "vec_unpack<US>_hi_<mode>"
-  [(match_operand:<V_unpack> 0 "register_operand")
-   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
- "TARGET_NEON && !BYTES_BIG_ENDIAN"
-  {
-   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
-   rtx t1;
-   int i;
-   for (i = 0; i < (<V_mode_nunits>/2); i++)
-     RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
-  
-   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
-   emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0], 
-                                                 operands[1], 
-                                                t1));
-   DONE;
-  }
-)
-
-(define_expand "vec_unpack<US>_lo_<mode>"
-  [(match_operand:<V_unpack> 0 "register_operand")
-   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
- "TARGET_NEON && !BYTES_BIG_ENDIAN"
-  {
-   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
-   rtx t1;
-   int i;
-   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
-     RTVEC_ELT (v, i) = GEN_INT (i);
-   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
-   emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0], 
-                                                 operands[1], 
-                                                t1));
-   DONE;
-  }
-)
-
 (define_insn "neon_vec_<US>mult_lo_<mode>"
  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
        (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
@@ -6176,7 +6139,7 @@ (define_expand "vec_widen_<US>shiftl_lo_<mode>"
 ; because the ordering of vector elements in Q registers is different from what
 ; the semantics of the instructions require.
 
-(define_insn "vec_pack_trunc_<mode>"
+(define_insn "neon_quad_vec_pack_trunc_<mode>"
  [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
        (vec_concat:<V_narrow_pack> 
                (truncate:<V_narrow> 
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 1ba1e5eb008..0ffc7a9322c 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -638,3 +638,92 @@ (define_expand "clz<mode>2"
     emit_insn (gen_mve_vclzq_s (<MODE>mode, operands[0], operands[1]));
   DONE;
 })
+
+;; vmovl[tb] are not available for V4SI on MVE
+(define_expand "vec_unpack<US>_hi_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
+ "ARM_HAVE_<MODE>_ARITH
+  && !TARGET_REALLY_IWMMXT
+  && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
+  && !BYTES_BIG_ENDIAN"
+  {
+    if (TARGET_NEON)
+      {
+       rtvec v = rtvec_alloc (<V_mode_nunits>/2);
+       rtx t1;
+       int i;
+       for (i = 0; i < (<V_mode_nunits>/2); i++)
+         RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
+
+       t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+       emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0],
+                                                     operands[1],
+                                                     t1));
+      }
+    else
+      {
+       emit_insn (gen_mve_vmovltq (VMOVLTQ_S, <MODE>mode, operands[0],
+                                   operands[1]));
+      }
+    DONE;
+  }
+)
+
+;; vmovl[tb] are not available for V4SI on MVE
+(define_expand "vec_unpack<US>_lo_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
+ "ARM_HAVE_<MODE>_ARITH
+  && !TARGET_REALLY_IWMMXT
+  && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
+  && !BYTES_BIG_ENDIAN"
+  {
+    if (TARGET_NEON)
+      {
+       rtvec v = rtvec_alloc (<V_mode_nunits>/2);
+       rtx t1;
+       int i;
+       for (i = 0; i < (<V_mode_nunits>/2) ; i++)
+         RTVEC_ELT (v, i) = GEN_INT (i);
+
+       t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+       emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0],
+                                                     operands[1],
+                                                     t1));
+      }
+    else
+      {
+       emit_insn (gen_mve_vmovlbq (VMOVLBQ_S, <MODE>mode, operands[0],
+                                   operands[1]));
+      }
+    DONE;
+  }
+)
+
+;; vmovn[tb] are not available for V2DI on MVE
+(define_expand "vec_pack_trunc_<mode>"
+ [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
+       (vec_concat:<V_narrow_pack>
+               (truncate:<V_narrow>
+                       (match_operand:VN 1 "register_operand" "w"))
+               (truncate:<V_narrow>
+                       (match_operand:VN 2 "register_operand" "w"))))]
+ "ARM_HAVE_<MODE>_ARITH
+  && !TARGET_REALLY_IWMMXT
+  && ! (<MODE>mode == V2DImode && TARGET_HAVE_MVE)
+  && !BYTES_BIG_ENDIAN"
+ {
+   if (TARGET_NEON)
+     {
+       emit_insn (gen_neon_quad_vec_pack_trunc_<mode> (operands[0], 
operands[1],
+                                                      operands[2]));
+     }
+   else
+     {
+       emit_insn (gen_mve_vec_pack_trunc (<MODE>mode, operands[0], operands[1],
+                                         operands[2]));
+     }
+   DONE;
+ }
+)
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c 
b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c
index 7068736bc28..5d6e991cfc6 100644
--- a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c
@@ -21,8 +21,9 @@ FUNC(u, uint, 16, clz)
 FUNC(s, int, 8, clz)
 FUNC(u, uint, 8, clz)
 
-/* 16 and 8-bit versions are not vectorized because they need pack/unpack
-   patterns since __builtin_clz uses 32-bit parameter and return value.  */
-/* { dg-final { scan-assembler-times {vclz\.i32  q[0-9]+, q[0-9]+} 2 } } */
+/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for
+   instance instead of using vclz.i8, we need 4 vclz.i32, leading to a total of
+   14 vclz.i32 expected in this testcase.  */
+/* { dg-final { scan-assembler-times {vclz\.i32  q[0-9]+, q[0-9]+} 14 } } */
 /* { dg-final { scan-assembler-times {vclz\.i16  q[0-9]+, q[0-9]+} 2 { xfail 
*-*-* } } } */
 /* { dg-final { scan-assembler-times {vclz\.i8  q[0-9]+, q[0-9]+} 2 { xfail 
*-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c 
b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
index 7a0644997c8..91dd942d818 100644
--- a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c
@@ -56,7 +56,10 @@ FUNC_IMM(u, uint, 8, 16, <<, vshlimm)
 /* MVE has only 128-bit vectors, so we can vectorize only half of the
    functions above.  */
 /* We only emit vshl.u, which is equivalent to vshl.s anyway.  */
-/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */
+/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for
+   instance instead of using vshl.u8, we need 4 vshl.i32, leading to a total of
+   14 vshl.i32 expected in this testcase.  */
+/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 14 } } */
 
 /* We emit vshl.i when the shift amount is an immediate.  */
 /* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */
-- 
2.25.1

Reply via email to