This patch adds vec_unpack<US>_hi_<mode>, vec_unpack<US>_lo_<mode>, vec_pack_trunc_<mode> patterns for MVE.
It does so by moving the unpack patterns from neon.md to vec-common.md, while adding them support for MVE. The pack expander is derived from the Neon one (which in turn is renamed into neon_quad_vec_pack_trunc_<mode>). The patch introduces mve_vec_pack_trunc_<mode> to avoid the need for a zero-initialized temporary, which is needed if the vec_pack_trunc_<mode> expander calls @mve_vmovn[bt]q_<supf><mode> instead. With this patch, we can now vectorize the 16 and 8-bit versions of vclz and vshl, although the generated code could still be improved. For test_clz_s16, we now generate vldrh.16 q3, [r1] vmovlb.s16 q2, q3 vmovlt.s16 q3, q3 vclz.i32 q2, q2 vclz.i32 q3, q3 vmovnb.i32 q1, q2 vmovnt.i32 q1, q3 vstrh.16 q1, [r0] which could be improved to vldrh.16 q3, [r1] vclz.i16 q1, q3 vstrh.16 q1, [r0] if we could avoid the need for unpack/pack steps. For reference, clang-12 generates: vldrh.s32 q0, [r1] vldrh.s32 q1, [r1, #8] vclz.i32 q0, q0 vstrh.32 q0, [r0] vclz.i32 q0, q1 vstrh.32 q0, [r0, #8] 2021-06-03 Christophe Lyon <christophe.l...@linaro.org> gcc/ * config/arm/mve.md (mve_vmovltq_<supf><mode>): Prefix with '@'. (mve_vmovlbq_<supf><mode>): Likewise. (mve_vmovnbq_<supf><mode>): Likewise. (mve_vmovntq_<supf><mode>): Likewise. (@mve_vec_pack_trunc_<mode>): New pattern. * config/arm/neon.md (vec_unpack<US>_hi_<mode>): Move to vec-common.md. (vec_unpack<US>_lo_<mode>): Likewise. (vec_pack_trunc_<mode>): Rename to neon_quad_vec_pack_trunc_<mode>. * config/arm/vec-common.md (vec_unpack<US>_hi_<mode>): New pattern. (vec_unpack<US>_lo_<mode>): New. (vec_pack_trunc_<mode>): New. gcc/testsuite/ * gcc.target/arm/simd/mve-vclz.c: Update expected results. * gcc.target/arm/simd/mve-vshl.c: Likewise. --- gcc/config/arm/mve.md | 20 ++++- gcc/config/arm/neon.md | 39 +-------- gcc/config/arm/vec-common.md | 89 ++++++++++++++++++++ gcc/testsuite/gcc.target/arm/simd/mve-vclz.c | 7 +- gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 5 +- 5 files changed, 114 insertions(+), 46 deletions(-) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 99e46d0bc69..b18292c07d3 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_<supf><mode>" ;; ;; [vmovltq_u, vmovltq_s]) ;; -(define_insn "mve_vmovltq_<supf><mode>" +(define_insn "@mve_vmovltq_<supf><mode>" [ (set (match_operand:<V_double_width> 0 "s_register_operand" "=w") (unspec:<V_double_width> [(match_operand:MVE_3 1 "s_register_operand" "w")] @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_<supf><mode>" ;; ;; [vmovlbq_s, vmovlbq_u]) ;; -(define_insn "mve_vmovlbq_<supf><mode>" +(define_insn "@mve_vmovlbq_<supf><mode>" [ (set (match_operand:<V_double_width> 0 "s_register_operand" "=w") (unspec:<V_double_width> [(match_operand:MVE_3 1 "s_register_operand" "w")] @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s<mode>" ;; ;; [vmovnbq_u, vmovnbq_s]) ;; -(define_insn "mve_vmovnbq_<supf><mode>" +(define_insn "@mve_vmovnbq_<supf><mode>" [ (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w") (unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0") @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_<supf><mode>" ;; ;; [vmovntq_s, vmovntq_u]) ;; -(define_insn "mve_vmovntq_<supf><mode>" +(define_insn "@mve_vmovntq_<supf><mode>" [ (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w") (unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0") @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_<supf><mode>" [(set_attr "type" "mve_move") ]) +(define_insn "@mve_vec_pack_trunc_<mode>" + [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w") + (vec_concat:<V_narrow_pack> + (truncate:<V_narrow> + (match_operand:MVE_5 1 "register_operand" "w")) + (truncate:<V_narrow> + (match_operand:MVE_5 2 "register_operand" "w"))))] + "TARGET_HAVE_MVE" + "vmovnb.i<V_sz_elem> %q0, %q1\;vmovnt.i<V_sz_elem> %q0, %q2" + [(set_attr "type" "mve_move")] +) + ;; ;; [vmulq_f]) ;; diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 0fdffaf4ec4..392d9607919 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5924,43 +5924,6 @@ (define_insn "neon_vec_unpack<US>_hi_<mode>" [(set_attr "type" "neon_shift_imm_long")] ) -(define_expand "vec_unpack<US>_hi_<mode>" - [(match_operand:<V_unpack> 0 "register_operand") - (SE:<V_unpack> (match_operand:VU 1 "register_operand"))] - "TARGET_NEON && !BYTES_BIG_ENDIAN" - { - rtvec v = rtvec_alloc (<V_mode_nunits>/2) ; - rtx t1; - int i; - for (i = 0; i < (<V_mode_nunits>/2); i++) - RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i); - - t1 = gen_rtx_PARALLEL (<MODE>mode, v); - emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0], - operands[1], - t1)); - DONE; - } -) - -(define_expand "vec_unpack<US>_lo_<mode>" - [(match_operand:<V_unpack> 0 "register_operand") - (SE:<V_unpack> (match_operand:VU 1 "register_operand"))] - "TARGET_NEON && !BYTES_BIG_ENDIAN" - { - rtvec v = rtvec_alloc (<V_mode_nunits>/2) ; - rtx t1; - int i; - for (i = 0; i < (<V_mode_nunits>/2) ; i++) - RTVEC_ELT (v, i) = GEN_INT (i); - t1 = gen_rtx_PARALLEL (<MODE>mode, v); - emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0], - operands[1], - t1)); - DONE; - } -) - (define_insn "neon_vec_<US>mult_lo_<mode>" [(set (match_operand:<V_unpack> 0 "register_operand" "=w") (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF> @@ -6176,7 +6139,7 @@ (define_expand "vec_widen_<US>shiftl_lo_<mode>" ; because the ordering of vector elements in Q registers is different from what ; the semantics of the instructions require. -(define_insn "vec_pack_trunc_<mode>" +(define_insn "neon_quad_vec_pack_trunc_<mode>" [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w") (vec_concat:<V_narrow_pack> (truncate:<V_narrow> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 1ba1e5eb008..0ffc7a9322c 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -638,3 +638,92 @@ (define_expand "clz<mode>2" emit_insn (gen_mve_vclzq_s (<MODE>mode, operands[0], operands[1])); DONE; }) + +;; vmovl[tb] are not available for V4SI on MVE +(define_expand "vec_unpack<US>_hi_<mode>" + [(match_operand:<V_unpack> 0 "register_operand") + (SE:<V_unpack> (match_operand:VU 1 "register_operand"))] + "ARM_HAVE_<MODE>_ARITH + && !TARGET_REALLY_IWMMXT + && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE) + && !BYTES_BIG_ENDIAN" + { + if (TARGET_NEON) + { + rtvec v = rtvec_alloc (<V_mode_nunits>/2); + rtx t1; + int i; + for (i = 0; i < (<V_mode_nunits>/2); i++) + RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i); + + t1 = gen_rtx_PARALLEL (<MODE>mode, v); + emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0], + operands[1], + t1)); + } + else + { + emit_insn (gen_mve_vmovltq (VMOVLTQ_S, <MODE>mode, operands[0], + operands[1])); + } + DONE; + } +) + +;; vmovl[tb] are not available for V4SI on MVE +(define_expand "vec_unpack<US>_lo_<mode>" + [(match_operand:<V_unpack> 0 "register_operand") + (SE:<V_unpack> (match_operand:VU 1 "register_operand"))] + "ARM_HAVE_<MODE>_ARITH + && !TARGET_REALLY_IWMMXT + && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE) + && !BYTES_BIG_ENDIAN" + { + if (TARGET_NEON) + { + rtvec v = rtvec_alloc (<V_mode_nunits>/2); + rtx t1; + int i; + for (i = 0; i < (<V_mode_nunits>/2) ; i++) + RTVEC_ELT (v, i) = GEN_INT (i); + + t1 = gen_rtx_PARALLEL (<MODE>mode, v); + emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0], + operands[1], + t1)); + } + else + { + emit_insn (gen_mve_vmovlbq (VMOVLBQ_S, <MODE>mode, operands[0], + operands[1])); + } + DONE; + } +) + +;; vmovn[tb] are not available for V2DI on MVE +(define_expand "vec_pack_trunc_<mode>" + [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w") + (vec_concat:<V_narrow_pack> + (truncate:<V_narrow> + (match_operand:VN 1 "register_operand" "w")) + (truncate:<V_narrow> + (match_operand:VN 2 "register_operand" "w"))))] + "ARM_HAVE_<MODE>_ARITH + && !TARGET_REALLY_IWMMXT + && ! (<MODE>mode == V2DImode && TARGET_HAVE_MVE) + && !BYTES_BIG_ENDIAN" + { + if (TARGET_NEON) + { + emit_insn (gen_neon_quad_vec_pack_trunc_<mode> (operands[0], operands[1], + operands[2])); + } + else + { + emit_insn (gen_mve_vec_pack_trunc (<MODE>mode, operands[0], operands[1], + operands[2])); + } + DONE; + } +) diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c index 7068736bc28..5d6e991cfc6 100644 --- a/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vclz.c @@ -21,8 +21,9 @@ FUNC(u, uint, 16, clz) FUNC(s, int, 8, clz) FUNC(u, uint, 8, clz) -/* 16 and 8-bit versions are not vectorized because they need pack/unpack - patterns since __builtin_clz uses 32-bit parameter and return value. */ -/* { dg-final { scan-assembler-times {vclz\.i32 q[0-9]+, q[0-9]+} 2 } } */ +/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for + instance instead of using vclz.i8, we need 4 vclz.i32, leading to a total of + 14 vclz.i32 expected in this testcase. */ +/* { dg-final { scan-assembler-times {vclz\.i32 q[0-9]+, q[0-9]+} 14 } } */ /* { dg-final { scan-assembler-times {vclz\.i16 q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {vclz\.i8 q[0-9]+, q[0-9]+} 2 { xfail *-*-* } } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c index 7a0644997c8..91dd942d818 100644 --- a/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshl.c @@ -56,7 +56,10 @@ FUNC_IMM(u, uint, 8, 16, <<, vshlimm) /* MVE has only 128-bit vectors, so we can vectorize only half of the functions above. */ /* We only emit vshl.u, which is equivalent to vshl.s anyway. */ -/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 2 } } */ +/* 16 and 8-bit versions still use 32-bit intermediate temporaries, so for + instance instead of using vshl.u8, we need 4 vshl.i32, leading to a total of + 14 vshl.i32 expected in this testcase. */ +/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 14 } } */ /* We emit vshl.i when the shift amount is an immediate. */ /* { dg-final { scan-assembler-times {vshl.i[0-9]+\tq[0-9]+, q[0-9]+} 6 } } */ -- 2.25.1