Jonathan Wright <jonathan.wri...@arm.com> writes:
> Hi,
>
> Version 2 of the patch adds tests to verify the benefit of this change.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
> gcc/ChangeLog:
>
> 2021-06-14  Jonathan Wright  <jonathan.wri...@arm.com>
>
>         * config/aarch64/aarch64-simd-builtins.def: Split generator
>         for aarch64_sqmovun builtins into scalar and vector variants.
>         * config/aarch64/aarch64-simd.md (aarch64_sqmovun<mode>):
>         Split into scalar and vector variants. Change vector variant
>         to an expander that emits the correct instruction depending
>         on endianness.
>         (aarch64_sqmovun<mode>_insn_le): Define.
>         (aarch64_sqmovun<mode>_insn_be): Define.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/narrow_zero_high_half.c: Add new tests.

OK, thanks.

> From: Gcc-patches <gcc-patches-bounces+jonathan.wright=arm....@gcc.gnu.org> 
> on behalf of Jonathan Wright via Gcc-patches <gcc-patches@gcc.gnu.org>
> Sent: 15 June 2021 10:52
> To: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
> Subject: [PATCH] aarch64: Model zero-high-half semantics of SQXTUN 
> instruction in RTL
>
> Hi,
>
> As subject, this patch first splits the aarch64_sqmovun<mode> pattern
> into separate scalar and vector variants. It then further split the vector
> pattern into big/little endian variants that model the zero-high-half
> semantics of the underlying instruction. Modeling these semantics
> allows for better RTL combinations while also removing some register
> allocation issues as the compiler now knows that the operation is
> totally destructive.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-06-14  Jonathan Wright  <jonathan.wri...@arm.com>
>
>         * config/aarch64/aarch64-simd-builtins.def: Split generator
>         for aarch64_sqmovun builtins into scalar and vector variants.
>         * config/aarch64/aarch64-simd.md (aarch64_sqmovun<mode>):
>         Split into scalar and vector variants. Change vector variant
>         to an expander that emits the correct instruction depending
>         on endianness.
>         (aarch64_sqmovun<mode>_insn_le): Define.
>         (aarch64_sqmovun<mode>_insn_be): Define.
>
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
> b/gcc/config/aarch64/aarch64-simd-builtins.def
> index 
> 18baa6720b09b2ebda8577b809f8a8683f8b44f0..2adb4b127527794d19b2bbd4859f089d3da47763
>  100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -263,7 +263,9 @@
>    BUILTIN_VQ_HSI (TERNOP, smlal_hi_n, 0, NONE)
>    BUILTIN_VQ_HSI (TERNOPU, umlal_hi_n, 0, NONE)
>  
> -  BUILTIN_VSQN_HSDI (UNOPUS, sqmovun, 0, NONE)
> +  /* Implemented by aarch64_sqmovun<mode>.  */
> +  BUILTIN_VQN (UNOPUS, sqmovun, 0, NONE)
> +  BUILTIN_SD_HSDI (UNOPUS, sqmovun, 0, NONE)
>  
>    /* Implemented by aarch64_sqxtun2<mode>.  */
>    BUILTIN_VQN (BINOP_UUS, sqxtun2, 0, NONE)
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> b23556b551cbbef420950007e9714acf190a534d..59779b851fbeecb17cd2cddbb0ed8770a22762b5
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4870,17 +4870,6 @@
>    [(set_attr "type" "neon_qadd<q>")]
>  )
>  
> -;; sqmovun
> -
> -(define_insn "aarch64_sqmovun<mode>"
> -  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
> -     (unspec:<VNARROWQ> [(match_operand:VSQN_HSDI 1 "register_operand" "w")]
> -                            UNSPEC_SQXTUN))]
> -   "TARGET_SIMD"
> -   "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
> -   [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
> -)
> -
>  ;; sqmovn and uqmovn
>  
>  (define_insn "aarch64_<su>qmovn<mode>"
> @@ -4931,6 +4920,61 @@
>    }
>  )
>  
> +;; sqmovun
> +
> +(define_insn "aarch64_sqmovun<mode>"
> +  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
> +     (unspec:<VNARROWQ> [(match_operand:SD_HSDI 1 "register_operand" "w")]
> +                        UNSPEC_SQXTUN))]
> +   "TARGET_SIMD"
> +   "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
> +   [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
> +)
> +
> +(define_insn "aarch64_sqmovun<mode>_insn_le"
> +  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
> +     (vec_concat:<VNARROWQ2>
> +       (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")]
> +                          UNSPEC_SQXTUN)
> +       (match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")))]
> +  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
> +  "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
> +  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
> +)
> +
> +(define_insn "aarch64_sqmovun<mode>_insn_be"
> +  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
> +     (vec_concat:<VNARROWQ2>
> +       (match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")
> +       (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")]
> +                          UNSPEC_SQXTUN)))]
> +  "TARGET_SIMD && BYTES_BIG_ENDIAN"
> +  "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
> +  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
> +)
> +
> +(define_expand "aarch64_sqmovun<mode>"
> +  [(set (match_operand:<VNARROWQ> 0 "register_operand")
> +     (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand")]
> +                        UNSPEC_SQXTUN))]
> +  "TARGET_SIMD"
> +  {
> +    rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
> +    if (BYTES_BIG_ENDIAN)
> +      emit_insn (gen_aarch64_sqmovun<mode>_insn_be (tmp, operands[1],
> +                             CONST0_RTX (<VNARROWQ>mode)));
> +    else
> +      emit_insn (gen_aarch64_sqmovun<mode>_insn_le (tmp, operands[1],
> +                             CONST0_RTX (<VNARROWQ>mode)));
> +
> +    /* The intrinsic expects a narrow result, so emit a subreg that will get
> +       optimized away as appropriate.  */
> +    emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
> +                                              <VNARROWQ2>mode));
> +    DONE;
> +  }
> +)
> +
>  (define_insn "aarch64_sqxtun2<mode>_le"
>    [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
>       (vec_concat:<VNARROWQ2>
> diff --git a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c 
> b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
> index 
> 78c474f3025cbf56d14323d8f05bfb73e003ebfd..27660be963c080a05f489e94c453044588f30f5f
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
> +++ b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
> @@ -63,6 +63,10 @@ TEST_UNARY (vmovn, uint8x16_t, uint16x8_t, u16, u8)
>  TEST_UNARY (vmovn, uint16x8_t, uint32x4_t, u32, u16)
>  TEST_UNARY (vmovn, uint32x4_t, uint64x2_t, u64, u32)
>  
> +TEST_UNARY (vqmovun, uint8x16_t, int16x8_t, s16, u8)
> +TEST_UNARY (vqmovun, uint16x8_t, int32x4_t, s32, u16)
> +TEST_UNARY (vqmovun, uint32x4_t, int64x2_t, s64, u32)
> +
>  /* { dg-final { scan-assembler-not "dup\\t" } } */
>  
>  /* { dg-final { scan-assembler-times "\\trshrn\\tv" 6} }  */
> @@ -74,3 +78,4 @@ TEST_UNARY (vmovn, uint32x4_t, uint64x2_t, u64, u32)
>  /* { dg-final { scan-assembler-times "\\tsqrshrn\\tv" 3} }  */
>  /* { dg-final { scan-assembler-times "\\tuqrshrn\\tv" 3} }  */
>  /* { dg-final { scan-assembler-times "\\txtn\\tv" 6} }  */
> +/* { dg-final { scan-assembler-times "\\tsqxtun\\tv" 3} }  */

Reply via email to