Re: [libav-devel] [PATCHv2] arm: vp9: Add NEON itxfm routines

Janne Grunau Thu, 10 Nov 2016 17:17:37 -0800

On 2016-10-18 21:07:30 +0300, Martin Storsjö wrote:
> This work is sponsored by, and copyright, Google.
> 
> For the transforms up to 8x8, we can fit all the data (including
> temporaries) in registers and just do a straightforward transform
> of all the data. For 16x16, we do a transform of 4x16 pixels in
> 4 slices, using a temporary buffer. For 32x32, we transform 4x32
> pixels at a time, in two steps of 4x16 pixels each.
> 
> Examples of relative speedup compared to the C version, from checkasm:
>                          Cortex       A7     A8     A9    A53
> vp9_inv_adst_adst_4x4_add_neon:     3.39   5.80   4.18   3.92
> vp9_inv_adst_adst_8x8_add_neon:     3.94   4.82   4.25   3.89
> vp9_inv_adst_adst_16x16_add_neon:   3.33   4.27   4.08   4.05
> vp9_inv_dct_dct_4x4_add_neon:       3.73   5.06   4.26   4.28
> vp9_inv_dct_dct_8x8_add_neon:       4.59   5.81   5.03   4.73
> vp9_inv_dct_dct_16x16_add_neon:     3.40   3.39   3.33   3.68
> vp9_inv_dct_dct_32x32_add_neon:     4.00   3.51   3.80   4.40
> vp9_inv_wht_wht_4x4_add_neon:       3.24   5.16   3.52   3.67
> 
> Thus, the speedup vs C code is around 3-5x.
> 
> This is mostly marginally faster than the corresponding routines
> in libvpx on most cores, tested with their 32x32 idct (compared to
> vpx_idct32x32_1024_add_neon). These numbers are slightly in libvpx's
> favour since their version doesn't clear the input buffer like ours
> do (although the effect of that on the total runtime probably is
> negligible.)
> 
>                            Cortex       A7       A8       A9      A53
> vp9_inv_dct_dct_32x32_add_neon:    18852.0  16831.6  14217.4  11988.6
> libvpx vpx_idct32x32_1024_add_neon 20789.0  13344.3  15049.9  13030.5
> 
> Only on the Cortex A8, the libvpx function is faster. On the other cores,
> ours is slightly faster even though ours has got source block clearing
> integrated.
> ---
> v2: Updated some broken macro comments, optimized the transposes by
> using the q registers for part of transposes.
> 
> Suggestions very much welcome on names for the macros - no idea if
> the current ones make sense or what one commonly would call these
> combinations.
> 
> I'm a bit reluctant to expanding the macros (to be able to schedule
> instructions better), in order to keep things readable. (Although,
> I guess this is kinda write-only code, which nobody ever touches
> afterwards).
> ---
>  libavcodec/arm/Makefile          |    3 +-
>  libavcodec/arm/vp9dsp_init_arm.c |   51 +-
>  libavcodec/arm/vp9itxfm_neon.S   | 1166 
> ++++++++++++++++++++++++++++++++++++++
>  3 files changed, 1218 insertions(+), 2 deletions(-)
>  create mode 100644 libavcodec/arm/vp9itxfm_neon.S
> 
> diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
> index 2638230..01630ac 100644
> --- a/libavcodec/arm/Makefile
> +++ b/libavcodec/arm/Makefile
> @@ -139,4 +139,5 @@ NEON-OBJS-$(CONFIG_RV40_DECODER)       += 
> arm/rv34dsp_neon.o            \
>                                            arm/rv40dsp_neon.o
>  NEON-OBJS-$(CONFIG_VORBIS_DECODER)     += arm/vorbisdsp_neon.o
>  NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp6dsp_neon.o
> -NEON-OBJS-$(CONFIG_VP9_DECODER)        += arm/vp9mc_neon.o
> +NEON-OBJS-$(CONFIG_VP9_DECODER)        += arm/vp9itxfm_neon.o           \
> +                                          arm/vp9mc_neon.o
> diff --git a/libavcodec/arm/vp9dsp_init_arm.c 
> b/libavcodec/arm/vp9dsp_init_arm.c
> index db8c683..2ba2644 100644
> --- a/libavcodec/arm/vp9dsp_init_arm.c
> +++ b/libavcodec/arm/vp9dsp_init_arm.c
> @@ -94,7 +94,7 @@ define_8tap_2d_funcs(8)
>  define_8tap_2d_funcs(4)
>  
>  
> -av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
> +static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp)
>  {
>      int cpu_flags = av_get_cpu_flags();
>  
> @@ -138,3 +138,52 @@ av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
>          init_mc_funcs_dirs(4, 4);
>      }
>  }
> +
> +#define define_itxfm(type_a, type_b, sz)                                   \
> +void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst,    \
> +                                                         ptrdiff_t stride, \
> +                                                         int16_t *_block, 
> int eob)
> +
> +#define define_itxfm_funcs(sz)      \
> +    define_itxfm(idct,  idct,  sz); \
> +    define_itxfm(iadst, idct,  sz); \
> +    define_itxfm(idct,  iadst, sz); \
> +    define_itxfm(iadst, iadst, sz)
> +
> +define_itxfm_funcs(4);
> +define_itxfm_funcs(8);
> +define_itxfm_funcs(16);
> +define_itxfm(idct, idct, 32);
> +define_itxfm(iwht, iwht, 4);
> +
> +
> +static av_cold void vp9dsp_itxfm_init_arm(VP9DSPContext *dsp)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (have_neon(cpu_flags)) {
> +#define init_itxfm(tx, sz)                                             \
> +    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_neon;  \
> +    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_neon; \
> +    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_neon; \
> +    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
> +
> +#define init_idct(tx, nm)           \
> +    dsp->itxfm_add[tx][DCT_DCT]   = \
> +    dsp->itxfm_add[tx][ADST_DCT]  = \
> +    dsp->itxfm_add[tx][DCT_ADST]  = \
> +    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
> +
> +        init_itxfm(TX_4X4, 4x4);
> +        init_itxfm(TX_8X8, 8x8);
> +        init_itxfm(TX_16X16, 16x16);
> +        init_idct(TX_32X32, idct_idct_32x32);
> +        init_idct(4, iwht_iwht_4x4);
> +    }
> +}
> +
> +av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
> +{
> +    vp9dsp_mc_init_arm(dsp);
> +    vp9dsp_itxfm_init_arm(dsp);
> +}
> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
> new file mode 100644
> index 0000000..96dc3a9
> --- /dev/null
> +++ b/libavcodec/arm/vp9itxfm_neon.S
> @@ -0,0 +1,1166 @@
> +/*
> + * Copyright (c) 2016 Google Inc.
> + *
> + * This file is part of Libav.
> + *
> + * Libav is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * Libav is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with Libav; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> + */
> +
> +#include "libavutil/arm/asm.S"
> +#include "neon.S"
> +
> +const itxfm4_coeffs, align=4
> +        .short  11585, 6270, 15137, 0
> +iadst4_coeffs:
> +        .short  5283, 15212, 9929, 13377
> +endconst
> +
> +const iadst8_coeffs, align=4
> +        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
> +endconst
> +
> +const idct_coeffs, align=4
> +        .short  11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
> +        .short  16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
> +        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
> +        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
> +endconst
> +
> +const iadst16_coeffs, align=4
> +        .short  16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
> +        .short  11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
> +endconst
> +
> +@ Do two 4x4 transposes, using q registers for the subtransposes that don't


it's four 4x4 transposes

> +@ need to address the individual d registers.
> +@ r0,r1 == rq1, r2,r3 == rq1, etc
> +.macro transpose16_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, 
> r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
> +        vtrn.32          \rq0, \rq1
> +        vtrn.32          \rq2, \rq3
> +        vtrn.32          \rq4, \rq5
> +        vtrn.32          \rq6, \rq7
> +        vtrn.16          \r0,  \r1
> +        vtrn.16          \r2,  \r3
> +        vtrn.16          \r4,  \r5
> +        vtrn.16          \r6,  \r7
> +        vtrn.16          \r8,  \r9
> +        vtrn.16          \r10, \r11
> +        vtrn.16          \r12, \r13
> +        vtrn.16          \r14, \r15
> +.endm
> +
> +@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
> +@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
> +@ in/out are d registers
> +.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
> +        vadd.s16        \tmpd1, \in1,  \in2
> +        vsub.s16        \tmpd2, \in1,  \in2
> +        vmull.s16       \tmpq3, \tmpd1, d0[0]
> +        vmull.s16       \tmpq4, \tmpd2, d0[0]
> +.if \neg > 0
> +        vneg.s32        \tmpq3, \tmpq3
> +.endif
> +        vrshrn.s32      \out1, \tmpq3, #14
> +        vrshrn.s32      \out2, \tmpq4, #14
> +.endm

an empty line after .endm improves the readability

> +@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
> +@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
> +@ Same as mbutterfly0, but with input being 2 q registers, output
> +@ being 4 d registers.
> +@ This can do with either 4 or 6 temporary q registers.
> +.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, 
> tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
> +        vadd.s16        \tmpq1, \in1,  \in2
> +        vsub.s16        \tmpq2, \in1,  \in2
> +        vmull.s16       \tmpq3, \tmpd11, d0[0]
> +        vmull.s16       \tmpq4, \tmpd12, d0[0]
> +.ifb \tmpq5
> +        vrshrn.s32      \out1, \tmpq3, #14
> +        vrshrn.s32      \out2, \tmpq4, #14
> +        vmull.s16       \tmpq3, \tmpd21, d0[0]
> +        vmull.s16       \tmpq4, \tmpd22, d0[0]
> +        vrshrn.s32      \out3, \tmpq3, #14
> +        vrshrn.s32      \out4, \tmpq4, #14
> +.else
> +        vmull.s16       \tmpq5, \tmpd21, d0[0]
> +        vmull.s16       \tmpq6, \tmpd22, d0[0]
> +        vrshrn.s32      \out1, \tmpq3, #14
> +        vrshrn.s32      \out2, \tmpq4, #14
> +        vrshrn.s32      \out3, \tmpq5, #14
> +        vrshrn.s32      \out4, \tmpq6, #14
> +.endif
> +.endm
> +@ out1 = in1 * coef1 - in2 * coef2
> +@ out2 = in1 * coef2 + in2 * coef1
> +@ out are 2 q registers, in are 2 d registers
> +.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2
> +        vmull.s16       \out1, \in1, \coef1
> +        vmlsl.s16       \out1, \in2, \coef2
> +        vmull.s16       \out2, \in1, \coef2

doing the second vmull before the preferable on in-order units

> +        vmlal.s16       \out2, \in2, \coef1
> +.endm
> +@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
> +@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
> +@ out are 4 q registers, in are 4 d registers
> +.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
> +        vmull.s16       \out1, \in1, \coef1
> +        vmull.s16       \out2, \in2, \coef1
> +        vmull.s16       \out3, \in1, \coef2
> +        vmull.s16       \out4, \in2, \coef2
> +        vmlsl.s16       \out1, \in3, \coef2
> +        vmlsl.s16       \out2, \in4, \coef2
> +        vmlal.s16       \out3, \in3, \coef1
> +        vmlal.s16       \out4, \in4, \coef1
> +.endm
> +@ in1 = (in1 * coef1 - in2 * coef2 + (1 << 13)) >> 14
> +@ in2 = (in1 * coef2 + in2 * coef1 + (1 << 13)) >> 14
> +@ in are 2 d registers, tmp are 2 q registers
> +.macro mbutterfly in1, in2, coef1, coef2, tmp1, tmp2, neg=0
> +        mbutterfly_l    \tmp1, \tmp2, \in1, \in2, \coef1, \coef2
> +.if \neg > 0
> +        vneg.s32        \tmp2, \tmp2
> +.endif
> +        vrshrn.s32      \in1, \tmp1,  #14
> +        vrshrn.s32      \in2, \tmp2,  #14
> +.endm
> +@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 
> 13)) >> 14
> +@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 
> 13)) >> 14
> +@ inout are 4 d registers, tmp are 4 q registers
> +.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, 
> tmp3, tmp4
> +        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, 
> \inout3, \inout4, \coef1, \coef2
> +        vrshrn.s32      \inout1, \tmp1,  #14
> +        vrshrn.s32      \inout2, \tmp2,  #14
> +        vrshrn.s32      \inout3, \tmp3,  #14
> +        vrshrn.s32      \inout4, \tmp4,  #14
> +.endm
> +.macro mbutterfly_neg in1, in2, coef1, coef2, tmp1, tmp2
> +        mbutterfly      \in1, \in2, \coef1, \coef2, \tmp1, \tmp2, 1
> +.endm

tis macro is a little pointless, readability is not really worse for

mbutterfly ..., neg=1 vs mbutterfly_neg ...

> +@ out1 = in1 + in2
> +@ out2 = in1 - in2
> +.macro butterfly out1, out2, in1, in2
> +        vadd.s16        \out1, \in1, \in2
> +        vsub.s16        \out2, \in1, \in2
> +.endm
> +@ out1 = in1 - in2
> +@ out2 = in1 + in2
> +.macro butterfly_r out1, out2, in1, in2
> +        vsub.s16        \out1, \in1, \in2
> +        vadd.s16        \out2, \in1, \in2
> +.endm
> +@ out1 = (in1 + in2 + (1 << 13)) >> 14
> +@ out2 = (in1 - in2 + (1 << 13)) >> 14
> +@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
> +.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
> +        vadd.s32        \tmp1, \in1, \in2
> +        vsub.s32        \tmp2, \in1, \in2
> +        vrshrn.s32      \out1, \tmp1,  #14
> +        vrshrn.s32      \out2, \tmp2,  #14
> +.endm
> +@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
> +@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
> +@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
> +.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, 
> tmp3, tmp4
> +        vadd.s32        \tmp1, \in1, \in3
> +        vadd.s32        \tmp2, \in2, \in4
> +        vsub.s32        \tmp3, \in1, \in3
> +        vsub.s32        \tmp4, \in2, \in4
> +        vrshrn.s32      \out1, \tmp1,  #14
> +        vrshrn.s32      \out2, \tmp2,  #14
> +        vrshrn.s32      \out3, \tmp3,  #14
> +        vrshrn.s32      \out4, \tmp4,  #14
> +.endm
> +
> +
> +.macro iwht4 c0, c1, c2, c3
> +        vadd.i16        \c0,  \c0,  \c1
> +        vsub.i16        d17,  \c2,  \c3
> +        vsub.i16        d16,  \c0,  d17
> +        vshr.s16        d16,  d16,  #1
> +        vsub.i16        \c2,  d16,  \c1
> +        vsub.i16        \c1,  d16,  \c3
> +        vadd.i16        \c3,  d17,  \c2
> +        vsub.i16        \c0,  \c0,  \c1
> +.endm
> +
> +.macro idct4 c0, c1, c2, c3
> +        vadd.i16        d16,  \c0,  \c2
> +        vsub.i16        d17,  \c0,  \c2
> +        vmull.s16       q11,  \c1,  d0[1]
> +        vmull.s16       q12,  \c3,  d0[2]

vmlsl.s16 q11, \c3,  d0[2] and reorder for in-order

> +        vmull.s16       q13,  \c1,  d0[2]
> +        vmull.s16       q14,  \c3,  d0[1]

vmlal.s16 q13, \c3,  d0[1]

> +        vmull.s16       q9,   d16,  d0[0]
> +        vmull.s16       q10,  d17,  d0[0]
> +        vadd.i32        q13,  q13,  q14
> +        vsub.i32        q11,  q11,  q12
> +        vrshrn.s32      d16,  q9,   #14
> +        vrshrn.s32      d19,  q13,  #14
> +        vrshrn.s32      d17,  q10,  #14
> +        vrshrn.s32      d18,  q11,  #14
> +        vadd.i16        \c0,  d16,  d19
> +        vadd.i16        \c1,  d17,  d18
> +        vsub.i16        \c2,  d17,  d18
> +        vsub.i16        \c3,  d16,  d19
> +.endm
> +
> +.macro iadst4 c0, c1, c2, c3
> +        vmull.s16       q10,  \c0,  d1[0]
> +        vmlal.s16       q10,  \c2,  d1[1]
> +        vmlal.s16       q10,  \c3,  d1[2]
> +        vmull.s16       q11,  \c0,  d1[2]
> +        vmlsl.s16       q11,  \c2,  d1[0]
> +        vsub.s16        \c0,  \c0,  \c2
> +        vmlsl.s16       q11,  \c3,  d1[1]
> +        vadd.s16        \c0,  \c0,  \c3
> +        vmull.s16       q13,  \c1,  d1[3]
> +        vmull.s16       q12,  \c0,  d1[3]
> +        vadd.s32        q14,  q10,  q13
> +        vadd.s32        q1,   q11,  q13
> +        vrshrn.s32      \c0,  q14,  #14
> +        vadd.s32        q10,  q10,  q11
> +        vrshrn.s32      \c1,  q1,   #14
> +        vsub.s32        q10,  q10,  q13
> +        vrshrn.s32      \c2,  q12,  #14
> +        vrshrn.s32      \c3,  q10,  #14

instruction scheduling can be optimized for this one too

> +.endm
> +
> +@ The public functions in this file have got the following signature:
> +@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
> +
> +.macro itxfm_func4x4 txfm1, txfm2
> +function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
> +.ifc \txfm1,\txfm2
> +.ifc \txfm1,idct
> +        movrel          r12, itxfm4_coeffs
> +        vld1.16         {d0}, [r12,:64]
> +.endif
> +.ifc \txfm1,iadst
> +        movrel          r12, iadst4_coeffs
> +        vld1.16         {d1}, [r12,:64]
> +.endif
> +.else
> +        movrel          r12, itxfm4_coeffs
> +        vld1.16         {q0}, [r12,:128]
> +.endif

aligned 8 byte and 16 byte loads are equally fast so this adds just 
complexity without gain

> +
> +        vmov.i16        q15, #0
> +.ifc \txfm1,idct
> +.ifc \txfm2,idct
> +        cmp             r3,  #1
> +        bne             1f
> +        @ DC-only for idct/idct
> +        vld1.16         {d4[]},   [r2]

alignment

> +        vmull.s16       q2,  d4,  d0[0]
> +        vrshrn.s32      d4,  q2,  #14
> +        vmull.s16       q2,  d4,  d0[0]
> +        vrshrn.s32      d4,  q2,  #14
> +        vst1.16         {d30[0]}, [r2]

same

> +        vdup.16         q2,  d4[0]
> +        vmov            q3,  q2

vdup first to q3 to avoid data dependency

> +        b               2f
> +.endif
> +.endif
> +
> +1:
> +        vld1.16         {d4-d7},  [r2,:128]
> +        vst1.16         {q15}, [r2,:128]!
> +
> +.ifc \txfm1,iwht
> +        vshr.s16        q2,  q2,  #2
> +        vshr.s16        q3,  q3,  #2
> +.endif
> +
> +        \txfm1\()4      d4,  d5,  d6,  d7
> +
> +        vst1.16         {q15}, [r2,:128]!
> +        @ Transpose 4x4 with 16 bit elements
> +        vtrn.16         d4,  d5
> +        vtrn.16         d6,  d7
> +        vtrn.32         d4,  d6
> +        vtrn.32         d5,  d7

vtrn.32 q2, q3

> +
> +        \txfm2\()4      d4,  d5,  d6,  d7
> +2:
> +        vld1.32         {d0[]},   [r0,:32], r1
> +        vld1.32         {d0[1]},  [r0,:32], r1
> +.ifnc \txfm1,iwht
> +        vrshr.s16       q2,  q2,  #4
> +        vrshr.s16       q3,  q3,  #4
> +.endif
> +        vaddw.u8        q2,  q2,  d0
> +        vld1.32         {d1[]},   [r0,:32], r1
> +        vld1.32         {d1[1]},  [r0,:32], r1
> +        vqmovun.s16     d0,  q2
> +        sub             r0,  r0,  r1, lsl #2

since we have free gp registers I'd use different register for load and 
store. probably not faster though

> +
> +        vaddw.u8        q3,  q3,  d1
> +        vst1.32         {d0[0]},  [r0,:32], r1
> +        vqmovun.s16     d1,  q3
> +
> +        vst1.32         {d0[1]},  [r0,:32], r1
> +        vst1.32         {d1[0]},  [r0,:32], r1
> +        vst1.32         {d1[1]},  [r0,:32], r1
> +
> +        bx              lr
> +endfunc
> +.endm
> +
> +itxfm_func4x4 idct,  idct
> +itxfm_func4x4 iadst, idct
> +itxfm_func4x4 idct,  iadst
> +itxfm_func4x4 iadst, iadst
> +itxfm_func4x4 iwht,  iwht
> +
> +
> +.macro idct8
> +        dmbutterfly0    d16, d17, d24, d25, q8,  q12, q2, q4, d4, d5, d8, 
> d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
> +        dmbutterfly     d20, d21, d28, d29, d0[1], d0[2], q2,  q3,  q4,  q5 
> @ q10 = t2a, q14 = t3a
> +        dmbutterfly     d18, d19, d30, d31, d0[3], d1[0], q2,  q3,  q4,  q5 
> @ q9  = t4a, q15 = t7a
> +        dmbutterfly     d26, d27, d22, d23, d1[1], d1[2], q2,  q3,  q4,  q5 
> @ q13 = t5a, q11 = t6a
> +
> +        butterfly       q2,  q14, q8,  q14 @ q2 = t0, q14 = t3
> +        butterfly       q3,  q10, q12, q10 @ q3 = t1, q10 = t2
> +        butterfly       q4,  q13, q9,  q13 @ q4 = t4, q13 = t5a
> +        butterfly       q5,  q11, q15, q11 @ q5 = t7, q11 = t6a
> +
> +        butterfly       q8,  q15, q2,  q5  @ q8 = out[0], q15 = out[7]
> +
> +        dmbutterfly0    d4,  d5,  d10, d11, q11, q13, q9,  q13, d18, d19, 
> d26, d27, q2,  q5, q11, q12 @ q2 = t6, q5 = t5
> +
> +        butterfly       q11, q12, q14, q4  @ q11 = out[3], q12 = out[4]
> +        butterfly       q9,  q14, q3,  q2  @ q9 = out[1],  q14 = out[6]
> +        butterfly_r     q13, q10, q10, q5  @ q13 = out[5], q10 = out[2]
> +.endm
> +
> +.macro iadst8
> +        dmbutterfly_l   q4,  q5,  q2,  q3,  d30, d31, d16, d17, d2[1], d2[0] 
> @ q4,q5  = t1a, q2,q3 = t0a
> +        dmbutterfly_l   q8,  q15, q6,  q7,  d22, d23, d24, d25, d3[1], d3[0] 
> @ q8,q15 = t5a, q6,q7 = t4a
> +
> +        dbutterfly_n    d22, d23, d4,  d5,  q2,  q3,  q6,  q7,  q11, q12, 
> q2,  q3 @ q11 = t0, q2 = t4
> +
> +        dbutterfly_n    d24, d25, d6,  d7,  q4,  q5,  q8,  q15, q12, q3,  
> q6,  q7 @ q12 = t1, q3 = t5
> +
> +        dmbutterfly_l   q6,  q7,  q4,  q5,  d26, d27, d20, d21, d2[3], d2[2] 
> @ q6,q7 = t3a, q4,q5 = t2a
> +        dmbutterfly_l   q10, q13, q8,  q15, d18, d19, d28, d29, d3[3], d3[2] 
> @ q10,q13 = t7a, q8,q15 = t6a
> +
> +        dbutterfly_n    d18, d19, d8,  d9,  q4,  q5,  q8,  q15, q9,  q14, 
> q4, q5 @ q9 = t2, q4 = t6
> +        dbutterfly_n    d16, d17, d12, d13, q6,  q7,  q10, q13, q8,  q15, 
> q6, q7 @ q8 = t3, q6 = t7
> +
> +        butterfly       q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
> +        vneg.s16        q15, q15          @ q15 = out[7]
> +        butterfly       q8,  q9,  q11, q9 @ q8 = out[0], q9 = t2
> +
> +        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d0[1], d0[2] 
> @ q10,q11 = t5a, q5,q7 = t4a
> +        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d0[2], d0[1] 
> @ q2,q3 = t6a, q13,q14 = t7a
> +
> +        dbutterfly_n    d28, d29, d8,  d9,  q10, q11, q13, q14, q4,  q6,  
> q10, q11 @ q14 = out[6], q4 = t7
> +
> +        dmbutterfly0    d22, d23, d24, d25, q9,  q12, q6, q13, d12, d13, 
> d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
> +        vneg.s16        q11, q11      @ q11 = out[3]
> +
> +        dbutterfly_n    d18, d19, d4,  d5,  q5,  q7,  q2,  q3,  q9, q10, q2, 
>  q3 @ q9 = -out[1], q2 = t6
> +        vneg.s16        q9,  q9       @ q9 = out[1]
> +
> +        dmbutterfly0    d20, d21, d26, d27, q2,  q4,  q3, q5,  d6,  d7,  
> d10, d11, q6,  q7 @ q10 = out[2], q13 = -out[5]
> +        vneg.s16        q13, q13      @ q13 = out[5]
> +.endm
> +
> +
> +.macro itxfm_func8x8 txfm1, txfm2
> +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
> +        @ Push q4-q7 if iadst is used, idct requires
> +        @ a few scratch registers less, so only push q4-q5
> +        @ if only idct is involved.
> +        @ The iadst also uses a few coefficients from
> +        @ idct, so those always need to be loaded.
> +        movrel          r12, idct_coeffs

move this into the last else below

> +        vld1.16         {q0}, [r12,:128]

this can follow after this block if the iadst8_coeffs load uses post 
increment

> +.ifc \txfm1,iadst
> +        movrel          r12, iadst8_coeffs
> +        vld1.16         {q1}, [r12,:128]
> +        vpush           {q4-q7}
> +.else
> +.ifc \txfm2,iadst

does .elseifc work?

> +        movrel          r12, iadst8_coeffs
> +        vld1.16         {q1}, [r12,:128]
> +        vpush           {q4-q7}
> +.else
> +        vpush           {q4-q5}
> +.endif
> +.endif
> +
> +        vmov.i16        q2, #0
> +        vmov.i16        q3, #0
> +
> +.ifc \txfm1,idct
> +.ifc \txfm2,idct
> +        cmp             r3,  #1
> +        bne             1f
> +        @ DC-only for idct/idct
> +        vld1.16         {d16[]},   [r2]

alignment

> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vdup.16         q8,  d16[0]
> +        vmov            q9,  q8
> +        vmov            q10, q8
> +        vmov            q11, q8
> +        vmov            q12, q8
> +        vmov            q13, q8
> +        vmov            q14, q8
> +        vmov            q15, q8

all duped from d16[0]

> +        vst1.16         {d4[0]}, [r2]

alignment

> +        b               2f
> +.endif
> +.endif
> +1:
> +        vld1.16         {q8-q9},    [r2,:128]!
> +        vld1.16         {q10-q11},  [r2,:128]!
> +        vld1.16         {q12-q13},  [r2,:128]!
> +        vld1.16         {q14-q15},  [r2,:128]!
> +        sub             r2,  r2,  #128
> +        vst1.16         {q2-q3}, [r2,:128]!
> +        vst1.16         {q2-q3}, [r2,:128]!
> +        vst1.16         {q2-q3}, [r2,:128]!
> +        vst1.16         {q2-q3}, [r2,:128]!
> +
> +        \txfm1\()8
> +
> +        @ Transpose 8x8 with 16 bit elements
> +        vswp            d17, d24
> +        vswp            d19, d26
> +        vswp            d21, d28
> +        vswp            d23, d30
> +        transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
> +
> +        \txfm2\()8
> +2:
> +        @ Add into the destination
> +        vld1.8          {d4},  [r0,:64], r1
> +        vrshr.s16       q8,  q8,  #5
> +        vld1.8          {d5},  [r0,:64], r1
> +        vrshr.s16       q9,  q9,  #5
> +        vld1.8          {d6},  [r0,:64], r1
> +        vrshr.s16       q10, q10, #5
> +        vaddw.u8        q8,  q8,  d4
> +        vld1.8          {d7},  [r0,:64], r1
> +        vrshr.s16       q11, q11, #5
> +        vaddw.u8        q9,  q9,  d5
> +        vld1.8          {d8},  [r0,:64], r1
> +        vrshr.s16       q12, q12, #5
> +        vaddw.u8        q10, q10, d6
> +        vqmovun.s16     d4,  q8
> +        vld1.8          {d9},  [r0,:64], r1
> +        vrshr.s16       q13, q13, #5
> +        vaddw.u8        q11, q11, d7
> +        vqmovun.s16     d5,  q9
> +        vld1.8          {d10}, [r0,:64], r1
> +        vrshr.s16       q14, q14, #5
> +        vaddw.u8        q12, q12, d8
> +        vqmovun.s16     d6,  q10
> +        vld1.8          {d11}, [r0,:64], r1
> +        vrshr.s16       q15, q15, #5
> +        vaddw.u8        q13, q13, d9
> +        vqmovun.s16     d7,  q11
> +        sub             r0,  r0,  r1, lsl #3

could use a different register loads and stores

> +
> +        vst1.8          {d4},  [r0,:64], r1
> +        vaddw.u8        q14, q14, d10
> +        vst1.8          {d5},  [r0,:64], r1
> +        vqmovun.s16     d8,  q12
> +        vst1.8          {d6},  [r0,:64], r1
> +        vaddw.u8        q15, q15, d11
> +        vst1.8          {d7},  [r0,:64], r1
> +        vqmovun.s16     d9,  q13
> +        vst1.8          {d8},  [r0,:64], r1
> +        vqmovun.s16     d10, q14
> +        vst1.8          {d9},  [r0,:64], r1
> +        vqmovun.s16     d11, q15
> +
> +        vst1.8          {d10}, [r0,:64], r1
> +        vst1.8          {d11}, [r0,:64], r1
> +
> +.ifc \txfm1,iadst
> +        vpop            {q4-q7}
> +.else
> +.ifc \txfm2,iadst
> +        vpop            {q4-q7}
> +.else
> +        vpop            {q4-q5}
> +.endif
> +.endif
> +        bx              lr
> +endfunc
> +.endm
> +
> +itxfm_func8x8 idct,  idct
> +itxfm_func8x8 iadst, idct
> +itxfm_func8x8 idct,  iadst
> +itxfm_func8x8 iadst, iadst
> +
> +
> +function idct16x16_dc_add_neon
> +        movrel          r12, idct_coeffs
> +        vld1.16         {d0}, [r12,:64]
> +
> +        vmov.i16        q2, #0
> +
> +        vld1.16         {d16[]},   [r2]

alignment

> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vdup.16         q8,  d16[0]
> +        vst1.16         {d4[0]}, [r2]

alignment

> +
> +        vrshr.s16       q8,  q8,  #6
> +
> +        mov             r12, #16
> +1:
> +        @ Loop to add the constant from q8 into all 16x16 outputs
> +        vld1.8          {q3},  [r0,:128]
> +        vaddw.u8        q10, q8,  d6
> +        vaddw.u8        q11, q8,  d7
> +        vqmovun.s16     d6,  q10
> +        vqmovun.s16     d7,  q11
> +        vst1.8          {q3},  [r0,:128], r1
> +        subs            r12, r12, #1
> +        bne             1b
> +
> +        bx              lr
> +endfunc
> +
> +.macro idct16
> +        mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  
> d24 = t1a
> +        mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = 
> t3a
> +        mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = 
> t7a
> +        mbutterfly      d26, d22, d1[1], d1[2], q2,  q3  @ d26 = t5a,  d22 = 
> t6a
> +        mbutterfly      d17, d31, d1[3], d2[0], q2,  q3  @ d17 = t8a,  d31 = 
> t15a
> +        mbutterfly      d25, d23, d2[1], d2[2], q2,  q3  @ d25 = t9a,  d23 = 
> t14a
> +        mbutterfly      d21, d27, d2[3], d3[0], q2,  q3  @ d21 = t10a, d27 = 
> t13a
> +        mbutterfly      d29, d19, d3[1], d3[2], q2,  q3  @ d29 = t11a, d19 = 
> t12a
> +
> +        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = 
> t3
> +        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = 
> t2
> +        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = 
> t5
> +        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = 
> t6
> +        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = 
> t9
> +        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = 
> t10
> +        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = 
> t13
> +        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = 
> t14
> +
> +        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15 @ d22 = t6a, 
> d26 = t5a
> +        mbutterfly      d23, d25, d0[1], d0[2], q9,  q15 @ d23 = t9a, d25 = 
> t14a
> +        mbutterfly_neg  d27, d21, d0[1], d0[2], q9,  q15 @ d27 = t13a, d21 = 
> t10a
> +
> +        butterfly       d18, d7,  d4,  d7                @ d18 = t0a,  d7  = 
> t7a
> +        butterfly       d19, d22, d5,  d22               @ d19 = t1a,  d22 = 
> t6
> +        butterfly       d4,  d26, d20, d26               @ d4  = t2a,  d26 = 
> t5
> +        butterfly       d5,  d6,  d28, d6                @ d5  = t3a,  d6  = 
> t4
> +        butterfly       d20, d28, d16, d24               @ d20 = t8a,  d28 = 
> t11a
> +        butterfly       d24, d21, d23, d21               @ d24 = t9,   d21 = 
> t10
> +        butterfly       d23, d27, d25, d27               @ d23 = t14,  d27 = 
> t13
> +        butterfly       d25, d29, d29, d17               @ d25 = t15a, d29 = 
> t12a
> +
> +        mbutterfly0     d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, 
> d21 = t10a
> +        mbutterfly0     d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12,  
> d28 = t11
> +
> +        vswp            d27, d29                         @ d27 = t12, d29 = 
> t13a
> +        vswp            d28, d27                         @ d28 = t12, d27 = 
> t11
> +        butterfly       d16, d31, d18, d25               @ d16 = out[0], d31 
> = out[15]
> +        butterfly       d17, d30, d19, d23               @ d17 = out[1], d30 
> = out[14]
> +        butterfly_r     d25, d22, d22, d24               @ d25 = out[9], d22 
> = out[6]
> +        butterfly       d23, d24, d7,  d20               @ d23 = out[7], d24 
> = out[8]
> +        butterfly       d18, d29, d4,  d29               @ d18 = out[2], d29 
> = out[13]
> +        butterfly       d19, d28, d5,  d28               @ d19 = out[3], d28 
> = out[12]
> +        vmov            d4,  d21                         @ d4  = t10a
> +        butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 
> = out[11]
> +        butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 
> = out[10]
> +.endm
> +
> +.macro iadst16
> +        movrel          r12, iadst16_coeffs
> +        vld1.16         {q0-q1}, [r12,:128]
> +
> +        mbutterfly_l    q3,  q2,  d31, d16, d0[1], d0[0] @ q3  = t1,   q2  = 
> t0
> +        mbutterfly_l    q5,  q4,  d23, d24, d2[1], d2[0] @ q5  = t9,   q4  = 
> t8
> +        butterfly_n     d31, d24, q3,  q5,  q6,  q5      @ d31 = t1a,  d24 = 
> t9a
> +        mbutterfly_l    q7,  q6,  d29, d18, d0[3], d0[2] @ q7  = t3,   q6  = 
> t2
> +        butterfly_n     d16, d23, q2,  q4,  q3,  q4      @ d16 = t0a,  d23 = 
> t8a
> +
> +        mbutterfly_l    q3,  q2,  d21, d26, d2[3], d2[2] @ q3  = t11,  q2  = 
> t10
> +        butterfly_n     d29, d26, q7,  q3,  q4,  q3      @ d29 = t3a,  d26 = 
> t11a
> +        mbutterfly_l    q5,  q4,  d27, d20, d1[1], d1[0] @ q5  = t5,   q4  = 
> t4
> +        butterfly_n     d18, d21, q6,  q2,  q3,  q2      @ d18 = t2a,  d21 = 
> t10a
> +
> +        mbutterfly_l    q7,  q6,  d19, d28, d3[1], d3[0] @ q7  = t13,  q6  = 
> t12
> +        butterfly_n     d20, d28, q5,  q7,  q2,  q7      @ d20 = t5a,  d28 = 
> t13a
> +        mbutterfly_l    q3,  q2,  d25, d22, d1[3], d1[2] @ q3  = t7,   q2  = 
> t6
> +        butterfly_n     d27, d19, q4,  q6,  q5,  q6      @ d27 = t4a,  d19 = 
> t12a
> +
> +        mbutterfly_l    q5,  q4,  d17, d30, d3[3], d3[2] @ q5  = t15,  q4  = 
> t14
> +        movrel          r12, idct_coeffs
> +        vld1.16         {q0}, [r12,:128]
> +        butterfly_n     d22, d30, q3,  q5,  q6,  q5      @ d22 = t7a,  d30 = 
> t15a
> +        mbutterfly_l    q7,  q6,  d23, d24, d0[3], d1[0] @ q7  = t9,   q6  = 
> t8
> +        butterfly_n     d25, d17, q2,  q4,  q3,  q4      @ d25 = t6a,  d17 = 
> t14a
> +
> +        mbutterfly_l    q2,  q3,  d28, d19, d1[0], d0[3] @ q2  = t12,  q3  = 
> t13
> +        butterfly_n     d23, d19, q6,  q2,  q4,  q2      @ d23 = t8a,  d19 = 
> t12a
> +        mbutterfly_l    q5,  q4,  d21, d26, d1[1], d1[2] @ q5  = t11,  q4  = 
> t10
> +        butterfly_r     d4,  d27, d16, d27               @ d4  = t4,   d27 = 
> t0
> +        butterfly_n     d24, d28, q7,  q3,  q6,  q3      @ d24 = t9a,  d28 = 
> t13a
> +
> +        mbutterfly_l    q6,  q7,  d30, d17, d1[2], d1[1] @ q6  = t14,  q7  = 
> t15
> +        butterfly_r     d5,  d20, d31, d20               @ d5  = t5,   d20 = 
> t1
> +        butterfly_n     d21, d17, q4,  q6,  q3,  q6      @ d21 = t10a, d17 = 
> t14a
> +        butterfly_n     d26, d30, q5,  q7,  q4,  q7      @ d26 = t11a, d30 = 
> t15a
> +
> +        butterfly_r     d6,  d25, d18, d25               @ d6  = t6,   d25 = 
> t2
> +        butterfly_r     d7,  d22, d29, d22               @ d7  = t7,   d22 = 
> t3
> +
> +        mbutterfly_l    q5,  q4,  d19, d28, d0[1], d0[2] @ q5  = t13,  q4  = 
> t12
> +        mbutterfly_l    q6,  q7,  d30, d17, d0[2], d0[1] @ q6  = t14,  q7  = 
> t15
> +
> +        butterfly_n     d18, d30, q4,  q6,  q8,  q6      @ d18 = out[2],   
> d30 = t14a
> +        butterfly_n     d29, d17, q5,  q7,  q6,  q7      @ d29 = -out[13], 
> d17 = t15a
> +        vneg.s16        d29, d29                         @ d29 = out[13]
> +
> +        mbutterfly_l    q5,  q4,  d4,  d5,  d0[1], d0[2] @ q5  = t5a,  q4  = 
> t4a
> +        mbutterfly_l    q6,  q7,  d7,  d6,  d0[2], d0[1] @ q6  = t6a,  q7  = 
> t7a
> +
> +        butterfly       d2,  d6,  d27, d25               @ d2 = out[0], d6 = 
> t2a
> +        butterfly       d3,  d7,  d23, d21               @ d3 =-out[1], d7 = 
> t10
> +
> +        butterfly_n     d19, d31, q4,  q6,  q2,  q4      @ d19 = -out[3],  
> d31 = t6
> +        vneg.s16        d19, d19                         @ d19 = out[3]
> +        butterfly_n     d28, d16, q5,  q7,  q2,  q5      @ d28 = out[12],  
> d16 = t7
> +
> +        butterfly       d5,  d8,  d20, d22               @ d5 =-out[15],d8 = 
> t3a
> +        butterfly       d4,  d9,  d24, d26               @ d4 = out[14],d9 = 
> t11
> +
> +        mbutterfly0     d23, d24, d6,  d8,  d10, d11, q6,  q7, 1 @ d23 = 
> out[7], d24 = out[8]
> +        mbutterfly0     d20, d27, d16, d31, d10, d11, q6,  q7    @ d20 = 
> out[4], d27 = out[11]
> +        mbutterfly0     d22, d25, d9,  d7,  d10, d11, q6,  q7    @ d22 = 
> out[6], d25 = out[9]
> +        mbutterfly0     d21, d26, d30, d17, d10, d11, q6,  q7, 1 @ d21 = 
> out[5], d26 = out[10]
> +
> +        vneg.s16        d31, d5                          @ d31 = out[15]
> +        vneg.s16        d17, d3                          @ d17 = out[1]
> +
> +        vmov            d16, d2
> +        vmov            d30, d4
> +.endm
> +
> +.macro itxfm16_1d_funcs txfm
> +@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
> +@ transpose into a horizontal 16x4 slice and store.
> +@ r0 = dst (temp buffer)
> +@ r1 = unused
> +@ r2 = src
> +@ r3 = slice offset
> +function \txfm\()16_1d_4x16_pass1_neon
> +        mov             r12, #32
> +        vmov.s16        q2, #0
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12
> +.endr
> +
> +        \txfm\()16
> +
> +        @ Do four 4x4 transposes. Originally, d16-d31 contain the
> +        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
> +        @ contain the transposed 4x4 blocks.
> +        transpose16_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, 
> d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
> +
> +        @ Store the transposed 4x4 blocks horizontally.
> +        cmp             r3,  #12
> +        beq             1f
> +.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +        bx              lr
> +1:
> +        @ Special case: For the last input column (r3 == 12),
> +        @ which would be stored as the last row in the temp buffer,
> +        @ don't store the first 4x4 block, but keep it in registers
> +        @ for the first slice of the second pass (where it is the
> +        @ last 4x4 block).
> +        add             r0,  r0,  #8
> +.irp i, 20, 24, 28
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +        add             r0,  r0,  #8
> +.irp i, 21, 25, 29
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +        add             r0,  r0,  #8
> +.irp i, 22, 26, 30
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +        add             r0,  r0,  #8
> +.irp i, 23, 27, 31
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +        vmov            d28, d16
> +        vmov            d29, d17
> +        vmov            d30, d18
> +        vmov            d31, d19
> +        bx              lr
> +endfunc
> +
> +@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
> +@ load the destination pixels (from a similar 4x16 slice), add and store 
> back.
> +@ r0 = dst
> +@ r1 = dst stride
> +@ r2 = src (temp buffer)
> +@ r3 = slice offset
> +function \txfm\()16_1d_4x16_pass2_neon
> +        mov             r12, #32
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        cmp             r3,  #0
> +        beq             1f
> +.irp i, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +1:
> +
> +        \txfm\()16
> +
> +.macro load_add_store coef0, coef1, coef2, coef3
> +        vrshr.s16       \coef0, \coef0, #6
> +        vrshr.s16       \coef1, \coef1, #6
> +
> +        vld1.32         {d4[]},   [r0,:32], r1
> +        vld1.32         {d4[1]},  [r0,:32], r1
> +        vrshr.s16       \coef2, \coef2, #6
> +        vrshr.s16       \coef3, \coef3, #6
> +        vld1.32         {d5[]},   [r0,:32], r1
> +        vld1.32         {d5[1]},  [r0,:32], r1
> +        vaddw.u8        \coef0, \coef0, d4
> +        vld1.32         {d6[]},   [r0,:32], r1
> +        vld1.32         {d6[1]},  [r0,:32], r1
> +        vaddw.u8        \coef1, \coef1, d5
> +        vld1.32         {d7[]},   [r0,:32], r1
> +        vld1.32         {d7[1]},  [r0,:32], r1
> +
> +        vqmovun.s16     d4,  \coef0
> +        vqmovun.s16     d5,  \coef1
> +        sub             r0,  r0,  r1, lsl #3

could use an additional register

> +        vaddw.u8        \coef2, \coef2, d6
> +        vaddw.u8        \coef3, \coef3, d7
> +        vst1.32         {d4[0]},  [r0,:32], r1
> +        vst1.32         {d4[1]},  [r0,:32], r1
> +        vqmovun.s16     d6,  \coef2
> +        vst1.32         {d5[0]},  [r0,:32], r1
> +        vst1.32         {d5[1]},  [r0,:32], r1
> +        vqmovun.s16     d7,  \coef3
> +
> +        vst1.32         {d6[0]},  [r0,:32], r1
> +        vst1.32         {d6[1]},  [r0,:32], r1
> +        vst1.32         {d7[0]},  [r0,:32], r1
> +        vst1.32         {d7[1]},  [r0,:32], r1
> +.endm
> +        load_add_store  q8,  q9,  q10, q11
> +        load_add_store  q12, q13, q14, q15
> +.purgem load_add_store
> +
> +        bx              lr
> +endfunc
> +.endm
> +
> +itxfm16_1d_funcs idct
> +itxfm16_1d_funcs iadst
> +
> +.macro itxfm_func16x16 txfm1, txfm2
> +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
> +.ifc \txfm1,idct
> +.ifc \txfm2,idct
> +        cmp             r3,  #1
> +        beq             idct16x16_dc_add_neon
> +.endif
> +.endif
> +1:

unused label

> +        push            {r4-r7,lr}
> +.ifc \txfm1,iadst
> +        vpush           {q4-q7}
> +.else
> +.ifc \txfm2,iadst
> +        vpush           {q4-q7}
> +.endif
> +.endif
> +        mov             r7,  sp
> +
> +        @ Align the stack, allocate a temp buffer
> +T       mov             r12, sp
> +T       bic             r12, r12, #15
> +T       sub             r12, r12, #512
> +T       mov             sp,  r12
> +A       bic             sp,  sp,  #15
> +A       sub             sp,  sp,  #512
> +
> +        mov             r4,  r0
> +        mov             r5,  r1
> +        mov             r6,  r2
> +
> +.ifc \txfm1,idct
> +        movrel          r12, idct_coeffs
> +        vld1.16         {q0-q1}, [r12,:128]
> +.endif
> +
> +.irp i, 0, 4, 8, 12
> +        add             r0,  sp,  #(\i*32)
> +        add             r2,  r6,  #(\i*2)
> +        mov             r3,  #\i
> +        bl              \txfm1\()16_1d_4x16_pass1_neon
> +.endr
> +.ifc \txfm2,idct
> +        movrel          r12, idct_coeffs
> +        vld1.16         {q0-q1}, [r12,:128]
> +.endif
> +.irp i, 0, 4, 8, 12
> +        add             r0,  r4,  #(\i)
> +        mov             r1,  r5
> +        add             r2,  sp,  #(\i*2)
> +        mov             r3,  #\i
> +        bl              \txfm2\()16_1d_4x16_pass2_neon
> +.endr
> +
> +        mov             sp,  r7
> +.ifc \txfm1,iadst
> +        vpop            {q4-q7}
> +.else
> +.ifc \txfm2,iadst
> +        vpop            {q4-q7}
> +.endif
> +.endif
> +        pop             {r4-r7,pc}
> +endfunc
> +.endm
> +
> +itxfm_func16x16 idct,  idct
> +itxfm_func16x16 iadst, idct
> +itxfm_func16x16 idct,  iadst
> +itxfm_func16x16 iadst, iadst
> +
> +
> +function idct32x32_dc_add_neon
> +        movrel          r12, idct_coeffs
> +        vld1.16         {d0}, [r12,:64]
> +
> +        vmov.i16        q2, #0
> +
> +        vld1.16         {d16[]},   [r2]

alignment

> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vdup.16         q8,  d16[0]
> +        vst1.16         {d4[0]}, [r2]

dito

> +
> +        vrshr.s16       q8,  q8,  #6
> +
> +        mov             r12, #32
> +1:
> +        @ Loop to add the constant from q8 into all 32x32 outputs
> +        vld1.8          {q2-q3},  [r0,:128]
> +        vaddw.u8        q10, q8,  d4
> +        vaddw.u8        q11, q8,  d5
> +        vaddw.u8        q12, q8,  d6
> +        vaddw.u8        q13, q8,  d7
> +        vqmovun.s16     d4,  q10
> +        vqmovun.s16     d5,  q11
> +        vqmovun.s16     d6,  q12
> +        vqmovun.s16     d7,  q13
> +        vst1.8          {q2-q3},  [r0,:128], r1
> +        subs            r12, r12, #1
> +        bne             1b
> +
> +        bx              lr
> +endfunc
> +
> +.macro idct32_odd
> +        movrel          r12, idct_coeffs
> +        add             r12, r12, #32
> +        vld1.16         {q0-q1}, [r12,:128]
> +
> +        mbutterfly      d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = 
> t31a
> +        mbutterfly      d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = 
> t30a
> +        mbutterfly      d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = 
> t29a
> +        mbutterfly      d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = 
> t28a
> +        mbutterfly      d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = 
> t27a
> +        mbutterfly      d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = 
> t26a
> +        mbutterfly      d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = 
> t25a
> +        mbutterfly      d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = 
> t24a
> +
> +        sub             r12, r12, #32
> +        vld1.16         {q0}, [r12,:128]
> +
> +        butterfly       d4,  d24, d16, d24 @ d4  = t16, d24 = t17
> +        butterfly       d5,  d20, d28, d20 @ d5  = t19, d20 = t18
> +        butterfly       d6,  d26, d18, d26 @ d6  = t20, d26 = t21
> +        butterfly       d7,  d22, d30, d22 @ d7  = t23, d22 = t22
> +        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
> +        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
> +        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
> +        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
> +
> +        mbutterfly      d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = 
> t30a
> +        mbutterfly_neg  d27, d20, d0[3], d1[0], q8, q9 @ d27 = t29a, d20 = 
> t18a
> +        mbutterfly      d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = 
> t26a
> +        mbutterfly_neg  d25, d22, d1[1], d1[2], q8, q9 @ d25 = t25a, d22 = 
> t22a
> +
> +        butterfly       d16, d5,  d4,  d5  @ d16 = t16a, d5  = t19a
> +        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
> +        butterfly       d18, d6,  d7,  d6  @ d18 = t23a, d6  = t20a
> +        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
> +        butterfly       d4,  d28, d28, d30 @ d4  = t24a, d28 = t27a
> +        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
> +        butterfly       d7,  d29, d29, d31 @ d7  = t31a, d29 = t28a
> +        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
> +
> +        mbutterfly      d27, d20, d0[1], d0[2], q12, q15 @ d27 = t18a, d20 = 
> t29a
> +        mbutterfly      d29, d5,  d0[1], d0[2], q12, q15 @ d29 = t19,  d5  = 
> t28
> +        mbutterfly_neg  d28, d6,  d0[1], d0[2], q12, q15 @ d28 = t27,  d6  = 
> t20
> +        mbutterfly_neg  d26, d21, d0[1], d0[2], q12, q15 @ d26 = t26a, d21 = 
> t21a
> +
> +        butterfly       d31, d24, d7,  d4  @ d31 = t31,  d24 = t24
> +        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
> +        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
> +        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
> +        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
> +        butterfly_r     d27, d28, d5,  d28 @ d27 = t27a, d28 = t28a
> +        butterfly       d4,  d26, d20, d26 @ d4  = t29,  d26 = t26
> +        butterfly       d19, d20, d29, d6  @ d19 = t19a, d20 = t20
> +        vmov            d29, d4            @ d29 = t29
> +
> +        mbutterfly0     d27, d20, d27, d20, d4, d6, q2, q3 @ d27 = t27,  d20 
> = t20
> +        mbutterfly0     d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 
> = t21a
> +        mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 
> = t22
> +        mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 
> = t23a
> +.endm
> +
> +@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
> +@ We don't have register space to do a single pass IDCT of 4x32 though,
> +@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
> +@ a normal IDCT16 with every other input component (the even ones, with
> +@ each output written twice), followed by a separate 16-point IDCT
> +@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
> +@ r0 = dst (temp buffer)
> +@ r1 = unused
> +@ r2 = src
> +function idct32_1d_4x32_pass1_neon
> +        movrel          r12, idct_coeffs
> +        vld1.16         {q0-q1}, [r12,:128]
> +
> +        @ Double stride of the input, since we only read every other line
> +        mov             r12, #128
> +        vmov.s16        d4, #0
> +
> +        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12
> +.endr
> +
> +        idct16
> +
> +        @ Do four 4x4 transposes. Originally, d16-d31 contain the
> +        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
> +        @ contain the transposed 4x4 blocks.
> +        transpose16_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, 
> d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
> +        @ Store the registers a, b, c, d horizontally, followed
> +        @ by the same registers d, c, b, a mirrored.
> +.macro store_rev a, b, c, d
> +.irp i, \a, \b, \c, \d
> +        vst1.16         {d\i}, [r0,:64]!
> +        vrev64.16       d\i, d\i
> +.endr
> +.irp i, \d, \c, \b, \a
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +.endm
> +        store_rev       16, 20, 24, 28
> +        store_rev       17, 21, 25, 29
> +        store_rev       18, 22, 26, 30
> +        store_rev       19, 23, 27, 31
> +        sub             r0,  r0,  #256
> +.purgem store_rev
> +
> +        @ Move r2 back to the start of the input, and move
> +        @ to the first odd row
> +        sub             r2,  r2,  r12, lsl #4
> +        add             r2,  r2,  #64
> +
> +        vmov.s16        d4, #0
> +        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12
> +.endr
> +
> +        idct32_odd
> +
> +        transpose16_q_2x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, 
> d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
> +
> +        @ Store the registers a, b, c, d horizontally,
> +        @ adding into the output first, and then mirrored, subtracted
> +        @ from the output.
> +.macro store_rev a, b, c, d
> +.irp i, \a, \b, \c, \d
> +        vld1.16         {d4},  [r0,:64]
> +        vadd.s16        d4, d4, d\i
> +        vst1.16         {d4},  [r0,:64]!
> +        vrev64.16       d\i, d\i
> +.endr
> +.irp i, \d, \c, \b, \a
> +        vld1.16         {d4},  [r0,:64]
> +        vsub.s16        d4, d4, d\i
> +        vst1.16         {d4},  [r0,:64]!
> +.endr
> +.endm
> +
> +        store_rev 31, 27, 23, 19
> +        store_rev 30, 26, 22, 18
> +        store_rev 29, 25, 21, 17
> +        store_rev 28, 24, 20, 16
> +.purgem store_rev
> +        bx              lr
> +endfunc
> +
> +@ This is mostly the same as 4x32_pass1, but without the transpose,
> +@ and use the source as temp buffer between the two idct passes, and
> +@ add into the destination.
> +@ r0 = dst
> +@ r1 = dst stride
> +@ r2 = src (temp buffer)
> +function idct32_1d_4x32_pass2_neon
> +        movrel          r12, idct_coeffs
> +        vld1.16         {q0-q1}, [r12,:128]
> +
> +        mov             r12, #128
> +        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        sub             r2,  r2,  r12, lsl #4
> +
> +        idct16
> +
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vst1.16         {d\i}, [r2,:64], r12
> +.endr
> +
> +        sub             r2,  r2,  r12, lsl #4
> +        add             r2,  r2,  #64
> +
> +        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        sub             r2,  r2,  r12, lsl #4
> +        sub             r2,  r2,  #64
> +
> +        idct32_odd
> +
> +        mov             r12,  #128
> +.macro load_acc_store a, b, c, d, neg=0
> +        vld1.16         {d4},  [r2,:64], r12
> +        vld1.16         {d5},  [r2,:64], r12
> +.if \neg == 0
> +        vadd.s16        d4, d4, d\a
> +        vld1.16         {d6},  [r2,:64], r12
> +        vadd.s16        d5, d5, d\b
> +        vld1.16         {d7},  [r2,:64], r12
> +        vadd.s16        d6, d6, d\c
> +        vadd.s16        d7, d7, d\d
> +.else
> +        vsub.s16        d4, d4, d\a
> +        vld1.16         {d6},  [r2,:64], r12
> +        vsub.s16        d5, d5, d\b
> +        vld1.16         {d7},  [r2,:64], r12
> +        vsub.s16        d6, d6, d\c
> +        vsub.s16        d7, d7, d\d
> +.endif
> +        vld1.32         {d2[]},   [r0,:32], r1
> +        vld1.32         {d2[1]},  [r0,:32], r1
> +        vrshr.s16       q2, q2, #6
> +        vld1.32         {d3[]},   [r0,:32], r1
> +        vrshr.s16       q3, q3, #6
> +        vld1.32         {d3[1]},  [r0,:32], r1
> +        sub             r0,  r0,  r1, lsl #2
> +        vaddw.u8        q2,  q2,  d2
> +        vaddw.u8        q3,  q3,  d3
> +        vqmovun.s16     d4,  q2
> +        vqmovun.s16     d5,  q3
> +        vst1.32         {d4[0]},  [r0,:32], r1
> +        vst1.32         {d4[1]},  [r0,:32], r1
> +        vst1.32         {d5[0]},  [r0,:32], r1
> +        vst1.32         {d5[1]},  [r0,:32], r1
> +.endm
> +        load_acc_store  31, 30, 29, 28
> +        load_acc_store  27, 26, 25, 24
> +        load_acc_store  23, 22, 21, 20
> +        load_acc_store  19, 18, 17, 16
> +        sub             r2,  r2,  r12
> +        neg             r12, r12
> +        load_acc_store  16, 17, 18, 19, 1
> +        load_acc_store  20, 21, 22, 23, 1
> +        load_acc_store  24, 25, 26, 27, 1
> +        load_acc_store  28, 29, 30, 31, 1
> +.purgem load_acc_store
> +        bx              lr
> +endfunc
> +
> +function ff_vp9_idct_idct_32x32_add_neon, export=1
> +        cmp             r3,  #1
> +        beq             idct32x32_dc_add_neon
> +1:

unused label

> +        push            {r4-r7,lr}
> +        vpush           {q4-q7}
> +        mov             r7,  sp
> +
> +        @ Align the stack, allocate a temp buffer
> +T       mov             r12, sp
> +T       bic             r12, r12, #15
> +T       sub             r12, r12, #2048
> +T       mov             sp,  r12
> +A       bic             sp,  sp,  #15
> +A       sub             sp,  sp,  #2048
> +
> +        mov             r4,  r0
> +        mov             r5,  r1
> +        mov             r6,  r2
> +
> +.irp i, 0, 4, 8, 12, 16, 20, 24, 28
> +        add             r0,  sp,  #(\i*64)
> +        add             r2,  r6,  #(\i*2)
> +        bl              idct32_1d_4x32_pass1_neon
> +.endr
> +.irp i, 0, 4, 8, 12, 16, 20, 24, 28
> +        add             r0,  r4,  #(\i)
> +        mov             r1,  r5
> +        add             r2,  sp,  #(\i*2)
> +        bl              idct32_1d_4x32_pass2_neon
> +.endr
> +
> +        mov             sp,  r7
> +        vpop            {q4-q7}
> +        pop             {r4-r7,pc}
> +endfunc

patch ok with nits fixed. instruction rescheduling can be done in a 
separate commit.

Janne
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCHv2] arm: vp9: Add NEON itxfm routines

Reply via email to