On 2016-10-18 21:07:30 +0300, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > For the transforms up to 8x8, we can fit all the data (including > temporaries) in registers and just do a straightforward transform > of all the data. For 16x16, we do a transform of 4x16 pixels in > 4 slices, using a temporary buffer. For 32x32, we transform 4x32 > pixels at a time, in two steps of 4x16 pixels each. > > Examples of relative speedup compared to the C version, from checkasm: > Cortex A7 A8 A9 A53 > vp9_inv_adst_adst_4x4_add_neon: 3.39 5.80 4.18 3.92 > vp9_inv_adst_adst_8x8_add_neon: 3.94 4.82 4.25 3.89 > vp9_inv_adst_adst_16x16_add_neon: 3.33 4.27 4.08 4.05 > vp9_inv_dct_dct_4x4_add_neon: 3.73 5.06 4.26 4.28 > vp9_inv_dct_dct_8x8_add_neon: 4.59 5.81 5.03 4.73 > vp9_inv_dct_dct_16x16_add_neon: 3.40 3.39 3.33 3.68 > vp9_inv_dct_dct_32x32_add_neon: 4.00 3.51 3.80 4.40 > vp9_inv_wht_wht_4x4_add_neon: 3.24 5.16 3.52 3.67 > > Thus, the speedup vs C code is around 3-5x. > > This is mostly marginally faster than the corresponding routines > in libvpx on most cores, tested with their 32x32 idct (compared to > vpx_idct32x32_1024_add_neon). These numbers are slightly in libvpx's > favour since their version doesn't clear the input buffer like ours > do (although the effect of that on the total runtime probably is > negligible.) > > Cortex A7 A8 A9 A53 > vp9_inv_dct_dct_32x32_add_neon: 18852.0 16831.6 14217.4 11988.6 > libvpx vpx_idct32x32_1024_add_neon 20789.0 13344.3 15049.9 13030.5 > > Only on the Cortex A8, the libvpx function is faster. On the other cores, > ours is slightly faster even though ours has got source block clearing > integrated. > --- > v2: Updated some broken macro comments, optimized the transposes by > using the q registers for part of transposes. > > Suggestions very much welcome on names for the macros - no idea if > the current ones make sense or what one commonly would call these > combinations. > > I'm a bit reluctant to expanding the macros (to be able to schedule > instructions better), in order to keep things readable. (Although, > I guess this is kinda write-only code, which nobody ever touches > afterwards). > --- > libavcodec/arm/Makefile | 3 +- > libavcodec/arm/vp9dsp_init_arm.c | 51 +- > libavcodec/arm/vp9itxfm_neon.S | 1166 > ++++++++++++++++++++++++++++++++++++++ > 3 files changed, 1218 insertions(+), 2 deletions(-) > create mode 100644 libavcodec/arm/vp9itxfm_neon.S > > diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile > index 2638230..01630ac 100644 > --- a/libavcodec/arm/Makefile > +++ b/libavcodec/arm/Makefile > @@ -139,4 +139,5 @@ NEON-OBJS-$(CONFIG_RV40_DECODER) += > arm/rv34dsp_neon.o \ > arm/rv40dsp_neon.o > NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o > NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o > -NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9mc_neon.o > +NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_neon.o \ > + arm/vp9mc_neon.o > diff --git a/libavcodec/arm/vp9dsp_init_arm.c > b/libavcodec/arm/vp9dsp_init_arm.c > index db8c683..2ba2644 100644 > --- a/libavcodec/arm/vp9dsp_init_arm.c > +++ b/libavcodec/arm/vp9dsp_init_arm.c > @@ -94,7 +94,7 @@ define_8tap_2d_funcs(8) > define_8tap_2d_funcs(4) > > > -av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp) > +static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp) > { > int cpu_flags = av_get_cpu_flags(); > > @@ -138,3 +138,52 @@ av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp) > init_mc_funcs_dirs(4, 4); > } > } > + > +#define define_itxfm(type_a, type_b, sz) \ > +void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst, \ > + ptrdiff_t stride, \ > + int16_t *_block, > int eob) > + > +#define define_itxfm_funcs(sz) \ > + define_itxfm(idct, idct, sz); \ > + define_itxfm(iadst, idct, sz); \ > + define_itxfm(idct, iadst, sz); \ > + define_itxfm(iadst, iadst, sz) > + > +define_itxfm_funcs(4); > +define_itxfm_funcs(8); > +define_itxfm_funcs(16); > +define_itxfm(idct, idct, 32); > +define_itxfm(iwht, iwht, 4); > + > + > +static av_cold void vp9dsp_itxfm_init_arm(VP9DSPContext *dsp) > +{ > + int cpu_flags = av_get_cpu_flags(); > + > + if (have_neon(cpu_flags)) { > +#define init_itxfm(tx, sz) \ > + dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_neon; \ > + dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_neon; \ > + dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_neon; \ > + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon > + > +#define init_idct(tx, nm) \ > + dsp->itxfm_add[tx][DCT_DCT] = \ > + dsp->itxfm_add[tx][ADST_DCT] = \ > + dsp->itxfm_add[tx][DCT_ADST] = \ > + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon > + > + init_itxfm(TX_4X4, 4x4); > + init_itxfm(TX_8X8, 8x8); > + init_itxfm(TX_16X16, 16x16); > + init_idct(TX_32X32, idct_idct_32x32); > + init_idct(4, iwht_iwht_4x4); > + } > +} > + > +av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp) > +{ > + vp9dsp_mc_init_arm(dsp); > + vp9dsp_itxfm_init_arm(dsp); > +} > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > new file mode 100644 > index 0000000..96dc3a9 > --- /dev/null > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -0,0 +1,1166 @@ > +/* > + * Copyright (c) 2016 Google Inc. > + * > + * This file is part of Libav. > + * > + * Libav is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * Libav is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with Libav; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include "libavutil/arm/asm.S" > +#include "neon.S" > + > +const itxfm4_coeffs, align=4 > + .short 11585, 6270, 15137, 0 > +iadst4_coeffs: > + .short 5283, 15212, 9929, 13377 > +endconst > + > +const iadst8_coeffs, align=4 > + .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 > +endconst > + > +const idct_coeffs, align=4 > + .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606 > + .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0 > + .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 > + .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 > +endconst > + > +const iadst16_coeffs, align=4 > + .short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760 > + .short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207 > +endconst > + > +@ Do two 4x4 transposes, using q registers for the subtransposes that don't
it's four 4x4 transposes > +@ need to address the individual d registers. > +@ r0,r1 == rq1, r2,r3 == rq1, etc > +.macro transpose16_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, > r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 > + vtrn.32 \rq0, \rq1 > + vtrn.32 \rq2, \rq3 > + vtrn.32 \rq4, \rq5 > + vtrn.32 \rq6, \rq7 > + vtrn.16 \r0, \r1 > + vtrn.16 \r2, \r3 > + vtrn.16 \r4, \r5 > + vtrn.16 \r6, \r7 > + vtrn.16 \r8, \r9 > + vtrn.16 \r10, \r11 > + vtrn.16 \r12, \r13 > + vtrn.16 \r14, \r15 > +.endm > + > +@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 > +@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 > +@ in/out are d registers > +.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0 > + vadd.s16 \tmpd1, \in1, \in2 > + vsub.s16 \tmpd2, \in1, \in2 > + vmull.s16 \tmpq3, \tmpd1, d0[0] > + vmull.s16 \tmpq4, \tmpd2, d0[0] > +.if \neg > 0 > + vneg.s32 \tmpq3, \tmpq3 > +.endif > + vrshrn.s32 \out1, \tmpq3, #14 > + vrshrn.s32 \out2, \tmpq4, #14 > +.endm an empty line after .endm improves the readability > +@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 > +@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 > +@ Same as mbutterfly0, but with input being 2 q registers, output > +@ being 4 d registers. > +@ This can do with either 4 or 6 temporary q registers. > +.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, > tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6 > + vadd.s16 \tmpq1, \in1, \in2 > + vsub.s16 \tmpq2, \in1, \in2 > + vmull.s16 \tmpq3, \tmpd11, d0[0] > + vmull.s16 \tmpq4, \tmpd12, d0[0] > +.ifb \tmpq5 > + vrshrn.s32 \out1, \tmpq3, #14 > + vrshrn.s32 \out2, \tmpq4, #14 > + vmull.s16 \tmpq3, \tmpd21, d0[0] > + vmull.s16 \tmpq4, \tmpd22, d0[0] > + vrshrn.s32 \out3, \tmpq3, #14 > + vrshrn.s32 \out4, \tmpq4, #14 > +.else > + vmull.s16 \tmpq5, \tmpd21, d0[0] > + vmull.s16 \tmpq6, \tmpd22, d0[0] > + vrshrn.s32 \out1, \tmpq3, #14 > + vrshrn.s32 \out2, \tmpq4, #14 > + vrshrn.s32 \out3, \tmpq5, #14 > + vrshrn.s32 \out4, \tmpq6, #14 > +.endif > +.endm > +@ out1 = in1 * coef1 - in2 * coef2 > +@ out2 = in1 * coef2 + in2 * coef1 > +@ out are 2 q registers, in are 2 d registers > +.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2 > + vmull.s16 \out1, \in1, \coef1 > + vmlsl.s16 \out1, \in2, \coef2 > + vmull.s16 \out2, \in1, \coef2 doing the second vmull before the preferable on in-order units > + vmlal.s16 \out2, \in2, \coef1 > +.endm > +@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2 > +@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1 > +@ out are 4 q registers, in are 4 d registers > +.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2 > + vmull.s16 \out1, \in1, \coef1 > + vmull.s16 \out2, \in2, \coef1 > + vmull.s16 \out3, \in1, \coef2 > + vmull.s16 \out4, \in2, \coef2 > + vmlsl.s16 \out1, \in3, \coef2 > + vmlsl.s16 \out2, \in4, \coef2 > + vmlal.s16 \out3, \in3, \coef1 > + vmlal.s16 \out4, \in4, \coef1 > +.endm > +@ in1 = (in1 * coef1 - in2 * coef2 + (1 << 13)) >> 14 > +@ in2 = (in1 * coef2 + in2 * coef1 + (1 << 13)) >> 14 > +@ in are 2 d registers, tmp are 2 q registers > +.macro mbutterfly in1, in2, coef1, coef2, tmp1, tmp2, neg=0 > + mbutterfly_l \tmp1, \tmp2, \in1, \in2, \coef1, \coef2 > +.if \neg > 0 > + vneg.s32 \tmp2, \tmp2 > +.endif > + vrshrn.s32 \in1, \tmp1, #14 > + vrshrn.s32 \in2, \tmp2, #14 > +.endm > +@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << > 13)) >> 14 > +@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << > 13)) >> 14 > +@ inout are 4 d registers, tmp are 4 q registers > +.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, > tmp3, tmp4 > + dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, > \inout3, \inout4, \coef1, \coef2 > + vrshrn.s32 \inout1, \tmp1, #14 > + vrshrn.s32 \inout2, \tmp2, #14 > + vrshrn.s32 \inout3, \tmp3, #14 > + vrshrn.s32 \inout4, \tmp4, #14 > +.endm > +.macro mbutterfly_neg in1, in2, coef1, coef2, tmp1, tmp2 > + mbutterfly \in1, \in2, \coef1, \coef2, \tmp1, \tmp2, 1 > +.endm tis macro is a little pointless, readability is not really worse for mbutterfly ..., neg=1 vs mbutterfly_neg ... > +@ out1 = in1 + in2 > +@ out2 = in1 - in2 > +.macro butterfly out1, out2, in1, in2 > + vadd.s16 \out1, \in1, \in2 > + vsub.s16 \out2, \in1, \in2 > +.endm > +@ out1 = in1 - in2 > +@ out2 = in1 + in2 > +.macro butterfly_r out1, out2, in1, in2 > + vsub.s16 \out1, \in1, \in2 > + vadd.s16 \out2, \in1, \in2 > +.endm > +@ out1 = (in1 + in2 + (1 << 13)) >> 14 > +@ out2 = (in1 - in2 + (1 << 13)) >> 14 > +@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers > +.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2 > + vadd.s32 \tmp1, \in1, \in2 > + vsub.s32 \tmp2, \in1, \in2 > + vrshrn.s32 \out1, \tmp1, #14 > + vrshrn.s32 \out2, \tmp2, #14 > +.endm > +@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 > +@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 > +@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers > +.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, > tmp3, tmp4 > + vadd.s32 \tmp1, \in1, \in3 > + vadd.s32 \tmp2, \in2, \in4 > + vsub.s32 \tmp3, \in1, \in3 > + vsub.s32 \tmp4, \in2, \in4 > + vrshrn.s32 \out1, \tmp1, #14 > + vrshrn.s32 \out2, \tmp2, #14 > + vrshrn.s32 \out3, \tmp3, #14 > + vrshrn.s32 \out4, \tmp4, #14 > +.endm > + > + > +.macro iwht4 c0, c1, c2, c3 > + vadd.i16 \c0, \c0, \c1 > + vsub.i16 d17, \c2, \c3 > + vsub.i16 d16, \c0, d17 > + vshr.s16 d16, d16, #1 > + vsub.i16 \c2, d16, \c1 > + vsub.i16 \c1, d16, \c3 > + vadd.i16 \c3, d17, \c2 > + vsub.i16 \c0, \c0, \c1 > +.endm > + > +.macro idct4 c0, c1, c2, c3 > + vadd.i16 d16, \c0, \c2 > + vsub.i16 d17, \c0, \c2 > + vmull.s16 q11, \c1, d0[1] > + vmull.s16 q12, \c3, d0[2] vmlsl.s16 q11, \c3, d0[2] and reorder for in-order > + vmull.s16 q13, \c1, d0[2] > + vmull.s16 q14, \c3, d0[1] vmlal.s16 q13, \c3, d0[1] > + vmull.s16 q9, d16, d0[0] > + vmull.s16 q10, d17, d0[0] > + vadd.i32 q13, q13, q14 > + vsub.i32 q11, q11, q12 > + vrshrn.s32 d16, q9, #14 > + vrshrn.s32 d19, q13, #14 > + vrshrn.s32 d17, q10, #14 > + vrshrn.s32 d18, q11, #14 > + vadd.i16 \c0, d16, d19 > + vadd.i16 \c1, d17, d18 > + vsub.i16 \c2, d17, d18 > + vsub.i16 \c3, d16, d19 > +.endm > + > +.macro iadst4 c0, c1, c2, c3 > + vmull.s16 q10, \c0, d1[0] > + vmlal.s16 q10, \c2, d1[1] > + vmlal.s16 q10, \c3, d1[2] > + vmull.s16 q11, \c0, d1[2] > + vmlsl.s16 q11, \c2, d1[0] > + vsub.s16 \c0, \c0, \c2 > + vmlsl.s16 q11, \c3, d1[1] > + vadd.s16 \c0, \c0, \c3 > + vmull.s16 q13, \c1, d1[3] > + vmull.s16 q12, \c0, d1[3] > + vadd.s32 q14, q10, q13 > + vadd.s32 q1, q11, q13 > + vrshrn.s32 \c0, q14, #14 > + vadd.s32 q10, q10, q11 > + vrshrn.s32 \c1, q1, #14 > + vsub.s32 q10, q10, q13 > + vrshrn.s32 \c2, q12, #14 > + vrshrn.s32 \c3, q10, #14 instruction scheduling can be optimized for this one too > +.endm > + > +@ The public functions in this file have got the following signature: > +@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); > + > +.macro itxfm_func4x4 txfm1, txfm2 > +function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1 > +.ifc \txfm1,\txfm2 > +.ifc \txfm1,idct > + movrel r12, itxfm4_coeffs > + vld1.16 {d0}, [r12,:64] > +.endif > +.ifc \txfm1,iadst > + movrel r12, iadst4_coeffs > + vld1.16 {d1}, [r12,:64] > +.endif > +.else > + movrel r12, itxfm4_coeffs > + vld1.16 {q0}, [r12,:128] > +.endif aligned 8 byte and 16 byte loads are equally fast so this adds just complexity without gain > + > + vmov.i16 q15, #0 > +.ifc \txfm1,idct > +.ifc \txfm2,idct > + cmp r3, #1 > + bne 1f > + @ DC-only for idct/idct > + vld1.16 {d4[]}, [r2] alignment > + vmull.s16 q2, d4, d0[0] > + vrshrn.s32 d4, q2, #14 > + vmull.s16 q2, d4, d0[0] > + vrshrn.s32 d4, q2, #14 > + vst1.16 {d30[0]}, [r2] same > + vdup.16 q2, d4[0] > + vmov q3, q2 vdup first to q3 to avoid data dependency > + b 2f > +.endif > +.endif > + > +1: > + vld1.16 {d4-d7}, [r2,:128] > + vst1.16 {q15}, [r2,:128]! > + > +.ifc \txfm1,iwht > + vshr.s16 q2, q2, #2 > + vshr.s16 q3, q3, #2 > +.endif > + > + \txfm1\()4 d4, d5, d6, d7 > + > + vst1.16 {q15}, [r2,:128]! > + @ Transpose 4x4 with 16 bit elements > + vtrn.16 d4, d5 > + vtrn.16 d6, d7 > + vtrn.32 d4, d6 > + vtrn.32 d5, d7 vtrn.32 q2, q3 > + > + \txfm2\()4 d4, d5, d6, d7 > +2: > + vld1.32 {d0[]}, [r0,:32], r1 > + vld1.32 {d0[1]}, [r0,:32], r1 > +.ifnc \txfm1,iwht > + vrshr.s16 q2, q2, #4 > + vrshr.s16 q3, q3, #4 > +.endif > + vaddw.u8 q2, q2, d0 > + vld1.32 {d1[]}, [r0,:32], r1 > + vld1.32 {d1[1]}, [r0,:32], r1 > + vqmovun.s16 d0, q2 > + sub r0, r0, r1, lsl #2 since we have free gp registers I'd use different register for load and store. probably not faster though > + > + vaddw.u8 q3, q3, d1 > + vst1.32 {d0[0]}, [r0,:32], r1 > + vqmovun.s16 d1, q3 > + > + vst1.32 {d0[1]}, [r0,:32], r1 > + vst1.32 {d1[0]}, [r0,:32], r1 > + vst1.32 {d1[1]}, [r0,:32], r1 > + > + bx lr > +endfunc > +.endm > + > +itxfm_func4x4 idct, idct > +itxfm_func4x4 iadst, idct > +itxfm_func4x4 idct, iadst > +itxfm_func4x4 iadst, iadst > +itxfm_func4x4 iwht, iwht > + > + > +.macro idct8 > + dmbutterfly0 d16, d17, d24, d25, q8, q12, q2, q4, d4, d5, d8, > d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a > + dmbutterfly d20, d21, d28, d29, d0[1], d0[2], q2, q3, q4, q5 > @ q10 = t2a, q14 = t3a > + dmbutterfly d18, d19, d30, d31, d0[3], d1[0], q2, q3, q4, q5 > @ q9 = t4a, q15 = t7a > + dmbutterfly d26, d27, d22, d23, d1[1], d1[2], q2, q3, q4, q5 > @ q13 = t5a, q11 = t6a > + > + butterfly q2, q14, q8, q14 @ q2 = t0, q14 = t3 > + butterfly q3, q10, q12, q10 @ q3 = t1, q10 = t2 > + butterfly q4, q13, q9, q13 @ q4 = t4, q13 = t5a > + butterfly q5, q11, q15, q11 @ q5 = t7, q11 = t6a > + > + butterfly q8, q15, q2, q5 @ q8 = out[0], q15 = out[7] > + > + dmbutterfly0 d4, d5, d10, d11, q11, q13, q9, q13, d18, d19, > d26, d27, q2, q5, q11, q12 @ q2 = t6, q5 = t5 > + > + butterfly q11, q12, q14, q4 @ q11 = out[3], q12 = out[4] > + butterfly q9, q14, q3, q2 @ q9 = out[1], q14 = out[6] > + butterfly_r q13, q10, q10, q5 @ q13 = out[5], q10 = out[2] > +.endm > + > +.macro iadst8 > + dmbutterfly_l q4, q5, q2, q3, d30, d31, d16, d17, d2[1], d2[0] > @ q4,q5 = t1a, q2,q3 = t0a > + dmbutterfly_l q8, q15, q6, q7, d22, d23, d24, d25, d3[1], d3[0] > @ q8,q15 = t5a, q6,q7 = t4a > + > + dbutterfly_n d22, d23, d4, d5, q2, q3, q6, q7, q11, q12, > q2, q3 @ q11 = t0, q2 = t4 > + > + dbutterfly_n d24, d25, d6, d7, q4, q5, q8, q15, q12, q3, > q6, q7 @ q12 = t1, q3 = t5 > + > + dmbutterfly_l q6, q7, q4, q5, d26, d27, d20, d21, d2[3], d2[2] > @ q6,q7 = t3a, q4,q5 = t2a > + dmbutterfly_l q10, q13, q8, q15, d18, d19, d28, d29, d3[3], d3[2] > @ q10,q13 = t7a, q8,q15 = t6a > + > + dbutterfly_n d18, d19, d8, d9, q4, q5, q8, q15, q9, q14, > q4, q5 @ q9 = t2, q4 = t6 > + dbutterfly_n d16, d17, d12, d13, q6, q7, q10, q13, q8, q15, > q6, q7 @ q8 = t3, q6 = t7 > + > + butterfly q15, q12, q12, q8 @ q15 = -out[7], q12 = t3 > + vneg.s16 q15, q15 @ q15 = out[7] > + butterfly q8, q9, q11, q9 @ q8 = out[0], q9 = t2 > + > + dmbutterfly_l q10, q11, q5, q7, d4, d5, d6, d7, d0[1], d0[2] > @ q10,q11 = t5a, q5,q7 = t4a > + dmbutterfly_l q2, q3, q13, q14, d12, d13, d8, d9, d0[2], d0[1] > @ q2,q3 = t6a, q13,q14 = t7a > + > + dbutterfly_n d28, d29, d8, d9, q10, q11, q13, q14, q4, q6, > q10, q11 @ q14 = out[6], q4 = t7 > + > + dmbutterfly0 d22, d23, d24, d25, q9, q12, q6, q13, d12, d13, > d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4] > + vneg.s16 q11, q11 @ q11 = out[3] > + > + dbutterfly_n d18, d19, d4, d5, q5, q7, q2, q3, q9, q10, q2, > q3 @ q9 = -out[1], q2 = t6 > + vneg.s16 q9, q9 @ q9 = out[1] > + > + dmbutterfly0 d20, d21, d26, d27, q2, q4, q3, q5, d6, d7, > d10, d11, q6, q7 @ q10 = out[2], q13 = -out[5] > + vneg.s16 q13, q13 @ q13 = out[5] > +.endm > + > + > +.macro itxfm_func8x8 txfm1, txfm2 > +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1 > + @ Push q4-q7 if iadst is used, idct requires > + @ a few scratch registers less, so only push q4-q5 > + @ if only idct is involved. > + @ The iadst also uses a few coefficients from > + @ idct, so those always need to be loaded. > + movrel r12, idct_coeffs move this into the last else below > + vld1.16 {q0}, [r12,:128] this can follow after this block if the iadst8_coeffs load uses post increment > +.ifc \txfm1,iadst > + movrel r12, iadst8_coeffs > + vld1.16 {q1}, [r12,:128] > + vpush {q4-q7} > +.else > +.ifc \txfm2,iadst does .elseifc work? > + movrel r12, iadst8_coeffs > + vld1.16 {q1}, [r12,:128] > + vpush {q4-q7} > +.else > + vpush {q4-q5} > +.endif > +.endif > + > + vmov.i16 q2, #0 > + vmov.i16 q3, #0 > + > +.ifc \txfm1,idct > +.ifc \txfm2,idct > + cmp r3, #1 > + bne 1f > + @ DC-only for idct/idct > + vld1.16 {d16[]}, [r2] alignment > + vmull.s16 q8, d16, d0[0] > + vrshrn.s32 d16, q8, #14 > + vmull.s16 q8, d16, d0[0] > + vrshrn.s32 d16, q8, #14 > + vdup.16 q8, d16[0] > + vmov q9, q8 > + vmov q10, q8 > + vmov q11, q8 > + vmov q12, q8 > + vmov q13, q8 > + vmov q14, q8 > + vmov q15, q8 all duped from d16[0] > + vst1.16 {d4[0]}, [r2] alignment > + b 2f > +.endif > +.endif > +1: > + vld1.16 {q8-q9}, [r2,:128]! > + vld1.16 {q10-q11}, [r2,:128]! > + vld1.16 {q12-q13}, [r2,:128]! > + vld1.16 {q14-q15}, [r2,:128]! > + sub r2, r2, #128 > + vst1.16 {q2-q3}, [r2,:128]! > + vst1.16 {q2-q3}, [r2,:128]! > + vst1.16 {q2-q3}, [r2,:128]! > + vst1.16 {q2-q3}, [r2,:128]! > + > + \txfm1\()8 > + > + @ Transpose 8x8 with 16 bit elements > + vswp d17, d24 > + vswp d19, d26 > + vswp d21, d28 > + vswp d23, d30 > + transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15 > + > + \txfm2\()8 > +2: > + @ Add into the destination > + vld1.8 {d4}, [r0,:64], r1 > + vrshr.s16 q8, q8, #5 > + vld1.8 {d5}, [r0,:64], r1 > + vrshr.s16 q9, q9, #5 > + vld1.8 {d6}, [r0,:64], r1 > + vrshr.s16 q10, q10, #5 > + vaddw.u8 q8, q8, d4 > + vld1.8 {d7}, [r0,:64], r1 > + vrshr.s16 q11, q11, #5 > + vaddw.u8 q9, q9, d5 > + vld1.8 {d8}, [r0,:64], r1 > + vrshr.s16 q12, q12, #5 > + vaddw.u8 q10, q10, d6 > + vqmovun.s16 d4, q8 > + vld1.8 {d9}, [r0,:64], r1 > + vrshr.s16 q13, q13, #5 > + vaddw.u8 q11, q11, d7 > + vqmovun.s16 d5, q9 > + vld1.8 {d10}, [r0,:64], r1 > + vrshr.s16 q14, q14, #5 > + vaddw.u8 q12, q12, d8 > + vqmovun.s16 d6, q10 > + vld1.8 {d11}, [r0,:64], r1 > + vrshr.s16 q15, q15, #5 > + vaddw.u8 q13, q13, d9 > + vqmovun.s16 d7, q11 > + sub r0, r0, r1, lsl #3 could use a different register loads and stores > + > + vst1.8 {d4}, [r0,:64], r1 > + vaddw.u8 q14, q14, d10 > + vst1.8 {d5}, [r0,:64], r1 > + vqmovun.s16 d8, q12 > + vst1.8 {d6}, [r0,:64], r1 > + vaddw.u8 q15, q15, d11 > + vst1.8 {d7}, [r0,:64], r1 > + vqmovun.s16 d9, q13 > + vst1.8 {d8}, [r0,:64], r1 > + vqmovun.s16 d10, q14 > + vst1.8 {d9}, [r0,:64], r1 > + vqmovun.s16 d11, q15 > + > + vst1.8 {d10}, [r0,:64], r1 > + vst1.8 {d11}, [r0,:64], r1 > + > +.ifc \txfm1,iadst > + vpop {q4-q7} > +.else > +.ifc \txfm2,iadst > + vpop {q4-q7} > +.else > + vpop {q4-q5} > +.endif > +.endif > + bx lr > +endfunc > +.endm > + > +itxfm_func8x8 idct, idct > +itxfm_func8x8 iadst, idct > +itxfm_func8x8 idct, iadst > +itxfm_func8x8 iadst, iadst > + > + > +function idct16x16_dc_add_neon > + movrel r12, idct_coeffs > + vld1.16 {d0}, [r12,:64] > + > + vmov.i16 q2, #0 > + > + vld1.16 {d16[]}, [r2] alignment > + vmull.s16 q8, d16, d0[0] > + vrshrn.s32 d16, q8, #14 > + vmull.s16 q8, d16, d0[0] > + vrshrn.s32 d16, q8, #14 > + vdup.16 q8, d16[0] > + vst1.16 {d4[0]}, [r2] alignment > + > + vrshr.s16 q8, q8, #6 > + > + mov r12, #16 > +1: > + @ Loop to add the constant from q8 into all 16x16 outputs > + vld1.8 {q3}, [r0,:128] > + vaddw.u8 q10, q8, d6 > + vaddw.u8 q11, q8, d7 > + vqmovun.s16 d6, q10 > + vqmovun.s16 d7, q11 > + vst1.8 {q3}, [r0,:128], r1 > + subs r12, r12, #1 > + bne 1b > + > + bx lr > +endfunc > + > +.macro idct16 > + mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, > d24 = t1a > + mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = > t3a > + mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = > t7a > + mbutterfly d26, d22, d1[1], d1[2], q2, q3 @ d26 = t5a, d22 = > t6a > + mbutterfly d17, d31, d1[3], d2[0], q2, q3 @ d17 = t8a, d31 = > t15a > + mbutterfly d25, d23, d2[1], d2[2], q2, q3 @ d25 = t9a, d23 = > t14a > + mbutterfly d21, d27, d2[3], d3[0], q2, q3 @ d21 = t10a, d27 = > t13a > + mbutterfly d29, d19, d3[1], d3[2], q2, q3 @ d29 = t11a, d19 = > t12a > + > + butterfly d4, d28, d16, d28 @ d4 = t0, d28 = > t3 > + butterfly d5, d20, d24, d20 @ d5 = t1, d20 = > t2 > + butterfly d6, d26, d18, d26 @ d6 = t4, d26 = > t5 > + butterfly d7, d22, d30, d22 @ d7 = t7, d22 = > t6 > + butterfly d16, d25, d17, d25 @ d16 = t8, d25 = > t9 > + butterfly d24, d21, d29, d21 @ d24 = t11, d21 = > t10 > + butterfly d17, d27, d19, d27 @ d17 = t12, d27 = > t13 > + butterfly d29, d23, d31, d23 @ d29 = t15, d23 = > t14 > + > + mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, > d26 = t5a > + mbutterfly d23, d25, d0[1], d0[2], q9, q15 @ d23 = t9a, d25 = > t14a > + mbutterfly_neg d27, d21, d0[1], d0[2], q9, q15 @ d27 = t13a, d21 = > t10a > + > + butterfly d18, d7, d4, d7 @ d18 = t0a, d7 = > t7a > + butterfly d19, d22, d5, d22 @ d19 = t1a, d22 = > t6 > + butterfly d4, d26, d20, d26 @ d4 = t2a, d26 = > t5 > + butterfly d5, d6, d28, d6 @ d5 = t3a, d6 = > t4 > + butterfly d20, d28, d16, d24 @ d20 = t8a, d28 = > t11a > + butterfly d24, d21, d23, d21 @ d24 = t9, d21 = > t10 > + butterfly d23, d27, d25, d27 @ d23 = t14, d27 = > t13 > + butterfly d25, d29, d29, d17 @ d25 = t15a, d29 = > t12a > + > + mbutterfly0 d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, > d21 = t10a > + mbutterfly0 d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12, > d28 = t11 > + > + vswp d27, d29 @ d27 = t12, d29 = > t13a > + vswp d28, d27 @ d28 = t12, d27 = > t11 > + butterfly d16, d31, d18, d25 @ d16 = out[0], d31 > = out[15] > + butterfly d17, d30, d19, d23 @ d17 = out[1], d30 > = out[14] > + butterfly_r d25, d22, d22, d24 @ d25 = out[9], d22 > = out[6] > + butterfly d23, d24, d7, d20 @ d23 = out[7], d24 > = out[8] > + butterfly d18, d29, d4, d29 @ d18 = out[2], d29 > = out[13] > + butterfly d19, d28, d5, d28 @ d19 = out[3], d28 > = out[12] > + vmov d4, d21 @ d4 = t10a > + butterfly d20, d27, d6, d27 @ d20 = out[4], d27 > = out[11] > + butterfly d21, d26, d26, d4 @ d21 = out[5], d26 > = out[10] > +.endm > + > +.macro iadst16 > + movrel r12, iadst16_coeffs > + vld1.16 {q0-q1}, [r12,:128] > + > + mbutterfly_l q3, q2, d31, d16, d0[1], d0[0] @ q3 = t1, q2 = > t0 > + mbutterfly_l q5, q4, d23, d24, d2[1], d2[0] @ q5 = t9, q4 = > t8 > + butterfly_n d31, d24, q3, q5, q6, q5 @ d31 = t1a, d24 = > t9a > + mbutterfly_l q7, q6, d29, d18, d0[3], d0[2] @ q7 = t3, q6 = > t2 > + butterfly_n d16, d23, q2, q4, q3, q4 @ d16 = t0a, d23 = > t8a > + > + mbutterfly_l q3, q2, d21, d26, d2[3], d2[2] @ q3 = t11, q2 = > t10 > + butterfly_n d29, d26, q7, q3, q4, q3 @ d29 = t3a, d26 = > t11a > + mbutterfly_l q5, q4, d27, d20, d1[1], d1[0] @ q5 = t5, q4 = > t4 > + butterfly_n d18, d21, q6, q2, q3, q2 @ d18 = t2a, d21 = > t10a > + > + mbutterfly_l q7, q6, d19, d28, d3[1], d3[0] @ q7 = t13, q6 = > t12 > + butterfly_n d20, d28, q5, q7, q2, q7 @ d20 = t5a, d28 = > t13a > + mbutterfly_l q3, q2, d25, d22, d1[3], d1[2] @ q3 = t7, q2 = > t6 > + butterfly_n d27, d19, q4, q6, q5, q6 @ d27 = t4a, d19 = > t12a > + > + mbutterfly_l q5, q4, d17, d30, d3[3], d3[2] @ q5 = t15, q4 = > t14 > + movrel r12, idct_coeffs > + vld1.16 {q0}, [r12,:128] > + butterfly_n d22, d30, q3, q5, q6, q5 @ d22 = t7a, d30 = > t15a > + mbutterfly_l q7, q6, d23, d24, d0[3], d1[0] @ q7 = t9, q6 = > t8 > + butterfly_n d25, d17, q2, q4, q3, q4 @ d25 = t6a, d17 = > t14a > + > + mbutterfly_l q2, q3, d28, d19, d1[0], d0[3] @ q2 = t12, q3 = > t13 > + butterfly_n d23, d19, q6, q2, q4, q2 @ d23 = t8a, d19 = > t12a > + mbutterfly_l q5, q4, d21, d26, d1[1], d1[2] @ q5 = t11, q4 = > t10 > + butterfly_r d4, d27, d16, d27 @ d4 = t4, d27 = > t0 > + butterfly_n d24, d28, q7, q3, q6, q3 @ d24 = t9a, d28 = > t13a > + > + mbutterfly_l q6, q7, d30, d17, d1[2], d1[1] @ q6 = t14, q7 = > t15 > + butterfly_r d5, d20, d31, d20 @ d5 = t5, d20 = > t1 > + butterfly_n d21, d17, q4, q6, q3, q6 @ d21 = t10a, d17 = > t14a > + butterfly_n d26, d30, q5, q7, q4, q7 @ d26 = t11a, d30 = > t15a > + > + butterfly_r d6, d25, d18, d25 @ d6 = t6, d25 = > t2 > + butterfly_r d7, d22, d29, d22 @ d7 = t7, d22 = > t3 > + > + mbutterfly_l q5, q4, d19, d28, d0[1], d0[2] @ q5 = t13, q4 = > t12 > + mbutterfly_l q6, q7, d30, d17, d0[2], d0[1] @ q6 = t14, q7 = > t15 > + > + butterfly_n d18, d30, q4, q6, q8, q6 @ d18 = out[2], > d30 = t14a > + butterfly_n d29, d17, q5, q7, q6, q7 @ d29 = -out[13], > d17 = t15a > + vneg.s16 d29, d29 @ d29 = out[13] > + > + mbutterfly_l q5, q4, d4, d5, d0[1], d0[2] @ q5 = t5a, q4 = > t4a > + mbutterfly_l q6, q7, d7, d6, d0[2], d0[1] @ q6 = t6a, q7 = > t7a > + > + butterfly d2, d6, d27, d25 @ d2 = out[0], d6 = > t2a > + butterfly d3, d7, d23, d21 @ d3 =-out[1], d7 = > t10 > + > + butterfly_n d19, d31, q4, q6, q2, q4 @ d19 = -out[3], > d31 = t6 > + vneg.s16 d19, d19 @ d19 = out[3] > + butterfly_n d28, d16, q5, q7, q2, q5 @ d28 = out[12], > d16 = t7 > + > + butterfly d5, d8, d20, d22 @ d5 =-out[15],d8 = > t3a > + butterfly d4, d9, d24, d26 @ d4 = out[14],d9 = > t11 > + > + mbutterfly0 d23, d24, d6, d8, d10, d11, q6, q7, 1 @ d23 = > out[7], d24 = out[8] > + mbutterfly0 d20, d27, d16, d31, d10, d11, q6, q7 @ d20 = > out[4], d27 = out[11] > + mbutterfly0 d22, d25, d9, d7, d10, d11, q6, q7 @ d22 = > out[6], d25 = out[9] > + mbutterfly0 d21, d26, d30, d17, d10, d11, q6, q7, 1 @ d21 = > out[5], d26 = out[10] > + > + vneg.s16 d31, d5 @ d31 = out[15] > + vneg.s16 d17, d3 @ d17 = out[1] > + > + vmov d16, d2 > + vmov d30, d4 > +.endm > + > +.macro itxfm16_1d_funcs txfm > +@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, > +@ transpose into a horizontal 16x4 slice and store. > +@ r0 = dst (temp buffer) > +@ r1 = unused > +@ r2 = src > +@ r3 = slice offset > +function \txfm\()16_1d_4x16_pass1_neon > + mov r12, #32 > + vmov.s16 q2, #0 > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > + vld1.16 {d\i}, [r2,:64] > + vst1.16 {d4}, [r2,:64], r12 > +.endr > + > + \txfm\()16 > + > + @ Do four 4x4 transposes. Originally, d16-d31 contain the > + @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 > + @ contain the transposed 4x4 blocks. > + transpose16_q_2x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, > d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 > + > + @ Store the transposed 4x4 blocks horizontally. > + cmp r3, #12 > + beq 1f > +.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 > + vst1.16 {d\i}, [r0,:64]! > +.endr > + bx lr > +1: > + @ Special case: For the last input column (r3 == 12), > + @ which would be stored as the last row in the temp buffer, > + @ don't store the first 4x4 block, but keep it in registers > + @ for the first slice of the second pass (where it is the > + @ last 4x4 block). > + add r0, r0, #8 > +.irp i, 20, 24, 28 > + vst1.16 {d\i}, [r0,:64]! > +.endr > + add r0, r0, #8 > +.irp i, 21, 25, 29 > + vst1.16 {d\i}, [r0,:64]! > +.endr > + add r0, r0, #8 > +.irp i, 22, 26, 30 > + vst1.16 {d\i}, [r0,:64]! > +.endr > + add r0, r0, #8 > +.irp i, 23, 27, 31 > + vst1.16 {d\i}, [r0,:64]! > +.endr > + vmov d28, d16 > + vmov d29, d17 > + vmov d30, d18 > + vmov d31, d19 > + bx lr > +endfunc > + > +@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, > +@ load the destination pixels (from a similar 4x16 slice), add and store > back. > +@ r0 = dst > +@ r1 = dst stride > +@ r2 = src (temp buffer) > +@ r3 = slice offset > +function \txfm\()16_1d_4x16_pass2_neon > + mov r12, #32 > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 > + vld1.16 {d\i}, [r2,:64], r12 > +.endr > + cmp r3, #0 > + beq 1f > +.irp i, 28, 29, 30, 31 > + vld1.16 {d\i}, [r2,:64], r12 > +.endr > +1: > + > + \txfm\()16 > + > +.macro load_add_store coef0, coef1, coef2, coef3 > + vrshr.s16 \coef0, \coef0, #6 > + vrshr.s16 \coef1, \coef1, #6 > + > + vld1.32 {d4[]}, [r0,:32], r1 > + vld1.32 {d4[1]}, [r0,:32], r1 > + vrshr.s16 \coef2, \coef2, #6 > + vrshr.s16 \coef3, \coef3, #6 > + vld1.32 {d5[]}, [r0,:32], r1 > + vld1.32 {d5[1]}, [r0,:32], r1 > + vaddw.u8 \coef0, \coef0, d4 > + vld1.32 {d6[]}, [r0,:32], r1 > + vld1.32 {d6[1]}, [r0,:32], r1 > + vaddw.u8 \coef1, \coef1, d5 > + vld1.32 {d7[]}, [r0,:32], r1 > + vld1.32 {d7[1]}, [r0,:32], r1 > + > + vqmovun.s16 d4, \coef0 > + vqmovun.s16 d5, \coef1 > + sub r0, r0, r1, lsl #3 could use an additional register > + vaddw.u8 \coef2, \coef2, d6 > + vaddw.u8 \coef3, \coef3, d7 > + vst1.32 {d4[0]}, [r0,:32], r1 > + vst1.32 {d4[1]}, [r0,:32], r1 > + vqmovun.s16 d6, \coef2 > + vst1.32 {d5[0]}, [r0,:32], r1 > + vst1.32 {d5[1]}, [r0,:32], r1 > + vqmovun.s16 d7, \coef3 > + > + vst1.32 {d6[0]}, [r0,:32], r1 > + vst1.32 {d6[1]}, [r0,:32], r1 > + vst1.32 {d7[0]}, [r0,:32], r1 > + vst1.32 {d7[1]}, [r0,:32], r1 > +.endm > + load_add_store q8, q9, q10, q11 > + load_add_store q12, q13, q14, q15 > +.purgem load_add_store > + > + bx lr > +endfunc > +.endm > + > +itxfm16_1d_funcs idct > +itxfm16_1d_funcs iadst > + > +.macro itxfm_func16x16 txfm1, txfm2 > +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 > +.ifc \txfm1,idct > +.ifc \txfm2,idct > + cmp r3, #1 > + beq idct16x16_dc_add_neon > +.endif > +.endif > +1: unused label > + push {r4-r7,lr} > +.ifc \txfm1,iadst > + vpush {q4-q7} > +.else > +.ifc \txfm2,iadst > + vpush {q4-q7} > +.endif > +.endif > + mov r7, sp > + > + @ Align the stack, allocate a temp buffer > +T mov r12, sp > +T bic r12, r12, #15 > +T sub r12, r12, #512 > +T mov sp, r12 > +A bic sp, sp, #15 > +A sub sp, sp, #512 > + > + mov r4, r0 > + mov r5, r1 > + mov r6, r2 > + > +.ifc \txfm1,idct > + movrel r12, idct_coeffs > + vld1.16 {q0-q1}, [r12,:128] > +.endif > + > +.irp i, 0, 4, 8, 12 > + add r0, sp, #(\i*32) > + add r2, r6, #(\i*2) > + mov r3, #\i > + bl \txfm1\()16_1d_4x16_pass1_neon > +.endr > +.ifc \txfm2,idct > + movrel r12, idct_coeffs > + vld1.16 {q0-q1}, [r12,:128] > +.endif > +.irp i, 0, 4, 8, 12 > + add r0, r4, #(\i) > + mov r1, r5 > + add r2, sp, #(\i*2) > + mov r3, #\i > + bl \txfm2\()16_1d_4x16_pass2_neon > +.endr > + > + mov sp, r7 > +.ifc \txfm1,iadst > + vpop {q4-q7} > +.else > +.ifc \txfm2,iadst > + vpop {q4-q7} > +.endif > +.endif > + pop {r4-r7,pc} > +endfunc > +.endm > + > +itxfm_func16x16 idct, idct > +itxfm_func16x16 iadst, idct > +itxfm_func16x16 idct, iadst > +itxfm_func16x16 iadst, iadst > + > + > +function idct32x32_dc_add_neon > + movrel r12, idct_coeffs > + vld1.16 {d0}, [r12,:64] > + > + vmov.i16 q2, #0 > + > + vld1.16 {d16[]}, [r2] alignment > + vmull.s16 q8, d16, d0[0] > + vrshrn.s32 d16, q8, #14 > + vmull.s16 q8, d16, d0[0] > + vrshrn.s32 d16, q8, #14 > + vdup.16 q8, d16[0] > + vst1.16 {d4[0]}, [r2] dito > + > + vrshr.s16 q8, q8, #6 > + > + mov r12, #32 > +1: > + @ Loop to add the constant from q8 into all 32x32 outputs > + vld1.8 {q2-q3}, [r0,:128] > + vaddw.u8 q10, q8, d4 > + vaddw.u8 q11, q8, d5 > + vaddw.u8 q12, q8, d6 > + vaddw.u8 q13, q8, d7 > + vqmovun.s16 d4, q10 > + vqmovun.s16 d5, q11 > + vqmovun.s16 d6, q12 > + vqmovun.s16 d7, q13 > + vst1.8 {q2-q3}, [r0,:128], r1 > + subs r12, r12, #1 > + bne 1b > + > + bx lr > +endfunc > + > +.macro idct32_odd > + movrel r12, idct_coeffs > + add r12, r12, #32 > + vld1.16 {q0-q1}, [r12,:128] > + > + mbutterfly d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = > t31a > + mbutterfly d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = > t30a > + mbutterfly d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = > t29a > + mbutterfly d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = > t28a > + mbutterfly d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = > t27a > + mbutterfly d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = > t26a > + mbutterfly d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = > t25a > + mbutterfly d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = > t24a > + > + sub r12, r12, #32 > + vld1.16 {q0}, [r12,:128] > + > + butterfly d4, d24, d16, d24 @ d4 = t16, d24 = t17 > + butterfly d5, d20, d28, d20 @ d5 = t19, d20 = t18 > + butterfly d6, d26, d18, d26 @ d6 = t20, d26 = t21 > + butterfly d7, d22, d30, d22 @ d7 = t23, d22 = t22 > + butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25 > + butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26 > + butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 > + butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29 > + > + mbutterfly d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = > t30a > + mbutterfly_neg d27, d20, d0[3], d1[0], q8, q9 @ d27 = t29a, d20 = > t18a > + mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = > t26a > + mbutterfly_neg d25, d22, d1[1], d1[2], q8, q9 @ d25 = t25a, d22 = > t22a > + > + butterfly d16, d5, d4, d5 @ d16 = t16a, d5 = t19a > + butterfly d17, d20, d23, d20 @ d17 = t17, d20 = t18 > + butterfly d18, d6, d7, d6 @ d18 = t23a, d6 = t20a > + butterfly d19, d21, d22, d21 @ d19 = t22, d21 = t21 > + butterfly d4, d28, d28, d30 @ d4 = t24a, d28 = t27a > + butterfly d23, d26, d25, d26 @ d23 = t25, d26 = t26 > + butterfly d7, d29, d29, d31 @ d7 = t31a, d29 = t28a > + butterfly d22, d27, d24, d27 @ d22 = t30, d27 = t29 > + > + mbutterfly d27, d20, d0[1], d0[2], q12, q15 @ d27 = t18a, d20 = > t29a > + mbutterfly d29, d5, d0[1], d0[2], q12, q15 @ d29 = t19, d5 = > t28 > + mbutterfly_neg d28, d6, d0[1], d0[2], q12, q15 @ d28 = t27, d6 = > t20 > + mbutterfly_neg d26, d21, d0[1], d0[2], q12, q15 @ d26 = t26a, d21 = > t21a > + > + butterfly d31, d24, d7, d4 @ d31 = t31, d24 = t24 > + butterfly d30, d25, d22, d23 @ d30 = t30a, d25 = t25a > + butterfly_r d23, d16, d16, d18 @ d23 = t23, d16 = t16 > + butterfly_r d22, d17, d17, d19 @ d22 = t22a, d17 = t17a > + butterfly d18, d21, d27, d21 @ d18 = t18, d21 = t21 > + butterfly_r d27, d28, d5, d28 @ d27 = t27a, d28 = t28a > + butterfly d4, d26, d20, d26 @ d4 = t29, d26 = t26 > + butterfly d19, d20, d29, d6 @ d19 = t19a, d20 = t20 > + vmov d29, d4 @ d29 = t29 > + > + mbutterfly0 d27, d20, d27, d20, d4, d6, q2, q3 @ d27 = t27, d20 > = t20 > + mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 > = t21a > + mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 > = t22 > + mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 > = t23a > +.endm > + > +@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. > +@ We don't have register space to do a single pass IDCT of 4x32 though, > +@ but the 32-point IDCT can be decomposed into two 16-point IDCTs; > +@ a normal IDCT16 with every other input component (the even ones, with > +@ each output written twice), followed by a separate 16-point IDCT > +@ of the odd inputs, added/subtracted onto the outputs of the first idct16. > +@ r0 = dst (temp buffer) > +@ r1 = unused > +@ r2 = src > +function idct32_1d_4x32_pass1_neon > + movrel r12, idct_coeffs > + vld1.16 {q0-q1}, [r12,:128] > + > + @ Double stride of the input, since we only read every other line > + mov r12, #128 > + vmov.s16 d4, #0 > + > + @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > + vld1.16 {d\i}, [r2,:64] > + vst1.16 {d4}, [r2,:64], r12 > +.endr > + > + idct16 > + > + @ Do four 4x4 transposes. Originally, d16-d31 contain the > + @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 > + @ contain the transposed 4x4 blocks. > + transpose16_q_2x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, > d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 > + @ Store the registers a, b, c, d horizontally, followed > + @ by the same registers d, c, b, a mirrored. > +.macro store_rev a, b, c, d > +.irp i, \a, \b, \c, \d > + vst1.16 {d\i}, [r0,:64]! > + vrev64.16 d\i, d\i > +.endr > +.irp i, \d, \c, \b, \a > + vst1.16 {d\i}, [r0,:64]! > +.endr > +.endm > + store_rev 16, 20, 24, 28 > + store_rev 17, 21, 25, 29 > + store_rev 18, 22, 26, 30 > + store_rev 19, 23, 27, 31 > + sub r0, r0, #256 > +.purgem store_rev > + > + @ Move r2 back to the start of the input, and move > + @ to the first odd row > + sub r2, r2, r12, lsl #4 > + add r2, r2, #64 > + > + vmov.s16 d4, #0 > + @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > + vld1.16 {d\i}, [r2,:64] > + vst1.16 {d4}, [r2,:64], r12 > +.endr > + > + idct32_odd > + > + transpose16_q_2x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, > d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 > + > + @ Store the registers a, b, c, d horizontally, > + @ adding into the output first, and then mirrored, subtracted > + @ from the output. > +.macro store_rev a, b, c, d > +.irp i, \a, \b, \c, \d > + vld1.16 {d4}, [r0,:64] > + vadd.s16 d4, d4, d\i > + vst1.16 {d4}, [r0,:64]! > + vrev64.16 d\i, d\i > +.endr > +.irp i, \d, \c, \b, \a > + vld1.16 {d4}, [r0,:64] > + vsub.s16 d4, d4, d\i > + vst1.16 {d4}, [r0,:64]! > +.endr > +.endm > + > + store_rev 31, 27, 23, 19 > + store_rev 30, 26, 22, 18 > + store_rev 29, 25, 21, 17 > + store_rev 28, 24, 20, 16 > +.purgem store_rev > + bx lr > +endfunc > + > +@ This is mostly the same as 4x32_pass1, but without the transpose, > +@ and use the source as temp buffer between the two idct passes, and > +@ add into the destination. > +@ r0 = dst > +@ r1 = dst stride > +@ r2 = src (temp buffer) > +function idct32_1d_4x32_pass2_neon > + movrel r12, idct_coeffs > + vld1.16 {q0-q1}, [r12,:128] > + > + mov r12, #128 > + @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > + vld1.16 {d\i}, [r2,:64], r12 > +.endr > + sub r2, r2, r12, lsl #4 > + > + idct16 > + > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > + vst1.16 {d\i}, [r2,:64], r12 > +.endr > + > + sub r2, r2, r12, lsl #4 > + add r2, r2, #64 > + > + @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > + vld1.16 {d\i}, [r2,:64], r12 > +.endr > + sub r2, r2, r12, lsl #4 > + sub r2, r2, #64 > + > + idct32_odd > + > + mov r12, #128 > +.macro load_acc_store a, b, c, d, neg=0 > + vld1.16 {d4}, [r2,:64], r12 > + vld1.16 {d5}, [r2,:64], r12 > +.if \neg == 0 > + vadd.s16 d4, d4, d\a > + vld1.16 {d6}, [r2,:64], r12 > + vadd.s16 d5, d5, d\b > + vld1.16 {d7}, [r2,:64], r12 > + vadd.s16 d6, d6, d\c > + vadd.s16 d7, d7, d\d > +.else > + vsub.s16 d4, d4, d\a > + vld1.16 {d6}, [r2,:64], r12 > + vsub.s16 d5, d5, d\b > + vld1.16 {d7}, [r2,:64], r12 > + vsub.s16 d6, d6, d\c > + vsub.s16 d7, d7, d\d > +.endif > + vld1.32 {d2[]}, [r0,:32], r1 > + vld1.32 {d2[1]}, [r0,:32], r1 > + vrshr.s16 q2, q2, #6 > + vld1.32 {d3[]}, [r0,:32], r1 > + vrshr.s16 q3, q3, #6 > + vld1.32 {d3[1]}, [r0,:32], r1 > + sub r0, r0, r1, lsl #2 > + vaddw.u8 q2, q2, d2 > + vaddw.u8 q3, q3, d3 > + vqmovun.s16 d4, q2 > + vqmovun.s16 d5, q3 > + vst1.32 {d4[0]}, [r0,:32], r1 > + vst1.32 {d4[1]}, [r0,:32], r1 > + vst1.32 {d5[0]}, [r0,:32], r1 > + vst1.32 {d5[1]}, [r0,:32], r1 > +.endm > + load_acc_store 31, 30, 29, 28 > + load_acc_store 27, 26, 25, 24 > + load_acc_store 23, 22, 21, 20 > + load_acc_store 19, 18, 17, 16 > + sub r2, r2, r12 > + neg r12, r12 > + load_acc_store 16, 17, 18, 19, 1 > + load_acc_store 20, 21, 22, 23, 1 > + load_acc_store 24, 25, 26, 27, 1 > + load_acc_store 28, 29, 30, 31, 1 > +.purgem load_acc_store > + bx lr > +endfunc > + > +function ff_vp9_idct_idct_32x32_add_neon, export=1 > + cmp r3, #1 > + beq idct32x32_dc_add_neon > +1: unused label > + push {r4-r7,lr} > + vpush {q4-q7} > + mov r7, sp > + > + @ Align the stack, allocate a temp buffer > +T mov r12, sp > +T bic r12, r12, #15 > +T sub r12, r12, #2048 > +T mov sp, r12 > +A bic sp, sp, #15 > +A sub sp, sp, #2048 > + > + mov r4, r0 > + mov r5, r1 > + mov r6, r2 > + > +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 > + add r0, sp, #(\i*64) > + add r2, r6, #(\i*2) > + bl idct32_1d_4x32_pass1_neon > +.endr > +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 > + add r0, r4, #(\i) > + mov r1, r5 > + add r2, sp, #(\i*2) > + bl idct32_1d_4x32_pass2_neon > +.endr > + > + mov sp, r7 > + vpop {q4-q7} > + pop {r4-r7,pc} > +endfunc patch ok with nits fixed. instruction rescheduling can be done in a separate commit. Janne _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel