On Fri, 11 Nov 2016, Janne Grunau wrote:
On 2016-10-18 21:07:30 +0300, Martin Storsjö wrote:This work is sponsored by, and copyright, Google.For the transforms up to 8x8, we can fit all the data (including temporaries) in registers and just do a straightforward transform of all the data. For 16x16, we do a transform of 4x16 pixels in 4 slices, using a temporary buffer. For 32x32, we transform 4x32 pixels at a time, in two steps of 4x16 pixels each. Examples of relative speedup compared to the C version, from checkasm: Cortex A7 A8 A9 A53 vp9_inv_adst_adst_4x4_add_neon: 3.39 5.80 4.18 3.92 vp9_inv_adst_adst_8x8_add_neon: 3.94 4.82 4.25 3.89 vp9_inv_adst_adst_16x16_add_neon: 3.33 4.27 4.08 4.05 vp9_inv_dct_dct_4x4_add_neon: 3.73 5.06 4.26 4.28 vp9_inv_dct_dct_8x8_add_neon: 4.59 5.81 5.03 4.73 vp9_inv_dct_dct_16x16_add_neon: 3.40 3.39 3.33 3.68 vp9_inv_dct_dct_32x32_add_neon: 4.00 3.51 3.80 4.40 vp9_inv_wht_wht_4x4_add_neon: 3.24 5.16 3.52 3.67 Thus, the speedup vs C code is around 3-5x. This is mostly marginally faster than the corresponding routines in libvpx on most cores, tested with their 32x32 idct (compared to vpx_idct32x32_1024_add_neon). These numbers are slightly in libvpx's favour since their version doesn't clear the input buffer like ours do (although the effect of that on the total runtime probably is negligible.) Cortex A7 A8 A9 A53 vp9_inv_dct_dct_32x32_add_neon: 18852.0 16831.6 14217.4 11988.6 libvpx vpx_idct32x32_1024_add_neon 20789.0 13344.3 15049.9 13030.5 Only on the Cortex A8, the libvpx function is faster. On the other cores, ours is slightly faster even though ours has got source block clearing integrated. --- v2: Updated some broken macro comments, optimized the transposes by using the q registers for part of transposes. Suggestions very much welcome on names for the macros - no idea if the current ones make sense or what one commonly would call these combinations. I'm a bit reluctant to expanding the macros (to be able to schedule instructions better), in order to keep things readable. (Although, I guess this is kinda write-only code, which nobody ever touches afterwards). --- libavcodec/arm/Makefile | 3 +- libavcodec/arm/vp9dsp_init_arm.c | 51 +- libavcodec/arm/vp9itxfm_neon.S | 1166 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 1218 insertions(+), 2 deletions(-) create mode 100644 libavcodec/arm/vp9itxfm_neon.S
+ +const itxfm4_coeffs, align=4 + .short 11585, 6270, 15137, 0 +iadst4_coeffs: + .short 5283, 15212, 9929, 13377 +endconst + +const iadst8_coeffs, align=4 + .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 +endconst + +const idct_coeffs, align=4 + .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606 + .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0 + .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 + .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 +endconst + +const iadst16_coeffs, align=4 + .short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760 + .short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207 +endconst + +@ Do two 4x4 transposes, using q registers for the subtransposes that don'tit's four 4x4 transposes
Indeed; fixed the comment and the macro name
+@ need to address the individual d registers. +@ r0,r1 == rq1, r2,r3 == rq1, etc +.macro transpose16_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + vtrn.32 \rq0, \rq1 + vtrn.32 \rq2, \rq3 + vtrn.32 \rq4, \rq5 + vtrn.32 \rq6, \rq7 + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 + vtrn.16 \r4, \r5 + vtrn.16 \r6, \r7 + vtrn.16 \r8, \r9 + vtrn.16 \r10, \r11 + vtrn.16 \r12, \r13 + vtrn.16 \r14, \r15 +.endm + +@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 +@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 +@ in/out are d registers +.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0 + vadd.s16 \tmpd1, \in1, \in2 + vsub.s16 \tmpd2, \in1, \in2 + vmull.s16 \tmpq3, \tmpd1, d0[0] + vmull.s16 \tmpq4, \tmpd2, d0[0] +.if \neg > 0 + vneg.s32 \tmpq3, \tmpq3 +.endif + vrshrn.s32 \out1, \tmpq3, #14 + vrshrn.s32 \out2, \tmpq4, #14 +.endman empty line after .endm improves the readability
Done (likewise for the aarch64 version)
+@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 +@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 +@ Same as mbutterfly0, but with input being 2 q registers, output +@ being 4 d registers. +@ This can do with either 4 or 6 temporary q registers. +.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6 + vadd.s16 \tmpq1, \in1, \in2 + vsub.s16 \tmpq2, \in1, \in2 + vmull.s16 \tmpq3, \tmpd11, d0[0] + vmull.s16 \tmpq4, \tmpd12, d0[0] +.ifb \tmpq5 + vrshrn.s32 \out1, \tmpq3, #14 + vrshrn.s32 \out2, \tmpq4, #14 + vmull.s16 \tmpq3, \tmpd21, d0[0] + vmull.s16 \tmpq4, \tmpd22, d0[0] + vrshrn.s32 \out3, \tmpq3, #14 + vrshrn.s32 \out4, \tmpq4, #14 +.else + vmull.s16 \tmpq5, \tmpd21, d0[0] + vmull.s16 \tmpq6, \tmpd22, d0[0] + vrshrn.s32 \out1, \tmpq3, #14 + vrshrn.s32 \out2, \tmpq4, #14 + vrshrn.s32 \out3, \tmpq5, #14 + vrshrn.s32 \out4, \tmpq6, #14 +.endif +.endm +@ out1 = in1 * coef1 - in2 * coef2 +@ out2 = in1 * coef2 + in2 * coef1 +@ out are 2 q registers, in are 2 d registers +.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2 + vmull.s16 \out1, \in1, \coef1 + vmlsl.s16 \out1, \in2, \coef2 + vmull.s16 \out2, \in1, \coef2doing the second vmull before the preferable on in-order units
That's what I would have expected as well, but it seems to have a negative effect on A8 and A53 (and A9!); only A7 seems to gain from it.
Current version: vp9_inv_adst_adst_16x16_add_neon: 4622.3 2989.9 2901.8 2609.6 With the vmlsl/vmull swapped: vp9_inv_adst_adst_16x16_add_neon: 4119.0 3242.2 3204.3 2907.5 Thus keeping it in the current form
+ vmlal.s16 \out2, \in2, \coef1 +.endm +@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2 +@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1 +@ out are 4 q registers, in are 4 d registers +.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2 + vmull.s16 \out1, \in1, \coef1 + vmull.s16 \out2, \in2, \coef1 + vmull.s16 \out3, \in1, \coef2 + vmull.s16 \out4, \in2, \coef2 + vmlsl.s16 \out1, \in3, \coef2 + vmlsl.s16 \out2, \in4, \coef2 + vmlal.s16 \out3, \in3, \coef1 + vmlal.s16 \out4, \in4, \coef1 +.endm +@ in1 = (in1 * coef1 - in2 * coef2 + (1 << 13)) >> 14 +@ in2 = (in1 * coef2 + in2 * coef1 + (1 << 13)) >> 14 +@ in are 2 d registers, tmp are 2 q registers +.macro mbutterfly in1, in2, coef1, coef2, tmp1, tmp2, neg=0 + mbutterfly_l \tmp1, \tmp2, \in1, \in2, \coef1, \coef2 +.if \neg > 0 + vneg.s32 \tmp2, \tmp2 +.endif + vrshrn.s32 \in1, \tmp1, #14 + vrshrn.s32 \in2, \tmp2, #14 +.endm +@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14 +@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14 +@ inout are 4 d registers, tmp are 4 q registers +.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2 + vrshrn.s32 \inout1, \tmp1, #14 + vrshrn.s32 \inout2, \tmp2, #14 + vrshrn.s32 \inout3, \tmp3, #14 + vrshrn.s32 \inout4, \tmp4, #14 +.endm +.macro mbutterfly_neg in1, in2, coef1, coef2, tmp1, tmp2 + mbutterfly \in1, \in2, \coef1, \coef2, \tmp1, \tmp2, 1 +.endmtis macro is a little pointless, readability is not really worse for mbutterfly ..., neg=1 vs mbutterfly_neg ...
Yeah; the only advantage is that it fits the "neg" part in a place where we already would have had whitespace:
mbutterfly d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = t30a mbutterfly_neg d27, d20, d0[3], d1[0], q8, q9 @ d27 = t29a, d20 = t18a mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a mbutterfly_neg d25, d22, d1[1], d1[2], q8, q9 @ d25 = t25a, d22 = t22a vs mbutterfly d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = t30a mbutterfly d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a mbutterfly d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22aNow looking at it afterwards, it doesn't look all that bad though and the lines are already very long, so I'll change it and reduce the macro soup by getting rid of this unnecessary one.
+@ out1 = in1 + in2 +@ out2 = in1 - in2 +.macro butterfly out1, out2, in1, in2 + vadd.s16 \out1, \in1, \in2 + vsub.s16 \out2, \in1, \in2 +.endm +@ out1 = in1 - in2 +@ out2 = in1 + in2 +.macro butterfly_r out1, out2, in1, in2 + vsub.s16 \out1, \in1, \in2 + vadd.s16 \out2, \in1, \in2 +.endm +@ out1 = (in1 + in2 + (1 << 13)) >> 14 +@ out2 = (in1 - in2 + (1 << 13)) >> 14 +@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers +.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2 + vadd.s32 \tmp1, \in1, \in2 + vsub.s32 \tmp2, \in1, \in2 + vrshrn.s32 \out1, \tmp1, #14 + vrshrn.s32 \out2, \tmp2, #14 +.endm +@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 +@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 +@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers +.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 + vadd.s32 \tmp1, \in1, \in3 + vadd.s32 \tmp2, \in2, \in4 + vsub.s32 \tmp3, \in1, \in3 + vsub.s32 \tmp4, \in2, \in4 + vrshrn.s32 \out1, \tmp1, #14 + vrshrn.s32 \out2, \tmp2, #14 + vrshrn.s32 \out3, \tmp3, #14 + vrshrn.s32 \out4, \tmp4, #14 +.endm + + +.macro iwht4 c0, c1, c2, c3 + vadd.i16 \c0, \c0, \c1 + vsub.i16 d17, \c2, \c3 + vsub.i16 d16, \c0, d17 + vshr.s16 d16, d16, #1 + vsub.i16 \c2, d16, \c1 + vsub.i16 \c1, d16, \c3 + vadd.i16 \c3, d17, \c2 + vsub.i16 \c0, \c0, \c1 +.endm + +.macro idct4 c0, c1, c2, c3 + vadd.i16 d16, \c0, \c2 + vsub.i16 d17, \c0, \c2 + vmull.s16 q11, \c1, d0[1] + vmull.s16 q12, \c3, d0[2]vmlsl.s16 q11, \c3, d0[2] and reorder for in-order+ vmull.s16 q13, \c1, d0[2] + vmull.s16 q14, \c3, d0[1]vmlal.s16 q13, \c3, d0[1]
Done Before: vp9_inv_dct_dct_4x4_add_neon: 117.0 69.0 85.0 79.0 After: vp9_inv_dct_dct_4x4_add_neon: 108.7 65.0 79.0 78.0
+ vmull.s16 q9, d16, d0[0] + vmull.s16 q10, d17, d0[0] + vadd.i32 q13, q13, q14 + vsub.i32 q11, q11, q12 + vrshrn.s32 d16, q9, #14 + vrshrn.s32 d19, q13, #14 + vrshrn.s32 d17, q10, #14 + vrshrn.s32 d18, q11, #14 + vadd.i16 \c0, d16, d19 + vadd.i16 \c1, d17, d18 + vsub.i16 \c2, d17, d18 + vsub.i16 \c3, d16, d19 +.endm + +.macro iadst4 c0, c1, c2, c3 + vmull.s16 q10, \c0, d1[0] + vmlal.s16 q10, \c2, d1[1] + vmlal.s16 q10, \c3, d1[2] + vmull.s16 q11, \c0, d1[2] + vmlsl.s16 q11, \c2, d1[0] + vsub.s16 \c0, \c0, \c2 + vmlsl.s16 q11, \c3, d1[1] + vadd.s16 \c0, \c0, \c3 + vmull.s16 q13, \c1, d1[3] + vmull.s16 q12, \c0, d1[3] + vadd.s32 q14, q10, q13 + vadd.s32 q1, q11, q13 + vrshrn.s32 \c0, q14, #14 + vadd.s32 q10, q10, q11 + vrshrn.s32 \c1, q1, #14 + vsub.s32 q10, q10, q13 + vrshrn.s32 \c2, q12, #14 + vrshrn.s32 \c3, q10, #14instruction scheduling can be optimized for this one too
I've done a bit of testing with different ordering on this one already, and while I can improve quite a bit on the A7, the A8 and A53 immediately get slower at the same time, I haven't found any better compromise yet.
+.endm + +@ The public functions in this file have got the following signature: +@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); + +.macro itxfm_func4x4 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1 +.ifc \txfm1,\txfm2 +.ifc \txfm1,idct + movrel r12, itxfm4_coeffs + vld1.16 {d0}, [r12,:64] +.endif +.ifc \txfm1,iadst + movrel r12, iadst4_coeffs + vld1.16 {d1}, [r12,:64] +.endif +.else + movrel r12, itxfm4_coeffs + vld1.16 {q0}, [r12,:128] +.endifaligned 8 byte and 16 byte loads are equally fast so this adds just complexity without gain
checkasm --bench disagrees; A7 and A53 get around one cycle slower on dct_dct_4x4 and adst_adst_4x4, if I just load the full q0 from itxfm4_coeffs.
Leaving this as is.
+ + vmov.i16 q15, #0 +.ifc \txfm1,idct +.ifc \txfm2,idct + cmp r3, #1 + bne 1f + @ DC-only for idct/idct + vld1.16 {d4[]}, [r2]alignment
Added :16 alignment here
+ vmull.s16 q2, d4, d0[0] + vrshrn.s32 d4, q2, #14 + vmull.s16 q2, d4, d0[0] + vrshrn.s32 d4, q2, #14 + vst1.16 {d30[0]}, [r2]same
Done
+ vdup.16 q2, d4[0] + vmov q3, q2vdup first to q3 to avoid data dependency
Do you mean vdup.16 q2, d4[0], vdup.16 q3, d4[0]? (I don't see the point here in duping into q3 and doing vmov q2, q3, if that is what you meant.) That turns out to be 1 cycle slower on A53 and A8, 1.5 cycles slower on A9.
+ b 2f +.endif +.endif + +1: + vld1.16 {d4-d7}, [r2,:128] + vst1.16 {q15}, [r2,:128]! + +.ifc \txfm1,iwht + vshr.s16 q2, q2, #2 + vshr.s16 q3, q3, #2 +.endif + + \txfm1\()4 d4, d5, d6, d7 + + vst1.16 {q15}, [r2,:128]! + @ Transpose 4x4 with 16 bit elements + vtrn.16 d4, d5 + vtrn.16 d6, d7 + vtrn.32 d4, d6 + vtrn.32 d5, d7vtrn.32 q2, q3
Done
+ + \txfm2\()4 d4, d5, d6, d7 +2: + vld1.32 {d0[]}, [r0,:32], r1 + vld1.32 {d0[1]}, [r0,:32], r1 +.ifnc \txfm1,iwht + vrshr.s16 q2, q2, #4 + vrshr.s16 q3, q3, #4 +.endif + vaddw.u8 q2, q2, d0 + vld1.32 {d1[]}, [r0,:32], r1 + vld1.32 {d1[1]}, [r0,:32], r1 + vqmovun.s16 d0, q2 + sub r0, r0, r1, lsl #2since we have free gp registers I'd use different register for load and store. probably not faster though
Around one cycle slower on A7 and A53, so skipped. I also tried using separate registers for storing odd/even lines, with double stride, but that was a pretty large loss on A8, so skipped that for now as well.
+ + vaddw.u8 q3, q3, d1 + vst1.32 {d0[0]}, [r0,:32], r1 + vqmovun.s16 d1, q3 + + vst1.32 {d0[1]}, [r0,:32], r1 + vst1.32 {d1[0]}, [r0,:32], r1 + vst1.32 {d1[1]}, [r0,:32], r1 + + bx lr +endfunc +.endm + +itxfm_func4x4 idct, idct +itxfm_func4x4 iadst, idct +itxfm_func4x4 idct, iadst +itxfm_func4x4 iadst, iadst +itxfm_func4x4 iwht, iwht + + +.macro idct8 + dmbutterfly0 d16, d17, d24, d25, q8, q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a + dmbutterfly d20, d21, d28, d29, d0[1], d0[2], q2, q3, q4, q5 @ q10 = t2a, q14 = t3a + dmbutterfly d18, d19, d30, d31, d0[3], d1[0], q2, q3, q4, q5 @ q9 = t4a, q15 = t7a + dmbutterfly d26, d27, d22, d23, d1[1], d1[2], q2, q3, q4, q5 @ q13 = t5a, q11 = t6a + + butterfly q2, q14, q8, q14 @ q2 = t0, q14 = t3 + butterfly q3, q10, q12, q10 @ q3 = t1, q10 = t2 + butterfly q4, q13, q9, q13 @ q4 = t4, q13 = t5a + butterfly q5, q11, q15, q11 @ q5 = t7, q11 = t6a + + butterfly q8, q15, q2, q5 @ q8 = out[0], q15 = out[7] + + dmbutterfly0 d4, d5, d10, d11, q11, q13, q9, q13, d18, d19, d26, d27, q2, q5, q11, q12 @ q2 = t6, q5 = t5 + + butterfly q11, q12, q14, q4 @ q11 = out[3], q12 = out[4] + butterfly q9, q14, q3, q2 @ q9 = out[1], q14 = out[6] + butterfly_r q13, q10, q10, q5 @ q13 = out[5], q10 = out[2] +.endm + +.macro iadst8 + dmbutterfly_l q4, q5, q2, q3, d30, d31, d16, d17, d2[1], d2[0] @ q4,q5 = t1a, q2,q3 = t0a + dmbutterfly_l q8, q15, q6, q7, d22, d23, d24, d25, d3[1], d3[0] @ q8,q15 = t5a, q6,q7 = t4a + + dbutterfly_n d22, d23, d4, d5, q2, q3, q6, q7, q11, q12, q2, q3 @ q11 = t0, q2 = t4 + + dbutterfly_n d24, d25, d6, d7, q4, q5, q8, q15, q12, q3, q6, q7 @ q12 = t1, q3 = t5 + + dmbutterfly_l q6, q7, q4, q5, d26, d27, d20, d21, d2[3], d2[2] @ q6,q7 = t3a, q4,q5 = t2a + dmbutterfly_l q10, q13, q8, q15, d18, d19, d28, d29, d3[3], d3[2] @ q10,q13 = t7a, q8,q15 = t6a + + dbutterfly_n d18, d19, d8, d9, q4, q5, q8, q15, q9, q14, q4, q5 @ q9 = t2, q4 = t6 + dbutterfly_n d16, d17, d12, d13, q6, q7, q10, q13, q8, q15, q6, q7 @ q8 = t3, q6 = t7 + + butterfly q15, q12, q12, q8 @ q15 = -out[7], q12 = t3 + vneg.s16 q15, q15 @ q15 = out[7] + butterfly q8, q9, q11, q9 @ q8 = out[0], q9 = t2 + + dmbutterfly_l q10, q11, q5, q7, d4, d5, d6, d7, d0[1], d0[2] @ q10,q11 = t5a, q5,q7 = t4a + dmbutterfly_l q2, q3, q13, q14, d12, d13, d8, d9, d0[2], d0[1] @ q2,q3 = t6a, q13,q14 = t7a + + dbutterfly_n d28, d29, d8, d9, q10, q11, q13, q14, q4, q6, q10, q11 @ q14 = out[6], q4 = t7 + + dmbutterfly0 d22, d23, d24, d25, q9, q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4] + vneg.s16 q11, q11 @ q11 = out[3] + + dbutterfly_n d18, d19, d4, d5, q5, q7, q2, q3, q9, q10, q2, q3 @ q9 = -out[1], q2 = t6 + vneg.s16 q9, q9 @ q9 = out[1] + + dmbutterfly0 d20, d21, d26, d27, q2, q4, q3, q5, d6, d7, d10, d11, q6, q7 @ q10 = out[2], q13 = -out[5] + vneg.s16 q13, q13 @ q13 = out[5] +.endm + + +.macro itxfm_func8x8 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1 + @ Push q4-q7 if iadst is used, idct requires + @ a few scratch registers less, so only push q4-q5 + @ if only idct is involved. + @ The iadst also uses a few coefficients from + @ idct, so those always need to be loaded. + movrel r12, idct_coeffsmove this into the last else below+ vld1.16 {q0}, [r12,:128]this can follow after this block if the iadst8_coeffs load uses post increment
Done
+.ifc \txfm1,iadst + movrel r12, iadst8_coeffs + vld1.16 {q1}, [r12,:128] + vpush {q4-q7} +.else +.ifc \txfm2,iadstdoes .elseifc work?
No, unfortunately it doesn't, so afaik there's no way to do .elseif with string comparisons
+ movrel r12, iadst8_coeffs + vld1.16 {q1}, [r12,:128] + vpush {q4-q7} +.else + vpush {q4-q5} +.endif +.endif + + vmov.i16 q2, #0 + vmov.i16 q3, #0 + +.ifc \txfm1,idct +.ifc \txfm2,idct + cmp r3, #1 + bne 1f + @ DC-only for idct/idct + vld1.16 {d16[]}, [r2]alignment
Done
+ vmull.s16 q8, d16, d0[0] + vrshrn.s32 d16, q8, #14 + vmull.s16 q8, d16, d0[0] + vrshrn.s32 d16, q8, #14 + vdup.16 q8, d16[0] + vmov q9, q8 + vmov q10, q8 + vmov q11, q8 + vmov q12, q8 + vmov q13, q8 + vmov q14, q8 + vmov q15, q8all duped from d16[0]
That ends up consistently around 1 cycle slower on most cores, so not done
+ vst1.16 {d4[0]}, [r2]alignment
Done
+ b 2f +.endif +.endif +1: + vld1.16 {q8-q9}, [r2,:128]! + vld1.16 {q10-q11}, [r2,:128]! + vld1.16 {q12-q13}, [r2,:128]! + vld1.16 {q14-q15}, [r2,:128]! + sub r2, r2, #128 + vst1.16 {q2-q3}, [r2,:128]! + vst1.16 {q2-q3}, [r2,:128]! + vst1.16 {q2-q3}, [r2,:128]! + vst1.16 {q2-q3}, [r2,:128]! + + \txfm1\()8 + + @ Transpose 8x8 with 16 bit elements + vswp d17, d24 + vswp d19, d26 + vswp d21, d28 + vswp d23, d30 + transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15 + + \txfm2\()8 +2: + @ Add into the destination + vld1.8 {d4}, [r0,:64], r1 + vrshr.s16 q8, q8, #5 + vld1.8 {d5}, [r0,:64], r1 + vrshr.s16 q9, q9, #5 + vld1.8 {d6}, [r0,:64], r1 + vrshr.s16 q10, q10, #5 + vaddw.u8 q8, q8, d4 + vld1.8 {d7}, [r0,:64], r1 + vrshr.s16 q11, q11, #5 + vaddw.u8 q9, q9, d5 + vld1.8 {d8}, [r0,:64], r1 + vrshr.s16 q12, q12, #5 + vaddw.u8 q10, q10, d6 + vqmovun.s16 d4, q8 + vld1.8 {d9}, [r0,:64], r1 + vrshr.s16 q13, q13, #5 + vaddw.u8 q11, q11, d7 + vqmovun.s16 d5, q9 + vld1.8 {d10}, [r0,:64], r1 + vrshr.s16 q14, q14, #5 + vaddw.u8 q12, q12, d8 + vqmovun.s16 d6, q10 + vld1.8 {d11}, [r0,:64], r1 + vrshr.s16 q15, q15, #5 + vaddw.u8 q13, q13, d9 + vqmovun.s16 d7, q11 + sub r0, r0, r1, lsl #3could use a different register loads and stores
Seems to help a little here, thus done
+ + vst1.8 {d4}, [r0,:64], r1 + vaddw.u8 q14, q14, d10 + vst1.8 {d5}, [r0,:64], r1 + vqmovun.s16 d8, q12 + vst1.8 {d6}, [r0,:64], r1 + vaddw.u8 q15, q15, d11 + vst1.8 {d7}, [r0,:64], r1 + vqmovun.s16 d9, q13 + vst1.8 {d8}, [r0,:64], r1 + vqmovun.s16 d10, q14 + vst1.8 {d9}, [r0,:64], r1 + vqmovun.s16 d11, q15 + + vst1.8 {d10}, [r0,:64], r1 + vst1.8 {d11}, [r0,:64], r1 + +.ifc \txfm1,iadst + vpop {q4-q7} +.else +.ifc \txfm2,iadst + vpop {q4-q7} +.else + vpop {q4-q5} +.endif +.endif + bx lr +endfunc +.endm + +itxfm_func8x8 idct, idct +itxfm_func8x8 iadst, idct +itxfm_func8x8 idct, iadst +itxfm_func8x8 iadst, iadst + + +function idct16x16_dc_add_neon + movrel r12, idct_coeffs + vld1.16 {d0}, [r12,:64] + + vmov.i16 q2, #0 + + vld1.16 {d16[]}, [r2]alignment
Done
+ vmull.s16 q8, d16, d0[0] + vrshrn.s32 d16, q8, #14 + vmull.s16 q8, d16, d0[0] + vrshrn.s32 d16, q8, #14 + vdup.16 q8, d16[0] + vst1.16 {d4[0]}, [r2]alignment
Done
+ + vrshr.s16 q8, q8, #6 + + mov r12, #16 +1: + @ Loop to add the constant from q8 into all 16x16 outputs + vld1.8 {q3}, [r0,:128] + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q3}, [r0,:128], r1 + subs r12, r12, #1 + bne 1b + + bx lr +endfunc + +.macro idct16 + mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a + mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a + mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a + mbutterfly d26, d22, d1[1], d1[2], q2, q3 @ d26 = t5a, d22 = t6a + mbutterfly d17, d31, d1[3], d2[0], q2, q3 @ d17 = t8a, d31 = t15a + mbutterfly d25, d23, d2[1], d2[2], q2, q3 @ d25 = t9a, d23 = t14a + mbutterfly d21, d27, d2[3], d3[0], q2, q3 @ d21 = t10a, d27 = t13a + mbutterfly d29, d19, d3[1], d3[2], q2, q3 @ d29 = t11a, d19 = t12a + + butterfly d4, d28, d16, d28 @ d4 = t0, d28 = t3 + butterfly d5, d20, d24, d20 @ d5 = t1, d20 = t2 + butterfly d6, d26, d18, d26 @ d6 = t4, d26 = t5 + butterfly d7, d22, d30, d22 @ d7 = t7, d22 = t6 + butterfly d16, d25, d17, d25 @ d16 = t8, d25 = t9 + butterfly d24, d21, d29, d21 @ d24 = t11, d21 = t10 + butterfly d17, d27, d19, d27 @ d17 = t12, d27 = t13 + butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14 + + mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a + mbutterfly d23, d25, d0[1], d0[2], q9, q15 @ d23 = t9a, d25 = t14a + mbutterfly_neg d27, d21, d0[1], d0[2], q9, q15 @ d27 = t13a, d21 = t10a + + butterfly d18, d7, d4, d7 @ d18 = t0a, d7 = t7a + butterfly d19, d22, d5, d22 @ d19 = t1a, d22 = t6 + butterfly d4, d26, d20, d26 @ d4 = t2a, d26 = t5 + butterfly d5, d6, d28, d6 @ d5 = t3a, d6 = t4 + butterfly d20, d28, d16, d24 @ d20 = t8a, d28 = t11a + butterfly d24, d21, d23, d21 @ d24 = t9, d21 = t10 + butterfly d23, d27, d25, d27 @ d23 = t14, d27 = t13 + butterfly d25, d29, d29, d17 @ d25 = t15a, d29 = t12a + + mbutterfly0 d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a + mbutterfly0 d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12, d28 = t11 + + vswp d27, d29 @ d27 = t12, d29 = t13a + vswp d28, d27 @ d28 = t12, d27 = t11 + butterfly d16, d31, d18, d25 @ d16 = out[0], d31 = out[15] + butterfly d17, d30, d19, d23 @ d17 = out[1], d30 = out[14] + butterfly_r d25, d22, d22, d24 @ d25 = out[9], d22 = out[6] + butterfly d23, d24, d7, d20 @ d23 = out[7], d24 = out[8] + butterfly d18, d29, d4, d29 @ d18 = out[2], d29 = out[13] + butterfly d19, d28, d5, d28 @ d19 = out[3], d28 = out[12] + vmov d4, d21 @ d4 = t10a + butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11] + butterfly d21, d26, d26, d4 @ d21 = out[5], d26 = out[10] +.endm + +.macro iadst16 + movrel r12, iadst16_coeffs + vld1.16 {q0-q1}, [r12,:128] + + mbutterfly_l q3, q2, d31, d16, d0[1], d0[0] @ q3 = t1, q2 = t0 + mbutterfly_l q5, q4, d23, d24, d2[1], d2[0] @ q5 = t9, q4 = t8 + butterfly_n d31, d24, q3, q5, q6, q5 @ d31 = t1a, d24 = t9a + mbutterfly_l q7, q6, d29, d18, d0[3], d0[2] @ q7 = t3, q6 = t2 + butterfly_n d16, d23, q2, q4, q3, q4 @ d16 = t0a, d23 = t8a + + mbutterfly_l q3, q2, d21, d26, d2[3], d2[2] @ q3 = t11, q2 = t10 + butterfly_n d29, d26, q7, q3, q4, q3 @ d29 = t3a, d26 = t11a + mbutterfly_l q5, q4, d27, d20, d1[1], d1[0] @ q5 = t5, q4 = t4 + butterfly_n d18, d21, q6, q2, q3, q2 @ d18 = t2a, d21 = t10a + + mbutterfly_l q7, q6, d19, d28, d3[1], d3[0] @ q7 = t13, q6 = t12 + butterfly_n d20, d28, q5, q7, q2, q7 @ d20 = t5a, d28 = t13a + mbutterfly_l q3, q2, d25, d22, d1[3], d1[2] @ q3 = t7, q2 = t6 + butterfly_n d27, d19, q4, q6, q5, q6 @ d27 = t4a, d19 = t12a + + mbutterfly_l q5, q4, d17, d30, d3[3], d3[2] @ q5 = t15, q4 = t14 + movrel r12, idct_coeffs + vld1.16 {q0}, [r12,:128] + butterfly_n d22, d30, q3, q5, q6, q5 @ d22 = t7a, d30 = t15a + mbutterfly_l q7, q6, d23, d24, d0[3], d1[0] @ q7 = t9, q6 = t8 + butterfly_n d25, d17, q2, q4, q3, q4 @ d25 = t6a, d17 = t14a + + mbutterfly_l q2, q3, d28, d19, d1[0], d0[3] @ q2 = t12, q3 = t13 + butterfly_n d23, d19, q6, q2, q4, q2 @ d23 = t8a, d19 = t12a + mbutterfly_l q5, q4, d21, d26, d1[1], d1[2] @ q5 = t11, q4 = t10 + butterfly_r d4, d27, d16, d27 @ d4 = t4, d27 = t0 + butterfly_n d24, d28, q7, q3, q6, q3 @ d24 = t9a, d28 = t13a + + mbutterfly_l q6, q7, d30, d17, d1[2], d1[1] @ q6 = t14, q7 = t15 + butterfly_r d5, d20, d31, d20 @ d5 = t5, d20 = t1 + butterfly_n d21, d17, q4, q6, q3, q6 @ d21 = t10a, d17 = t14a + butterfly_n d26, d30, q5, q7, q4, q7 @ d26 = t11a, d30 = t15a + + butterfly_r d6, d25, d18, d25 @ d6 = t6, d25 = t2 + butterfly_r d7, d22, d29, d22 @ d7 = t7, d22 = t3 + + mbutterfly_l q5, q4, d19, d28, d0[1], d0[2] @ q5 = t13, q4 = t12 + mbutterfly_l q6, q7, d30, d17, d0[2], d0[1] @ q6 = t14, q7 = t15 + + butterfly_n d18, d30, q4, q6, q8, q6 @ d18 = out[2], d30 = t14a + butterfly_n d29, d17, q5, q7, q6, q7 @ d29 = -out[13], d17 = t15a + vneg.s16 d29, d29 @ d29 = out[13] + + mbutterfly_l q5, q4, d4, d5, d0[1], d0[2] @ q5 = t5a, q4 = t4a + mbutterfly_l q6, q7, d7, d6, d0[2], d0[1] @ q6 = t6a, q7 = t7a + + butterfly d2, d6, d27, d25 @ d2 = out[0], d6 = t2a + butterfly d3, d7, d23, d21 @ d3 =-out[1], d7 = t10 + + butterfly_n d19, d31, q4, q6, q2, q4 @ d19 = -out[3], d31 = t6 + vneg.s16 d19, d19 @ d19 = out[3] + butterfly_n d28, d16, q5, q7, q2, q5 @ d28 = out[12], d16 = t7 + + butterfly d5, d8, d20, d22 @ d5 =-out[15],d8 = t3a + butterfly d4, d9, d24, d26 @ d4 = out[14],d9 = t11 + + mbutterfly0 d23, d24, d6, d8, d10, d11, q6, q7, 1 @ d23 = out[7], d24 = out[8] + mbutterfly0 d20, d27, d16, d31, d10, d11, q6, q7 @ d20 = out[4], d27 = out[11] + mbutterfly0 d22, d25, d9, d7, d10, d11, q6, q7 @ d22 = out[6], d25 = out[9] + mbutterfly0 d21, d26, d30, d17, d10, d11, q6, q7, 1 @ d21 = out[5], d26 = out[10] + + vneg.s16 d31, d5 @ d31 = out[15] + vneg.s16 d17, d3 @ d17 = out[1] + + vmov d16, d2 + vmov d30, d4 +.endm + +.macro itxfm16_1d_funcs txfm +@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, +@ transpose into a horizontal 16x4 slice and store. +@ r0 = dst (temp buffer) +@ r1 = unused +@ r2 = src +@ r3 = slice offset +function \txfm\()16_1d_4x16_pass1_neon + mov r12, #32 + vmov.s16 q2, #0 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d4}, [r2,:64], r12 +.endr + + \txfm\()16 + + @ Do four 4x4 transposes. Originally, d16-d31 contain the + @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 + @ contain the transposed 4x4 blocks. + transpose16_q_2x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + + @ Store the transposed 4x4 blocks horizontally. + cmp r3, #12 + beq 1f +.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 + vst1.16 {d\i}, [r0,:64]! +.endr + bx lr +1: + @ Special case: For the last input column (r3 == 12), + @ which would be stored as the last row in the temp buffer, + @ don't store the first 4x4 block, but keep it in registers + @ for the first slice of the second pass (where it is the + @ last 4x4 block). + add r0, r0, #8 +.irp i, 20, 24, 28 + vst1.16 {d\i}, [r0,:64]! +.endr + add r0, r0, #8 +.irp i, 21, 25, 29 + vst1.16 {d\i}, [r0,:64]! +.endr + add r0, r0, #8 +.irp i, 22, 26, 30 + vst1.16 {d\i}, [r0,:64]! +.endr + add r0, r0, #8 +.irp i, 23, 27, 31 + vst1.16 {d\i}, [r0,:64]! +.endr + vmov d28, d16 + vmov d29, d17 + vmov d30, d18 + vmov d31, d19 + bx lr +endfunc + +@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, +@ load the destination pixels (from a similar 4x16 slice), add and store back. +@ r0 = dst +@ r1 = dst stride +@ r2 = src (temp buffer) +@ r3 = slice offset +function \txfm\()16_1d_4x16_pass2_neon + mov r12, #32 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 + vld1.16 {d\i}, [r2,:64], r12 +.endr + cmp r3, #0 + beq 1f +.irp i, 28, 29, 30, 31 + vld1.16 {d\i}, [r2,:64], r12 +.endr +1: + + \txfm\()16 + +.macro load_add_store coef0, coef1, coef2, coef3 + vrshr.s16 \coef0, \coef0, #6 + vrshr.s16 \coef1, \coef1, #6 + + vld1.32 {d4[]}, [r0,:32], r1 + vld1.32 {d4[1]}, [r0,:32], r1 + vrshr.s16 \coef2, \coef2, #6 + vrshr.s16 \coef3, \coef3, #6 + vld1.32 {d5[]}, [r0,:32], r1 + vld1.32 {d5[1]}, [r0,:32], r1 + vaddw.u8 \coef0, \coef0, d4 + vld1.32 {d6[]}, [r0,:32], r1 + vld1.32 {d6[1]}, [r0,:32], r1 + vaddw.u8 \coef1, \coef1, d5 + vld1.32 {d7[]}, [r0,:32], r1 + vld1.32 {d7[1]}, [r0,:32], r1 + + vqmovun.s16 d4, \coef0 + vqmovun.s16 d5, \coef1 + sub r0, r0, r1, lsl #3could use an additional register
Done, but by using two registers for loading/storing alternating rows, which gave a larger speedup on all cores (but requiring two subs inbetween instead)
+ vaddw.u8 \coef2, \coef2, d6 + vaddw.u8 \coef3, \coef3, d7 + vst1.32 {d4[0]}, [r0,:32], r1 + vst1.32 {d4[1]}, [r0,:32], r1 + vqmovun.s16 d6, \coef2 + vst1.32 {d5[0]}, [r0,:32], r1 + vst1.32 {d5[1]}, [r0,:32], r1 + vqmovun.s16 d7, \coef3 + + vst1.32 {d6[0]}, [r0,:32], r1 + vst1.32 {d6[1]}, [r0,:32], r1 + vst1.32 {d7[0]}, [r0,:32], r1 + vst1.32 {d7[1]}, [r0,:32], r1 +.endm + load_add_store q8, q9, q10, q11 + load_add_store q12, q13, q14, q15 +.purgem load_add_store + + bx lr +endfunc +.endm + +itxfm16_1d_funcs idct +itxfm16_1d_funcs iadst + +.macro itxfm_func16x16 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 +.ifc \txfm1,idct +.ifc \txfm2,idct + cmp r3, #1 + beq idct16x16_dc_add_neon +.endif +.endif +1:unused label
Removed
+ push {r4-r7,lr} +.ifc \txfm1,iadst + vpush {q4-q7} +.else +.ifc \txfm2,iadst + vpush {q4-q7} +.endif +.endif + mov r7, sp + + @ Align the stack, allocate a temp buffer +T mov r12, sp +T bic r12, r12, #15 +T sub r12, r12, #512 +T mov sp, r12 +A bic sp, sp, #15 +A sub sp, sp, #512 + + mov r4, r0 + mov r5, r1 + mov r6, r2 + +.ifc \txfm1,idct + movrel r12, idct_coeffs + vld1.16 {q0-q1}, [r12,:128] +.endif + +.irp i, 0, 4, 8, 12 + add r0, sp, #(\i*32) + add r2, r6, #(\i*2) + mov r3, #\i + bl \txfm1\()16_1d_4x16_pass1_neon +.endr +.ifc \txfm2,idct + movrel r12, idct_coeffs + vld1.16 {q0-q1}, [r12,:128] +.endif +.irp i, 0, 4, 8, 12 + add r0, r4, #(\i) + mov r1, r5 + add r2, sp, #(\i*2) + mov r3, #\i + bl \txfm2\()16_1d_4x16_pass2_neon +.endr + + mov sp, r7 +.ifc \txfm1,iadst + vpop {q4-q7} +.else +.ifc \txfm2,iadst + vpop {q4-q7} +.endif +.endif + pop {r4-r7,pc} +endfunc +.endm + +itxfm_func16x16 idct, idct +itxfm_func16x16 iadst, idct +itxfm_func16x16 idct, iadst +itxfm_func16x16 iadst, iadst + + +function idct32x32_dc_add_neon + movrel r12, idct_coeffs + vld1.16 {d0}, [r12,:64] + + vmov.i16 q2, #0 + + vld1.16 {d16[]}, [r2]alignment
Done
+ vmull.s16 q8, d16, d0[0] + vrshrn.s32 d16, q8, #14 + vmull.s16 q8, d16, d0[0] + vrshrn.s32 d16, q8, #14 + vdup.16 q8, d16[0] + vst1.16 {d4[0]}, [r2]dito
Done
+ + vrshr.s16 q8, q8, #6 + + mov r12, #32 +1: + @ Loop to add the constant from q8 into all 32x32 outputs + vld1.8 {q2-q3}, [r0,:128] + vaddw.u8 q10, q8, d4 + vaddw.u8 q11, q8, d5 + vaddw.u8 q12, q8, d6 + vaddw.u8 q13, q8, d7 + vqmovun.s16 d4, q10 + vqmovun.s16 d5, q11 + vqmovun.s16 d6, q12 + vqmovun.s16 d7, q13 + vst1.8 {q2-q3}, [r0,:128], r1 + subs r12, r12, #1 + bne 1b + + bx lr +endfunc + +.macro idct32_odd + movrel r12, idct_coeffs + add r12, r12, #32 + vld1.16 {q0-q1}, [r12,:128] + + mbutterfly d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a + mbutterfly d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a + mbutterfly d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a + mbutterfly d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a + mbutterfly d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a + mbutterfly d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a + mbutterfly d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a + mbutterfly d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a + + sub r12, r12, #32 + vld1.16 {q0}, [r12,:128] + + butterfly d4, d24, d16, d24 @ d4 = t16, d24 = t17 + butterfly d5, d20, d28, d20 @ d5 = t19, d20 = t18 + butterfly d6, d26, d18, d26 @ d6 = t20, d26 = t21 + butterfly d7, d22, d30, d22 @ d7 = t23, d22 = t22 + butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25 + butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26 + butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 + butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29 + + mbutterfly d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = t30a + mbutterfly_neg d27, d20, d0[3], d1[0], q8, q9 @ d27 = t29a, d20 = t18a + mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a + mbutterfly_neg d25, d22, d1[1], d1[2], q8, q9 @ d25 = t25a, d22 = t22a + + butterfly d16, d5, d4, d5 @ d16 = t16a, d5 = t19a + butterfly d17, d20, d23, d20 @ d17 = t17, d20 = t18 + butterfly d18, d6, d7, d6 @ d18 = t23a, d6 = t20a + butterfly d19, d21, d22, d21 @ d19 = t22, d21 = t21 + butterfly d4, d28, d28, d30 @ d4 = t24a, d28 = t27a + butterfly d23, d26, d25, d26 @ d23 = t25, d26 = t26 + butterfly d7, d29, d29, d31 @ d7 = t31a, d29 = t28a + butterfly d22, d27, d24, d27 @ d22 = t30, d27 = t29 + + mbutterfly d27, d20, d0[1], d0[2], q12, q15 @ d27 = t18a, d20 = t29a + mbutterfly d29, d5, d0[1], d0[2], q12, q15 @ d29 = t19, d5 = t28 + mbutterfly_neg d28, d6, d0[1], d0[2], q12, q15 @ d28 = t27, d6 = t20 + mbutterfly_neg d26, d21, d0[1], d0[2], q12, q15 @ d26 = t26a, d21 = t21a + + butterfly d31, d24, d7, d4 @ d31 = t31, d24 = t24 + butterfly d30, d25, d22, d23 @ d30 = t30a, d25 = t25a + butterfly_r d23, d16, d16, d18 @ d23 = t23, d16 = t16 + butterfly_r d22, d17, d17, d19 @ d22 = t22a, d17 = t17a + butterfly d18, d21, d27, d21 @ d18 = t18, d21 = t21 + butterfly_r d27, d28, d5, d28 @ d27 = t27a, d28 = t28a + butterfly d4, d26, d20, d26 @ d4 = t29, d26 = t26 + butterfly d19, d20, d29, d6 @ d19 = t19a, d20 = t20 + vmov d29, d4 @ d29 = t29 + + mbutterfly0 d27, d20, d27, d20, d4, d6, q2, q3 @ d27 = t27, d20 = t20 + mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a + mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 + mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a +.endm + +@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. +@ We don't have register space to do a single pass IDCT of 4x32 though, +@ but the 32-point IDCT can be decomposed into two 16-point IDCTs; +@ a normal IDCT16 with every other input component (the even ones, with +@ each output written twice), followed by a separate 16-point IDCT +@ of the odd inputs, added/subtracted onto the outputs of the first idct16. +@ r0 = dst (temp buffer) +@ r1 = unused +@ r2 = src +function idct32_1d_4x32_pass1_neon + movrel r12, idct_coeffs + vld1.16 {q0-q1}, [r12,:128] + + @ Double stride of the input, since we only read every other line + mov r12, #128 + vmov.s16 d4, #0 + + @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d4}, [r2,:64], r12 +.endr + + idct16 + + @ Do four 4x4 transposes. Originally, d16-d31 contain the + @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 + @ contain the transposed 4x4 blocks. + transpose16_q_2x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + @ Store the registers a, b, c, d horizontally, followed + @ by the same registers d, c, b, a mirrored. +.macro store_rev a, b, c, d +.irp i, \a, \b, \c, \d + vst1.16 {d\i}, [r0,:64]! + vrev64.16 d\i, d\i +.endr +.irp i, \d, \c, \b, \a + vst1.16 {d\i}, [r0,:64]! +.endr +.endm + store_rev 16, 20, 24, 28 + store_rev 17, 21, 25, 29 + store_rev 18, 22, 26, 30 + store_rev 19, 23, 27, 31 + sub r0, r0, #256 +.purgem store_rev + + @ Move r2 back to the start of the input, and move + @ to the first odd row + sub r2, r2, r12, lsl #4 + add r2, r2, #64 + + vmov.s16 d4, #0 + @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d4}, [r2,:64], r12 +.endr + + idct32_odd + + transpose16_q_2x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 + + @ Store the registers a, b, c, d horizontally, + @ adding into the output first, and then mirrored, subtracted + @ from the output. +.macro store_rev a, b, c, d +.irp i, \a, \b, \c, \d + vld1.16 {d4}, [r0,:64] + vadd.s16 d4, d4, d\i + vst1.16 {d4}, [r0,:64]! + vrev64.16 d\i, d\i +.endr +.irp i, \d, \c, \b, \a + vld1.16 {d4}, [r0,:64] + vsub.s16 d4, d4, d\i + vst1.16 {d4}, [r0,:64]! +.endr +.endm + + store_rev 31, 27, 23, 19 + store_rev 30, 26, 22, 18 + store_rev 29, 25, 21, 17 + store_rev 28, 24, 20, 16 +.purgem store_rev + bx lr +endfunc + +@ This is mostly the same as 4x32_pass1, but without the transpose, +@ and use the source as temp buffer between the two idct passes, and +@ add into the destination. +@ r0 = dst +@ r1 = dst stride +@ r2 = src (temp buffer) +function idct32_1d_4x32_pass2_neon + movrel r12, idct_coeffs + vld1.16 {q0-q1}, [r12,:128] + + mov r12, #128 + @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vld1.16 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #4 + + idct16 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vst1.16 {d\i}, [r2,:64], r12 +.endr + + sub r2, r2, r12, lsl #4 + add r2, r2, #64 + + @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + vld1.16 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #4 + sub r2, r2, #64 + + idct32_odd + + mov r12, #128 +.macro load_acc_store a, b, c, d, neg=0 + vld1.16 {d4}, [r2,:64], r12 + vld1.16 {d5}, [r2,:64], r12 +.if \neg == 0 + vadd.s16 d4, d4, d\a + vld1.16 {d6}, [r2,:64], r12 + vadd.s16 d5, d5, d\b + vld1.16 {d7}, [r2,:64], r12 + vadd.s16 d6, d6, d\c + vadd.s16 d7, d7, d\d +.else + vsub.s16 d4, d4, d\a + vld1.16 {d6}, [r2,:64], r12 + vsub.s16 d5, d5, d\b + vld1.16 {d7}, [r2,:64], r12 + vsub.s16 d6, d6, d\c + vsub.s16 d7, d7, d\d +.endif + vld1.32 {d2[]}, [r0,:32], r1 + vld1.32 {d2[1]}, [r0,:32], r1 + vrshr.s16 q2, q2, #6 + vld1.32 {d3[]}, [r0,:32], r1 + vrshr.s16 q3, q3, #6 + vld1.32 {d3[1]}, [r0,:32], r1 + sub r0, r0, r1, lsl #2 + vaddw.u8 q2, q2, d2 + vaddw.u8 q3, q3, d3 + vqmovun.s16 d4, q2 + vqmovun.s16 d5, q3 + vst1.32 {d4[0]}, [r0,:32], r1 + vst1.32 {d4[1]}, [r0,:32], r1 + vst1.32 {d5[0]}, [r0,:32], r1 + vst1.32 {d5[1]}, [r0,:32], r1 +.endm + load_acc_store 31, 30, 29, 28 + load_acc_store 27, 26, 25, 24 + load_acc_store 23, 22, 21, 20 + load_acc_store 19, 18, 17, 16 + sub r2, r2, r12 + neg r12, r12 + load_acc_store 16, 17, 18, 19, 1 + load_acc_store 20, 21, 22, 23, 1 + load_acc_store 24, 25, 26, 27, 1 + load_acc_store 28, 29, 30, 31, 1 +.purgem load_acc_store + bx lr +endfunc + +function ff_vp9_idct_idct_32x32_add_neon, export=1 + cmp r3, #1 + beq idct32x32_dc_add_neon +1:unused label+ push {r4-r7,lr} + vpush {q4-q7} + mov r7, sp + + @ Align the stack, allocate a temp buffer +T mov r12, sp +T bic r12, r12, #15 +T sub r12, r12, #2048 +T mov sp, r12 +A bic sp, sp, #15 +A sub sp, sp, #2048 + + mov r4, r0 + mov r5, r1 + mov r6, r2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r0, sp, #(\i*64) + add r2, r6, #(\i*2) + bl idct32_1d_4x32_pass1_neon +.endr +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r0, r4, #(\i) + mov r1, r5 + add r2, sp, #(\i*2) + bl idct32_1d_4x32_pass2_neon +.endr + + mov sp, r7 + vpop {q4-q7} + pop {r4-r7,pc} +endfuncpatch ok with nits fixed. instruction rescheduling can be done in a separate commit.
Ok, thanks! Will push in a little while. // Martin _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel