The speedup vs C code is around 8x. --- libavcodec/arm/hevc_idct.S | 187 ++++++++++++++++++++++++++++++++++++++ libavcodec/arm/hevcdsp_init_arm.c | 4 + 2 files changed, 191 insertions(+)
diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S index 4124fc8..b4279db 100644 --- a/libavcodec/arm/hevc_idct.S +++ b/libavcodec/arm/hevc_idct.S @@ -222,7 +222,194 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1 endfunc .endm +.macro butterfly e, o, tmp_p, tmp_m + vadd.s32 \tmp_p, \e, \o + vsub.s32 \tmp_m, \e, \o +.endm + +.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7 + tr_4x4_8 \in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, q14, q15 + + vmull.s16 q12, \in1, \in0[0] + vmull.s16 q13, \in1, \in0[1] + vmull.s16 q14, \in1, \in0[2] + vmull.s16 q15, \in1, \in0[3] + sum_sub q12, \in3, \in0[1], + + sum_sub q13, \in3, \in0[3], - + sum_sub q14, \in3, \in0[0], - + sum_sub q15, \in3, \in0[2], - + + sum_sub q12, \in5, \in0[2], + + sum_sub q13, \in5, \in0[0], - + sum_sub q14, \in5, \in0[3], + + sum_sub q15, \in5, \in0[1], + + + sum_sub q12, \in7, \in0[3], + + sum_sub q13, \in7, \in0[2], - + sum_sub q14, \in7, \in0[1], + + sum_sub q15, \in7, \in0[0], - + + butterfly q8, q12, q0, q7 + butterfly q9, q13, q1, q6 + butterfly q10, q14, q2, q5 + butterfly q11, q15, q3, q4 + add r4, sp, #512 + vst1.s16 {q0-q1}, [r4, :128]! + vst1.s16 {q2-q3}, [r4, :128]! + vst1.s16 {q4-q5}, [r4, :128]! + vst1.s16 {q6-q7}, [r4, :128] +.endm + +.macro load16 in0, in1, in2, in3, in4, in5, in6, in7 + vld1.s16 {\in0}, [r1, :64], r2 + vld1.s16 {\in1}, [r3, :64], r2 + vld1.s16 {\in2}, [r1, :64], r2 + vld1.s16 {\in3}, [r3, :64], r2 + vld1.s16 {\in4}, [r1, :64], r2 + vld1.s16 {\in5}, [r3, :64], r2 + vld1.s16 {\in6}, [r1, :64], r2 + vld1.s16 {\in7}, [r3, :64], r2 +.endm + +.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7 + sum_sub q5, \in, \t0, \op0 + sum_sub q6, \in, \t1, \op1 + sum_sub q7, \in, \t2, \op2 + sum_sub q8, \in, \t3, \op3 + sum_sub q9, \in, \t4, \op4 + sum_sub q10, \in, \t5, \op5 + sum_sub q11, \in, \t6, \op6 + sum_sub q12, \in, \t7, \op7 +.endm + +.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 + vadd.s32 q4, \in0, \in1 + vsub.s32 \in0, \in0, \in1 + vadd.s32 \in1, \in2, \in3 + vsub.s32 \in2, \in2, \in3 + vadd.s32 \in3, \in4, \in5 + vsub.s32 \in4, \in4, \in5 + vadd.s32 \in5, \in6, \in7 + vsub.s32 \in6, \in6, \in7 +.endm + +.macro store16 in0, in1, in2, in3, in4, in5, in6, in7 + vst1.s16 \in0, [r1, :64], r2 + vst1.s16 \in1, [r3, :64], r4 + vst1.s16 \in2, [r1, :64], r2 + vst1.s16 \in3, [r3, :64], r4 + vst1.s16 \in4, [r1, :64], r2 + vst1.s16 \in5, [r3, :64], r4 + vst1.s16 \in6, [r1, :64], r2 + vst1.s16 \in7, [r3, :64], r4 +.endm + +.macro scale out0, out1, out2, out3, out4, out5, out6, out7, in0, in1, in2, in3, in4, in5, in6, in7, shift + vqrshrn.s32 \out0, \in0, \shift + vqrshrn.s32 \out1, \in1, \shift + vqrshrn.s32 \out2, \in2, \shift + vqrshrn.s32 \out3, \in3, \shift + vqrshrn.s32 \out4, \in4, \shift + vqrshrn.s32 \out5, \in5, \shift + vqrshrn.s32 \out6, \in6, \shift + vqrshrn.s32 \out7, \in7, \shift +.endm + +.macro tr_16x4 horiz, shift, in, out + add r1, \in, \horiz + add r3, \in, #(\horiz + 64) + mov r2, #128 + load16 d0, d1, d2, d3, d4, d5, d6, d7 + movrel r1, trans + + tr16_8x4 d0, d1, d2, d3, d4, d5, d6, d7 + + add r1, \in, #(\horiz + 32) + add r3, \in, #(\horiz + 64 + 32) + mov r2, #128 + load16 d8, d9, d2, d3, d4, d5, d6, d7 + movrel r1, trans + 16 + vld1.s16 {q0}, [r1, :128] + vmull.s16 q5, d8, d0[0] + vmull.s16 q6, d8, d0[1] + vmull.s16 q7, d8, d0[2] + vmull.s16 q8, d8, d0[3] + vmull.s16 q9, d8, d1[0] + vmull.s16 q10, d8, d1[1] + vmull.s16 q11, d8, d1[2] + vmull.s16 q12, d8, d1[3] + + add_member d9, d0[1], d1[0], d1[3], d1[1], d0[2], d0[0], d0[3], d1[2], +, +, +, -, -, -, -, - + add_member d2, d0[2], d1[3], d0[3], d0[1], d1[2], d1[0], d0[0], d1[1], +, +, -, -, -, +, +, + + add_member d3, d0[3], d1[1], d0[1], d1[3], d0[0], d1[2], d0[2], d1[0], +, -, -, +, +, +, -, - + add_member d4, d1[0], d0[2], d1[2], d0[0], d1[3], d0[1], d1[1], d0[3], +, -, -, +, -, -, +, + + add_member d5, d1[1], d0[0], d1[0], d1[2], d0[1], d0[3], d1[3], d0[2], +, -, +, +, -, +, +, - + add_member d6, d1[2], d0[3], d0[0], d0[2], d1[1], d1[3], d1[0], d0[1], +, -, +, -, +, +, -, + + add_member d7, d1[3], d1[2], d1[1], d1[0], d0[3], d0[2], d0[1], d0[0], +, -, +, -, +, -, +, - + + add r4, sp, #512 + vld1.s16 {q0-q1}, [r4, :128]! + vld1.s16 {q2-q3}, [r4, :128]! + + butterfly16 q0, q5, q1, q6, q2, q7, q3, q8 + scale d26, d27, d28, d29, d30, d31, d16, d17, q4, q0, q5, q1, q6, q2, q7, q3, \shift + transpose8_4x4 d26, d28, d30, d16 + transpose8_4x4 d17, d31, d29, d27 + add r1, \out, #(\horiz*16) + add r3, \out, #(\horiz*16 + 24 +3*32) + mov r2, #32 + mov r4, #-32 + store16 d26, d27, d28, d29, d30, d31, d16, d17 + + add r4, sp, #576 + vld1.s16 {q0-q1}, [r4, :128]! + vld1.s16 {q2-q3}, [r4, :128] + butterfly16 q0, q9, q1, q10, q2, q11, q3, q12 + scale d26, d27, d28, d29, d30, d31, d8, d9, q4, q0, q9, q1, q10, q2, q11, q3, \shift + transpose8_4x4 d26, d28, d30, d8 + transpose8_4x4 d9, d31, d29, d27 + + add r1, \out, #(\horiz*16 + 8) + add r3, \out, #(\horiz*16 + 16 + 3*32) + mov r2, #32 + mov r4, #-32 + store16 d26, d27, d28, d29, d30, d31, d8, d9 +.endm + +.macro idct_16x16 bitdepth +function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1 +@r0 - coeffs + push {r4, lr} + vpush {q4-q7} + + @ Align the stack, allocate a temp buffer +T mov lr, sp +T and lr, lr, #15 +A and lr, sp, #15 + add lr, lr, #640 + sub sp, sp, lr + + + tr_16x4 0, 7, r0, sp + tr_16x4 8, 7, r0, sp + tr_16x4 16, 7, r0, sp + tr_16x4 24, 7, r0, sp + + tr_16x4 0, 20 - \bitdepth, sp, r0 + tr_16x4 8, 20 - \bitdepth, sp, r0 + tr_16x4 16, 20 - \bitdepth, sp, r0 + tr_16x4 24, 20 - \bitdepth, sp, r0 + + add sp, sp, lr + + vpop {q4-q7} + pop {r4, pc} +endfunc +.endm + idct_4x4 8 idct_4x4 10 idct_8x8 8 idct_8x8 10 +idct_16x16 8 +idct_16x16 10 diff --git a/libavcodec/arm/hevcdsp_init_arm.c b/libavcodec/arm/hevcdsp_init_arm.c index 1e984e6..e61587f 100644 --- a/libavcodec/arm/hevcdsp_init_arm.c +++ b/libavcodec/arm/hevcdsp_init_arm.c @@ -27,8 +27,10 @@ void ff_hevc_idct_4x4_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit); +void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit); av_cold void ff_hevc_dsp_init_arm(HEVCDSPContext *c, int bit_depth) { @@ -38,10 +40,12 @@ av_cold void ff_hevc_dsp_init_arm(HEVCDSPContext *c, int bit_depth) if (bit_depth == 8) { c->idct[0] = ff_hevc_idct_4x4_8_neon; c->idct[1] = ff_hevc_idct_8x8_8_neon; + c->idct[2] = ff_hevc_idct_16x16_8_neon; } if (bit_depth == 10) { c->idct[0] = ff_hevc_idct_4x4_10_neon; c->idct[1] = ff_hevc_idct_8x8_10_neon; + c->idct[2] = ff_hevc_idct_16x16_10_neon; } } } -- 2.10.2 _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel