On Wed, 5 Apr 2017, Alexandra Hájková wrote:

The speedup vs C code is around 8x.

On my devboards, the speedup seems to be 6x on Cortex-A9, 7x on A7, 9x on A53, and 13x on A8. So this can probably be amended into "around 6-13x".

---
libavcodec/arm/hevc_idct.S        | 187 ++++++++++++++++++++++++++++++++++++++
libavcodec/arm/hevcdsp_init_arm.c |   4 +
2 files changed, 191 insertions(+)

diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 4124fc8..b4279db 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -222,7 +222,194 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
endfunc
.endm

+.macro butterfly e, o, tmp_p, tmp_m
+        vadd.s32        \tmp_p, \e, \o
+        vsub.s32        \tmp_m, \e, \o
+.endm
+
+.macro tr16_8x4 in0, in1, in2, in3, in4, in5, in6, in7
+        tr_4x4_8        \in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, 
q14, q15
+
+        vmull.s16       q12, \in1, \in0[0]
+        vmull.s16       q13, \in1, \in0[1]
+        vmull.s16       q14, \in1, \in0[2]
+        vmull.s16       q15, \in1, \in0[3]
+        sum_sub         q12, \in3, \in0[1], +
+        sum_sub         q13, \in3, \in0[3], -
+        sum_sub         q14, \in3, \in0[0], -
+        sum_sub         q15, \in3, \in0[2], -
+
+        sum_sub         q12, \in5, \in0[2], +
+        sum_sub         q13, \in5, \in0[0], -
+        sum_sub         q14, \in5, \in0[3], +
+        sum_sub         q15, \in5, \in0[1], +
+
+        sum_sub         q12, \in7, \in0[3], +
+        sum_sub         q13, \in7, \in0[2], -
+        sum_sub         q14, \in7, \in0[1], +
+        sum_sub         q15, \in7, \in0[0], -
+
+        butterfly       q8,  q12, q0, q7
+        butterfly       q9,  q13, q1, q6
+        butterfly       q10, q14, q2, q5
+        butterfly       q11, q15, q3, q4
+        add             r4,  sp,  #512
+        vst1.s16        {q0-q1}, [r4, :128]!
+        vst1.s16        {q2-q3}, [r4, :128]!
+        vst1.s16        {q4-q5}, [r4, :128]!
+        vst1.s16        {q6-q7}, [r4, :128]
+.endm
+
+.macro load16 in0, in1, in2, in3, in4, in5, in6, in7
+        vld1.s16        {\in0}, [r1, :64], r2
+        vld1.s16        {\in1}, [r3, :64], r2
+        vld1.s16        {\in2}, [r1, :64], r2
+        vld1.s16        {\in3}, [r3, :64], r2
+        vld1.s16        {\in4}, [r1, :64], r2
+        vld1.s16        {\in5}, [r3, :64], r2
+        vld1.s16        {\in6}, [r1, :64], r2
+        vld1.s16        {\in7}, [r3, :64], r2
+.endm
+
+.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, 
op5, op6, op7
+        sum_sub q5,     \in, \t0, \op0
+        sum_sub q6,     \in, \t1, \op1
+        sum_sub q7,     \in, \t2, \op2
+        sum_sub q8,     \in, \t3, \op3
+        sum_sub q9,     \in, \t4, \op4
+        sum_sub q10,    \in, \t5, \op5
+        sum_sub q11,    \in, \t6, \op6
+        sum_sub q12,    \in, \t7, \op7
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+        vadd.s32        q4, \in0, \in1
+        vsub.s32        \in0, \in0, \in1
+        vadd.s32        \in1, \in2, \in3
+        vsub.s32        \in2, \in2, \in3
+        vadd.s32        \in3, \in4, \in5
+        vsub.s32        \in4, \in4, \in5
+        vadd.s32        \in5, \in6, \in7
+        vsub.s32        \in6, \in6, \in7
+.endm
+
+.macro store16 in0, in1, in2, in3, in4, in5, in6, in7
+        vst1.s16        \in0, [r1, :64], r2
+        vst1.s16        \in1, [r3, :64], r4
+        vst1.s16        \in2, [r1, :64], r2
+        vst1.s16        \in3, [r3, :64], r4
+        vst1.s16        \in4, [r1, :64], r2
+        vst1.s16        \in5, [r3, :64], r4
+        vst1.s16        \in6, [r1, :64], r2
+        vst1.s16        \in7, [r3, :64], r4
+.endm
+
+.macro scale out0, out1, out2, out3, out4, out5, out6, out7, in0, in1, in2, 
in3, in4, in5, in6, in7, shift
+        vqrshrn.s32     \out0, \in0, \shift
+        vqrshrn.s32     \out1, \in1, \shift
+        vqrshrn.s32     \out2, \in2, \shift
+        vqrshrn.s32     \out3, \in3, \shift
+        vqrshrn.s32     \out4, \in4, \shift
+        vqrshrn.s32     \out5, \in5, \shift
+        vqrshrn.s32     \out6, \in6, \shift
+        vqrshrn.s32     \out7, \in7, \shift
+.endm
+
+.macro tr_16x4 horiz, shift, in, out
+        add             r1,  \in, \horiz
+        add             r3,  \in, #(\horiz + 64)
+        mov             r2,  #128
+        load16          d0, d1, d2, d3, d4, d5, d6, d7
+        movrel          r1, trans
+
+        tr16_8x4        d0, d1, d2, d3, d4, d5, d6, d7
+
+        add             r1,  \in, #(\horiz + 32)
+        add             r3,  \in, #(\horiz + 64 + 32)
+        mov             r2,  #128
+        load16          d8, d9, d2, d3, d4, d5, d6, d7
+        movrel          r1, trans + 16
+        vld1.s16        {q0}, [r1, :128]
+        vmull.s16       q5, d8, d0[0]
+        vmull.s16       q6, d8, d0[1]
+        vmull.s16       q7, d8, d0[2]
+        vmull.s16       q8, d8, d0[3]
+        vmull.s16       q9, d8, d1[0]
+        vmull.s16       q10, d8, d1[1]
+        vmull.s16       q11, d8, d1[2]
+        vmull.s16       q12, d8, d1[3]
+
+        add_member      d9, d0[1], d1[0], d1[3], d1[1], d0[2], d0[0], d0[3], 
d1[2], +, +, +, -, -, -, -, -
+        add_member      d2, d0[2], d1[3], d0[3], d0[1], d1[2], d1[0], d0[0], 
d1[1], +, +, -, -, -, +, +, +
+        add_member      d3, d0[3], d1[1], d0[1], d1[3], d0[0], d1[2], d0[2], 
d1[0], +, -, -, +, +, +, -, -
+        add_member      d4, d1[0], d0[2], d1[2], d0[0], d1[3], d0[1], d1[1], 
d0[3], +, -, -, +, -, -, +, +
+        add_member      d5, d1[1], d0[0], d1[0], d1[2], d0[1], d0[3], d1[3], 
d0[2], +, -, +, +, -, +, +, -
+        add_member      d6, d1[2], d0[3], d0[0], d0[2], d1[1], d1[3], d1[0], 
d0[1], +, -, +, -, +, +, -, +
+        add_member      d7, d1[3], d1[2], d1[1], d1[0], d0[3], d0[2], d0[1], 
d0[0], +, -, +, -, +, -, +, -
+
+        add             r4, sp, #512
+        vld1.s16        {q0-q1}, [r4, :128]!
+        vld1.s16        {q2-q3}, [r4, :128]!
+
+        butterfly16     q0, q5, q1, q6, q2, q7, q3, q8
+        scale           d26, d27, d28, d29, d30, d31, d16, d17, q4, q0, q5, 
q1, q6, q2, q7, q3, \shift
+        transpose8_4x4  d26, d28, d30, d16
+        transpose8_4x4  d17, d31, d29, d27
+        add             r1, \out, #(\horiz*16)
+        add             r3, \out, #(\horiz*16 + 24 +3*32)
+        mov             r2, #32
+        mov             r4, #-32
+        store16         d26, d27, d28, d29, d30, d31, d16, d17
+
+        add             r4, sp, #576
+        vld1.s16        {q0-q1}, [r4, :128]!
+        vld1.s16        {q2-q3}, [r4, :128]
+        butterfly16     q0, q9, q1, q10, q2, q11, q3, q12
+        scale           d26, d27, d28, d29, d30, d31, d8, d9, q4, q0, q9, q1, 
q10, q2, q11, q3, \shift
+        transpose8_4x4  d26, d28, d30, d8
+        transpose8_4x4  d9, d31, d29, d27
+
+        add             r1, \out, #(\horiz*16 + 8)
+        add             r3, \out, #(\horiz*16 + 16 + 3*32)
+        mov             r2, #32
+        mov             r4, #-32
+        store16         d26, d27, d28, d29, d30, d31, d8, d9
+.endm
+
+.macro idct_16x16 bitdepth
+function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
+@r0 - coeffs
+        push            {r4, lr}
+        vpush           {q4-q7}
+
+        @ Align the stack, allocate a temp buffer
+T       mov             lr,  sp
+T       and             lr,  lr,  #15
+A       and             lr,  sp,  #15
+        add             lr,  lr,  #640
+        sub             sp,  sp,  lr
+
+
+        tr_16x4         0, 7, r0, sp
+        tr_16x4         8, 7, r0, sp
+        tr_16x4         16, 7, r0, sp
+        tr_16x4         24, 7, r0, sp
+
+        tr_16x4         0, 20 - \bitdepth, sp, r0
+        tr_16x4         8, 20 - \bitdepth, sp, r0
+        tr_16x4         16, 20 - \bitdepth, sp, r0
+        tr_16x4         24, 20 - \bitdepth, sp, r0
+

The overall structure of the code seems clear enough now.


These 8 macro invocations adds 8 copies of the almost identical same code though - the size of this object file is inflated a great deal due to this.

Before this, the code size ends up at 2280 bytes, after this patch it ends up at 15976 bytes. This also makes the builds fail on e.g. raspberry pi, due to "invalid literal constant: pool needs to be closer". In other words, the function is too large.

You should be able to make a single tr_16x4 function (which works for both 8 and 10 bpp), and call it from here. In that form, the things you have as macro parameters right now, would end up as function parameters to that function, passed e.g. in r0-r3.

That should reduce the size of the code rather significantly (it should almost get rid of 15/16 of the increase!) without sacrificing performance.

// Martin
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to