On Wed, 26 Apr 2017, Alexandra Hájková wrote:

From: Seppo Tomperi <seppo.tomp...@vtt.fi>

Optimized by Alexandra Hájková.
---
libavcodec/arm/hevc_idct.S        | 86 +++++++++++++++++++++++++++++++++++++++
libavcodec/arm/hevcdsp_init_arm.c | 15 +++++++
2 files changed, 101 insertions(+)

diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 41b1b29..833c3fe 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -30,6 +30,92 @@ const trans, align=4
        .short 57, 43, 25, 9
endconst

+function ff_hevc_add_residual_4x4_8_neon, export=1
+        vld1.16         {q0-q1}, [r1, :128]
+        vld1.32         d4[0], [r0, :32], r2
+        vld1.32         d4[1], [r0, :32], r2
+        vld1.32         d5[0], [r0, :32], r2
+        vld1.32         d5[1], [r0, :32], r2
+        sub             r0, r0, r2, lsl #2
+        vmovl.u8        q8, d4
+        vmovl.u8        q9, d5
+        vqadd.s16       q0, q0, q8
+        vqadd.s16       q1, q1, q9
+        vqmovun.s16     d0, q0
+        vqmovun.s16     d1, q1
+        vst1.32         d0[0], [r0], r2
+        vst1.32         d0[1], [r0], r2
+        vst1.32         d1[0], [r0], r2
+        vst1.32         d1[1], [r0], r2

Add :32 alignment here as well

+        bx              lr
+endfunc
+
+function ff_hevc_add_residual_8x8_8_neon, export=1
+        mov             r3,   #8
+1:      subs            r3,   #2
+        vld1.16         {q0-q1}, [r1, :128]!
+        vld1.8          {q8},    [r0, :128]

You can't do this; this loads two rows from the output, which actually are r2 bytes apart. This only happens to work in checkasm since checkasm has a tightly packed buffer for the image data. You need to load d16 and d17 separately.

+        vmovl.u8        q9,   d16
+        vmovl.u8        q8,   d17
+        vqadd.s16       q0,   q9
+        vqadd.s16       q1,   q8
+        vqmovun.s16     d0,   q0
+        vqmovun.s16     d1,   q1
+        vst1.8          d0,   [r0, :64], r2
+        vst1.8          d1,   [r0, :64], r2
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_hevc_add_residual_16x16_8_neon, export=1
+        push            {lr}
+        mov             r3,   #16
+1:      subs            r3,   #2
+        vld1.16         {q0, q1}, [r1, :128]!
+        vld1.16         {q2, q3}, [r1, :128]!
+        vld1.8          {q8},     [r0, :128]
+        add             lr, r0, r2

You don't need to use lr as temp register here, you can use r12 as well. (You can't assume that r12 is kept intact if you call external functions, but here it is fine.) That avoids having to push anything here.

I tested a few alternatives here, and the fastest seems to be this:

at the head of the function:
    add r12, r0,  r2
    add r2,  r2,  r2 @ double stride
at the start of the loop:
    vld1.8 {q8}, [r0, :128]
    vld1.16 {q0, q1}, [r1, :128]!
    vld1.8 {q11}, [r12, :128]
    vld1.16 {q2, q3}, [r1, :128]!

(You're using q8 and q11 long before you use q0-q3, so it's better to start the load of that one first)
at the end of the loop:
    vst1.8 {q0}, [r0, :128], r2
    vst1.8 {q1}, [r12, :128], r2

This seems to shave off around 18 cycles (from 272 to 254) on A53 and almost 50 cycles (from 372 to 320) on A7.

// Martin
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to