On Wed, 26 Apr 2017, Alexandra Hájková wrote:
From: Seppo Tomperi <seppo.tomp...@vtt.fi>
Optimized by Alexandra Hájková.
---
libavcodec/arm/hevc_idct.S | 86 +++++++++++++++++++++++++++++++++++++++
libavcodec/arm/hevcdsp_init_arm.c | 15 +++++++
2 files changed, 101 insertions(+)
diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 41b1b29..833c3fe 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -30,6 +30,92 @@ const trans, align=4
.short 57, 43, 25, 9
endconst
+function ff_hevc_add_residual_4x4_8_neon, export=1
+ vld1.16 {q0-q1}, [r1, :128]
+ vld1.32 d4[0], [r0, :32], r2
+ vld1.32 d4[1], [r0, :32], r2
+ vld1.32 d5[0], [r0, :32], r2
+ vld1.32 d5[1], [r0, :32], r2
+ sub r0, r0, r2, lsl #2
+ vmovl.u8 q8, d4
+ vmovl.u8 q9, d5
+ vqadd.s16 q0, q0, q8
+ vqadd.s16 q1, q1, q9
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vst1.32 d0[0], [r0], r2
+ vst1.32 d0[1], [r0], r2
+ vst1.32 d1[0], [r0], r2
+ vst1.32 d1[1], [r0], r2
Add :32 alignment here as well
+ bx lr
+endfunc
+
+function ff_hevc_add_residual_8x8_8_neon, export=1
+ mov r3, #8
+1: subs r3, #2
+ vld1.16 {q0-q1}, [r1, :128]!
+ vld1.8 {q8}, [r0, :128]
You can't do this; this loads two rows from the output, which actually are
r2 bytes apart. This only happens to work in checkasm since checkasm has a
tightly packed buffer for the image data. You need to load d16 and d17
separately.
+ vmovl.u8 q9, d16
+ vmovl.u8 q8, d17
+ vqadd.s16 q0, q9
+ vqadd.s16 q1, q8
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vst1.8 d0, [r0, :64], r2
+ vst1.8 d1, [r0, :64], r2
+ bne 1b
+ bx lr
+endfunc
+
+function ff_hevc_add_residual_16x16_8_neon, export=1
+ push {lr}
+ mov r3, #16
+1: subs r3, #2
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2, q3}, [r1, :128]!
+ vld1.8 {q8}, [r0, :128]
+ add lr, r0, r2
You don't need to use lr as temp register here, you can use r12 as well.
(You can't assume that r12 is kept intact if you call external functions,
but here it is fine.) That avoids having to push anything here.
I tested a few alternatives here, and the fastest seems to be this:
at the head of the function:
add r12, r0, r2
add r2, r2, r2 @ double stride
at the start of the loop:
vld1.8 {q8}, [r0, :128]
vld1.16 {q0, q1}, [r1, :128]!
vld1.8 {q11}, [r12, :128]
vld1.16 {q2, q3}, [r1, :128]!
(You're using q8 and q11 long before you use q0-q3, so it's better to
start the load of that one first)
at the end of the loop:
vst1.8 {q0}, [r0, :128], r2
vst1.8 {q1}, [r12, :128], r2
This seems to shave off around 18 cycles (from 272 to 254) on A53 and
almost 50 cycles (from 372 to 320) on A7.
// Martin
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel