Re: [libav-devel] [PATCH] hevc: Optimize NEON 8x8 IDCT using col_limit

Martin Storsjö Wed, 12 Apr 2017 05:08:28 -0700

On Wed, 12 Apr 2017, Alexandra Hájková wrote:

---
libavcodec/arm/hevc_idct.S | 26 +++++++++++++++-----------
1 file changed, 15 insertions(+), 11 deletions(-)


diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 4124fc8..29135ad 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -58,7 +58,7 @@ endconst

.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, 
tmp3
         vshll.s16      \tmp0, \in0, #6
-         vld1.s16       {\in0}, [r1, :64]!
+         vld1.s16       {\in0}, [r4, :64]!
         vmov           \tmp1, \tmp0
         vmull.s16      \tmp2, \in1, \in0[1]
         vmull.s16      \tmp3, \in1, \in0[3]
@@ -67,14 +67,14 @@ endconst
         vmlal.s16      \tmp2, \in3, \in0[3] @o0
         vmlsl.s16      \tmp3, \in3, \in0[1] @o1

-         vld1.s16       {\in0}, [r1, :64]
+         vld1.s16       {\in0}, [r4, :64]

         vadd.s32       \out0, \tmp0, \tmp2
         vadd.s32       \out1, \tmp1, \tmp3
         vsub.s32       \out2, \tmp1, \tmp3
         vsub.s32       \out3, \tmp0, \tmp2

-         sub            r1,  r1,  #8
+         sub            r4,  r4,  #8
.endm

@ Do a 4x4 transpose, using q registers for the subtransposes that don't
@@ -166,21 +166,25 @@ endfunc
.macro idct_8x8 bitdepth
function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
@r0 - coeffs
+        push            {r4, lr}

If you actually only need one spare register, you don't need to push bothr4 and lr - it would be enough to just push lr and use that instead of r4below. (Since you don't do any subroutine calls in this function, it's okto use lr as scratch register.)

        vpush           {q4-q7}

-        mov             r1,  r0
+        mov             r4,  r0
        mov             r2,  #64
        add             r3,  r0,  #32
-        vld1.s16        {q0-q1}, [r1,:128], r2
+        vld1.s16        {q0-q1}, [r4,:128], r2
        vld1.s16        {q2-q3}, [r3,:128], r2
-        vld1.s16        {q4-q5}, [r1,:128], r2
+        vld1.s16        {q4-q5}, [r4,:128], r2
        vld1.s16        {q6-q7}, [r3,:128], r2

-        movrel          r1, trans
+        movrel          r4, trans

        tr_8x4          7, d0, d2, d4, d6, d8, d10, d12, d14
+        cmp             r1, #4
+        blt             1f
        tr_8x4          7, d1, d3, d5, d7, d9, d11, d13, d15

As far as I can see the code in libavcodec/hevcdec.c right now, it'simpossible to get col_limit < 4. So this won't actually ever have anyeffect, it seems.

I'm not sure what the logic of the current col_limit is, if it should bechanged so that it behaves like this (which would make sense).


// Martin
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] hevc: Optimize NEON 8x8 IDCT using col_limit

Reply via email to