On Wed, 12 Apr 2017, Alexandra Hájková wrote:

---
libavcodec/arm/hevc_idct.S | 26 +++++++++++++++-----------
1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 4124fc8..29135ad 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -58,7 +58,7 @@ endconst

.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, 
tmp3
         vshll.s16      \tmp0, \in0, #6
-         vld1.s16       {\in0}, [r1, :64]!
+         vld1.s16       {\in0}, [r4, :64]!
         vmov           \tmp1, \tmp0
         vmull.s16      \tmp2, \in1, \in0[1]
         vmull.s16      \tmp3, \in1, \in0[3]
@@ -67,14 +67,14 @@ endconst
         vmlal.s16      \tmp2, \in3, \in0[3] @o0
         vmlsl.s16      \tmp3, \in3, \in0[1] @o1

-         vld1.s16       {\in0}, [r1, :64]
+         vld1.s16       {\in0}, [r4, :64]

         vadd.s32       \out0, \tmp0, \tmp2
         vadd.s32       \out1, \tmp1, \tmp3
         vsub.s32       \out2, \tmp1, \tmp3
         vsub.s32       \out3, \tmp0, \tmp2

-         sub            r1,  r1,  #8
+         sub            r4,  r4,  #8
.endm

@ Do a 4x4 transpose, using q registers for the subtransposes that don't
@@ -166,21 +166,25 @@ endfunc
.macro idct_8x8 bitdepth
function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
@r0 - coeffs
+        push            {r4, lr}

If you actually only need one spare register, you don't need to push both r4 and lr - it would be enough to just push lr and use that instead of r4 below. (Since you don't do any subroutine calls in this function, it's ok to use lr as scratch register.)

        vpush           {q4-q7}

-        mov             r1,  r0
+        mov             r4,  r0
        mov             r2,  #64
        add             r3,  r0,  #32
-        vld1.s16        {q0-q1}, [r1,:128], r2
+        vld1.s16        {q0-q1}, [r4,:128], r2
        vld1.s16        {q2-q3}, [r3,:128], r2
-        vld1.s16        {q4-q5}, [r1,:128], r2
+        vld1.s16        {q4-q5}, [r4,:128], r2
        vld1.s16        {q6-q7}, [r3,:128], r2

-        movrel          r1, trans
+        movrel          r4, trans

        tr_8x4          7, d0, d2, d4, d6, d8, d10, d12, d14
+        cmp             r1, #4
+        blt             1f
        tr_8x4          7, d1, d3, d5, d7, d9, d11, d13, d15


As far as I can see the code in libavcodec/hevcdec.c right now, it's impossible to get col_limit < 4. So this won't actually ever have any effect, it seems.

I'm not sure what the logic of the current col_limit is, if it should be changed so that it behaves like this (which would make sense).

// Martin
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to