[libav-devel] [PATCH] aarch64: Make transpose_4x4H do a regular transpose

Martin Storsjö Fri, 25 Mar 2016 15:09:07 -0700

Previously, ff_h264_idct_add_neon (originally in the arm version) used
a non-regular transpose in order to be able to use more instructions
that deal with registers as 128 bit register pairs. The aarch64
translation doesn't do it to the same extent, but brought along the
same structure since it was a straight translation.


This makes the transpose_4x4H macro do a regular transpose, adjusting
ff_h264_idct_add_neon accordingly. (This makes the register pattern
in transpose_4x4H identical to transpose_4x8B, just with different
element widths.)
---
I also have a version of the patch that reshuffles more of
ff_h264_idct_add_neon, closer to the C version, potentially
making it a bit more readable, in case that is desired, instead of
just flipping registers to make it work after fixing the transpose.
(Neither version gives any slowdown on a Cortex A53.)
---
 libavcodec/aarch64/h264idct_neon.S |  8 ++++----
 libavcodec/aarch64/neon.S          | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/libavcodec/aarch64/h264idct_neon.S 
b/libavcodec/aarch64/h264idct_neon.S
index 78f780a..330145b 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -42,16 +42,16 @@ function ff_h264_idct_add_neon, export=1
 
         transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
 
-        add             v4.4H,  v0.4H,  v3.4H
+        add             v4.4H,  v0.4H,  v2.4H
         ld1             {v18.S}[0], [x0], x2
-        sshr            v16.4H,  v2.4H,  #1
+        sshr            v16.4H,  v3.4H,  #1
         sshr            v17.4H,  v1.4H,  #1
         ld1             {v19.S}[1], [x0], x2
-        sub             v5.4H,  v0.4H,  v3.4H
+        sub             v5.4H,  v0.4H,  v2.4H
         ld1             {v18.S}[1], [x0], x2
         add             v6.4H,  v16.4H, v1.4H
         ins             v4.D[1],  v5.D[0]
-        sub             v7.4H,  v2.4H,  v17.4H
+        sub             v7.4H,  v3.4H,  v17.4H
         ld1             {v19.S}[0], [x0], x2
         ins             v6.D[1],  v7.D[0]
         sub             x0,  x0,  x2, lsl #2
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 767bc9d..377009e 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -107,12 +107,12 @@
 .macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
         trn1            \r4\().4H,  \r0\().4H,  \r1\().4H
         trn2            \r5\().4H,  \r0\().4H,  \r1\().4H
-        trn1            \r7\().4H,  \r2\().4H,  \r3\().4H
-        trn2            \r6\().4H,  \r2\().4H,  \r3\().4H
-        trn1            \r0\().2S,  \r4\().2S,  \r7\().2S
-        trn2            \r3\().2S,  \r4\().2S,  \r7\().2S
-        trn1            \r1\().2S,  \r5\().2S,  \r6\().2S
-        trn2            \r2\().2S,  \r5\().2S,  \r6\().2S
+        trn1            \r6\().4H,  \r2\().4H,  \r3\().4H
+        trn2            \r7\().4H,  \r2\().4H,  \r3\().4H
+        trn1            \r0\().2S,  \r4\().2S,  \r6\().2S
+        trn2            \r2\().2S,  \r4\().2S,  \r6\().2S
+        trn1            \r1\().2S,  \r5\().2S,  \r7\().2S
+        trn2            \r3\().2S,  \r5\().2S,  \r7\().2S
 .endm
 
 .macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
-- 
1.8.1.2

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH] aarch64: Make transpose_4x4H do a regular transpose

Reply via email to