Previously, ff_h264_idct_add_neon (originally in the arm version) used
a non-regular transpose in order to be able to use more instructions
that deal with registers as 128 bit register pairs. The aarch64
translation doesn't do it to the same extent, but brought along the
same structure since it was a straight translation.
This makes the transpose_4x4H macro do a regular transpose, adjusting
ff_h264_idct_add_neon accordingly. (This makes the register pattern
in transpose_4x4H identical to transpose_4x8B, just with different
element widths.)
---
I also have a version of the patch that reshuffles more of
ff_h264_idct_add_neon, closer to the C version, potentially
making it a bit more readable, in case that is desired, instead of
just flipping registers to make it work after fixing the transpose.
(Neither version gives any slowdown on a Cortex A53.)
---
libavcodec/aarch64/h264idct_neon.S | 8 ++++----
libavcodec/aarch64/neon.S | 12 ++++++------
2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/libavcodec/aarch64/h264idct_neon.S
b/libavcodec/aarch64/h264idct_neon.S
index 78f780a..330145b 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -42,16 +42,16 @@ function ff_h264_idct_add_neon, export=1
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
- add v4.4H, v0.4H, v3.4H
+ add v4.4H, v0.4H, v2.4H
ld1 {v18.S}[0], [x0], x2
- sshr v16.4H, v2.4H, #1
+ sshr v16.4H, v3.4H, #1
sshr v17.4H, v1.4H, #1
ld1 {v19.S}[1], [x0], x2
- sub v5.4H, v0.4H, v3.4H
+ sub v5.4H, v0.4H, v2.4H
ld1 {v18.S}[1], [x0], x2
add v6.4H, v16.4H, v1.4H
ins v4.D[1], v5.D[0]
- sub v7.4H, v2.4H, v17.4H
+ sub v7.4H, v3.4H, v17.4H
ld1 {v19.S}[0], [x0], x2
ins v6.D[1], v7.D[0]
sub x0, x0, x2, lsl #2
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 767bc9d..377009e 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -107,12 +107,12 @@
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
trn1 \r4\().4H, \r0\().4H, \r1\().4H
trn2 \r5\().4H, \r0\().4H, \r1\().4H
- trn1 \r7\().4H, \r2\().4H, \r3\().4H
- trn2 \r6\().4H, \r2\().4H, \r3\().4H
- trn1 \r0\().2S, \r4\().2S, \r7\().2S
- trn2 \r3\().2S, \r4\().2S, \r7\().2S
- trn1 \r1\().2S, \r5\().2S, \r6\().2S
- trn2 \r2\().2S, \r5\().2S, \r6\().2S
+ trn1 \r6\().4H, \r2\().4H, \r3\().4H
+ trn2 \r7\().4H, \r2\().4H, \r3\().4H
+ trn1 \r0\().2S, \r4\().2S, \r6\().2S
+ trn2 \r2\().2S, \r4\().2S, \r6\().2S
+ trn1 \r1\().2S, \r5\().2S, \r7\().2S
+ trn2 \r3\().2S, \r5\().2S, \r7\().2S
.endm
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
--
1.8.1.2
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel