Re: [FFmpeg-devel] [PATCH v2 2/4] avcodec/aarch64/hevcdsp: port add_residual functions

2021-02-11 Thread Martin Storsjö

On Thu, 4 Feb 2021, Josh Dekker wrote:


From: Reimar Döffinger 

Speedup is fairly small, around 1.5%, but these are fairly simple.

Signed-off-by: Josh Dekker 
---
libavcodec/aarch64/hevcdsp_idct_neon.S| 190 ++
libavcodec/aarch64/hevcdsp_init_aarch64.c |  24 +++
2 files changed, 214 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index c70d6a906d..329038a958 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -36,6 +36,196 @@ const trans, align=4
.short 31, 22, 13, 4
endconst

+.macro clip10 in1, in2, c1, c2
+smax\in1, \in1, \c1
+smax\in2, \in2, \c1
+smin\in1, \in1, \c2
+smin\in2, \in2, \c2
+.endm
+
+function ff_hevc_add_residual_4x4_8_neon, export=1
+ld1 {v0.8h-v1.8h}, [x1]
+ld1 {v2.s}[0], [x0], x2
+ld1 {v2.s}[1], [x0], x2
+ld1 {v2.s}[2], [x0], x2
+ld1 {v2.s}[3], [x0], x2
+sub x0, x0, x2, lsl #2
+uxtlv6.8h,  v2.8B
+uxtl2   v7.8h,  v2.16B


Personal preference: I prefer the non-shouty forms like v2.16b instead of 
v2.16B.



+sqadd   v0.8h,  v0.8h, v6.8h
+sqadd   v1.8h,  v1.8h, v7.8h


Nit: Incosistent alignment between columns 1-2 and 2-3. (And if one would 
want to make space for full sized operands like v16.16b, they'd all need 
another space.)



+sqxtun  v0.8B,  v0.8h
+sqxtun2 v0.16B, v1.8h
+st1 {v0.s}[0], [x0], x2
+st1 {v0.s}[1], [x0], x2
+st1 {v0.s}[2], [x0], x2
+st1 {v0.s}[3], [x0], x2
+ret
+endfunc
+
+function ff_hevc_add_residual_4x4_10_neon, export=1
+mov x12, x0
+ld1 {v0.8h-v1.8h}, [x1]
+ld1 {v2.d}[0], [x12], x2
+ld1 {v2.d}[1], [x12], x2
+ld1 {v3.d}[0], [x12], x2
+sqadd   v0.8h, v0.8h, v2.8h
+ld1 {V3.d}[1], [x12], x2
+moviv4.8h, #0
+sqadd   v1.8h, v1.8h, v3.8h
+mvniv5.8h, #0xFC, LSL #8 // movi #0x3FF
+clip10  v0.8h, v1.8h, v4.8h, v5.8h
+st1 {v0.d}[0], [x0], x2
+st1 {v0.d}[1], [x0], x2
+st1 {v1.d}[0], [x0], x2
+st1 {v1.d}[1], [x0], x2
+ret
+endfunc
+
+function ff_hevc_add_residual_8x8_8_neon, export=1
+add x12, x0, x2
+add x2,  x2, x2
+mov x3,   #8
+1:  subsx3,   x3, #2


Nit: Odd vertical alignment here?


+ld1 {v2.d}[0],   [x0]
+ld1 {v2.d}[1],   [x12]
+uxtlv3.8h,   v2.8B
+ld1 {v0.8h-v1.8h}, [x1], #32
+uxtl2   v2.8h,   v2.16B
+sqadd   v0.8h,   v0.8h,   v3.8h
+sqadd   v1.8h,   v1.8h,   v2.8h
+sqxtun  v0.8B,   v0.8h
+sqxtun2 v0.16B,  v1.8h
+st1 {v0.d}[0],   [x0], x2
+st1 {v0.d}[1],   [x12], x2
+bne 1b
+ret
+endfunc
+
+function ff_hevc_add_residual_8x8_10_neon, export=1
+add x12, x0, x2
+add x2,  x2, x2
+mov x3,  #8
+moviv4.8h, #0
+mvniv5.8h, #0xFC, LSL #8 // movi #0x3FF
+1:  subsx3,  x3, #2
+ld1 {v0.8h-v1.8h}, [x1], #32
+ld1 {v2.8h},[x0]
+sqadd   v0.8h, v0.8h, v2.8h
+ld1 {v3.8h},[x12]
+sqadd   v1.8h, v1.8h, v3.8h
+clip10  v0.8h, v1.8h, v4.8h, v5.8h
+st1 {v0.8h}, [x0], x2
+st1 {v1.8h}, [x12], x2
+bne 1b
+ret
+endfunc
+
+function ff_hevc_add_residual_16x16_8_neon, export=1
+mov x3,  #16
+add x12, x0, x2
+add x2,  x2, x2
+1:  subsx3,  x3, #2
+ld1 {v16.16B}, [x0]
+ld1 {v0.8h-v3.8h}, [x1], #64
+ld1 {v19.16B},[x12]
+uxtlv17.8h, v16.8B
+uxtl2   v18.8h, v16.16B
+uxtlv20.8h, v19.8B
+uxtl2   v21.8h, v19.16B
+sqadd   v0.8h,  v0.8h, v17.8h
+sqadd   v1.8h,  v1.8h, v18.8h
+sqadd   v2.8h,  v2.8h, v20.8h
+sqadd   v3.8h,  v3.8h, v21.8h
+sqxtun  v0.8B,  v0.8h
+sqxtun2 v0.16B, v1.8h
+sqxtun  v1.8B,  v2.8h
+sqxtun2 v1.16B, v3.8h
+st1 {v0.16B}, [x0], x2
+st1  

[FFmpeg-devel] [PATCH v2 2/4] avcodec/aarch64/hevcdsp: port add_residual functions

2021-02-04 Thread Josh Dekker
From: Reimar Döffinger 

Speedup is fairly small, around 1.5%, but these are fairly simple.

Signed-off-by: Josh Dekker 
---
 libavcodec/aarch64/hevcdsp_idct_neon.S| 190 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  24 +++
 2 files changed, 214 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S 
b/libavcodec/aarch64/hevcdsp_idct_neon.S
index c70d6a906d..329038a958 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -36,6 +36,196 @@ const trans, align=4
 .short 31, 22, 13, 4
 endconst
 
+.macro clip10 in1, in2, c1, c2
+smax\in1, \in1, \c1
+smax\in2, \in2, \c1
+smin\in1, \in1, \c2
+smin\in2, \in2, \c2
+.endm
+
+function ff_hevc_add_residual_4x4_8_neon, export=1
+ld1 {v0.8h-v1.8h}, [x1]
+ld1 {v2.s}[0], [x0], x2
+ld1 {v2.s}[1], [x0], x2
+ld1 {v2.s}[2], [x0], x2
+ld1 {v2.s}[3], [x0], x2
+sub x0, x0, x2, lsl #2
+uxtlv6.8h,  v2.8B
+uxtl2   v7.8h,  v2.16B
+sqadd   v0.8h,  v0.8h, v6.8h
+sqadd   v1.8h,  v1.8h, v7.8h
+sqxtun  v0.8B,  v0.8h
+sqxtun2 v0.16B, v1.8h
+st1 {v0.s}[0], [x0], x2
+st1 {v0.s}[1], [x0], x2
+st1 {v0.s}[2], [x0], x2
+st1 {v0.s}[3], [x0], x2
+ret
+endfunc
+
+function ff_hevc_add_residual_4x4_10_neon, export=1
+mov x12, x0
+ld1 {v0.8h-v1.8h}, [x1]
+ld1 {v2.d}[0], [x12], x2
+ld1 {v2.d}[1], [x12], x2
+ld1 {v3.d}[0], [x12], x2
+sqadd   v0.8h, v0.8h, v2.8h
+ld1 {V3.d}[1], [x12], x2
+moviv4.8h, #0
+sqadd   v1.8h, v1.8h, v3.8h
+mvniv5.8h, #0xFC, LSL #8 // movi #0x3FF
+clip10  v0.8h, v1.8h, v4.8h, v5.8h
+st1 {v0.d}[0], [x0], x2
+st1 {v0.d}[1], [x0], x2
+st1 {v1.d}[0], [x0], x2
+st1 {v1.d}[1], [x0], x2
+ret
+endfunc
+
+function ff_hevc_add_residual_8x8_8_neon, export=1
+add x12, x0, x2
+add x2,  x2, x2
+mov x3,   #8
+1:  subsx3,   x3, #2
+ld1 {v2.d}[0],   [x0]
+ld1 {v2.d}[1],   [x12]
+uxtlv3.8h,   v2.8B
+ld1 {v0.8h-v1.8h}, [x1], #32
+uxtl2   v2.8h,   v2.16B
+sqadd   v0.8h,   v0.8h,   v3.8h
+sqadd   v1.8h,   v1.8h,   v2.8h
+sqxtun  v0.8B,   v0.8h
+sqxtun2 v0.16B,  v1.8h
+st1 {v0.d}[0],   [x0], x2
+st1 {v0.d}[1],   [x12], x2
+bne 1b
+ret
+endfunc
+
+function ff_hevc_add_residual_8x8_10_neon, export=1
+add x12, x0, x2
+add x2,  x2, x2
+mov x3,  #8
+moviv4.8h, #0
+mvniv5.8h, #0xFC, LSL #8 // movi #0x3FF
+1:  subsx3,  x3, #2
+ld1 {v0.8h-v1.8h}, [x1], #32
+ld1 {v2.8h},[x0]
+sqadd   v0.8h, v0.8h, v2.8h
+ld1 {v3.8h},[x12]
+sqadd   v1.8h, v1.8h, v3.8h
+clip10  v0.8h, v1.8h, v4.8h, v5.8h
+st1 {v0.8h}, [x0], x2
+st1 {v1.8h}, [x12], x2
+bne 1b
+ret
+endfunc
+
+function ff_hevc_add_residual_16x16_8_neon, export=1
+mov x3,  #16
+add x12, x0, x2
+add x2,  x2, x2
+1:  subsx3,  x3, #2
+ld1 {v16.16B}, [x0]
+ld1 {v0.8h-v3.8h}, [x1], #64
+ld1 {v19.16B},[x12]
+uxtlv17.8h, v16.8B
+uxtl2   v18.8h, v16.16B
+uxtlv20.8h, v19.8B
+uxtl2   v21.8h, v19.16B
+sqadd   v0.8h,  v0.8h, v17.8h
+sqadd   v1.8h,  v1.8h, v18.8h
+sqadd   v2.8h,  v2.8h, v20.8h
+sqadd   v3.8h,  v3.8h, v21.8h
+sqxtun  v0.8B,  v0.8h
+sqxtun2 v0.16B, v1.8h
+sqxtun  v1.8B,  v2.8h
+sqxtun2 v1.16B, v3.8h
+st1 {v0.16B}, [x0], x2
+st1 {v1.16B}, [x12], x2
+bne 1b
+ret
+endfunc
+
+function ff_hevc_add_residual_16x16_10_neon, export=1
+mov x3,  #16
+moviv20.8h, #0
+mvniv21.8h, #0xFC, LSL #8 // movi #0x3FF
+add x12, x0, x2
+add