Re: [FFmpeg-devel] [PATCH v2 2/4] avcodec/aarch64/hevcdsp: port add_residual functions
On Thu, 4 Feb 2021, Josh Dekker wrote: From: Reimar Döffinger Speedup is fairly small, around 1.5%, but these are fairly simple. Signed-off-by: Josh Dekker --- libavcodec/aarch64/hevcdsp_idct_neon.S| 190 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 24 +++ 2 files changed, 214 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index c70d6a906d..329038a958 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -36,6 +36,196 @@ const trans, align=4 .short 31, 22, 13, 4 endconst +.macro clip10 in1, in2, c1, c2 +smax\in1, \in1, \c1 +smax\in2, \in2, \c1 +smin\in1, \in1, \c2 +smin\in2, \in2, \c2 +.endm + +function ff_hevc_add_residual_4x4_8_neon, export=1 +ld1 {v0.8h-v1.8h}, [x1] +ld1 {v2.s}[0], [x0], x2 +ld1 {v2.s}[1], [x0], x2 +ld1 {v2.s}[2], [x0], x2 +ld1 {v2.s}[3], [x0], x2 +sub x0, x0, x2, lsl #2 +uxtlv6.8h, v2.8B +uxtl2 v7.8h, v2.16B Personal preference: I prefer the non-shouty forms like v2.16b instead of v2.16B. +sqadd v0.8h, v0.8h, v6.8h +sqadd v1.8h, v1.8h, v7.8h Nit: Incosistent alignment between columns 1-2 and 2-3. (And if one would want to make space for full sized operands like v16.16b, they'd all need another space.) +sqxtun v0.8B, v0.8h +sqxtun2 v0.16B, v1.8h +st1 {v0.s}[0], [x0], x2 +st1 {v0.s}[1], [x0], x2 +st1 {v0.s}[2], [x0], x2 +st1 {v0.s}[3], [x0], x2 +ret +endfunc + +function ff_hevc_add_residual_4x4_10_neon, export=1 +mov x12, x0 +ld1 {v0.8h-v1.8h}, [x1] +ld1 {v2.d}[0], [x12], x2 +ld1 {v2.d}[1], [x12], x2 +ld1 {v3.d}[0], [x12], x2 +sqadd v0.8h, v0.8h, v2.8h +ld1 {V3.d}[1], [x12], x2 +moviv4.8h, #0 +sqadd v1.8h, v1.8h, v3.8h +mvniv5.8h, #0xFC, LSL #8 // movi #0x3FF +clip10 v0.8h, v1.8h, v4.8h, v5.8h +st1 {v0.d}[0], [x0], x2 +st1 {v0.d}[1], [x0], x2 +st1 {v1.d}[0], [x0], x2 +st1 {v1.d}[1], [x0], x2 +ret +endfunc + +function ff_hevc_add_residual_8x8_8_neon, export=1 +add x12, x0, x2 +add x2, x2, x2 +mov x3, #8 +1: subsx3, x3, #2 Nit: Odd vertical alignment here? +ld1 {v2.d}[0], [x0] +ld1 {v2.d}[1], [x12] +uxtlv3.8h, v2.8B +ld1 {v0.8h-v1.8h}, [x1], #32 +uxtl2 v2.8h, v2.16B +sqadd v0.8h, v0.8h, v3.8h +sqadd v1.8h, v1.8h, v2.8h +sqxtun v0.8B, v0.8h +sqxtun2 v0.16B, v1.8h +st1 {v0.d}[0], [x0], x2 +st1 {v0.d}[1], [x12], x2 +bne 1b +ret +endfunc + +function ff_hevc_add_residual_8x8_10_neon, export=1 +add x12, x0, x2 +add x2, x2, x2 +mov x3, #8 +moviv4.8h, #0 +mvniv5.8h, #0xFC, LSL #8 // movi #0x3FF +1: subsx3, x3, #2 +ld1 {v0.8h-v1.8h}, [x1], #32 +ld1 {v2.8h},[x0] +sqadd v0.8h, v0.8h, v2.8h +ld1 {v3.8h},[x12] +sqadd v1.8h, v1.8h, v3.8h +clip10 v0.8h, v1.8h, v4.8h, v5.8h +st1 {v0.8h}, [x0], x2 +st1 {v1.8h}, [x12], x2 +bne 1b +ret +endfunc + +function ff_hevc_add_residual_16x16_8_neon, export=1 +mov x3, #16 +add x12, x0, x2 +add x2, x2, x2 +1: subsx3, x3, #2 +ld1 {v16.16B}, [x0] +ld1 {v0.8h-v3.8h}, [x1], #64 +ld1 {v19.16B},[x12] +uxtlv17.8h, v16.8B +uxtl2 v18.8h, v16.16B +uxtlv20.8h, v19.8B +uxtl2 v21.8h, v19.16B +sqadd v0.8h, v0.8h, v17.8h +sqadd v1.8h, v1.8h, v18.8h +sqadd v2.8h, v2.8h, v20.8h +sqadd v3.8h, v3.8h, v21.8h +sqxtun v0.8B, v0.8h +sqxtun2 v0.16B, v1.8h +sqxtun v1.8B, v2.8h +sqxtun2 v1.16B, v3.8h +st1 {v0.16B}, [x0], x2 +st1
[FFmpeg-devel] [PATCH v2 2/4] avcodec/aarch64/hevcdsp: port add_residual functions
From: Reimar Döffinger Speedup is fairly small, around 1.5%, but these are fairly simple. Signed-off-by: Josh Dekker --- libavcodec/aarch64/hevcdsp_idct_neon.S| 190 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 24 +++ 2 files changed, 214 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index c70d6a906d..329038a958 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -36,6 +36,196 @@ const trans, align=4 .short 31, 22, 13, 4 endconst +.macro clip10 in1, in2, c1, c2 +smax\in1, \in1, \c1 +smax\in2, \in2, \c1 +smin\in1, \in1, \c2 +smin\in2, \in2, \c2 +.endm + +function ff_hevc_add_residual_4x4_8_neon, export=1 +ld1 {v0.8h-v1.8h}, [x1] +ld1 {v2.s}[0], [x0], x2 +ld1 {v2.s}[1], [x0], x2 +ld1 {v2.s}[2], [x0], x2 +ld1 {v2.s}[3], [x0], x2 +sub x0, x0, x2, lsl #2 +uxtlv6.8h, v2.8B +uxtl2 v7.8h, v2.16B +sqadd v0.8h, v0.8h, v6.8h +sqadd v1.8h, v1.8h, v7.8h +sqxtun v0.8B, v0.8h +sqxtun2 v0.16B, v1.8h +st1 {v0.s}[0], [x0], x2 +st1 {v0.s}[1], [x0], x2 +st1 {v0.s}[2], [x0], x2 +st1 {v0.s}[3], [x0], x2 +ret +endfunc + +function ff_hevc_add_residual_4x4_10_neon, export=1 +mov x12, x0 +ld1 {v0.8h-v1.8h}, [x1] +ld1 {v2.d}[0], [x12], x2 +ld1 {v2.d}[1], [x12], x2 +ld1 {v3.d}[0], [x12], x2 +sqadd v0.8h, v0.8h, v2.8h +ld1 {V3.d}[1], [x12], x2 +moviv4.8h, #0 +sqadd v1.8h, v1.8h, v3.8h +mvniv5.8h, #0xFC, LSL #8 // movi #0x3FF +clip10 v0.8h, v1.8h, v4.8h, v5.8h +st1 {v0.d}[0], [x0], x2 +st1 {v0.d}[1], [x0], x2 +st1 {v1.d}[0], [x0], x2 +st1 {v1.d}[1], [x0], x2 +ret +endfunc + +function ff_hevc_add_residual_8x8_8_neon, export=1 +add x12, x0, x2 +add x2, x2, x2 +mov x3, #8 +1: subsx3, x3, #2 +ld1 {v2.d}[0], [x0] +ld1 {v2.d}[1], [x12] +uxtlv3.8h, v2.8B +ld1 {v0.8h-v1.8h}, [x1], #32 +uxtl2 v2.8h, v2.16B +sqadd v0.8h, v0.8h, v3.8h +sqadd v1.8h, v1.8h, v2.8h +sqxtun v0.8B, v0.8h +sqxtun2 v0.16B, v1.8h +st1 {v0.d}[0], [x0], x2 +st1 {v0.d}[1], [x12], x2 +bne 1b +ret +endfunc + +function ff_hevc_add_residual_8x8_10_neon, export=1 +add x12, x0, x2 +add x2, x2, x2 +mov x3, #8 +moviv4.8h, #0 +mvniv5.8h, #0xFC, LSL #8 // movi #0x3FF +1: subsx3, x3, #2 +ld1 {v0.8h-v1.8h}, [x1], #32 +ld1 {v2.8h},[x0] +sqadd v0.8h, v0.8h, v2.8h +ld1 {v3.8h},[x12] +sqadd v1.8h, v1.8h, v3.8h +clip10 v0.8h, v1.8h, v4.8h, v5.8h +st1 {v0.8h}, [x0], x2 +st1 {v1.8h}, [x12], x2 +bne 1b +ret +endfunc + +function ff_hevc_add_residual_16x16_8_neon, export=1 +mov x3, #16 +add x12, x0, x2 +add x2, x2, x2 +1: subsx3, x3, #2 +ld1 {v16.16B}, [x0] +ld1 {v0.8h-v3.8h}, [x1], #64 +ld1 {v19.16B},[x12] +uxtlv17.8h, v16.8B +uxtl2 v18.8h, v16.16B +uxtlv20.8h, v19.8B +uxtl2 v21.8h, v19.16B +sqadd v0.8h, v0.8h, v17.8h +sqadd v1.8h, v1.8h, v18.8h +sqadd v2.8h, v2.8h, v20.8h +sqadd v3.8h, v3.8h, v21.8h +sqxtun v0.8B, v0.8h +sqxtun2 v0.16B, v1.8h +sqxtun v1.8B, v2.8h +sqxtun2 v1.16B, v3.8h +st1 {v0.16B}, [x0], x2 +st1 {v1.16B}, [x12], x2 +bne 1b +ret +endfunc + +function ff_hevc_add_residual_16x16_10_neon, export=1 +mov x3, #16 +moviv20.8h, #0 +mvniv21.8h, #0xFC, LSL #8 // movi #0x3FF +add x12, x0, x2 +add