Hello Mahesh,
+x265-devel mailing-list Please find attached the last patches from my local git development tree that are not yet part of the public x265 git repo. Could you please run smoke tests and integrate those patches to the public x265? Please let me know if you want me to address any further issue. Thanks, Sebastian ________________________________ From: Mahesh Pittala <[email protected]> Sent: Friday, October 28, 2022 2:41:51 AM To: Pop, Sebastian Cc: Swathi Gurumani; Santhoshini Sekar; Gopi Satykrishna Akisetty Subject: [EXTERNAL] ARM patches CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe. Hello Sebastian, I have observed a few ARM patches locally which are not pushed to the public x265 repo, we ran smoke tests and it was successful. Can you please share it to x265 videoLAN so that we can push it ? If you have updated patches please share them Thanks, Mahesh
From 091439615265b2fb140b4e8220d597f013081fe6 Mon Sep 17 00:00:00 2001 From: Sebastian Pop <[email protected]> Date: Sat, 29 Oct 2022 18:32:24 +0000 Subject: [PATCH 7/7] [arm64] disable scanPosLast_neon on Apple processors The code in scanPosLast_neon produces an error when executed on Apple processors. Disable the code on Apple processors until the error is fixed. --- source/common/aarch64/asm-primitives.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 9a95d2943..bb5d4ce42 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -679,7 +679,9 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon); p.weight_pp = PFX(weight_pp_neon); +#if !defined(__APPLE__) p.scanPosLast = PFX(scanPosLast_neon); +#endif p.costCoeffNxN = PFX(costCoeffNxN_neon); #endif -- 2.25.1
From 7809dea12c4a0541fe05c752f5d2dd3f9bc47abc Mon Sep 17 00:00:00 2001 From: Sebastian Pop <[email protected]> Date: Wed, 4 May 2022 21:02:09 +0000 Subject: [PATCH 6/7] [arm64] remove two fmov instructions Before: scanPosLast 5.78x 737.95 4261.98 After: scanPosLast 5.45x 783.49 4272.30 --- source/common/aarch64/pixel-util.S | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S index 67e1fd660..fba9a90d5 100644 --- a/source/common/aarch64/pixel-util.S +++ b/source/common/aarch64/pixel-util.S @@ -2293,20 +2293,18 @@ function PFX(scanPosLast_neon) // val - w13 = pmovmskb(v3) and v3.16b, v3.16b, v28.16b mov d4, v3.d[1] - addv b13, v3.8b - addv b14, v4.8b - fmov w13, s13 - fmov w14, s14 - orr w13, w13, w14, lsl #8 + addv b23, v3.8b + addv b24, v4.8b + mov v23.b[1], v24.b[0] + fmov w13, s23 // mask - w15 = pmovmskb(v5) and v5.16b, v5.16b, v28.16b mov d6, v5.d[1] addv b25, v5.8b addv b26, v6.8b + mov v25.b[1], v26.b[0] fmov w15, s25 - fmov w14, s26 - orr w15, w15, w14, lsl #8 // coeffFlag = reverse_bit(w15) in 16-bit rbit w12, w15 -- 2.25.1
From 1849b4f76a2faa375869ef58b785993937ab1af8 Mon Sep 17 00:00:00 2001 From: Sebastian Pop <[email protected]> Date: Wed, 4 May 2022 21:05:22 +0000 Subject: [PATCH 5/7] [arm64] do not use FP register v15 According to Arm64 procedure call standard, v15 must be preserved by a callee across subroutine calls. --- source/common/aarch64/pixel-util.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S index e04f684b4..67e1fd660 100644 --- a/source/common/aarch64/pixel-util.S +++ b/source/common/aarch64/pixel-util.S @@ -2302,10 +2302,10 @@ function PFX(scanPosLast_neon) // mask - w15 = pmovmskb(v5) and v5.16b, v5.16b, v28.16b mov d6, v5.d[1] - addv b15, v5.8b - addv b16, v6.8b - fmov w15, s15 - fmov w14, s16 + addv b25, v5.8b + addv b26, v6.8b + fmov w15, s25 + fmov w14, s26 orr w15, w15, w14, lsl #8 // coeffFlag = reverse_bit(w15) in 16-bit -- 2.25.1
From 4f9ca86d709a5261af86750d6ca2ae9cdc3298cd Mon Sep 17 00:00:00 2001 From: Sebastian Pop <[email protected]> Date: Fri, 25 Mar 2022 22:53:27 +0000 Subject: [PATCH 4/7] [arm64] use better addressing modes with ld1/st1 --- source/common/aarch64/blockcopy8.S | 159 +++++++++--------------- source/common/aarch64/ipfilter.S | 147 ++++++++-------------- source/common/aarch64/mc-a.S | 191 +++++++++++------------------ source/common/aarch64/p2s.S | 115 +++++++---------- source/common/aarch64/pixel-util.S | 175 ++++++++++---------------- source/common/aarch64/sad-a.S | 20 ++- source/common/aarch64/ssd-a.S | 53 +++----- 7 files changed, 316 insertions(+), 544 deletions(-) diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S index 9f0fb675a..36518435d 100644 --- a/source/common/aarch64/blockcopy8.S +++ b/source/common/aarch64/blockcopy8.S @@ -640,10 +640,9 @@ blockcopy_pp_16xN1_neon 64 function PFX(blockcopy_pp_12x16_neon) sub x1, x1, #8 .rept 16 - ldr q0, [x2] + ld1 {v0.16b}, [x2], x3 str d0, [x0], #8 st1 {v0.s}[2], [x0], x1 - add x2, x2, x3 .endr ret endfunc @@ -654,10 +653,9 @@ function PFX(blockcopy_pp_12x32_neon) .loop_pp_12x32: sub w12, w12, #1 .rept 8 - ldr q0, [x2] + ld1 {v0.16b}, [x2], x3 str d0, [x0], #8 st1 {v0.s}[2], [x0], x1 - add x2, x2, x3 .endr cbnz w12, .loop_pp_12x32 ret @@ -668,11 +666,8 @@ function PFX(blockcopy_pp_24x32_neon) .loop_24x32: sub w12, w12, #1 .rept 8 - ldp q0, q1, [x2] - str q0, [x0] - str d1, [x0, #16] - add x0, x0, x1 - add x2, x2, x3 + ld1 {v0.8b-v2.8b}, [x2], x3 + st1 {v0.8b-v2.8b}, [x0], x1 .endr cbnz w12, .loop_24x32 ret @@ -683,11 +678,8 @@ function PFX(blockcopy_pp_24x64_neon) .loop_24x64: sub w12, w12, #1 .rept 16 - ldp q0, q1, [x2] - str q0, [x0] - str d1, [x0, #16] - add x0, x0, x1 - add x2, x2, x3 + ld1 {v0.8b-v2.8b}, [x2], x3 + st1 {v0.8b-v2.8b}, [x0], x1 .endr cbnz w12, .loop_24x64 ret @@ -695,10 +687,8 @@ endfunc function PFX(blockcopy_pp_32x8_neon) .rept 8 - ldp q0, q1, [x2] - add x2, x2, x3 - stp q0, q1, [x0] - add x0, x0, x1 + ld1 {v0.16b-v1.16b}, [x2], x3 + st1 {v0.16b-v1.16b}, [x0], x1 .endr ret endfunc @@ -709,10 +699,8 @@ function PFX(blockcopy_pp_32x\h\()_neon) .loop_32x\h\(): sub w12, w12, #1 .rept 8 - ldp q0, q1, [x2] - add x2, x2, x3 - stp q0, q1, [x0] - add x0, x0, x1 + ld1 {v0.16b-v1.16b}, [x2], x3 + st1 {v0.16b-v1.16b}, [x0], x1 .endr cbnz w12, .loop_32x\h ret @@ -743,12 +731,8 @@ function PFX(blockcopy_pp_64x\h\()_neon) .loop_64x\h\(): sub w12, w12, #1 .rept 4 - ldp q0, q1, [x2] - ldp q2, q3, [x2, #32] - add x2, x2, x3 - stp q0, q1, [x0] - stp q2, q3, [x0, #32] - add x0, x0, x1 + ld1 {v0.16b-v3.16b}, [x2], x3 + st1 {v0.16b-v3.16b}, [x0], x1 .endr cbnz w12, .loop_64x\h ret @@ -819,12 +803,10 @@ endfunc // uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride) function PFX(copy_cnt_4_neon) lsl x2, x2, #1 - add x3, x2, x2 movi v4.8b, #0 .rept 2 - ldr d0, [x1] - ldr d1, [x1, x2] - add x1, x1, x3 + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x1], x2 stp d0, d1, [x0], #16 cmeq v0.4h, v0.4h, #0 cmeq v1.4h, v1.4h, #0 @@ -839,12 +821,10 @@ endfunc function PFX(copy_cnt_8_neon) lsl x2, x2, #1 - add x3, x2, x2 movi v4.8b, #0 .rept 4 - ldr q0, [x1] - ldr q1, [x1, x2] - add x1, x1, x3 + ld1 {v0.16b}, [x1], x2 + ld1 {v1.16b}, [x1], x2 stp q0, q1, [x0], #32 cmeq v0.8h, v0.8h, #0 cmeq v1.8h, v1.8h, #0 @@ -994,7 +974,7 @@ function PFX(cpy2Dto1D_shl_4x4_neon) ld1 {v3.d}[1], [x1], x2 sshl v2.8h, v2.8h, v0.8h sshl v3.8h, v3.8h, v0.8h - stp q2, q3, [x0] + st1 {v2.16b-v3.16b}, [x0] ret endfunc @@ -1005,7 +985,7 @@ function PFX(cpy2Dto1D_shl_8x8_neon) ld1 {v3.16b}, [x1], x2 sshl v2.8h, v2.8h, v0.8h sshl v3.8h, v3.8h, v0.8h - stp q2, q3, [x0], #32 + st1 {v2.16b-v3.16b}, [x0], #32 .endr ret endfunc @@ -1016,11 +996,10 @@ function PFX(cpy2Dto1D_shl_16x16_neon) .loop_cpy2Dto1D_shl_16: sub w12, w12, #1 .rept 4 - ldp q2, q3, [x1] - add x1, x1, x2 + ld1 {v2.16b-v3.16b}, [x1], x2 sshl v2.8h, v2.8h, v0.8h sshl v3.8h, v3.8h, v0.8h - stp q2, q3, [x0], #32 + st1 {v2.16b-v3.16b}, [x0], #32 .endr cbnz w12, .loop_cpy2Dto1D_shl_16 ret @@ -1032,15 +1011,12 @@ function PFX(cpy2Dto1D_shl_32x32_neon) .loop_cpy2Dto1D_shl_32: sub w12, w12, #1 .rept 2 - ldp q2, q3, [x1] - ldp q4, q5, [x1, #32] - add x1, x1, x2 + ld1 {v2.16b-v5.16b}, [x1], x2 sshl v2.8h, v2.8h, v0.8h sshl v3.8h, v3.8h, v0.8h sshl v4.8h, v4.8h, v0.8h sshl v5.8h, v5.8h, v0.8h - stp q2, q3, [x0], #32 - stp q4, q5, [x0], #32 + st1 {v2.16b-v5.16b}, [x0], #64 .endr cbnz w12, .loop_cpy2Dto1D_shl_32 ret @@ -1049,26 +1025,22 @@ endfunc function PFX(cpy2Dto1D_shl_64x64_neon) cpy2Dto1D_shl_start mov w12, #32 + sub x2, x2, #64 .loop_cpy2Dto1D_shl_64: sub w12, w12, #1 .rept 2 - ldp q2, q3, [x1] - ldp q4, q5, [x1, #32] - ldp q6, q7, [x1, #64] - ldp q16, q17, [x1, #96] - add x1, x1, x2 + ld1 {v2.16b-v5.16b}, [x1], #64 + ld1 {v16.16b-v19.16b}, [x1], x2 sshl v2.8h, v2.8h, v0.8h sshl v3.8h, v3.8h, v0.8h sshl v4.8h, v4.8h, v0.8h sshl v5.8h, v5.8h, v0.8h - sshl v6.8h, v6.8h, v0.8h - sshl v7.8h, v7.8h, v0.8h sshl v16.8h, v16.8h, v0.8h sshl v17.8h, v17.8h, v0.8h - stp q2, q3, [x0], #32 - stp q4, q5, [x0], #32 - stp q6, q7, [x0], #32 - stp q16, q17, [x0], #32 + sshl v18.8h, v18.8h, v0.8h + sshl v19.8h, v19.8h, v0.8h + st1 {v2.16b-v5.16b}, [x0], #64 + st1 {v16.16b-v19.16b}, [x0], #64 .endr cbnz w12, .loop_cpy2Dto1D_shl_64 ret @@ -1158,7 +1130,7 @@ endfunc function PFX(cpy1Dto2D_shl_4x4_neon) cpy1Dto2D_shl_start - ldp q2, q3, [x1] + ld1 {v2.16b-v3.16b}, [x1] sshl v2.8h, v2.8h, v0.8h sshl v3.8h, v3.8h, v0.8h st1 {v2.d}[0], [x0], x2 @@ -1171,13 +1143,11 @@ endfunc function PFX(cpy1Dto2D_shl_8x8_neon) cpy1Dto2D_shl_start .rept 4 - ldp q2, q3, [x1], #32 + ld1 {v2.16b-v3.16b}, [x1], #32 sshl v2.8h, v2.8h, v0.8h sshl v3.8h, v3.8h, v0.8h - str q2, [x0] - add x0, x0, x2 - str q3, [x0] - add x0, x0, x2 + st1 {v2.16b}, [x0], x2 + st1 {v3.16b}, [x0], x2 .endr ret endfunc @@ -1188,11 +1158,10 @@ function PFX(cpy1Dto2D_shl_16x16_neon) .loop_cpy1Dto2D_shl_16: sub w12, w12, #1 .rept 4 - ldp q2, q3, [x1], #32 + ld1 {v2.16b-v3.16b}, [x1], #32 sshl v2.8h, v2.8h, v0.8h sshl v3.8h, v3.8h, v0.8h - stp q2, q3, [x0] - add x0, x0, x2 + st1 {v2.16b-v3.16b}, [x0], x2 .endr cbnz w12, .loop_cpy1Dto2D_shl_16 ret @@ -1204,15 +1173,12 @@ function PFX(cpy1Dto2D_shl_32x32_neon) .loop_cpy1Dto2D_shl_32: sub w12, w12, #1 .rept 2 - ldp q2, q3, [x1], #32 - ldp q4, q5, [x1], #32 + ld1 {v2.16b-v5.16b}, [x1], #64 sshl v2.8h, v2.8h, v0.8h sshl v3.8h, v3.8h, v0.8h sshl v4.8h, v4.8h, v0.8h sshl v5.8h, v5.8h, v0.8h - stp q2, q3, [x0] - stp q4, q5, [x0, #32] - add x0, x0, x2 + st1 {v2.16b-v5.16b}, [x0], x2 .endr cbnz w12, .loop_cpy1Dto2D_shl_32 ret @@ -1221,26 +1187,22 @@ endfunc function PFX(cpy1Dto2D_shl_64x64_neon) cpy1Dto2D_shl_start mov w12, #32 + sub x2, x2, #64 .loop_cpy1Dto2D_shl_64: sub w12, w12, #1 .rept 2 - ldp q2, q3, [x1], #32 - ldp q4, q5, [x1], #32 - ldp q6, q7, [x1], #32 - ldp q16, q17, [x1], #32 + ld1 {v2.16b-v5.16b}, [x1], #64 + ld1 {v16.16b-v19.16b}, [x1], #64 sshl v2.8h, v2.8h, v0.8h sshl v3.8h, v3.8h, v0.8h sshl v4.8h, v4.8h, v0.8h sshl v5.8h, v5.8h, v0.8h - sshl v6.8h, v6.8h, v0.8h - sshl v7.8h, v7.8h, v0.8h sshl v16.8h, v16.8h, v0.8h sshl v17.8h, v17.8h, v0.8h - stp q2, q3, [x0] - stp q4, q5, [x0, #32] - stp q6, q7, [x0, #64] - stp q16, q17, [x0, #96] - add x0, x0, x2 + sshl v18.8h, v18.8h, v0.8h + sshl v19.8h, v19.8h, v0.8h + st1 {v2.16b-v5.16b}, [x0], #64 + st1 {v16.16b-v19.16b}, [x0], x2 .endr cbnz w12, .loop_cpy1Dto2D_shl_64 ret @@ -1258,7 +1220,7 @@ endfunc function PFX(cpy1Dto2D_shr_4x4_neon) cpy1Dto2D_shr_start - ldp q2, q3, [x1] + ld1 {v2.16b-v3.16b}, [x1] sub v2.8h, v2.8h, v1.8h sub v3.8h, v3.8h, v1.8h sshl v2.8h, v2.8h, v0.8h @@ -1273,7 +1235,7 @@ endfunc function PFX(cpy1Dto2D_shr_8x8_neon) cpy1Dto2D_shr_start .rept 4 - ldp q2, q3, [x1], #32 + ld1 {v2.16b-v3.16b}, [x1], #32 sub v2.8h, v2.8h, v1.8h sub v3.8h, v3.8h, v1.8h sshl v2.8h, v2.8h, v0.8h @@ -1307,8 +1269,7 @@ function PFX(cpy1Dto2D_shr_32x32_neon) .loop_cpy1Dto2D_shr_32: sub w12, w12, #1 .rept 2 - ldp q2, q3, [x1], #32 - ldp q4, q5, [x1], #32 + ld1 {v2.16b-v5.16b}, [x1], #64 sub v2.8h, v2.8h, v1.8h sub v3.8h, v3.8h, v1.8h sub v4.8h, v4.8h, v1.8h @@ -1317,9 +1278,7 @@ function PFX(cpy1Dto2D_shr_32x32_neon) sshl v3.8h, v3.8h, v0.8h sshl v4.8h, v4.8h, v0.8h sshl v5.8h, v5.8h, v0.8h - stp q2, q3, [x0] - stp q4, q5, [x0, #32] - add x0, x0, x2 + st1 {v2.16b-v5.16b}, [x0], x2 .endr cbnz w12, .loop_cpy1Dto2D_shr_32 ret @@ -1328,34 +1287,30 @@ endfunc function PFX(cpy1Dto2D_shr_64x64_neon) cpy1Dto2D_shr_start mov w12, #32 + sub x2, x2, #64 .loop_cpy1Dto2D_shr_64: sub w12, w12, #1 .rept 2 - ldp q2, q3, [x1], #32 - ldp q4, q5, [x1], #32 - ldp q6, q7, [x1], #32 - ldp q16, q17, [x1], #32 + ld1 {v2.16b-v5.16b}, [x1], #64 + ld1 {v16.16b-v19.16b}, [x1], #64 sub v2.8h, v2.8h, v1.8h sub v3.8h, v3.8h, v1.8h sub v4.8h, v4.8h, v1.8h sub v5.8h, v5.8h, v1.8h - sub v6.8h, v6.8h, v1.8h - sub v7.8h, v7.8h, v1.8h sub v16.8h, v16.8h, v1.8h sub v17.8h, v17.8h, v1.8h + sub v18.8h, v18.8h, v1.8h + sub v19.8h, v19.8h, v1.8h sshl v2.8h, v2.8h, v0.8h sshl v3.8h, v3.8h, v0.8h sshl v4.8h, v4.8h, v0.8h sshl v5.8h, v5.8h, v0.8h - sshl v6.8h, v6.8h, v0.8h - sshl v7.8h, v7.8h, v0.8h sshl v16.8h, v16.8h, v0.8h sshl v17.8h, v17.8h, v0.8h - stp q2, q3, [x0] - stp q4, q5, [x0, #32] - stp q6, q7, [x0, #64] - stp q16, q17, [x0, #96] - add x0, x0, x2 + sshl v18.8h, v18.8h, v0.8h + sshl v19.8h, v19.8h, v0.8h + st1 {v2.16b-v5.16b}, [x0], #64 + st1 {v16.16b-v19.16b}, [x0], x2 .endr cbnz w12, .loop_cpy1Dto2D_shr_64 ret diff --git a/source/common/aarch64/ipfilter.S b/source/common/aarch64/ipfilter.S index 9e11f42e5..22e66a7a4 100644 --- a/source/common/aarch64/ipfilter.S +++ b/source/common/aarch64/ipfilter.S @@ -127,27 +127,23 @@ .macro qpel_load_32b v .if \v == 0 add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0 - ldr d3, [x6] - add x6, x6, x1 + ld1 {v3.8b}, [x6], x1 .elseif \v == 1 || \v == 2 || \v == 3 .if \v != 3 // not used in qpel_filter_3 - ldr d0, [x6] -.endif - add x6, x6, x1 - ldr d1, [x6] - add x6, x6, x1 - ldr d2, [x6] - add x6, x6, x1 - ldr d3, [x6] - add x6, x6, x1 - ldr d4, [x6] - add x6, x6, x1 - ldr d5, [x6] + ld1 {v0.8b}, [x6], x1 +.else add x6, x6, x1 - ldr d6, [x6] +.endif + ld1 {v1.8b}, [x6], x1 + ld1 {v2.8b}, [x6], x1 + ld1 {v3.8b}, [x6], x1 + ld1 {v4.8b}, [x6], x1 + ld1 {v5.8b}, [x6], x1 .if \v != 1 // not used in qpel_filter_1 - add x6, x6, x1 - ldr d7, [x6] + ld1 {v6.8b}, [x6], x1 + ld1 {v7.8b}, [x6] +.else + ld1 {v6.8b}, [x6] .endif .endif .endm @@ -155,27 +151,23 @@ .macro qpel_load_64b v .if \v == 0 add x6, x6, x11 // do not load 3 values that are not used in qpel_filter_0 - ldr q3, [x6] - add x6, x6, x1 + ld1 {v3.16b}, [x6], x1 .elseif \v == 1 || \v == 2 || \v == 3 .if \v != 3 // not used in qpel_filter_3 - ldr q0, [x6] -.endif - add x6, x6, x1 - ldr q1, [x6] - add x6, x6, x1 - ldr q2, [x6] - add x6, x6, x1 - ldr q3, [x6] - add x6, x6, x1 - ldr q4, [x6] - add x6, x6, x1 - ldr q5, [x6] + ld1 {v0.16b}, [x6], x1 +.else add x6, x6, x1 - ldr q6, [x6] +.endif + ld1 {v1.16b}, [x6], x1 + ld1 {v2.16b}, [x6], x1 + ld1 {v3.16b}, [x6], x1 + ld1 {v4.16b}, [x6], x1 + ld1 {v5.16b}, [x6], x1 .if \v != 1 // not used in qpel_filter_1 - add x6, x6, x1 - ldr q7, [x6] + ld1 {v6.16b}, [x6], x1 + ld1 {v7.16b}, [x6] +.else + ld1 {v6.16b}, [x6] .endif .endif .endm @@ -186,13 +178,10 @@ add x6, x6, x1 ldr d1, [x6] .else - ldr d0, [x6] - add x6, x6, x1 - ldr d1, [x6] - add x6, x6, x1 - ldr d2, [x6] - add x6, x6, x1 - ldr d3, [x6] + ld1 {v0.8b}, [x6], x1 + ld1 {v1.8b}, [x6], x1 + ld1 {v2.8b}, [x6], x1 + ld1 {v3.8b}, [x6] .endif .endm @@ -202,13 +191,10 @@ add x6, x6, x1 ldr q1, [x6] .else - ldr q0, [x6] - add x6, x6, x1 - ldr q1, [x6] - add x6, x6, x1 - ldr q2, [x6] - add x6, x6, x1 - ldr q3, [x6] + ld1 {v0.16b}, [x6], x1 + ld1 {v1.16b}, [x6], x1 + ld1 {v2.16b}, [x6], x1 + ld1 {v3.16b}, [x6] .endif .endm @@ -884,8 +870,7 @@ function x265_interp_8tap_vert_pp_4x\h\()_neon add v16.8h, v20.8h, v21.8h sqrshrun v16.8b, v16.8h, #6 - str s16, [x2] - add x2, x2, x3 + st1 {v16.s}[0], [x2], x3 st1 {v16.s}[1], [x2], x3 sub x9, x9, #2 @@ -1024,49 +1009,41 @@ function x265_interp_8tap_vert_ps_4x\h\()_neon .loop_vps_4x\h: mov x6, x0 - ldr s0, [x6] - add x6, x6, x1 + ld1 {v0.s}[0], [x6], x1 + ld1 {v1.s}[0], [x6], x1 + ld1 {v2.s}[0], [x6], x1 + ld1 {v3.s}[0], [x6], x1 + ld1 {v4.s}[0], [x6], x1 + ld1 {v5.s}[0], [x6], x1 + ld1 {v6.s}[0], [x6], x1 + ld1 {v7.s}[0], [x6], x1 uxtl v0.8h, v0.8b uxtl v0.4s, v0.4h - ldr s1, [x6] - add x6, x6, x1 uxtl v1.8h, v1.8b uxtl v1.4s, v1.4h mul v0.4s, v0.4s, v16.4s - ldr s2, [x6] - add x6, x6, x1 uxtl v2.8h, v2.8b uxtl v2.4s, v2.4h mla v0.4s, v1.4s, v17.4s - ldr s3, [x6] - add x6, x6, x1 uxtl v3.8h, v3.8b uxtl v3.4s, v3.4h mla v0.4s, v2.4s, v18.4s - ldr s4, [x6] - add x6, x6, x1 uxtl v4.8h, v4.8b uxtl v4.4s, v4.4h mla v0.4s, v3.4s, v19.4s - ldr s5, [x6] - add x6, x6, x1 uxtl v5.8h, v5.8b uxtl v5.4s, v5.4h mla v0.4s, v4.4s, v20.4s - ldr s6, [x6] - add x6, x6, x1 uxtl v6.8h, v6.8b uxtl v6.4s, v6.4h mla v0.4s, v5.4s, v21.4s - ldr s7, [x6] - add x6, x6, x1 uxtl v7.8h, v7.8b uxtl v7.4s, v7.4h mla v0.4s, v6.4s, v22.4s @@ -1075,8 +1052,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_neon sub v0.4s, v0.4s, v28.4s sqxtn v0.4h, v0.4s - str d0, [x2] - add x2, x2, x3 + st1 {v0.8b}, [x2], x3 add x0, x0, x1 sub x4, x4, #1 @@ -1214,42 +1190,28 @@ function x265_interp_8tap_vert_sp_4x\h\()_neon .loop_vsp_4x\h: mov x6, x0 - ldr d0, [x6] - add x6, x6, x1 - sshll v0.4s, v0.4h, #0 + ld1 {v0.8b}, [x6], x1 + ld1 {v1.8b}, [x6], x1 + ld1 {v2.8b}, [x6], x1 + ld1 {v3.8b}, [x6], x1 + ld1 {v4.8b}, [x6], x1 + ld1 {v5.8b}, [x6], x1 + ld1 {v6.8b}, [x6], x1 + ld1 {v7.8b}, [x6], x1 - ldr d1, [x6] - add x6, x6, x1 + sshll v0.4s, v0.4h, #0 sshll v1.4s, v1.4h, #0 mul v0.4s, v0.4s, v16.4s - - ldr d2, [x6] - add x6, x6, x1 sshll v2.4s, v2.4h, #0 mla v0.4s, v1.4s, v17.4s - - ldr d3, [x6] - add x6, x6, x1 sshll v3.4s, v3.4h, #0 mla v0.4s, v2.4s, v18.4s - - ldr d4, [x6] - add x6, x6, x1 sshll v4.4s, v4.4h, #0 mla v0.4s, v3.4s, v19.4s - - ldr d5, [x6] - add x6, x6, x1 sshll v5.4s, v5.4h, #0 mla v0.4s, v4.4s, v20.4s - - ldr d6, [x6] - add x6, x6, x1 sshll v6.4s, v6.4h, #0 mla v0.4s, v5.4s, v21.4s - - ldr d7, [x6] - add x6, x6, x1 sshll v7.4s, v7.4h, #0 mla v0.4s, v6.4s, v22.4s @@ -1258,8 +1220,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_neon add v0.4s, v0.4s, v24.4s sqshrun v0.4h, v0.4s, #12 sqxtun v0.8b, v0.8h - str s0, [x2] - add x2, x2, x3 + st1 {v0.s}[0], [x2], x3 add x0, x0, x1 sub x4, x4, #1 diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S index 930d0cf4f..6ae167740 100644 --- a/source/common/aarch64/mc-a.S +++ b/source/common/aarch64/mc-a.S @@ -37,13 +37,10 @@ .macro pixel_avg_pp_4xN_neon h function PFX(pixel_avg_pp_4x\h\()_neon) .rept \h - ldr s0, [x2] - ldr s1, [x4] - add x2, x2, x3 - add x4, x4, x5 + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x4], x5 urhadd v2.8b, v0.8b, v1.8b - str s2, [x0] - add x0, x0, x1 + st1 {v2.s}[0], [x0], x1 .endr ret endfunc @@ -56,13 +53,10 @@ pixel_avg_pp_4xN_neon 16 .macro pixel_avg_pp_8xN_neon h function PFX(pixel_avg_pp_8x\h\()_neon) .rept \h - ldr d0, [x2] - ldr d1, [x4] - add x2, x2, x3 - add x4, x4, x5 + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x4], x5 urhadd v2.8b, v0.8b, v1.8b - str d2, [x0] - add x0, x0, x1 + st1 {v2.8b}, [x0], x1 .endr ret endfunc @@ -74,18 +68,18 @@ pixel_avg_pp_8xN_neon 16 pixel_avg_pp_8xN_neon 32 function PFX(pixel_avg_pp_12x16_neon) + sub x1, x1, #4 + sub x3, x3, #4 + sub x5, x5, #4 .rept 16 - ldr s0, [x2] - ldr d1, [x2, #4] - ldr s2, [x4] - ldr d3, [x4, #4] - add x2, x2, x3 - add x4, x4, x5 + ld1 {v0.s}[0], [x2], #4 + ld1 {v1.8b}, [x2], x3 + ld1 {v2.s}[0], [x4], #4 + ld1 {v3.8b}, [x4], x5 urhadd v4.8b, v0.8b, v2.8b urhadd v5.8b, v1.8b, v3.8b - str s4, [x0] - str d5, [x0, #4] - add x0, x0, x1 + st1 {v4.s}[0], [x0], #4 + st1 {v5.8b}, [x0], x1 .endr ret endfunc @@ -123,21 +117,21 @@ function PFX(pixel_avg_pp_16x64_neon) endfunc function PFX(pixel_avg_pp_24x32_neon) + sub x1, x1, #16 + sub x3, x3, #16 + sub x5, x5, #16 mov w12, #4 .lpavg_24x32: sub w12, w12, #1 .rept 8 - ldr q0, [x2] - ldr d1, [x2, #16] - ldr q2, [x4] - ldr d3, [x4, #16] - add x2, x2, x3 - add x4, x4, x5 + ld1 {v0.16b}, [x2], #16 + ld1 {v1.8b}, [x2], x3 + ld1 {v2.16b}, [x4], #16 + ld1 {v3.8b}, [x4], x5 urhadd v0.16b, v0.16b, v2.16b urhadd v1.8b, v1.8b, v3.8b - str q0, [x0] - str d1, [x0, #16] - add x0, x0, x1 + st1 {v0.16b}, [x0], #16 + st1 {v1.8b}, [x0], x1 .endr cbnz w12, .lpavg_24x32 ret @@ -146,14 +140,11 @@ endfunc .macro pixel_avg_pp_32xN_neon h function PFX(pixel_avg_pp_32x\h\()_neon) .rept \h - ldp q0, q1, [x2] - ldp q2, q3, [x4] - add x2, x2, x3 - add x4, x4, x5 + ld1 {v0.16b-v1.16b}, [x2], x3 + ld1 {v2.16b-v3.16b}, [x4], x5 urhadd v0.16b, v0.16b, v2.16b urhadd v1.16b, v1.16b, v3.16b - stp q0, q1, [x0] - add x0, x0, x1 + st1 {v0.16b-v1.16b}, [x0], x1 .endr ret endfunc @@ -169,14 +160,11 @@ function PFX(pixel_avg_pp_32x\h\()_neon) .lpavg_32x\h\(): sub w12, w12, #1 .rept 8 - ldp q0, q1, [x2] - ldp q2, q3, [x4] - add x2, x2, x3 - add x4, x4, x5 + ld1 {v0.16b-v1.16b}, [x2], x3 + ld1 {v2.16b-v3.16b}, [x4], x5 urhadd v0.16b, v0.16b, v2.16b urhadd v1.16b, v1.16b, v3.16b - stp q0, q1, [x0] - add x0, x0, x1 + st1 {v0.16b-v1.16b}, [x0], x1 .endr cbnz w12, .lpavg_32x\h ret @@ -191,18 +179,12 @@ function PFX(pixel_avg_pp_48x64_neon) .lpavg_48x64: sub w12, w12, #1 .rept 8 - ldr q0, [x2] - ldr q3, [x4] - ldp q1, q2, [x2, #16] - ldp q4, q5, [x4, #16] - add x2, x2, x3 - add x4, x4, x5 + ld1 {v0.16b-v2.16b}, [x2], x3 + ld1 {v3.16b-v5.16b}, [x4], x5 urhadd v0.16b, v0.16b, v3.16b urhadd v1.16b, v1.16b, v4.16b urhadd v2.16b, v2.16b, v5.16b - str q0, [x0] - stp q1, q2, [x0, #16] - add x0, x0, x1 + st1 {v0.16b-v2.16b}, [x0], x1 .endr cbnz w12, .lpavg_48x64 ret @@ -214,19 +196,13 @@ function PFX(pixel_avg_pp_64x\h\()_neon) .lpavg_64x\h\(): sub w12, w12, #1 .rept 4 - ldp q0, q1, [x2] - ldp q4, q5, [x4] - ldp q2, q3, [x2, #32] - ldp q6, q7, [x4, #32] - add x2, x2, x3 - add x4, x4, x5 + ld1 {v0.16b-v3.16b}, [x2], x3 + ld1 {v4.16b-v7.16b}, [x4], x5 urhadd v0.16b, v0.16b, v4.16b urhadd v1.16b, v1.16b, v5.16b urhadd v2.16b, v2.16b, v6.16b urhadd v3.16b, v3.16b, v7.16b - stp q0, q1, [x0] - stp q2, q3, [x0, #32] - add x0, x0, x1 + st1 {v0.16b-v3.16b}, [x0], x1 .endr cbnz w12, .lpavg_64x\h ret @@ -284,14 +260,10 @@ addAvg_2xN 16 function PFX(addAvg_4x\h\()_neon) addAvg_start .rept \h / 2 - ldr d0, [x0] - ldr d1, [x1] - add x0, x0, x3 - add x1, x1, x4 - ldr d2, [x0] - ldr d3, [x1] - add x0, x0, x3 - add x1, x1, x4 + ld1 {v0.8b}, [x0], x3 + ld1 {v1.8b}, [x1], x4 + ld1 {v2.8b}, [x0], x3 + ld1 {v3.8b}, [x1], x4 add v0.4h, v0.4h, v1.4h add v2.4h, v2.4h, v3.4h saddl v0.4s, v0.4h, v30.4h @@ -299,8 +271,7 @@ function PFX(addAvg_4x\h\()_neon) shrn v0.4h, v0.4s, #7 shrn2 v0.8h, v2.4s, #7 sqxtun v0.8b, v0.8h - str s0, [x2] - add x2, x2, x5 + st1 {v0.s}[0], [x2], x5 st1 {v0.s}[1], [x2], x5 .endr ret @@ -320,14 +291,10 @@ function PFX(addAvg_6x\h\()_neon) sub x5, x5, #4 .loop_addavg_6x\h: sub w12, w12, #1 - ldr q0, [x0] - ldr q1, [x1] - add x0, x0, x3 - add x1, x1, x4 - ldr q2, [x0] - ldr q3, [x1] - add x0, x0, x3 - add x1, x1, x4 + ld1 {v0.16b}, [x0], x3 + ld1 {v1.16b}, [x1], x4 + ld1 {v2.16b}, [x0], x3 + ld1 {v3.16b}, [x1], x4 add v0.8h, v0.8h, v1.8h add v2.8h, v2.8h, v3.8h saddl v16.4s, v0.4h, v30.4h @@ -356,14 +323,10 @@ addAvg_6xN 16 function PFX(addAvg_8x\h\()_neon) addAvg_start .rept \h / 2 - ldr q0, [x0] - ldr q1, [x1] - add x0, x0, x3 - add x1, x1, x4 - ldr q2, [x0] - ldr q3, [x1] - add x0, x0, x3 - add x1, x1, x4 + ld1 {v0.16b}, [x0], x3 + ld1 {v1.16b}, [x1], x4 + ld1 {v2.16b}, [x0], x3 + ld1 {v3.16b}, [x1], x4 add v0.8h, v0.8h, v1.8h add v2.8h, v2.8h, v3.8h saddl v16.4s, v0.4h, v30.4h @@ -376,10 +339,8 @@ function PFX(addAvg_8x\h\()_neon) shrn2 v1.8h, v19.4s, #7 sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h - str d0, [x2] - add x2, x2, x5 - str d1, [x2] - add x2, x2, x5 + st1 {v0.8b}, [x2], x5 + st1 {v1.8b}, [x2], x5 .endr ret endfunc @@ -391,14 +352,10 @@ function PFX(addAvg_8x\h\()_neon) mov w12, #\h / 2 .loop_addavg_8x\h: sub w12, w12, #1 - ldr q0, [x0] - ldr q1, [x1] - add x0, x0, x3 - add x1, x1, x4 - ldr q2, [x0] - ldr q3, [x1] - add x0, x0, x3 - add x1, x1, x4 + ld1 {v0.16b}, [x0], x3 + ld1 {v1.16b}, [x1], x4 + ld1 {v2.16b}, [x0], x3 + ld1 {v3.16b}, [x1], x4 add v0.8h, v0.8h, v1.8h add v2.8h, v2.8h, v3.8h saddl v16.4s, v0.4h, v30.4h @@ -411,10 +368,8 @@ function PFX(addAvg_8x\h\()_neon) shrn2 v1.8h, v19.4s, #7 sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h - str d0, [x2] - add x2, x2, x5 - str d1, [x2] - add x2, x2, x5 + st1 {v0.8b}, [x2], x5 + st1 {v1.8b}, [x2], x5 cbnz w12, .loop_addavg_8x\h ret endfunc @@ -432,15 +387,16 @@ addAvg_8xN1 64 .macro addAvg_12xN h function PFX(addAvg_12x\h\()_neon) addAvg_start + sub x3, x3, #16 + sub x4, x4, #16 + sub x5, x5, #8 mov w12, #\h .loop_addAvg_12X\h\(): sub w12, w12, #1 - ldr q0, [x0] - ldr q1, [x1] - ldr d2, [x0, #16] - ldr d3, [x1, #16] - add x0, x0, x3 - add x1, x1, x4 + ld1 {v0.16b}, [x0], #16 + ld1 {v1.16b}, [x1], #16 + ld1 {v2.8b}, [x0], x3 + ld1 {v3.8b}, [x1], x4 add v0.8h, v0.8h, v1.8h add v2.4h, v2.4h, v3.4h saddl v16.4s, v0.4h, v30.4h @@ -451,9 +407,8 @@ function PFX(addAvg_12x\h\()_neon) shrn v1.4h, v18.4s, #7 sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h - str d0, [x2] - str s1, [x2, #8] - add x2, x2, x5 + st1 {v0.8b}, [x2], #8 + st1 {v1.s}[0], [x2], x5 cbnz w12, .loop_addAvg_12X\h ret endfunc @@ -502,21 +457,15 @@ function PFX(addAvg_24x\h\()_neon) mov w12, #\h .loop_addavg_24x\h\(): sub w12, w12, #1 - ldr q0, [x0] - ldr q3, [x1] - ldp q1, q2, [x0, #16] - ldp q4, q5, [x1, #16] - add x0, x0, x3 - add x1, x1, x4 + ld1 {v0.16b-v2.16b}, [x0], x3 + ld1 {v3.16b-v5.16b}, [x1], x4 addavg_1 v0, v3 addavg_1 v1, v4 addavg_1 v2, v5 sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h - sqxtun2 v1.16b, v2.8h - str d0, [x2] - str q1, [x2, #8] - add x2, x2, x5 + sqxtun v2.8b, v2.8h + st1 {v0.8b-v2.8b}, [x2], x5 cbnz w12, .loop_addavg_24x\h ret endfunc diff --git a/source/common/aarch64/p2s.S b/source/common/aarch64/p2s.S index 8f7b999bc..8d6b3dce3 100644 --- a/source/common/aarch64/p2s.S +++ b/source/common/aarch64/p2s.S @@ -55,8 +55,7 @@ .macro p2s_2x2 #if HIGH_BIT_DEPTH - ldr s0, [x0] - add x0, x0, x1 + ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 shl v3.8h, v0.8h, #P2S_SHIFT #else @@ -90,13 +89,11 @@ p2s_2xN 16 .macro p2s_6x2 #if HIGH_BIT_DEPTH - ldr d0, [x0] - add x0, x0, #8 - ldr s1, [x0] - add x0, x0, x1 + ld1 {v0.d}[0], [x0], #8 + ld1 {v1.s}[0], [x0], x1 ld1 {v0.d}[1], [x0], #8 - shl v3.8h, v0.8h, #P2S_SHIFT ld1 {v1.s}[1], [x0], x1 + shl v3.8h, v0.8h, #P2S_SHIFT shl v4.8h, v1.8h, #P2S_SHIFT #else ldr s0, [x0] @@ -138,13 +135,11 @@ p2s_6xN 16 function PFX(filterPixelToShort_4x2_neon) p2s_start #if HIGH_BIT_DEPTH - ldr d0, [x0] - add x0, x0, x1 + ld1 {v0.d}[0], [x0], x1 ld1 {v0.d}[1], [x0], x1 shl v3.8h, v0.8h, #P2S_SHIFT #else - ldr s0, [x0] - add x0, x0, x1 + ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 ushll v3.8h, v0.8b, #P2S_SHIFT #endif @@ -157,13 +152,11 @@ endfunc function PFX(filterPixelToShort_4x4_neon) p2s_start #if HIGH_BIT_DEPTH - ldr d0, [x0] - add x0, x0, x1 + ld1 {v0.d}[0], [x0], x1 ld1 {v0.d}[1], [x0], x1 shl v3.8h, v0.8h, #P2S_SHIFT #else - ldr s0, [x0] - add x0, x0, x1 + ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 ushll v3.8h, v0.8b, #P2S_SHIFT #endif @@ -171,13 +164,11 @@ function PFX(filterPixelToShort_4x4_neon) st1 {v3.d}[0], [x2], x3 st1 {v3.d}[1], [x2], x3 #if HIGH_BIT_DEPTH - ldr d1, [x0] - add x0, x0, x1 + ld1 {v1.d}[0], [x0], x1 ld1 {v1.d}[1], [x0], x1 shl v4.8h, v1.8h, #P2S_SHIFT #else - ldr s1, [x0] - add x0, x0, x1 + ld1 {v1.s}[0], [x0], x1 ld1 {v1.s}[1], [x0], x1 ushll v4.8h, v1.8b, #P2S_SHIFT #endif @@ -192,23 +183,19 @@ function PFX(filterPixelToShort_4x\h\()_neon) p2s_start .rept \h / 2 #if HIGH_BIT_DEPTH - ldr q0, [x0] - add x0, x0, x1 + ld1 {v0.16b}, [x0], x1 shl v0.8h, v0.8h, #P2S_SHIFT #else - ldr d0, [x0] - add x0, x0, x1 + ld1 {v0.8b}, [x0], x1 ushll v0.8h, v0.8b, #P2S_SHIFT #endif add v2.4h, v0.4h, v31.4h st1 {v2.4h}, [x2], x3 #if HIGH_BIT_DEPTH - ldr q1, [x0] - add x0, x0, x1 + ld1 {v1.16b}, [x0], x1 shl v1.8h, v1.8h, #P2S_SHIFT #else - ldr d1, [x0] - add x0, x0, x1 + ld1 {v1.8b}, [x0], x1 ushll v1.8h, v1.8b, #P2S_SHIFT #endif add v3.4h, v1.4h, v31.4h @@ -227,18 +214,14 @@ function PFX(filterPixelToShort_8x\h\()_neon) p2s_start .rept \h / 2 #if HIGH_BIT_DEPTH - ldr q0, [x0] - add x0, x0, x1 + ld1 {v0.16b}, [x0], x1 + ld1 {v1.16b}, [x0], x1 shl v0.8h, v0.8h, #P2S_SHIFT - ldr q1, [x0] - add x0, x0, x1 shl v1.8h, v1.8h, #P2S_SHIFT #else - ldr d0, [x0] - add x0, x0, x1 + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 ushll v0.8h, v0.8b, #P2S_SHIFT - ldr d1, [x0] - add x0, x0, x1 ushll v1.8h, v1.8b, #P2S_SHIFT #endif add v2.8h, v0.8h, v31.8h @@ -265,13 +248,11 @@ function PFX(filterPixelToShort_12x\h\()_neon) sub x3, x3, #16 .rept \h #if HIGH_BIT_DEPTH - ldp q0, q1, [x0] - add x0, x0, x1 + ld1 {v0.16b-v1.16b}, [x0], x1 shl v2.8h, v0.8h, #P2S_SHIFT shl v3.8h, v1.8h, #P2S_SHIFT #else - ldr q0, [x0] - add x0, x0, x1 + ld1 {v0.16b}, [x0], x1 ushll v2.8h, v0.8b, #P2S_SHIFT ushll2 v3.8h, v0.16b, #P2S_SHIFT #endif @@ -292,13 +273,11 @@ function PFX(filterPixelToShort_16x\h\()_neon) p2s_start .rept \h #if HIGH_BIT_DEPTH - ldp q0, q1, [x0] - add x0, x0, x1 + ld1 {v0.16b-v1.16b}, [x0], x1 shl v2.8h, v0.8h, #P2S_SHIFT shl v3.8h, v1.8h, #P2S_SHIFT #else - ldr q0, [x0] - add x0, x0, x1 + ld1 {v0.16b}, [x0], x1 ushll v2.8h, v0.8b, #P2S_SHIFT ushll2 v3.8h, v0.16b, #P2S_SHIFT #endif @@ -323,18 +302,14 @@ function PFX(filterPixelToShort_24x\h\()_neon) p2s_start .rept \h #if HIGH_BIT_DEPTH - ldp q0, q1, [x0] + ld1 {v0.16b-v2.16b}, [x0], x1 shl v3.8h, v0.8h, #P2S_SHIFT shl v4.8h, v1.8h, #P2S_SHIFT - ldr q2, [x0, #32] - add x0, x0, x1 shl v5.8h, v2.8h, #P2S_SHIFT #else - ldp d0, d1, [x0] + ld1 {v0.8b-v2.8b}, [x0], x1 ushll v3.8h, v0.8b, #P2S_SHIFT ushll v4.8h, v1.8b, #P2S_SHIFT - ldr d2, [x0, #16] - add x0, x0, x1 ushll v5.8h, v2.8b, #P2S_SHIFT #endif add v3.8h, v3.8h, v31.8h @@ -356,16 +331,13 @@ function PFX(filterPixelToShort_32x\h\()_neon) .loop_filterP2S_32x\h: sub x9, x9, #1 #if HIGH_BIT_DEPTH - ldp q0, q1, [x0] + ld1 {v0.16b-v3.16b}, [x0], x1 shl v22.8h, v0.8h, #P2S_SHIFT shl v23.8h, v1.8h, #P2S_SHIFT - ldp q2, q3, [x0, #32] - add x0, x0, x1 shl v24.8h, v2.8h, #P2S_SHIFT shl v25.8h, v3.8h, #P2S_SHIFT #else - ldp q0, q1, [x0] - add x0, x0, x1 + ld1 {v0.16b-v1.16b}, [x0], x1 ushll v22.8h, v0.8b, #P2S_SHIFT ushll2 v23.8h, v0.16b, #P2S_SHIFT ushll v24.8h, v1.8b, #P2S_SHIFT @@ -391,32 +363,30 @@ p2s_32xN 64 .macro p2s_64xN h function PFX(filterPixelToShort_64x\h\()_neon) p2s_start - sub x3, x3, #0x40 +#if HIGH_BIT_DEPTH + sub x1, x1, #64 +#endif + sub x3, x3, #64 mov x9, #\h .loop_filterP2S_64x\h: sub x9, x9, #1 #if HIGH_BIT_DEPTH - ldp q0, q1, [x0] + ld1 {v0.16b-v3.16b}, [x0], #64 + ld1 {v4.16b-v7.16b}, [x0], x1 shl v16.8h, v0.8h, #P2S_SHIFT shl v17.8h, v1.8h, #P2S_SHIFT - ldp q2, q3, [x0, #0x20] shl v18.8h, v2.8h, #P2S_SHIFT shl v19.8h, v3.8h, #P2S_SHIFT - ldp q4, q5, [x0, #0x40] shl v20.8h, v4.8h, #P2S_SHIFT shl v21.8h, v5.8h, #P2S_SHIFT - ldp q6, q7, [x0, #0x60] - add x0, x0, x1 shl v22.8h, v6.8h, #P2S_SHIFT shl v23.8h, v7.8h, #P2S_SHIFT #else - ldp q0, q1, [x0] + ld1 {v0.16b-v3.16b}, [x0], x1 ushll v16.8h, v0.8b, #P2S_SHIFT ushll2 v17.8h, v0.16b, #P2S_SHIFT ushll v18.8h, v1.8b, #P2S_SHIFT ushll2 v19.8h, v1.16b, #P2S_SHIFT - ldp q2, q3, [x0, #0x20] - add x0, x0, x1 ushll v20.8h, v2.8b, #P2S_SHIFT ushll2 v21.8h, v2.16b, #P2S_SHIFT ushll v22.8h, v3.8b, #P2S_SHIFT @@ -430,7 +400,7 @@ function PFX(filterPixelToShort_64x\h\()_neon) add v21.8h, v21.8h, v31.8h add v22.8h, v22.8h, v31.8h add v23.8h, v23.8h, v31.8h - st1 {v16.16b-v19.16b}, [x2], #0x40 + st1 {v16.16b-v19.16b}, [x2], #64 st1 {v20.16b-v23.16b}, [x2], x3 cbnz x9, .loop_filterP2S_64x\h ret @@ -444,29 +414,28 @@ p2s_64xN 64 function PFX(filterPixelToShort_48x64_neon) p2s_start - sub x3, x3, #0x40 +#if HIGH_BIT_DEPTH + sub x1, x1, #64 +#endif + sub x3, x3, #64 mov x9, #64 .loop_filterP2S_48x64: sub x9, x9, #1 #if HIGH_BIT_DEPTH - ldp q0, q1, [x0] + ld1 {v0.16b-v3.16b}, [x0], #64 + ld1 {v4.16b-v5.16b}, [x0], x1 shl v16.8h, v0.8h, #P2S_SHIFT shl v17.8h, v1.8h, #P2S_SHIFT - ldp q2, q3, [x0, #0x20] shl v18.8h, v2.8h, #P2S_SHIFT shl v19.8h, v3.8h, #P2S_SHIFT - ldp q4, q5, [x0, #0x40] - add x0, x0, x1 shl v20.8h, v4.8h, #P2S_SHIFT shl v21.8h, v5.8h, #P2S_SHIFT #else - ldp q0, q1, [x0] + ld1 {v0.16b-v2.16b}, [x0], x1 ushll v16.8h, v0.8b, #P2S_SHIFT ushll2 v17.8h, v0.16b, #P2S_SHIFT ushll v18.8h, v1.8b, #P2S_SHIFT ushll2 v19.8h, v1.16b, #P2S_SHIFT - ldr q2, [x0, #0x20] - add x0, x0, x1 ushll v20.8h, v2.8b, #P2S_SHIFT ushll2 v21.8h, v2.16b, #P2S_SHIFT #endif @@ -476,7 +445,7 @@ function PFX(filterPixelToShort_48x64_neon) add v19.8h, v19.8h, v31.8h add v20.8h, v20.8h, v31.8h add v21.8h, v21.8h, v31.8h - st1 {v16.16b-v19.16b}, [x2], #0x40 + st1 {v16.16b-v19.16b}, [x2], #64 st1 {v20.16b-v21.16b}, [x2], x3 cbnz x9, .loop_filterP2S_48x64 ret diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S index 25af1dc02..e04f684b4 100644 --- a/source/common/aarch64/pixel-util.S +++ b/source/common/aarch64/pixel-util.S @@ -37,15 +37,13 @@ // uint64_t pixel_var(const pixel* pix, intptr_t i_stride) function PFX(pixel_var_8x8_neon) - ldr d4, [x0] // pix[x] - add x0, x0, x1 + ld1 {v4.8b}, [x0], x1 // pix[x] uxtl v0.8h, v4.8b // sum = pix[x] umull v1.8h, v4.8b, v4.8b uaddlp v1.4s, v1.8h // sqr = pix[x] * pix[x] .rept 7 - ldr d4, [x0] // pix[x] - add x0, x0, x1 + ld1 {v4.8b}, [x0], x1 // pix[x] umull v31.8h, v4.8b, v4.8b uaddw v0.8h, v0.8h, v4.8b // sum += pix[x] uadalp v1.4s, v31.8h // sqr += pix[x] * pix[x] @@ -90,8 +88,7 @@ function PFX(pixel_var_16x16_neon) mov w12, #16 .loop_var_16: sub w12, w12, #1 - ldr q4, [x0] - add x0, x0, x1 + ld1 {v4.16b}, [x0], x1 pixel_var_1 v4 cbnz w12, .loop_var_16 pixel_var_end @@ -130,20 +127,14 @@ endfunc function PFX(getResidual4_neon) lsl x4, x3, #1 .rept 2 - ldr d0, [x0] - ldr d1, [x1] - add x0, x0, x3 - add x1, x1, x3 - ldr d2, [x0] - ldr d3, [x1] - add x0, x0, x3 - add x1, x1, x3 + ld1 {v0.8b}, [x0], x3 + ld1 {v1.8b}, [x1], x3 + ld1 {v2.8b}, [x0], x3 + ld1 {v3.8b}, [x1], x3 usubl v4.8h, v0.8b, v1.8b usubl v5.8h, v2.8b, v3.8b - str d4, [x2] - add x2, x2, x4 - str d5, [x2] - add x2, x2, x4 + st1 {v4.8b}, [x2], x4 + st1 {v5.8b}, [x2], x4 .endr ret endfunc @@ -151,20 +142,14 @@ endfunc function PFX(getResidual8_neon) lsl x4, x3, #1 .rept 4 - ldr d0, [x0] - ldr d1, [x1] - add x0, x0, x3 - add x1, x1, x3 - ldr d2, [x0] - ldr d3, [x1] - add x0, x0, x3 - add x1, x1, x3 + ld1 {v0.8b}, [x0], x3 + ld1 {v1.8b}, [x1], x3 + ld1 {v2.8b}, [x0], x3 + ld1 {v3.8b}, [x1], x3 usubl v4.8h, v0.8b, v1.8b usubl v5.8h, v2.8b, v3.8b - str q4, [x2] - add x2, x2, x4 - str q5, [x2] - add x2, x2, x4 + st1 {v4.16b}, [x2], x4 + st1 {v5.16b}, [x2], x4 .endr ret endfunc @@ -650,19 +635,14 @@ endfunc //******* satd ******* .macro satd_4x4_neon - ldr s1, [x2] - ldr s0, [x0] - add x2, x2, x3 - add x0, x0, x1 - ldr s3, [x2] - ldr s2, [x0] - add x2, x2, x3 - add x0, x0, x1 - - ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 - ld1 {v3.s}[1], [x2], x3 + ld1 {v1.s}[0], [x2], x3 + ld1 {v1.s}[1], [x2], x3 + ld1 {v2.s}[0], [x0], x1 ld1 {v2.s}[1], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v3.s}[1], [x2], x3 usubl v4.8h, v0.8b, v1.8b usubl v5.8h, v2.8b, v3.8b @@ -1965,10 +1945,8 @@ endfunc function PFX(ssimDist4_neon) ssimDist_start .rept 4 - ldr s4, [x0] - ldr s5, [x2] - add x0, x0, x1 - add x2, x2, x3 + ld1 {v4.s}[0], [x0], x1 + ld1 {v5.s}[0], [x2], x3 uxtl v4.8h, v4.8b uxtl v5.8h, v5.8b sub v2.4h, v4.4h, v5.4h @@ -1984,10 +1962,8 @@ endfunc function PFX(ssimDist8_neon) ssimDist_start .rept 8 - ldr d4, [x0] - ldr d5, [x2] - add x0, x0, x1 - add x2, x2, x3 + ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x2], x3 uxtl v4.8h, v4.8b uxtl v5.8h, v5.8b ssimDist_1 v4, v5 @@ -2001,10 +1977,8 @@ function PFX(ssimDist16_neon) ssimDist_start .loop_ssimDist16: sub w12, w12, #1 - ldr q4, [x0] - ldr q5, [x2] - add x0, x0, x1 - add x2, x2, x3 + ld1 {v4.16b}, [x0], x1 + ld1 {v5.16b}, [x2], x3 uxtl v6.8h, v4.8b uxtl v7.8h, v5.8b uxtl2 v4.8h, v4.16b @@ -2021,10 +1995,8 @@ function PFX(ssimDist32_neon) ssimDist_start .loop_ssimDist32: sub w12, w12, #1 - ldp q4, q5, [x0] - ldp q6, q7, [x2] - add x0, x0, x1 - add x2, x2, x3 + ld1 {v4.16b-v5.16b}, [x0], x1 + ld1 {v6.16b-v7.16b}, [x2], x3 uxtl v21.8h, v4.8b uxtl v22.8h, v6.8b uxtl v23.8h, v5.8b @@ -2047,32 +2019,28 @@ function PFX(ssimDist64_neon) ssimDist_start .loop_ssimDist64: sub w12, w12, #1 - ldp q4, q5, [x0] - ldp q6, q7, [x2] + ld1 {v4.16b-v7.16b}, [x0], x1 + ld1 {v16.16b-v19.16b}, [x2], x3 uxtl v21.8h, v4.8b - uxtl v22.8h, v6.8b + uxtl v22.8h, v16.8b uxtl v23.8h, v5.8b - uxtl v24.8h, v7.8b + uxtl v24.8h, v17.8b uxtl2 v25.8h, v4.16b - uxtl2 v26.8h, v6.16b + uxtl2 v26.8h, v16.16b uxtl2 v27.8h, v5.16b - uxtl2 v28.8h, v7.16b + uxtl2 v28.8h, v17.16b ssimDist_1 v21, v22 ssimDist_1 v23, v24 ssimDist_1 v25, v26 ssimDist_1 v27, v28 - ldp q4, q5, [x0, #32] - ldp q6, q7, [x2, #32] - add x0, x0, x1 - add x2, x2, x3 - uxtl v21.8h, v4.8b - uxtl v22.8h, v6.8b - uxtl v23.8h, v5.8b - uxtl v24.8h, v7.8b - uxtl2 v25.8h, v4.16b - uxtl2 v26.8h, v6.16b - uxtl2 v27.8h, v5.16b - uxtl2 v28.8h, v7.16b + uxtl v21.8h, v6.8b + uxtl v22.8h, v18.8b + uxtl v23.8h, v7.8b + uxtl v24.8h, v19.8b + uxtl2 v25.8h, v6.16b + uxtl2 v26.8h, v18.16b + uxtl2 v27.8h, v7.16b + uxtl2 v28.8h, v19.16b ssimDist_1 v21, v22 ssimDist_1 v23, v24 ssimDist_1 v25, v26 @@ -2104,8 +2072,7 @@ endfunc function PFX(normFact8_neon) normFact_start .rept 8 - ldr d4, [x0] - add x0, x0, x1 + ld1 {v4.8b}, [x0], x1 uxtl v4.8h, v4.8b normFact_1 v4 .endr @@ -2118,8 +2085,7 @@ function PFX(normFact16_neon) normFact_start .loop_normFact16: sub w12, w12, #1 - ldr q4, [x0] - add x0, x0, x1 + ld1 {v4.16b}, [x0], x1 uxtl v5.8h, v4.8b uxtl2 v4.8h, v4.16b normFact_1 v5 @@ -2134,8 +2100,7 @@ function PFX(normFact32_neon) normFact_start .loop_normFact32: sub w12, w12, #1 - ldp q4, q5, [x0] - add x0, x0, x1 + ld1 {v4.16b-v5.16b}, [x0], x1 uxtl v6.8h, v4.8b uxtl2 v4.8h, v4.16b uxtl v7.8h, v5.8b @@ -2154,25 +2119,23 @@ function PFX(normFact64_neon) normFact_start .loop_normFact64: sub w12, w12, #1 - ldp q4, q5, [x0] - uxtl v6.8h, v4.8b - uxtl2 v4.8h, v4.16b - uxtl v7.8h, v5.8b - uxtl2 v5.8h, v5.16b - normFact_1 v4 - normFact_1 v5 - normFact_1 v6 - normFact_1 v7 - ldp q4, q5, [x0, #32] - add x0, x0, x1 - uxtl v6.8h, v4.8b - uxtl2 v4.8h, v4.16b - uxtl v7.8h, v5.8b - uxtl2 v5.8h, v5.16b - normFact_1 v4 - normFact_1 v5 - normFact_1 v6 - normFact_1 v7 + ld1 {v4.16b-v7.16b}, [x0], x1 + uxtl v26.8h, v4.8b + uxtl2 v24.8h, v4.16b + uxtl v27.8h, v5.8b + uxtl2 v25.8h, v5.16b + normFact_1 v24 + normFact_1 v25 + normFact_1 v26 + normFact_1 v27 + uxtl v26.8h, v6.8b + uxtl2 v24.8h, v6.16b + uxtl v27.8h, v7.8b + uxtl2 v25.8h, v7.16b + normFact_1 v24 + normFact_1 v25 + normFact_1 v26 + normFact_1 v27 cbnz w12, .loop_normFact64 normFact_end ret @@ -2402,14 +2365,10 @@ endfunc function PFX(costCoeffNxN_neon) // abs(coeff) add x2, x2, x2 - ldr d1, [x1] - ldr x9, [x1, x2] - add x1, x1, x2 - ldr d2, [x1, x2] - add x1, x1, x2 - ldr x10, [x1, x2] - mov v1.d[1], x9 - mov v2.d[1], x10 + ld1 {v1.d}[0], [x1], x2 + ld1 {v1.d}[1], [x1], x2 + ld1 {v2.d}[0], [x1], x2 + ld1 {v2.d}[1], [x1], x2 abs v1.8h, v1.8h abs v2.8h, v2.8h diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S index b96ac9efe..31ebd25bc 100644 --- a/source/common/aarch64/sad-a.S +++ b/source/common/aarch64/sad-a.S @@ -35,12 +35,10 @@ .text .macro SAD_START_4 f - ldr s0, [x0] - add x0, x0, x1 - ldr s1, [x2] - add x2, x2, x3 - ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 + ld1 {v1.s}[0], [x2], x3 + ld1 {v1.s}[1], [x2], x3 \f v16.8h, v0.8b, v1.8b .endm @@ -328,17 +326,13 @@ SAD_FUNC_LOOP 48, 64 // SAD_X3 and SAD_X4 code start .macro SAD_X_START_4 h, x, f - ldr s0, [x0] - add x0, x0, x9 + ld1 {v0.s}[0], [x0], x9 ld1 {v0.s}[1], [x0], x9 - ldr s1, [x1] - add x1, x1, x5 + ld1 {v1.s}[0], [x1], x5 ld1 {v1.s}[1], [x1], x5 - ldr s2, [x2] - add x2, x2, x5 + ld1 {v2.s}[0], [x2], x5 ld1 {v2.s}[1], [x2], x5 - ldr s3, [x3] - add x3, x3, x5 + ld1 {v3.s}[0], [x3], x5 ld1 {v3.s}[1], [x3], x5 \f v16.8h, v0.8b, v1.8b \f v17.8h, v0.8b, v2.8b diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S index b9758af12..c05be45c0 100644 --- a/source/common/aarch64/ssd-a.S +++ b/source/common/aarch64/ssd-a.S @@ -42,54 +42,39 @@ .endm function PFX(pixel_sse_pp_4x4_neon) - ldr s16, [x0] - add x0, x0, x1 - ldr s17, [x2] - add x2, x2, x3 - usubl v1.8h, v16.8b, v17.8b - ldr s18, [x0] - add x0, x0, x1 - smull v0.4s, v1.4h, v1.4h - ldr s19, [x2] - add x2, x2, x3 + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 + ld1 {v18.s}[0], [x0], x1 + ld1 {v19.s}[0], [x2], x3 + ld1 {v20.s}[0], [x0], x1 + ld1 {v21.s}[0], [x2], x3 + ld1 {v22.s}[0], [x0], x1 + ld1 {v23.s}[0], [x2], x3 + usubl v1.8h, v16.8b, v17.8b usubl v2.8h, v18.8b, v19.8b - ldr s20, [x0] - add x0, x0, x1 - smlal v0.4s, v2.4h, v2.4h - ldr s21, [x2] - add x2, x2, x3 - usubl v3.8h, v20.8b, v21.8b - ldr s22, [x0] - add x0, x0, x1 - smlal v0.4s, v3.4h, v3.4h - ldr s23, [x2] - add x2, x2, x3 - usubl v4.8h, v22.8b, v23.8b + + smull v0.4s, v1.4h, v1.4h + smlal v0.4s, v2.4h, v2.4h + smlal v0.4s, v3.4h, v3.4h smlal v0.4s, v4.4h, v4.4h ret_v0_w0 endfunc function PFX(pixel_sse_pp_4x8_neon) - ldr s16, [x0] - add x0, x0, x1 - ldr s17, [x2] - add x2, x2, x3 + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 usubl v1.8h, v16.8b, v17.8b - ldr s16, [x0] - add x0, x0, x1 + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 smull v0.4s, v1.4h, v1.4h - ldr s17, [x2] - add x2, x2, x3 .rept 6 usubl v1.8h, v16.8b, v17.8b - ldr s16, [x0] - add x0, x0, x1 + ld1 {v16.s}[0], [x0], x1 smlal v0.4s, v1.4h, v1.4h - ldr s17, [x2] - add x2, x2, x3 + ld1 {v17.s}[0], [x2], x3 .endr usubl v1.8h, v16.8b, v17.8b smlal v0.4s, v1.4h, v1.4h -- 2.25.1
From 6e7669851cf40ede55b96c06fbb05590dd6507f2 Mon Sep 17 00:00:00 2001 From: Sebastian Pop <[email protected]> Date: Wed, 9 Mar 2022 21:15:24 +0000 Subject: [PATCH 3/7] [arm64] register several ASM routines addAvg, blockcopy_pp, blockfill, interp_vert, cpy2Dto1D_shl, cpy1Dto2D_shl, and cpy1Dto2D_shr. --- source/common/aarch64/asm-primitives.cpp | 25 +++++- source/common/aarch64/blockcopy8.S | 106 +++++++++++++++++++++++ 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 79c3102fe..9a95d2943 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -37,7 +37,8 @@ extern "C" { p.cu[BLOCK_4x4].prim = fncdef PFX(fname ## _4x4_ ## cpu); \ p.cu[BLOCK_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \ p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \ - p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu) + p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \ + p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu) #define ALL_LUMA_TU(prim, fname, cpu) ALL_LUMA_TU_TYPED(prim, , fname, cpu) @@ -125,10 +126,12 @@ extern "C" { #define ALL_CHROMA_420_4x4_PU_TYPED(prim, fncdef, fname, cpu) \ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].prim = fncdef PFX(fname ## _4x4_ ## cpu); \ + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].prim = fncdef PFX(fname ## _8x2_ ## cpu); \ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].prim = fncdef PFX(fname ## _8x4_ ## cpu); \ + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].prim = fncdef PFX(fname ## _8x6_ ## cpu); \ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].prim = fncdef PFX(fname ## _4x8_ ## cpu); \ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].prim = fncdef PFX(fname ## _16x8_ ## cpu); \ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].prim = fncdef PFX(fname ## _8x16_ ## cpu); \ @@ -350,6 +353,20 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) ALL_LUMA_PU(copy_pp, blockcopy_pp, neon); ALL_CHROMA_420_PU(copy_pp, blockcopy_pp, neon); ALL_CHROMA_422_PU(copy_pp, blockcopy_pp, neon); + p.cu[BLOCK_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon); + p.cu[BLOCK_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon); + p.cu[BLOCK_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon); + p.cu[BLOCK_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon); + p.cu[BLOCK_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon); + p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon); + p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon); + p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon); + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon); + p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_pp = PFX(blockcopy_pp_4x8_neon); + p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_pp = PFX(blockcopy_pp_8x16_neon); + p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon); + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_neon); + #endif // !HIGH_BIT_DEPTH // Blockcopy_ss @@ -424,6 +441,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8x8_neon); p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_neon); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_neon); + p.cu[BLOCK_64x64].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_neon); // cpy2Dto1D_shr p.cu[BLOCK_4x4].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon); @@ -436,17 +454,20 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) p.cu[BLOCK_8x8].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_8x8_neon); p.cu[BLOCK_16x16].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_16x16_neon); p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_32x32_neon); + p.cu[BLOCK_64x64].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_64x64_neon); p.cu[BLOCK_4x4].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_4x4_neon); p.cu[BLOCK_8x8].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_8x8_neon); p.cu[BLOCK_16x16].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_16x16_neon); p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32x32_neon); + p.cu[BLOCK_64x64].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_64x64_neon); // cpy1Dto2D_shr p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon); p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon); p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_neon); p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon); + p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon); #if !HIGH_BIT_DEPTH // pixel_avg_pp @@ -454,6 +475,8 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) ALL_LUMA_PU(pixelavg_pp[ALIGNED], pixel_avg_pp, neon); // addAvg + ALL_LUMA_PU(addAvg[NONALIGNED], addAvg, neon); + ALL_LUMA_PU(addAvg[ALIGNED], addAvg, neon); ALL_CHROMA_420_PU(addAvg[NONALIGNED], addAvg, neon); ALL_CHROMA_422_PU(addAvg[NONALIGNED], addAvg, neon); ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon); diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S index bdcb5d432..9f0fb675a 100644 --- a/source/common/aarch64/blockcopy8.S +++ b/source/common/aarch64/blockcopy8.S @@ -802,6 +802,20 @@ function PFX(blockfill_s_32x32_neon) ret endfunc +function PFX(blockfill_s_64x64_neon) + dup v0.8h, w2 + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + lsl x1, x1, #1 + sub x1, x1, #64 +.rept 64 + st1 {v0.8h-v3.8h}, [x0], #64 + st1 {v0.8h-v3.8h}, [x0], x1 +.endr + ret +endfunc + // uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride) function PFX(copy_cnt_4_neon) lsl x2, x2, #1 @@ -1032,6 +1046,34 @@ function PFX(cpy2Dto1D_shl_32x32_neon) ret endfunc +function PFX(cpy2Dto1D_shl_64x64_neon) + cpy2Dto1D_shl_start + mov w12, #32 +.loop_cpy2Dto1D_shl_64: + sub w12, w12, #1 +.rept 2 + ldp q2, q3, [x1] + ldp q4, q5, [x1, #32] + ldp q6, q7, [x1, #64] + ldp q16, q17, [x1, #96] + add x1, x1, x2 + sshl v2.8h, v2.8h, v0.8h + sshl v3.8h, v3.8h, v0.8h + sshl v4.8h, v4.8h, v0.8h + sshl v5.8h, v5.8h, v0.8h + sshl v6.8h, v6.8h, v0.8h + sshl v7.8h, v7.8h, v0.8h + sshl v16.8h, v16.8h, v0.8h + sshl v17.8h, v17.8h, v0.8h + stp q2, q3, [x0], #32 + stp q4, q5, [x0], #32 + stp q6, q7, [x0], #32 + stp q16, q17, [x0], #32 +.endr + cbnz w12, .loop_cpy2Dto1D_shl_64 + ret +endfunc + // void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) .macro cpy2Dto1D_shr_start add x2, x2, x2 @@ -1176,6 +1218,34 @@ function PFX(cpy1Dto2D_shl_32x32_neon) ret endfunc +function PFX(cpy1Dto2D_shl_64x64_neon) + cpy1Dto2D_shl_start + mov w12, #32 +.loop_cpy1Dto2D_shl_64: + sub w12, w12, #1 +.rept 2 + ldp q2, q3, [x1], #32 + ldp q4, q5, [x1], #32 + ldp q6, q7, [x1], #32 + ldp q16, q17, [x1], #32 + sshl v2.8h, v2.8h, v0.8h + sshl v3.8h, v3.8h, v0.8h + sshl v4.8h, v4.8h, v0.8h + sshl v5.8h, v5.8h, v0.8h + sshl v6.8h, v6.8h, v0.8h + sshl v7.8h, v7.8h, v0.8h + sshl v16.8h, v16.8h, v0.8h + sshl v17.8h, v17.8h, v0.8h + stp q2, q3, [x0] + stp q4, q5, [x0, #32] + stp q6, q7, [x0, #64] + stp q16, q17, [x0, #96] + add x0, x0, x2 +.endr + cbnz w12, .loop_cpy1Dto2D_shl_64 + ret +endfunc + // void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) .macro cpy1Dto2D_shr_start add x2, x2, x2 @@ -1255,6 +1325,42 @@ function PFX(cpy1Dto2D_shr_32x32_neon) ret endfunc +function PFX(cpy1Dto2D_shr_64x64_neon) + cpy1Dto2D_shr_start + mov w12, #32 +.loop_cpy1Dto2D_shr_64: + sub w12, w12, #1 +.rept 2 + ldp q2, q3, [x1], #32 + ldp q4, q5, [x1], #32 + ldp q6, q7, [x1], #32 + ldp q16, q17, [x1], #32 + sub v2.8h, v2.8h, v1.8h + sub v3.8h, v3.8h, v1.8h + sub v4.8h, v4.8h, v1.8h + sub v5.8h, v5.8h, v1.8h + sub v6.8h, v6.8h, v1.8h + sub v7.8h, v7.8h, v1.8h + sub v16.8h, v16.8h, v1.8h + sub v17.8h, v17.8h, v1.8h + sshl v2.8h, v2.8h, v0.8h + sshl v3.8h, v3.8h, v0.8h + sshl v4.8h, v4.8h, v0.8h + sshl v5.8h, v5.8h, v0.8h + sshl v6.8h, v6.8h, v0.8h + sshl v7.8h, v7.8h, v0.8h + sshl v16.8h, v16.8h, v0.8h + sshl v17.8h, v17.8h, v0.8h + stp q2, q3, [x0] + stp q4, q5, [x0, #32] + stp q6, q7, [x0, #64] + stp q16, q17, [x0, #96] + add x0, x0, x2 +.endr + cbnz w12, .loop_cpy1Dto2D_shr_64 + ret +endfunc + const xtn_xtn2_table, align=4 .byte 0, 2, 4, 6, 8, 10, 12, 14 .byte 16, 18, 20, 22, 24, 26, 28, 30 -- 2.25.1
From 074e5219edcd3ca801ccb05cdc374b4f54e009b6 Mon Sep 17 00:00:00 2001 From: Sebastian Pop <[email protected]> Date: Wed, 9 Mar 2022 18:39:29 +0000 Subject: [PATCH 2/7] [arm64] Register the assembly routines `x265_satd_*_neon` in place of the VideoLAN intrinsics implementation. Overall performance is improved by 2% on c6g.2xl: before: c6g.2xl: encoded 261 frames in 105.36s (2.48 fps), 452.66 kb/s, Avg QP:37.28, Global PSNR: 35.101, SSIM Mean Y: 0.9613134 (14.124 dB) after: c6g.2xl: encoded 261 frames in 103.18s (2.53 fps), 452.66 kb/s, Avg QP:37.28, Global PSNR: 35.101, SSIM Mean Y: 0.9613134 (14.124 dB) On the top profile, the patch replaces: 4.31% (anonymous namespace)::_satd_16x4_neon with 1.48% x265_satd_16x4_neon --- source/common/aarch64/asm-primitives.cpp | 99 +++++++++++------------- 1 file changed, 46 insertions(+), 53 deletions(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 97d69c83c..79c3102fe 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -574,67 +574,60 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // planecopy p.planecopy_cp = PFX(pixel_planecopy_cp_neon); -#if 0 // satd - ALL_CHROMA_420_PU(satd, pixel_satd, neon); - ALL_CHROMA_422_PU(satd, pixel_satd, neon); -#else -#define ALL_CHROMA_420_PU_TYPED_1(prim, fncdef, fname, cpu) \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].prim = fncdef PFX(fname ## _4x4_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].prim = fncdef PFX(fname ## _8x4_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].prim = fncdef PFX(fname ## _4x8_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].prim = fncdef PFX(fname ## _16x8_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].prim = fncdef PFX(fname ## _8x16_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].prim = fncdef PFX(fname ## _32x16_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].prim = fncdef PFX(fname ## _16x32_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].prim = fncdef PFX(fname ## _16x12_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].prim = fncdef PFX(fname ## _12x16_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].prim = fncdef PFX(fname ## _16x4_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].prim = fncdef PFX(fname ## _4x16_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].prim = fncdef PFX(fname ## _32x24_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].prim = fncdef PFX(fname ## _24x32_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].prim = fncdef PFX(fname ## _32x8_ ## cpu); \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim = fncdef PFX(fname ## _8x32_ ## cpu) -#define ALL_CHROMA_420_PU_1(prim, fname, cpu) ALL_CHROMA_420_PU_TYPED_1(prim, , fname, cpu) + ALL_LUMA_PU(satd, pixel_satd, neon); + + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = PFX(pixel_satd_16x4_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_neon); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = PFX(pixel_satd_8x32_neon); + + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_neon); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = PFX(pixel_satd_8x64_neon); -#define ALL_CHROMA_422_PU_TYPED_1(prim, fncdef, fname, cpu) \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].prim = fncdef PFX(fname ## _4x8_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].prim = fncdef PFX(fname ## _8x16_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].prim = fncdef PFX(fname ## _16x32_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].prim = fncdef PFX(fname ## _32x64_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].prim = fncdef PFX(fname ## _4x4_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].prim = fncdef PFX(fname ## _4x16_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].prim = fncdef PFX(fname ## _8x32_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].prim = fncdef PFX(fname ## _16x64_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].prim = fncdef PFX(fname ## _8x12_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].prim = fncdef PFX(fname ## _8x4_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].prim = fncdef PFX(fname ## _16x24_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].prim = fncdef PFX(fname ## _12x32_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].prim = fncdef PFX(fname ## _16x8_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].prim = fncdef PFX(fname ## _4x32_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].prim = fncdef PFX(fname ## _32x48_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].prim = fncdef PFX(fname ## _24x64_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].prim = fncdef PFX(fname ## _32x16_ ## cpu); \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].prim = fncdef PFX(fname ## _8x64_ ## cpu) -#define ALL_CHROMA_422_PU_1(prim, fname, cpu) ALL_CHROMA_422_PU_TYPED_1(prim, , fname, cpu) - - ALL_CHROMA_420_PU_1(satd, pixel_satd, neon); - ALL_CHROMA_422_PU_1(satd, pixel_satd, neon); - -#endif // sa8d p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_neon); - p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_neon); p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_neon); p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon); p.cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon); p.cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon); + p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_neon); + p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon); + p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon); + p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon); p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_neon); p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_neon); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_neon); -- 2.25.1
From 08075546542b65c915309d05e38de13e7306608c Mon Sep 17 00:00:00 2001 From: Sebastian Pop <[email protected]> Date: Fri, 21 Jan 2022 19:34:58 +0000 Subject: [PATCH 1/7] [arm64] port costCoeffNxN costCoeffNxN 1.26x 63.60 80.17 --- source/common/aarch64/asm-primitives.cpp | 1 + source/common/aarch64/fun-decls.h | 1 + source/common/aarch64/pixel-util.S | 140 +++++++++++++++++++++++ 3 files changed, 142 insertions(+) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 0e10dae2f..97d69c83c 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -664,6 +664,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) p.weight_pp = PFX(weight_pp_neon); p.scanPosLast = PFX(scanPosLast_neon); + p.costCoeffNxN = PFX(costCoeffNxN_neon); #endif // quant diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h index 58cdac4cf..59679f3c4 100644 --- a/source/common/aarch64/fun-decls.h +++ b/source/common/aarch64/fun-decls.h @@ -221,3 +221,4 @@ int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* re void PFX(weight_pp_neon)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); void PFX(weight_sp_neon)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize); +uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase); diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S index 369735c80..25af1dc02 100644 --- a/source/common/aarch64/pixel-util.S +++ b/source/common/aarch64/pixel-util.S @@ -2388,6 +2388,146 @@ function PFX(scanPosLast_neon) ret endfunc +// uint32_t costCoeffNxN( +// uint16_t *scan, // x0 +// coeff_t *coeff, // x1 +// intptr_t trSize, // x2 +// uint16_t *absCoeff, // x3 +// uint8_t *tabSigCtx, // x4 +// uint16_t scanFlagMask, // x5 +// uint8_t *baseCtx, // x6 +// int offset, // x7 +// int scanPosSigOff, // sp +// int subPosBase) // sp + 8 +function PFX(costCoeffNxN_neon) + // abs(coeff) + add x2, x2, x2 + ldr d1, [x1] + ldr x9, [x1, x2] + add x1, x1, x2 + ldr d2, [x1, x2] + add x1, x1, x2 + ldr x10, [x1, x2] + mov v1.d[1], x9 + mov v2.d[1], x10 + abs v1.8h, v1.8h + abs v2.8h, v2.8h + + // WARNING: beyond-bound read here! + // loading scan table + ldr w2, [sp] + eor w15, w2, #15 + add x1, x0, x15, lsl #1 + ldp q20, q21, [x1] + uzp1 v20.16b, v20.16b, v21.16b + movi v21.16b, #15 + eor v0.16b, v20.16b, v21.16b + + // reorder coeff + uzp1 v22.16b, v1.16b, v2.16b + uzp2 v23.16b, v1.16b, v2.16b + tbl v24.16b, {v22.16b}, v0.16b + tbl v25.16b, {v23.16b}, v0.16b + zip1 v2.16b, v24.16b, v25.16b + zip2 v3.16b, v24.16b, v25.16b + + // loading tabSigCtx (+offset) + ldr q1, [x4] + tbl v1.16b, {v1.16b}, v0.16b + dup v4.16b, w7 + movi v5.16b, #0 + tbl v4.16b, {v4.16b}, v5.16b + add v1.16b, v1.16b, v4.16b + + // register mapping + // x0 - sum + // x1 - entropyStateBits + // v1 - sigCtx + // {v3,v2} - abs(coeff) + // x2 - scanPosSigOff + // x3 - absCoeff + // x4 - numNonZero + // x5 - scanFlagMask + // x6 - baseCtx + mov x0, #0 + movrel x1, x265_entropyStateBits + mov x4, #0 + mov x11, #0 + movi v31.16b, #0 + cbz x2, .idx_zero +.loop_ccnn: +// { +// const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset; +// ctxSig = cnt & posZeroMask; +// const uint32_t mstate = baseCtx[ctxSig]; +// const uint32_t mps = mstate & 1; +// const uint32_t stateBits = x265_entropyStateBits[mstate ^ sig]; +// uint32_t nextState = (stateBits >> 24) + mps; +// if ((mstate ^ sig) == 1) +// nextState = sig; +// baseCtx[ctxSig] = (uint8_t)nextState; +// sum += stateBits; +// } +// absCoeff[numNonZero] = tmpCoeff[blkPos]; +// numNonZero += sig; +// scanPosSigOff--; + + add x13, x3, x4, lsl #1 + sub x2, x2, #1 + str h2, [x13] // absCoeff[numNonZero] = tmpCoeff[blkPos] + fmov w14, s1 // x14 = ctxSig + uxtb w14, w14 + ubfx w11, w5, #0, #1 // x11 = sig + lsr x5, x5, #1 + add x4, x4, x11 // numNonZero += sig + ext v1.16b, v1.16b, v31.16b, #1 + ext v2.16b, v2.16b, v3.16b, #2 + ext v3.16b, v3.16b, v31.16b, #2 + ldrb w9, [x6, x14] // mstate = baseCtx[ctxSig] + and w10, w9, #1 // mps = mstate & 1 + eor w9, w9, w11 // x9 = mstate ^ sig + add x12, x1, x9, lsl #2 + ldr w13, [x12] + add w0, w0, w13 // sum += x265_entropyStateBits[mstate ^ sig] + ldrb w13, [x12, #3] + add w10, w10, w13 // nextState = (stateBits >> 24) + mps + cmp w9, #1 + csel w10, w11, w10, eq + strb w10, [x6, x14] + cbnz x2, .loop_ccnn +.idx_zero: + + add x13, x3, x4, lsl #1 + add x4, x4, x15 + str h2, [x13] // absCoeff[numNonZero] = tmpCoeff[blkPos] + + ldr x9, [sp, #8] // subPosBase + uxth w9, w9 + cmp w9, #0 + cset x2, eq + add x4, x4, x2 + cbz x4, .exit_ccnn + + sub w2, w2, #1 + uxtb w2, w2 + fmov w3, s1 + and w2, w2, w3 + + ldrb w3, [x6, x2] // mstate = baseCtx[ctxSig] + eor w4, w5, w3 // x5 = mstate ^ sig + and w3, w3, #1 // mps = mstate & 1 + add x1, x1, x4, lsl #2 + ldr w11, [x1] + ldrb w12, [x1, #3] + add w0, w0, w11 // sum += x265_entropyStateBits[mstate ^ sig] + add w3, w3, w12 // nextState = (stateBits >> 24) + mps + cmp w4, #1 + csel w3, w5, w3, eq + strb w3, [x6, x2] +.exit_ccnn: + ubfx w0, w0, #0, #24 + ret +endfunc const g_SPL_and_mask, align=8 .byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 -- 2.25.1
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
