Hi, We're getting conflicts applying this patch due to recent changes. Could you please rebase it on the latest tip and resend?
Thanks, Pavan Tarun ________________________________ From: x265-devel <[email protected]> on behalf of chen <[email protected]> Sent: 20 June 2025 11:06 AM To: Development for x265 <[email protected]> Cc: [email protected] <[email protected]> Subject: Re: [x265] [PATCH] AArch64: Optimize and clean up addAvg Neon and SVE2 functions Looks good to me, thanks. At 2025-06-20 02:48:38, "Li Zhang" <[email protected]> wrote: >Extend the neon intrinsics implementation to support all block sizes and >optimize it to use rounding-shift-and-accumulate instead of separate >widening, add, and shift steps. Also unroll the loops for larger block >sizes to enable the compiler to emit LDP and STP instructions. > >Delete the Neon and SVE2 assembly implementations as they are 1-2x >slower than Neon intrinsics implementation. >--- > source/common/aarch64/asm-primitives.cpp | 16 - > source/common/aarch64/mc-a-sve2.S | 606 ----------------------- > source/common/aarch64/mc-a.S | 341 ------------- > source/common/aarch64/mem-neon.h | 20 + > source/common/aarch64/pixel-prim.cpp | 207 +++++--- > 5 files changed, 169 insertions(+), 1021 deletions(-) > >diff --git a/source/common/aarch64/asm-primitives.cpp >b/source/common/aarch64/asm-primitives.cpp >index 5ce9352bd..e1fc8e82a 100644 >--- a/source/common/aarch64/asm-primitives.cpp >+++ b/source/common/aarch64/asm-primitives.cpp >@@ -462,14 +462,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) > ALL_LUMA_PU(pixelavg_pp[NONALIGNED], pixel_avg_pp, neon); > ALL_LUMA_PU(pixelavg_pp[ALIGNED], pixel_avg_pp, neon); > >- // addAvg >- ALL_LUMA_PU(addAvg[NONALIGNED], addAvg, neon); >- ALL_LUMA_PU(addAvg[ALIGNED], addAvg, neon); >- ALL_CHROMA_420_PU(addAvg[NONALIGNED], addAvg, neon); >- ALL_CHROMA_422_PU(addAvg[NONALIGNED], addAvg, neon); >- ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon); >- ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon); >- > // pixel_var > p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon); > p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon); >@@ -635,14 +627,6 @@ void setupSve2Primitives(EncoderPrimitives &p) > LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[NONALIGNED], pixel_avg_pp, sve2); > LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[ALIGNED], pixel_avg_pp, sve2); > >- // addAvg >- LUMA_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg); >- LUMA_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg); >- CHROMA_420_PU_MULTIPLE_ARCHS(addAvg[NONALIGNED], addAvg, sve2); >- CHROMA_420_PU_MULTIPLE_ARCHS(addAvg[ALIGNED], addAvg, sve2); >- CHROMA_422_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg); >- CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg); >- > // pixel_var > p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_sve2); > p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_sve2); >diff --git a/source/common/aarch64/mc-a-sve2.S >b/source/common/aarch64/mc-a-sve2.S >index 00fb0048f..fc0a6f3e8 100644 >--- a/source/common/aarch64/mc-a-sve2.S >+++ b/source/common/aarch64/mc-a-sve2.S >@@ -298,609 +298,3 @@ pixel_avg_pp_64xN_sve2 16 > pixel_avg_pp_64xN_sve2 32 > pixel_avg_pp_64xN_sve2 48 > pixel_avg_pp_64xN_sve2 64 >- >-// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t >src0Stride, intptr_t src1Stride, intptr_t dstStride) >- >-.macro addAvg_2xN_sve2 h >-function PFX(addAvg_2x\h\()_sve2) >- ptrue p0.s, vl2 >- ptrue p1.h, vl4 >- ptrue p2.h, vl2 >-.rept \h / 2 >- ld1rw {z0.s}, p0/z, [x0] >- ld1rw {z1.s}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- ld1rw {z2.s}, p0/z, [x0] >- ld1rw {z3.s}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p1/m, z0.h, z1.h >- add z2.h, p1/m, z2.h, z3.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z2.b, z2.h, #7 >- add z2.b, z2.b, #0x80 >- st1b {z0.h}, p2, [x2] >- add x2, x2, x5 >- st1b {z2.h}, p2, [x2] >- add x2, x2, x5 >-.endr >- ret >-endfunc >-.endm >- >-addAvg_2xN_sve2 4 >-addAvg_2xN_sve2 8 >-addAvg_2xN_sve2 16 >- >-.macro addAvg_6xN_sve2 h >-function PFX(addAvg_6x\h\()_sve2) >- mov w12, #\h / 2 >- ptrue p0.b, vl16 >- ptrue p2.h, vl6 >-.Loop_sve2_addavg_6x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- ld1b {z2.b}, p0/z, [x0] >- ld1b {z3.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z1.h >- add z2.h, p0/m, z2.h, z3.h >- sqrshrnb z0.b, z0.h, #7 >- sqrshrnb z2.b, z2.h, #7 >- add z0.b, z0.b, #0x80 >- add z2.b, z2.b, #0x80 >- st1b {z0.h}, p2, [x2] >- add x2, x2, x5 >- st1b {z2.h}, p2, [x2] >- add x2, x2, x5 >- cbnz w12, .Loop_sve2_addavg_6x\h >- ret >-endfunc >-.endm >- >-addAvg_6xN_sve2 8 >-addAvg_6xN_sve2 16 >- >-.macro addAvg_8xN_sve2 h >-function PFX(addAvg_8x\h\()_sve2) >- ptrue p0.b, vl16 >-.rept \h / 2 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- ld1b {z2.b}, p0/z, [x0] >- ld1b {z3.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z1.h >- add z2.h, p0/m, z2.h, z3.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z2.b, z2.h, #7 >- add z2.b, z2.b, #0x80 >- st1b {z0.h}, p0, [x2] >- add x2, x2, x5 >- st1b {z2.h}, p0, [x2] >- add x2, x2, x5 >-.endr >- ret >-endfunc >-.endm >- >-.macro addAvg_8xN1_sve2 h >-function PFX(addAvg_8x\h\()_sve2) >- mov w12, #\h / 2 >- ptrue p0.b, vl16 >-.Loop_sve2_addavg_8x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- ld1b {z2.b}, p0/z, [x0] >- ld1b {z3.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z1.h >- add z2.h, p0/m, z2.h, z3.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z2.b, z2.h, #7 >- add z2.b, z2.b, #0x80 >- st1b {z0.h}, p0, [x2] >- add x2, x2, x5 >- st1b {z2.h}, p0, [x2] >- add x2, x2, x5 >- cbnz w12, .Loop_sve2_addavg_8x\h >- ret >-endfunc >-.endm >- >-addAvg_8xN_sve2 2 >-addAvg_8xN_sve2 4 >-addAvg_8xN_sve2 6 >-addAvg_8xN_sve2 8 >-addAvg_8xN_sve2 12 >-addAvg_8xN_sve2 16 >-addAvg_8xN1_sve2 32 >-addAvg_8xN1_sve2 64 >- >-.macro addAvg_12xN_sve2 h >-function PFX(addAvg_12x\h\()_sve2) >- mov w12, #\h >- rdvl x9, #1 >- cmp x9, #16 >- bgt .vl_gt_16_addAvg_12x\h >- ptrue p0.b, vl16 >- ptrue p1.b, vl8 >-.Loop_sve2_addavg_12x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x1] >- ld1b {z2.b}, p1/z, [x0, #1, mul vl] >- ld1b {z3.b}, p1/z, [x1, #1, mul vl] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z1.h >- add z2.h, p1/m, z2.h, z3.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z2.b, z2.h, #7 >- add z2.b, z2.b, #0x80 >- st1b {z0.h}, p0, [x2] >- st1b {z2.h}, p1, [x2, #1, mul vl] >- add x2, x2, x5 >- cbnz w12, .Loop_sve2_addavg_12x\h >- ret >-.vl_gt_16_addAvg_12x\h\(): >- mov x10, #24 >- mov x11, #0 >- whilelt p0.b, x11, x10 >-.Loop_sve2_gt_16_addavg_12x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z1.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z2.b, z2.h, #7 >- add z2.b, z2.b, #0x80 >- st1b {z0.h}, p0, [x2] >- add x2, x2, x5 >- cbnz w12, .Loop_sve2_gt_16_addavg_12x\h >- ret >-endfunc >-.endm >- >-addAvg_12xN_sve2 16 >-addAvg_12xN_sve2 32 >- >-.macro addAvg_16xN_sve2 h >-function PFX(addAvg_16x\h\()_sve2) >- mov w12, #\h >- rdvl x9, #1 >- cmp x9, #16 >- bgt .vl_gt_16_addAvg_16x\h >- ptrue p0.b, vl16 >-.Loop_eq_16_sve2_addavg_16x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x1] >- ld1b {z2.b}, p0/z, [x0, #1, mul vl] >- ld1b {z3.b}, p0/z, [x1, #1, mul vl] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z1.h >- add z2.h, p0/m, z2.h, z3.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z2.b, z2.h, #7 >- add z2.b, z2.b, #0x80 >- st1b {z0.h}, p0, [x2] >- st1b {z2.h}, p0, [x2, #1, mul vl] >- add x2, x2, x5 >- cbnz w12, .Loop_eq_16_sve2_addavg_16x\h >- ret >-.vl_gt_16_addAvg_16x\h\(): >- ptrue p0.b, vl32 >-.Loop_gt_16_sve2_addavg_16x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z1.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- st1b {z0.h}, p0, [x2] >- add x2, x2, x5 >- cbnz w12, .Loop_gt_16_sve2_addavg_16x\h >- ret >-endfunc >-.endm >- >-addAvg_16xN_sve2 4 >-addAvg_16xN_sve2 8 >-addAvg_16xN_sve2 12 >-addAvg_16xN_sve2 16 >-addAvg_16xN_sve2 24 >-addAvg_16xN_sve2 32 >-addAvg_16xN_sve2 64 >- >-.macro addAvg_24xN_sve2 h >-function PFX(addAvg_24x\h\()_sve2) >- mov w12, #\h >- rdvl x9, #1 >- cmp x9, #16 >- bgt .vl_gt_16_addAvg_24x\h >- addAvg_start >-.Loop_eq_16_sve2_addavg_24x\h\(): >- sub w12, w12, #1 >- ld1 {v0.16b-v2.16b}, [x0], x3 >- ld1 {v3.16b-v5.16b}, [x1], x4 >- addavg_1 v0, v3 >- addavg_1 v1, v4 >- addavg_1 v2, v5 >- sqxtun v0.8b, v0.8h >- sqxtun v1.8b, v1.8h >- sqxtun v2.8b, v2.8h >- st1 {v0.8b-v2.8b}, [x2], x5 >- cbnz w12, .Loop_eq_16_sve2_addavg_24x\h >- ret >-.vl_gt_16_addAvg_24x\h\(): >- cmp x9, #48 >- bgt .vl_gt_48_addAvg_24x\h >- ptrue p0.b, vl32 >- ptrue p1.b, vl16 >-.Loop_gt_16_sve2_addavg_24x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p1/z, [x0, #1, mul vl] >- ld1b {z2.b}, p0/z, [x1] >- ld1b {z3.b}, p1/z, [x1, #1, mul vl] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z2.h >- add z1.h, p1/m, z1.h, z3.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z1.b, z1.h, #7 >- add z1.b, z1.b, #0x80 >- st1b {z0.h}, p0, [x2] >- st1b {z1.h}, p1, [x2, #1, mul vl] >- add x2, x2, x5 >- cbnz w12, .Loop_gt_16_sve2_addavg_24x\h >- ret >-.vl_gt_48_addAvg_24x\h\(): >- mov x10, #48 >- mov x11, #0 >- whilelt p0.b, x11, x10 >-.Loop_gt_48_sve2_addavg_24x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z2.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z2.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- st1b {z0.h}, p0, [x2] >- add x2, x2, x5 >- cbnz w12, .Loop_gt_48_sve2_addavg_24x\h >- ret >-endfunc >-.endm >- >-addAvg_24xN_sve2 32 >-addAvg_24xN_sve2 64 >- >-.macro addAvg_32xN_sve2 h >-function PFX(addAvg_32x\h\()_sve2) >- mov w12, #\h >- rdvl x9, #1 >- cmp x9, #16 >- bgt .vl_gt_16_addAvg_32x\h >- ptrue p0.b, vl16 >-.Loop_eq_16_sve2_addavg_32x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x0, #1, mul vl] >- ld1b {z2.b}, p0/z, [x0, #2, mul vl] >- ld1b {z3.b}, p0/z, [x0, #3, mul vl] >- ld1b {z4.b}, p0/z, [x1] >- ld1b {z5.b}, p0/z, [x1, #1, mul vl] >- ld1b {z6.b}, p0/z, [x1, #2, mul vl] >- ld1b {z7.b}, p0/z, [x1, #3, mul vl] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z4.h >- add z1.h, p0/m, z1.h, z5.h >- add z2.h, p0/m, z2.h, z6.h >- add z3.h, p0/m, z3.h, z7.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z1.b, z1.h, #7 >- add z1.b, z1.b, #0x80 >- sqrshrnb z2.b, z2.h, #7 >- add z2.b, z2.b, #0x80 >- sqrshrnb z3.b, z3.h, #7 >- add z3.b, z3.b, #0x80 >- st1b {z0.h}, p0, [x2] >- st1b {z1.h}, p0, [x2, #1, mul vl] >- st1b {z2.h}, p0, [x2, #2, mul vl] >- st1b {z3.h}, p0, [x2, #3, mul vl] >- add x2, x2, x5 >- cbnz w12, .Loop_eq_16_sve2_addavg_32x\h >- ret >-.vl_gt_16_addAvg_32x\h\(): >- cmp x9, #48 >- bgt .vl_gt_48_addAvg_32x\h >- ptrue p0.b, vl32 >-.Loop_gt_eq_32_sve2_addavg_32x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x0, #1, mul vl] >- ld1b {z2.b}, p0/z, [x1] >- ld1b {z3.b}, p0/z, [x1, #1, mul vl] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z2.h >- add z1.h, p0/m, z1.h, z3.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z1.b, z1.h, #7 >- add z1.b, z1.b, #0x80 >- st1b {z0.h}, p0, [x2] >- st1b {z1.h}, p0, [x2, #1, mul vl] >- add x2, x2, x5 >- cbnz w12, .Loop_gt_eq_32_sve2_addavg_32x\h >- ret >-.vl_gt_48_addAvg_32x\h\(): >- ptrue p0.b, vl64 >-.Loop_eq_64_sve2_addavg_32x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z1.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- st1b {z0.h}, p0, [x2] >- add x2, x2, x5 >- cbnz w12, .Loop_eq_64_sve2_addavg_32x\h >- ret >-endfunc >-.endm >- >-addAvg_32xN_sve2 8 >-addAvg_32xN_sve2 16 >-addAvg_32xN_sve2 24 >-addAvg_32xN_sve2 32 >-addAvg_32xN_sve2 48 >-addAvg_32xN_sve2 64 >- >-function PFX(addAvg_48x64_sve2) >- mov w12, #64 >- rdvl x9, #1 >- cmp x9, #16 >- bgt .vl_gt_16_addAvg_48x64 >- addAvg_start >- sub x3, x3, #64 >- sub x4, x4, #64 >-.Loop_eq_16_sve2_addavg_48x64: >- sub w12, w12, #1 >- ld1 {v0.8h-v3.8h}, [x0], #64 >- ld1 {v4.8h-v7.8h}, [x1], #64 >- ld1 {v20.8h-v21.8h}, [x0], x3 >- ld1 {v22.8h-v23.8h}, [x1], x4 >- addavg_1 v0, v4 >- addavg_1 v1, v5 >- addavg_1 v2, v6 >- addavg_1 v3, v7 >- addavg_1 v20, v22 >- addavg_1 v21, v23 >- sqxtun v0.8b, v0.8h >- sqxtun2 v0.16b, v1.8h >- sqxtun v1.8b, v2.8h >- sqxtun2 v1.16b, v3.8h >- sqxtun v2.8b, v20.8h >- sqxtun2 v2.16b, v21.8h >- st1 {v0.16b-v2.16b}, [x2], x5 >- cbnz w12, .Loop_eq_16_sve2_addavg_48x64 >- ret >-.vl_gt_16_addAvg_48x64: >- cmp x9, #48 >- bgt .vl_gt_48_addAvg_48x64 >- ptrue p0.b, vl32 >-.Loop_gt_eq_32_sve2_addavg_48x64: >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x0, #1, mul vl] >- ld1b {z2.b}, p0/z, [x0, #2, mul vl] >- ld1b {z4.b}, p0/z, [x1] >- ld1b {z5.b}, p0/z, [x1, #1, mul vl] >- ld1b {z6.b}, p0/z, [x1, #2, mul vl] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z4.h >- add z1.h, p0/m, z1.h, z5.h >- add z2.h, p0/m, z2.h, z6.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z1.b, z1.h, #7 >- add z1.b, z1.b, #0x80 >- sqrshrnb z2.b, z2.h, #7 >- add z2.b, z2.b, #0x80 >- st1b {z0.h}, p0, [x2] >- st1b {z1.h}, p0, [x2, #1, mul vl] >- st1b {z2.h}, p0, [x2, #2, mul vl] >- add x2, x2, x5 >- cbnz w12, .Loop_gt_eq_32_sve2_addavg_48x64 >- ret >-.vl_gt_48_addAvg_48x64: >- cmp x9, #112 >- bgt .vl_gt_112_addAvg_48x64 >- ptrue p0.b, vl64 >- ptrue p1.b, vl32 >-.Loop_gt_48_sve2_addavg_48x64: >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p1/z, [x0, #1, mul vl] >- ld1b {z4.b}, p0/z, [x1] >- ld1b {z5.b}, p1/z, [x1, #1, mul vl] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z4.h >- add z1.h, p1/m, z1.h, z5.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z1.b, z1.h, #7 >- add z1.b, z1.b, #0x80 >- st1b {z0.h}, p0, [x2] >- st1b {z1.h}, p1, [x2, #1, mul vl] >- add x2, x2, x5 >- cbnz w12, .Loop_gt_48_sve2_addavg_48x64 >- ret >-.vl_gt_112_addAvg_48x64: >- mov x10, #96 >- mov x11, #0 >- whilelt p0.b, x11, x10 >-.Loop_gt_112_sve2_addavg_48x64: >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z4.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z4.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- st1b {z0.h}, p0, [x2] >- add x2, x2, x5 >- cbnz w12, .Loop_gt_112_sve2_addavg_48x64 >- ret >-endfunc >- >-.macro addAvg_64xN_sve2 h >-function PFX(addAvg_64x\h\()_sve2) >- mov w12, #\h >- rdvl x9, #1 >- cmp x9, #16 >- bgt .vl_gt_16_addAvg_64x\h >- addAvg_start >- sub x3, x3, #64 >- sub x4, x4, #64 >-.Loop_eq_16_sve2_addavg_64x\h\(): >- sub w12, w12, #1 >- ld1 {v0.8h-v3.8h}, [x0], #64 >- ld1 {v4.8h-v7.8h}, [x1], #64 >- ld1 {v20.8h-v23.8h}, [x0], x3 >- ld1 {v24.8h-v27.8h}, [x1], x4 >- addavg_1 v0, v4 >- addavg_1 v1, v5 >- addavg_1 v2, v6 >- addavg_1 v3, v7 >- addavg_1 v20, v24 >- addavg_1 v21, v25 >- addavg_1 v22, v26 >- addavg_1 v23, v27 >- sqxtun v0.8b, v0.8h >- sqxtun2 v0.16b, v1.8h >- sqxtun v1.8b, v2.8h >- sqxtun2 v1.16b, v3.8h >- sqxtun v2.8b, v20.8h >- sqxtun2 v2.16b, v21.8h >- sqxtun v3.8b, v22.8h >- sqxtun2 v3.16b, v23.8h >- st1 {v0.16b-v3.16b}, [x2], x5 >- cbnz w12, .Loop_eq_16_sve2_addavg_64x\h >- ret >-.vl_gt_16_addAvg_64x\h\(): >- cmp x9, #48 >- bgt .vl_gt_48_addAvg_64x\h >- ptrue p0.b, vl32 >-.Loop_gt_eq_32_sve2_addavg_64x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x0, #1, mul vl] >- ld1b {z2.b}, p0/z, [x0, #2, mul vl] >- ld1b {z3.b}, p0/z, [x0, #3, mul vl] >- ld1b {z4.b}, p0/z, [x1] >- ld1b {z5.b}, p0/z, [x1, #1, mul vl] >- ld1b {z6.b}, p0/z, [x1, #2, mul vl] >- ld1b {z7.b}, p0/z, [x1, #3, mul vl] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z4.h >- add z1.h, p0/m, z1.h, z5.h >- add z2.h, p0/m, z2.h, z6.h >- add z3.h, p0/m, z3.h, z7.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z1.b, z1.h, #7 >- add z1.b, z1.b, #0x80 >- sqrshrnb z2.b, z2.h, #7 >- add z2.b, z2.b, #0x80 >- sqrshrnb z3.b, z3.h, #7 >- add z3.b, z3.b, #0x80 >- st1b {z0.h}, p0, [x2] >- st1b {z1.h}, p0, [x2, #1, mul vl] >- st1b {z2.h}, p0, [x2, #2, mul vl] >- st1b {z3.h}, p0, [x2, #3, mul vl] >- add x2, x2, x5 >- cbnz w12, .Loop_gt_eq_32_sve2_addavg_64x\h >- ret >-.vl_gt_48_addAvg_64x\h\(): >- cmp x9, #112 >- bgt .vl_gt_112_addAvg_64x\h >- ptrue p0.b, vl64 >-.Loop_gt_eq_48_sve2_addavg_64x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z1.b}, p0/z, [x0, #1, mul vl] >- ld1b {z4.b}, p0/z, [x1] >- ld1b {z5.b}, p0/z, [x1, #1, mul vl] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z4.h >- add z1.h, p0/m, z1.h, z5.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- sqrshrnb z1.b, z1.h, #7 >- add z1.b, z1.b, #0x80 >- st1b {z0.h}, p0, [x2] >- st1b {z1.h}, p0, [x2, #1, mul vl] >- add x2, x2, x5 >- cbnz w12, .Loop_gt_eq_48_sve2_addavg_64x\h >- ret >-.vl_gt_112_addAvg_64x\h\(): >- ptrue p0.b, vl128 >-.Loop_gt_eq_128_sve2_addavg_64x\h\(): >- sub w12, w12, #1 >- ld1b {z0.b}, p0/z, [x0] >- ld1b {z4.b}, p0/z, [x1] >- add x0, x0, x3, lsl #1 >- add x1, x1, x4, lsl #1 >- add z0.h, p0/m, z0.h, z4.h >- sqrshrnb z0.b, z0.h, #7 >- add z0.b, z0.b, #0x80 >- st1b {z0.h}, p0, [x2] >- add x2, x2, x5 >- cbnz w12, .Loop_gt_eq_128_sve2_addavg_64x\h >- ret >-endfunc >-.endm >- >-addAvg_64xN_sve2 16 >-addAvg_64xN_sve2 32 >-addAvg_64xN_sve2 48 >-addAvg_64xN_sve2 64 >diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S >index 130bf1a4a..876228473 100644 >--- a/source/common/aarch64/mc-a.S >+++ b/source/common/aarch64/mc-a.S >@@ -214,344 +214,3 @@ pixel_avg_pp_64xN_neon 16 > pixel_avg_pp_64xN_neon 32 > pixel_avg_pp_64xN_neon 48 > pixel_avg_pp_64xN_neon 64 >- >-// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t >src0Stride, intptr_t src1Stride, intptr_t dstStride) >-.macro addAvg_2xN h >-function PFX(addAvg_2x\h\()_neon) >- addAvg_start >-.rept \h / 2 >- ldr w10, [x0] >- ldr w11, [x1] >- add x0, x0, x3 >- add x1, x1, x4 >- ldr w12, [x0] >- ldr w13, [x1] >- add x0, x0, x3 >- add x1, x1, x4 >- dup v0.2s, w10 >- dup v1.2s, w11 >- dup v2.2s, w12 >- dup v3.2s, w13 >- add v0.4h, v0.4h, v1.4h >- add v2.4h, v2.4h, v3.4h >- saddl v0.4s, v0.4h, v30.4h >- saddl v2.4s, v2.4h, v30.4h >- shrn v0.4h, v0.4s, #7 >- shrn2 v0.8h, v2.4s, #7 >- sqxtun v0.8b, v0.8h >- st1 {v0.h}[0], [x2], x5 >- st1 {v0.h}[2], [x2], x5 >-.endr >- ret >-endfunc >-.endm >- >-addAvg_2xN 4 >-addAvg_2xN 8 >-addAvg_2xN 16 >- >-.macro addAvg_4xN h >-function PFX(addAvg_4x\h\()_neon) >- addAvg_start >-.rept \h / 2 >- ld1 {v0.8b}, [x0], x3 >- ld1 {v1.8b}, [x1], x4 >- ld1 {v2.8b}, [x0], x3 >- ld1 {v3.8b}, [x1], x4 >- add v0.4h, v0.4h, v1.4h >- add v2.4h, v2.4h, v3.4h >- saddl v0.4s, v0.4h, v30.4h >- saddl v2.4s, v2.4h, v30.4h >- shrn v0.4h, v0.4s, #7 >- shrn2 v0.8h, v2.4s, #7 >- sqxtun v0.8b, v0.8h >- st1 {v0.s}[0], [x2], x5 >- st1 {v0.s}[1], [x2], x5 >-.endr >- ret >-endfunc >-.endm >- >-addAvg_4xN 2 >-addAvg_4xN 4 >-addAvg_4xN 8 >-addAvg_4xN 16 >-addAvg_4xN 32 >- >-.macro addAvg_6xN h >-function PFX(addAvg_6x\h\()_neon) >- addAvg_start >- mov w12, #\h / 2 >- sub x5, x5, #4 >-.Loop_addavg_6x\h: >- sub w12, w12, #1 >- ld1 {v0.16b}, [x0], x3 >- ld1 {v1.16b}, [x1], x4 >- ld1 {v2.16b}, [x0], x3 >- ld1 {v3.16b}, [x1], x4 >- add v0.8h, v0.8h, v1.8h >- add v2.8h, v2.8h, v3.8h >- saddl v16.4s, v0.4h, v30.4h >- saddl2 v17.4s, v0.8h, v30.8h >- saddl v18.4s, v2.4h, v30.4h >- saddl2 v19.4s, v2.8h, v30.8h >- shrn v0.4h, v16.4s, #7 >- shrn2 v0.8h, v17.4s, #7 >- shrn v1.4h, v18.4s, #7 >- shrn2 v1.8h, v19.4s, #7 >- sqxtun v0.8b, v0.8h >- sqxtun v1.8b, v1.8h >- str s0, [x2], #4 >- st1 {v0.h}[2], [x2], x5 >- str s1, [x2], #4 >- st1 {v1.h}[2], [x2], x5 >- cbnz w12, .Loop_addavg_6x\h >- ret >-endfunc >-.endm >- >-addAvg_6xN 8 >-addAvg_6xN 16 >- >-.macro addAvg_8xN h >-function PFX(addAvg_8x\h\()_neon) >- addAvg_start >-.rept \h / 2 >- ld1 {v0.16b}, [x0], x3 >- ld1 {v1.16b}, [x1], x4 >- ld1 {v2.16b}, [x0], x3 >- ld1 {v3.16b}, [x1], x4 >- add v0.8h, v0.8h, v1.8h >- add v2.8h, v2.8h, v3.8h >- saddl v16.4s, v0.4h, v30.4h >- saddl2 v17.4s, v0.8h, v30.8h >- saddl v18.4s, v2.4h, v30.4h >- saddl2 v19.4s, v2.8h, v30.8h >- shrn v0.4h, v16.4s, #7 >- shrn2 v0.8h, v17.4s, #7 >- shrn v1.4h, v18.4s, #7 >- shrn2 v1.8h, v19.4s, #7 >- sqxtun v0.8b, v0.8h >- sqxtun v1.8b, v1.8h >- st1 {v0.8b}, [x2], x5 >- st1 {v1.8b}, [x2], x5 >-.endr >- ret >-endfunc >-.endm >- >-.macro addAvg_8xN1 h >-function PFX(addAvg_8x\h\()_neon) >- addAvg_start >- mov w12, #\h / 2 >-.Loop_addavg_8x\h: >- sub w12, w12, #1 >- ld1 {v0.16b}, [x0], x3 >- ld1 {v1.16b}, [x1], x4 >- ld1 {v2.16b}, [x0], x3 >- ld1 {v3.16b}, [x1], x4 >- add v0.8h, v0.8h, v1.8h >- add v2.8h, v2.8h, v3.8h >- saddl v16.4s, v0.4h, v30.4h >- saddl2 v17.4s, v0.8h, v30.8h >- saddl v18.4s, v2.4h, v30.4h >- saddl2 v19.4s, v2.8h, v30.8h >- shrn v0.4h, v16.4s, #7 >- shrn2 v0.8h, v17.4s, #7 >- shrn v1.4h, v18.4s, #7 >- shrn2 v1.8h, v19.4s, #7 >- sqxtun v0.8b, v0.8h >- sqxtun v1.8b, v1.8h >- st1 {v0.8b}, [x2], x5 >- st1 {v1.8b}, [x2], x5 >- cbnz w12, .Loop_addavg_8x\h >- ret >-endfunc >-.endm >- >-addAvg_8xN 2 >-addAvg_8xN 4 >-addAvg_8xN 6 >-addAvg_8xN 8 >-addAvg_8xN 12 >-addAvg_8xN 16 >-addAvg_8xN1 32 >-addAvg_8xN1 64 >- >-.macro addAvg_12xN h >-function PFX(addAvg_12x\h\()_neon) >- addAvg_start >- sub x3, x3, #16 >- sub x4, x4, #16 >- sub x5, x5, #8 >- mov w12, #\h >-.Loop_addAvg_12X\h\(): >- sub w12, w12, #1 >- ld1 {v0.16b}, [x0], #16 >- ld1 {v1.16b}, [x1], #16 >- ld1 {v2.8b}, [x0], x3 >- ld1 {v3.8b}, [x1], x4 >- add v0.8h, v0.8h, v1.8h >- add v2.4h, v2.4h, v3.4h >- saddl v16.4s, v0.4h, v30.4h >- saddl2 v17.4s, v0.8h, v30.8h >- saddl v18.4s, v2.4h, v30.4h >- shrn v0.4h, v16.4s, #7 >- shrn2 v0.8h, v17.4s, #7 >- shrn v1.4h, v18.4s, #7 >- sqxtun v0.8b, v0.8h >- sqxtun v1.8b, v1.8h >- st1 {v0.8b}, [x2], #8 >- st1 {v1.s}[0], [x2], x5 >- cbnz w12, .Loop_addAvg_12X\h >- ret >-endfunc >-.endm >- >-addAvg_12xN 16 >-addAvg_12xN 32 >- >-.macro addAvg_16xN h >-function PFX(addAvg_16x\h\()_neon) >- addAvg_start >- mov w12, #\h >-.Loop_addavg_16x\h: >- sub w12, w12, #1 >- ld1 {v0.8h-v1.8h}, [x0], x3 >- ld1 {v2.8h-v3.8h}, [x1], x4 >- addavg_1 v0, v2 >- addavg_1 v1, v3 >- sqxtun v0.8b, v0.8h >- sqxtun2 v0.16b, v1.8h >- st1 {v0.16b}, [x2], x5 >- cbnz w12, .Loop_addavg_16x\h >- ret >-endfunc >-.endm >- >-addAvg_16xN 4 >-addAvg_16xN 8 >-addAvg_16xN 12 >-addAvg_16xN 16 >-addAvg_16xN 24 >-addAvg_16xN 32 >-addAvg_16xN 64 >- >-.macro addAvg_24xN h >-function PFX(addAvg_24x\h\()_neon) >- addAvg_start >- mov w12, #\h >-.Loop_addavg_24x\h\(): >- sub w12, w12, #1 >- ld1 {v0.16b-v2.16b}, [x0], x3 >- ld1 {v3.16b-v5.16b}, [x1], x4 >- addavg_1 v0, v3 >- addavg_1 v1, v4 >- addavg_1 v2, v5 >- sqxtun v0.8b, v0.8h >- sqxtun v1.8b, v1.8h >- sqxtun v2.8b, v2.8h >- st1 {v0.8b-v2.8b}, [x2], x5 >- cbnz w12, .Loop_addavg_24x\h >- ret >-endfunc >-.endm >- >-addAvg_24xN 32 >-addAvg_24xN 64 >- >-.macro addAvg_32xN h >-function PFX(addAvg_32x\h\()_neon) >- addAvg_start >- mov w12, #\h >-.Loop_addavg_32x\h\(): >- sub w12, w12, #1 >- ld1 {v0.8h-v3.8h}, [x0], x3 >- ld1 {v4.8h-v7.8h}, [x1], x4 >- addavg_1 v0, v4 >- addavg_1 v1, v5 >- addavg_1 v2, v6 >- addavg_1 v3, v7 >- sqxtun v0.8b, v0.8h >- sqxtun v1.8b, v1.8h >- sqxtun v2.8b, v2.8h >- sqxtun v3.8b, v3.8h >- st1 {v0.8b-v3.8b}, [x2], x5 >- cbnz w12, .Loop_addavg_32x\h >- ret >-endfunc >-.endm >- >-addAvg_32xN 8 >-addAvg_32xN 16 >-addAvg_32xN 24 >-addAvg_32xN 32 >-addAvg_32xN 48 >-addAvg_32xN 64 >- >-function PFX(addAvg_48x64_neon) >- addAvg_start >- sub x3, x3, #64 >- sub x4, x4, #64 >- mov w12, #64 >-.Loop_addavg_48x64: >- sub w12, w12, #1 >- ld1 {v0.8h-v3.8h}, [x0], #64 >- ld1 {v4.8h-v7.8h}, [x1], #64 >- ld1 {v20.8h-v21.8h}, [x0], x3 >- ld1 {v22.8h-v23.8h}, [x1], x4 >- addavg_1 v0, v4 >- addavg_1 v1, v5 >- addavg_1 v2, v6 >- addavg_1 v3, v7 >- addavg_1 v20, v22 >- addavg_1 v21, v23 >- sqxtun v0.8b, v0.8h >- sqxtun2 v0.16b, v1.8h >- sqxtun v1.8b, v2.8h >- sqxtun2 v1.16b, v3.8h >- sqxtun v2.8b, v20.8h >- sqxtun2 v2.16b, v21.8h >- st1 {v0.16b-v2.16b}, [x2], x5 >- cbnz w12, .Loop_addavg_48x64 >- ret >-endfunc >- >-.macro addAvg_64xN h >-function PFX(addAvg_64x\h\()_neon) >- addAvg_start >- mov w12, #\h >- sub x3, x3, #64 >- sub x4, x4, #64 >-.Loop_addavg_64x\h\(): >- sub w12, w12, #1 >- ld1 {v0.8h-v3.8h}, [x0], #64 >- ld1 {v4.8h-v7.8h}, [x1], #64 >- ld1 {v20.8h-v23.8h}, [x0], x3 >- ld1 {v24.8h-v27.8h}, [x1], x4 >- addavg_1 v0, v4 >- addavg_1 v1, v5 >- addavg_1 v2, v6 >- addavg_1 v3, v7 >- addavg_1 v20, v24 >- addavg_1 v21, v25 >- addavg_1 v22, v26 >- addavg_1 v23, v27 >- sqxtun v0.8b, v0.8h >- sqxtun2 v0.16b, v1.8h >- sqxtun v1.8b, v2.8h >- sqxtun2 v1.16b, v3.8h >- sqxtun v2.8b, v20.8h >- sqxtun2 v2.16b, v21.8h >- sqxtun v3.8b, v22.8h >- sqxtun2 v3.16b, v23.8h >- st1 {v0.16b-v3.16b}, [x2], x5 >- cbnz w12, .Loop_addavg_64x\h >- ret >-endfunc >-.endm >- >-addAvg_64xN 16 >-addAvg_64xN 32 >-addAvg_64xN 48 >-addAvg_64xN 64 >diff --git a/source/common/aarch64/mem-neon.h >b/source/common/aarch64/mem-neon.h >index 8bd5fbee9..27d72b70c 100644 >--- a/source/common/aarch64/mem-neon.h >+++ b/source/common/aarch64/mem-neon.h >@@ -74,6 +74,26 @@ static void inline store_u8x4x1(uint8_t *d, const uint8x8_t >s) > vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(s), 0); > } > >+// Store 2 bytes from the low half of a uint8x8_t. >+static void inline store_u8x2x1(uint8_t *d, const uint8x8_t s) >+{ >+ vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(s), 0); >+} >+ >+// Load 2 int16_t into a int16x8_t. >+static inline int16x8_t load_s16x2x1(const int16_t *p) >+{ >+ int32x4_t ret = vld1q_lane_s32((const int32_t *)p, vdupq_n_s32(0), 0); >+ >+ return vreinterpretq_s16_s32(ret); >+} >+ >+// Store 2 uint16_t from the low half of a uint16x8_t. >+static inline void store_u16x2x1(const uint16_t *d, const uint16x8_t s) >+{ >+ vst1q_lane_u32((uint32_t *)d, vreinterpretq_u32_u16(s), 0); >+} >+ > // Store N blocks of 32-bits from (N / 2) D-Registers. > template<int N> > static void inline store_u8x4_strided_xN(uint8_t *d, intptr_t stride, >diff --git a/source/common/aarch64/pixel-prim.cpp >b/source/common/aarch64/pixel-prim.cpp >index f4df6786e..ef7861284 100644 >--- a/source/common/aarch64/pixel-prim.cpp >+++ b/source/common/aarch64/pixel-prim.cpp >@@ -1145,49 +1145,138 @@ void pixel_add_ps_neon(pixel *a, intptr_t dstride, >const pixel *b0, const int16_ > } > } > >-template<int bx, int by> >-void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, >intptr_t src0Stride, intptr_t src1Stride, >- intptr_t dstStride) >+template<int width, int height> >+void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, >+ intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) > { >- > const int shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH; >- const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS; >+ const int offset = 2 * IF_INTERNAL_OFFS; > >- const int32x4_t addon = vdupq_n_s32(offset); >- for (int y = 0; y < by; y++) >+#if HIGH_BIT_DEPTH >+ const int16x8_t addon = vdupq_n_s16(offset >> shiftNum); >+ >+ for (int h = 0; h < height; h++) > { >- int x = 0; >+ int w = 0; >+ for (; w + 16 <= width; w += 16) >+ { >+ int16x8_t s0[2], s1[2]; >+ load_s16x8xn<2>(src0 + w, 8, s0); >+ load_s16x8xn<2>(src1 + w, 8, s1); > >- for (; (x + 8) <= bx; x += 8) >+ int16x8_t d0_lo = vrsraq_n_s16(addon, vaddq_s16(s0[0], s1[0]), >shiftNum); >+ int16x8_t d0_hi = vrsraq_n_s16(addon, vaddq_s16(s0[1], s1[1]), >shiftNum); >+ >+ d0_lo = vminq_s16(d0_lo, vdupq_n_s16((1 << X265_DEPTH) - 1)); >+ d0_lo = vmaxq_s16(d0_lo, vdupq_n_s16(0)); >+ d0_hi = vminq_s16(d0_hi, vdupq_n_s16((1 << X265_DEPTH) - 1)); >+ d0_hi = vmaxq_s16(d0_hi, vdupq_n_s16(0)); >+ >+ vst1q_u16(dst + w, vreinterpretq_u16_s16(d0_lo)); >+ vst1q_u16(dst + w + 8, vreinterpretq_u16_s16(d0_hi)); >+ } >+ if (width & 8) > { >- int16x8_t in0 = vld1q_s16(src0 + x); >- int16x8_t in1 = vld1q_s16(src1 + x); >- int32x4_t t1 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1)); >- int32x4_t t2 = vaddl_high_s16(in0, in1); >- t1 = vaddq_s32(t1, addon); >- t2 = vaddq_s32(t2, addon); >- t1 = vshrq_n_s32(t1, shiftNum); >- t2 = vshrq_n_s32(t2, shiftNum); >- int16x8_t t = vuzp1q_s16(vreinterpretq_s16_s32(t1), >- vreinterpretq_s16_s32(t2)); >-#if HIGH_BIT_DEPTH >- t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1)); >- t = vmaxq_s16(t, vdupq_n_s16(0)); >- vst1q_u16(dst + x, vreinterpretq_u16_s16(t)); >-#else >- vst1_u8(dst + x, vqmovun_s16(t)); >-#endif >+ int16x8_t s0 = vld1q_s16(src0 + w); >+ int16x8_t s1 = vld1q_s16(src1 + w); >+ >+ int16x8_t d0 = vrsraq_n_s16(addon, vaddq_s16(s0, s1), shiftNum); >+ d0 = vminq_s16(d0, vdupq_n_s16((1 << X265_DEPTH) - 1)); >+ d0 = vmaxq_s16(d0, vdupq_n_s16(0)); >+ >+ vst1q_u16(dst + w, vreinterpretq_u16_s16(d0)); >+ >+ w += 8; >+ } >+ if (width & 4) >+ { >+ int16x4_t s0 = vld1_s16(src0 + w); >+ int16x4_t s1 = vld1_s16(src1 + w); >+ >+ int16x4_t d0 = vrsra_n_s16(vget_low_s16(addon), vadd_s16(s0, s1), >shiftNum); >+ d0 = vmin_s16(d0, vdup_n_s16((1 << X265_DEPTH) - 1)); >+ d0 = vmax_s16(d0, vdup_n_s16(0)); >+ >+ vst1_u16(dst + w, vreinterpret_u16_s16(d0)); >+ >+ w += 4; >+ } >+ if (width & 2) >+ { >+ int16x8_t s0 = load_s16x2x1(src0 + w); >+ int16x8_t s1 = load_s16x2x1(src1 + w); >+ >+ int16x8_t d0 = vrsraq_n_s16(addon, vaddq_s16(s0, s1), shiftNum); >+ d0 = vminq_s16(d0, vdupq_n_s16((1 << X265_DEPTH) - 1)); >+ d0 = vmaxq_s16(d0, vdupq_n_s16(0)); >+ >+ store_u16x2x1(dst + w, vreinterpretq_u16_s16(d0)); >+ } >+ >+ src0 += src0Stride; >+ src1 += src1Stride; >+ dst += dstStride; >+ } >+#else // !HIGH_BIT_DEPTH >+ const uint8x8_t addon = vdup_n_u8(offset >> shiftNum); >+ >+ for (int h = 0; h < height; h++) >+ { >+ int w = 0; >+ for (; w + 16 <= width; w += 16) >+ { >+ int16x8_t s0[2], s1[2]; >+ load_s16x8xn<2>(src0 + w, 8, s0); >+ load_s16x8xn<2>(src1 + w, 8, s1); >+ >+ int8x8_t sum01_s8_lo = vqrshrn_n_s16(vaddq_s16(s0[0], s1[0]), >shiftNum); >+ int8x8_t sum01_s8_hi = vqrshrn_n_s16(vaddq_s16(s0[1], s1[1]), >shiftNum); >+ uint8x8_t d0_lo = vadd_u8(vreinterpret_u8_s8(sum01_s8_lo), addon); >+ uint8x8_t d0_hi = vadd_u8(vreinterpret_u8_s8(sum01_s8_hi), addon); >+ >+ vst1_u8(dst + w, d0_lo); >+ vst1_u8(dst + w + 8, d0_hi); >+ } >+ if (width & 8) >+ { >+ int16x8_t s0 = vld1q_s16(src0 + w); >+ int16x8_t s1 = vld1q_s16(src1 + w); >+ >+ int8x8_t sum01_s8 = vqrshrn_n_s16(vaddq_s16(s0, s1), shiftNum); >+ uint8x8_t d0 = vadd_u8(vreinterpret_u8_s8(sum01_s8), addon); >+ >+ vst1_u8(dst + w, d0); >+ >+ w += 8; > } >- for (; x < bx; x += 2) >+ if (width & 4) > { >- dst[x + 0] = x265_clip((src0[x + 0] + src1[x + 0] + offset) >> >shiftNum); >- dst[x + 1] = x265_clip((src0[x + 1] + src1[x + 1] + offset) >> >shiftNum); >+ int16x8_t s0 = vcombine_s16(vld1_s16(src0 + w), vdup_n_s16(0)); >+ int16x8_t s1 = vcombine_s16(vld1_s16(src1 + w), vdup_n_s16(0)); >+ >+ int8x8_t sum01_s8 = vqrshrn_n_s16(vaddq_s16(s0, s1), shiftNum); >+ uint8x8_t d0 = vadd_u8(vreinterpret_u8_s8(sum01_s8), addon); >+ >+ store_u8x4x1(dst + w, d0); >+ >+ w += 4; >+ } >+ if (width & 2) >+ { >+ int16x8_t s0 = load_s16x2x1(src0 + w); >+ int16x8_t s1 = load_s16x2x1(src1 + w); >+ >+ int8x8_t sum01_s8 = vqrshrn_n_s16(vaddq_s16(s0, s1), shiftNum); >+ uint8x8_t d0 = vadd_u8(vreinterpret_u8_s8(sum01_s8), addon); >+ >+ store_u8x2x1(dst + w, d0); > } > > src0 += src0Stride; > src1 += src1Stride; > dst += dstStride; > } >+#endif > } > > void planecopy_cp_neon(const uint8_t *src, intptr_t srcStride, pixel *dst, >@@ -2057,29 +2146,30 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) > p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = > blockcopy_pp_neon<W, H>; \ > > >- CHROMA_PU_420(4, 4); >- CHROMA_PU_420(8, 8); >- CHROMA_PU_420(16, 16); >- CHROMA_PU_420(32, 32); >+ CHROMA_PU_420(2, 4); >+ CHROMA_PU_420(2, 8); > CHROMA_PU_420(4, 2); >- CHROMA_PU_420(8, 4); >+ CHROMA_PU_420(4, 4); > CHROMA_PU_420(4, 8); >- CHROMA_PU_420(8, 6); > CHROMA_PU_420(6, 8); >+ CHROMA_PU_420(4, 16); > CHROMA_PU_420(8, 2); >- CHROMA_PU_420(2, 8); >- CHROMA_PU_420(16, 8); >- CHROMA_PU_420(8, 16); >- CHROMA_PU_420(16, 12); >+ CHROMA_PU_420(8, 4); >+ CHROMA_PU_420(8, 6); >+ CHROMA_PU_420(8, 8); >+ CHROMA_PU_420(8, 16); >+ CHROMA_PU_420(8, 32); > CHROMA_PU_420(12, 16); > CHROMA_PU_420(16, 4); >- CHROMA_PU_420(4, 16); >- CHROMA_PU_420(32, 16); >+ CHROMA_PU_420(16, 8); >+ CHROMA_PU_420(16, 12); >+ CHROMA_PU_420(16, 16); > CHROMA_PU_420(16, 32); >- CHROMA_PU_420(32, 24); > CHROMA_PU_420(24, 32); > CHROMA_PU_420(32, 8); >- CHROMA_PU_420(8, 32); >+ CHROMA_PU_420(32, 16); >+ CHROMA_PU_420(32, 24); >+ CHROMA_PU_420(32, 32); > > > >@@ -2161,30 +2251,31 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) > p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = > blockcopy_pp_neon<W, H>; \ > > >- CHROMA_PU_422(4, 8); >- CHROMA_PU_422(8, 16); >- CHROMA_PU_422(16, 32); >- CHROMA_PU_422(32, 64); >- CHROMA_PU_422(4, 4); >+ CHROMA_PU_422(2, 4); > CHROMA_PU_422(2, 8); >- CHROMA_PU_422(8, 8); >+ CHROMA_PU_422(2, 16); >+ CHROMA_PU_422(4, 4); >+ CHROMA_PU_422(4, 8); > CHROMA_PU_422(4, 16); >- CHROMA_PU_422(8, 12); >- CHROMA_PU_422(6, 16); >+ CHROMA_PU_422(4, 32); > CHROMA_PU_422(8, 4); >- CHROMA_PU_422(2, 16); >- CHROMA_PU_422(16, 16); >+ CHROMA_PU_422(8, 8); >+ CHROMA_PU_422(8, 12); >+ CHROMA_PU_422(8, 16); > CHROMA_PU_422(8, 32); >- CHROMA_PU_422(16, 24); >+ CHROMA_PU_422(8, 64); >+ CHROMA_PU_422(6, 16); > CHROMA_PU_422(12, 32); > CHROMA_PU_422(16, 8); >- CHROMA_PU_422(4, 32); >- CHROMA_PU_422(32, 32); >+ CHROMA_PU_422(16, 16); >+ CHROMA_PU_422(16, 24); >+ CHROMA_PU_422(16, 32); > CHROMA_PU_422(16, 64); >- CHROMA_PU_422(32, 48); > CHROMA_PU_422(24, 64); > CHROMA_PU_422(32, 16); >- CHROMA_PU_422(8, 64); >+ CHROMA_PU_422(32, 32); >+ CHROMA_PU_422(32, 48); >+ CHROMA_PU_422(32, 64); > > > p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].satd = NULL; >-- >2.39.5 (Apple Git-154) >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
