Delete the Neon and SVE assembly implementations of these kernels as they are no faster, and only serve to increase binary size.
Co-authored by: Jonathan Wright <[email protected]> --- source/common/aarch64/asm-primitives.cpp | 28 --- source/common/aarch64/blockcopy8-sve.S | 207 ----------------------- source/common/aarch64/blockcopy8.S | 120 ------------- source/common/aarch64/pixel-prim.cpp | 40 ++++- 4 files changed, 39 insertions(+), 356 deletions(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 463da8319..981c6352a 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -404,13 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon); ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon); - // Blockcopy_ss - p.cu[BLOCK_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon); - p.cu[BLOCK_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon); - p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon); - p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon); - p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_neon); - // Blockcopy_sp p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon); p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon); @@ -418,16 +411,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon); p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon); - // chroma blockcopy_ss - p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ss = PFX(blockcopy_ss_4x8_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ss = PFX(blockcopy_ss_8x16_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_neon); - // chroma blockcopy_sp p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon); p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon); @@ -656,23 +639,12 @@ void setupSvePrimitives(EncoderPrimitives &p) CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]); LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]); - // Blockcopy_ss - p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve); - p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve); - p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_sve); - // Blockcopy_sp p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve); p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve); p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve); p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve); - // chroma blockcopy_ss - p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve); - p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_sve); - // chroma blockcopy_sp p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve); p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve); diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S index 9f9406e6e..976d80dd1 100644 --- a/source/common/aarch64/blockcopy8-sve.S +++ b/source/common/aarch64/blockcopy8-sve.S @@ -166,213 +166,6 @@ function PFX(blockcopy_sp_32x32_sve) ret endfunc -function PFX(blockcopy_ss_16x16_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_ss_16_16 - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 8 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v1.8h}, [x0], x1 - st1 {v2.8h-v3.8h}, [x0], x1 -.endr - ret -.vl_gt_16_blockcopy_ss_16_16: - ptrue p0.h, vl16 -.rept 16 - ld1h {z0.h}, p0/z, [x2] - st1h {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_32x32_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_ss_32_32 - lsl x1, x1, #1 - lsl x3, x3, #1 - mov w12, #4 -.Loop_css32_sve: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css32_sve - ret -.vl_gt_16_blockcopy_ss_32_32: - cmp x9, #48 - bgt .vl_gt_48_blockcopy_ss_32_32 - ptrue p0.h, vl16 -.rept 32 - ld1h {z0.h}, p0/z, [x2] - ld1h {z1.h}, p0/z, [x2, #1, mul vl] - st1h {z0.h}, p0, [x0] - st1h {z1.h}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - ret -.vl_gt_48_blockcopy_ss_32_32: - ptrue p0.h, vl32 -.rept 32 - ld1h {z0.h}, p0/z, [x2] - st1h {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_64x64_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_ss_64_64 - lsl x1, x1, #1 - sub x1, x1, #64 - lsl x3, x3, #1 - sub x3, x3, #64 - mov w12, #8 -.Loop_css64_sve: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], #64 - ld1 {v4.8h-v7.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], #64 - st1 {v4.8h-v7.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css64_sve - ret -.vl_gt_16_blockcopy_ss_64_64: - cmp x9, #48 - bgt .vl_gt_48_blockcopy_ss_64_64 - mov w12, #8 - ptrue p0.b, vl32 -.vl_gt_16_loop_css64_sve: - sub w12, w12, #1 -.rept 8 - ld1b {z0.b}, p0/z, [x2] - ld1b {z1.b}, p0/z, [x2, #1, mul vl] - ld1b {z2.b}, p0/z, [x2, #2, mul vl] - ld1b {z3.b}, p0/z, [x2, #3, mul vl] - st1b {z0.b}, p0, [x0] - st1b {z1.b}, p0, [x0, #1, mul vl] - st1b {z2.b}, p0, [x0, #2, mul vl] - st1b {z3.b}, p0, [x0, #3, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - cbnz w12, .vl_gt_16_loop_css64_sve - ret -.vl_gt_48_blockcopy_ss_64_64: - cmp x9, #112 - bgt .vl_gt_112_blockcopy_ss_64_64 - mov w12, #8 - ptrue p0.b, vl64 -.vl_gt_48_loop_css64_sve: - sub w12, w12, #1 -.rept 8 - ld1b {z0.b}, p0/z, [x2] - ld1b {z1.b}, p0/z, [x2, #1, mul vl] - st1b {z0.b}, p0, [x0] - st1b {z1.b}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - cbnz w12, .vl_gt_48_loop_css64_sve - ret -.vl_gt_112_blockcopy_ss_64_64: - mov w12, #8 - ptrue p0.b, vl128 -.vl_gt_112_loop_css64_sve: - sub w12, w12, #1 -.rept 8 - ld1b {z0.b}, p0/z, [x2] - st1b {z0.b}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - cbnz w12, .vl_gt_112_loop_css64_sve - ret -endfunc - -/******** Chroma blockcopy********/ -function PFX(blockcopy_ss_16x32_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_ss_16_32 - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 16 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v1.8h}, [x0], x1 - st1 {v2.8h-v3.8h}, [x0], x1 -.endr - ret -.vl_gt_16_blockcopy_ss_16_32: - ptrue p0.h, vl16 -.rept 32 - ld1h {z0.h}, p0/z, [x2] - st1h {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_32x64_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_ss_32_64 - lsl x1, x1, #1 - lsl x3, x3, #1 - mov w12, #8 -.Loop_css32x64_sve: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css32x64_sve - ret -.vl_gt_16_blockcopy_ss_32_64: - cmp x9, #48 - bgt .vl_gt_48_blockcopy_ss_32_64 - mov w12, #8 - ptrue p0.b, vl32 -.vl_gt_32_loop_css32x64_sve: - sub w12, w12, #1 -.rept 8 - ld1b {z0.b}, p0/z, [x2] - ld1b {z1.b}, p0/z, [x2, #1, mul vl] - st1b {z0.b}, p0, [x0] - st1b {z1.b}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - cbnz w12, .vl_gt_32_loop_css32x64_sve - ret -.vl_gt_48_blockcopy_ss_32_64: - mov w12, #8 - ptrue p0.b, vl64 -.vl_gt_48_loop_css32x64_sve: - sub w12, w12, #1 -.rept 8 - ld1b {z0.b}, p0/z, [x2] - st1b {z0.b}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - cbnz w12, .vl_gt_48_loop_css32x64_sve - ret -endfunc - // chroma blockcopy_sp function PFX(blockcopy_sp_4x8_sve) ptrue p0.h, vl4 diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S index 11685d254..8ac54a1e1 100644 --- a/source/common/aarch64/blockcopy8.S +++ b/source/common/aarch64/blockcopy8.S @@ -123,126 +123,6 @@ function PFX(blockcopy_sp_64x64_neon) ret endfunc -// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb) -function PFX(blockcopy_ss_4x4_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 2 - ld1 {v0.8b}, [x2], x3 - ld1 {v1.8b}, [x2], x3 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_8x8_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 4 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - st1 {v0.8h}, [x0], x1 - st1 {v1.8h}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_16x16_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 8 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v1.8h}, [x0], x1 - st1 {v2.8h-v3.8h}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_32x32_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 - mov w12, #4 -.Loop_css32: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css32 - ret -endfunc - -function PFX(blockcopy_ss_64x64_neon) - lsl x1, x1, #1 - sub x1, x1, #64 - lsl x3, x3, #1 - sub x3, x3, #64 - mov w12, #8 -.Loop_css64: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], #64 - ld1 {v4.8h-v7.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], #64 - st1 {v4.8h-v7.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css64 - ret -endfunc - -/******** Chroma blockcopy********/ -function PFX(blockcopy_ss_4x8_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 4 - ld1 {v0.8b}, [x2], x3 - ld1 {v1.8b}, [x2], x3 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_8x16_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 8 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - st1 {v0.8h}, [x0], x1 - st1 {v1.8h}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_16x32_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 16 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v1.8h}, [x0], x1 - st1 {v2.8h-v3.8h}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_32x64_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 - mov w12, #8 -.Loop_css32x64: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css32x64 - ret -endfunc - // chroma blockcopy_sp function PFX(blockcopy_sp_4x8_neon) lsl x3, x3, #1 diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index 80678a827..4be409ab1 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -1017,6 +1017,35 @@ void blockcopy_pp_neon(pixel *dst, intptr_t dst_stride, const pixel *src, } } +template<int width, int height> +void blockcopy_ss_neon(int16_t *dst, intptr_t dst_stride, const int16_t *src, + intptr_t src_stride) +{ + for (int h = 0; h < height; h++) + { + int w = 0; + for (; w + 16 <= width; w += 16) + { + int16x8_t a0 = vld1q_s16(src + w + 0); + int16x8_t a1 = vld1q_s16(src + w + 8); + vst1q_s16(dst + w + 0, a0); + vst1q_s16(dst + w + 8, a1); + } + if (width & 8) + { + vst1q_s16(dst + w, vld1q_s16(src + w)); + w += 8; + } + if (width & 4) + { + vst1_s16(dst + w, vld1_s16(src + w)); + } + + dst += dst_stride; + src += src_stride; + } +} + template<int bx, int by> void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixel *b1, intptr_t sstride0, @@ -1775,6 +1804,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \ @@ -1787,7 +1817,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ - p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \ @@ -1946,12 +1976,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #if HIGH_BIT_DEPTH #define CHROMA_CU_420(W, H) \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; #define CHROMA_CU_S_420(W, H) \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -1959,6 +1991,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_420(W, H) \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -1966,6 +1999,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_S_420(W, H) \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2046,12 +2080,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #if HIGH_BIT_DEPTH #define CHROMA_CU_422(W, H) \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; #define CHROMA_CU_S_422(W, H) \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2059,6 +2095,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_422(W, H) \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2066,6 +2103,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_S_422(W, H) \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; -- 2.39.5 (Apple Git-154)
>From 3c23c6b9eb2721f2b4437d81255ad33f402ac06f Mon Sep 17 00:00:00 2001 Message-Id: <3c23c6b9eb2721f2b4437d81255ad33f402ac06f.1747668338.git.li.zha...@arm.com> In-Reply-To: <[email protected]> References: <[email protected]> From: Li Zhang <[email protected]> Date: Thu, 8 May 2025 19:17:00 +0200 Subject: [PATCH 3/8] AArch64: Implement blockcopy_ss primitives using Neon intrinsics Delete the Neon and SVE assembly implementations of these kernels as they are no faster, and only serve to increase binary size. Co-authored by: Jonathan Wright <[email protected]> --- source/common/aarch64/asm-primitives.cpp | 28 --- source/common/aarch64/blockcopy8-sve.S | 207 ----------------------- source/common/aarch64/blockcopy8.S | 120 ------------- source/common/aarch64/pixel-prim.cpp | 40 ++++- 4 files changed, 39 insertions(+), 356 deletions(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 463da8319..981c6352a 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -404,13 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon); ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon); - // Blockcopy_ss - p.cu[BLOCK_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon); - p.cu[BLOCK_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon); - p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon); - p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon); - p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_neon); - // Blockcopy_sp p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon); p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon); @@ -418,16 +411,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon); p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon); - // chroma blockcopy_ss - p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ss = PFX(blockcopy_ss_4x8_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ss = PFX(blockcopy_ss_8x16_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_neon); - // chroma blockcopy_sp p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon); p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon); @@ -656,23 +639,12 @@ void setupSvePrimitives(EncoderPrimitives &p) CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]); LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]); - // Blockcopy_ss - p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve); - p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve); - p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_sve); - // Blockcopy_sp p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve); p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve); p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve); p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve); - // chroma blockcopy_ss - p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve); - p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_sve); - // chroma blockcopy_sp p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve); p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve); diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S index 9f9406e6e..976d80dd1 100644 --- a/source/common/aarch64/blockcopy8-sve.S +++ b/source/common/aarch64/blockcopy8-sve.S @@ -166,213 +166,6 @@ function PFX(blockcopy_sp_32x32_sve) ret endfunc -function PFX(blockcopy_ss_16x16_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_ss_16_16 - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 8 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v1.8h}, [x0], x1 - st1 {v2.8h-v3.8h}, [x0], x1 -.endr - ret -.vl_gt_16_blockcopy_ss_16_16: - ptrue p0.h, vl16 -.rept 16 - ld1h {z0.h}, p0/z, [x2] - st1h {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_32x32_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_ss_32_32 - lsl x1, x1, #1 - lsl x3, x3, #1 - mov w12, #4 -.Loop_css32_sve: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css32_sve - ret -.vl_gt_16_blockcopy_ss_32_32: - cmp x9, #48 - bgt .vl_gt_48_blockcopy_ss_32_32 - ptrue p0.h, vl16 -.rept 32 - ld1h {z0.h}, p0/z, [x2] - ld1h {z1.h}, p0/z, [x2, #1, mul vl] - st1h {z0.h}, p0, [x0] - st1h {z1.h}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - ret -.vl_gt_48_blockcopy_ss_32_32: - ptrue p0.h, vl32 -.rept 32 - ld1h {z0.h}, p0/z, [x2] - st1h {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_64x64_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_ss_64_64 - lsl x1, x1, #1 - sub x1, x1, #64 - lsl x3, x3, #1 - sub x3, x3, #64 - mov w12, #8 -.Loop_css64_sve: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], #64 - ld1 {v4.8h-v7.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], #64 - st1 {v4.8h-v7.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css64_sve - ret -.vl_gt_16_blockcopy_ss_64_64: - cmp x9, #48 - bgt .vl_gt_48_blockcopy_ss_64_64 - mov w12, #8 - ptrue p0.b, vl32 -.vl_gt_16_loop_css64_sve: - sub w12, w12, #1 -.rept 8 - ld1b {z0.b}, p0/z, [x2] - ld1b {z1.b}, p0/z, [x2, #1, mul vl] - ld1b {z2.b}, p0/z, [x2, #2, mul vl] - ld1b {z3.b}, p0/z, [x2, #3, mul vl] - st1b {z0.b}, p0, [x0] - st1b {z1.b}, p0, [x0, #1, mul vl] - st1b {z2.b}, p0, [x0, #2, mul vl] - st1b {z3.b}, p0, [x0, #3, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - cbnz w12, .vl_gt_16_loop_css64_sve - ret -.vl_gt_48_blockcopy_ss_64_64: - cmp x9, #112 - bgt .vl_gt_112_blockcopy_ss_64_64 - mov w12, #8 - ptrue p0.b, vl64 -.vl_gt_48_loop_css64_sve: - sub w12, w12, #1 -.rept 8 - ld1b {z0.b}, p0/z, [x2] - ld1b {z1.b}, p0/z, [x2, #1, mul vl] - st1b {z0.b}, p0, [x0] - st1b {z1.b}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - cbnz w12, .vl_gt_48_loop_css64_sve - ret -.vl_gt_112_blockcopy_ss_64_64: - mov w12, #8 - ptrue p0.b, vl128 -.vl_gt_112_loop_css64_sve: - sub w12, w12, #1 -.rept 8 - ld1b {z0.b}, p0/z, [x2] - st1b {z0.b}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - cbnz w12, .vl_gt_112_loop_css64_sve - ret -endfunc - -/******** Chroma blockcopy********/ -function PFX(blockcopy_ss_16x32_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_ss_16_32 - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 16 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v1.8h}, [x0], x1 - st1 {v2.8h-v3.8h}, [x0], x1 -.endr - ret -.vl_gt_16_blockcopy_ss_16_32: - ptrue p0.h, vl16 -.rept 32 - ld1h {z0.h}, p0/z, [x2] - st1h {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_32x64_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_ss_32_64 - lsl x1, x1, #1 - lsl x3, x3, #1 - mov w12, #8 -.Loop_css32x64_sve: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css32x64_sve - ret -.vl_gt_16_blockcopy_ss_32_64: - cmp x9, #48 - bgt .vl_gt_48_blockcopy_ss_32_64 - mov w12, #8 - ptrue p0.b, vl32 -.vl_gt_32_loop_css32x64_sve: - sub w12, w12, #1 -.rept 8 - ld1b {z0.b}, p0/z, [x2] - ld1b {z1.b}, p0/z, [x2, #1, mul vl] - st1b {z0.b}, p0, [x0] - st1b {z1.b}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - cbnz w12, .vl_gt_32_loop_css32x64_sve - ret -.vl_gt_48_blockcopy_ss_32_64: - mov w12, #8 - ptrue p0.b, vl64 -.vl_gt_48_loop_css32x64_sve: - sub w12, w12, #1 -.rept 8 - ld1b {z0.b}, p0/z, [x2] - st1b {z0.b}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1, lsl #1 -.endr - cbnz w12, .vl_gt_48_loop_css32x64_sve - ret -endfunc - // chroma blockcopy_sp function PFX(blockcopy_sp_4x8_sve) ptrue p0.h, vl4 diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S index 11685d254..8ac54a1e1 100644 --- a/source/common/aarch64/blockcopy8.S +++ b/source/common/aarch64/blockcopy8.S @@ -123,126 +123,6 @@ function PFX(blockcopy_sp_64x64_neon) ret endfunc -// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb) -function PFX(blockcopy_ss_4x4_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 2 - ld1 {v0.8b}, [x2], x3 - ld1 {v1.8b}, [x2], x3 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_8x8_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 4 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - st1 {v0.8h}, [x0], x1 - st1 {v1.8h}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_16x16_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 8 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v1.8h}, [x0], x1 - st1 {v2.8h-v3.8h}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_32x32_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 - mov w12, #4 -.Loop_css32: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css32 - ret -endfunc - -function PFX(blockcopy_ss_64x64_neon) - lsl x1, x1, #1 - sub x1, x1, #64 - lsl x3, x3, #1 - sub x3, x3, #64 - mov w12, #8 -.Loop_css64: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], #64 - ld1 {v4.8h-v7.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], #64 - st1 {v4.8h-v7.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css64 - ret -endfunc - -/******** Chroma blockcopy********/ -function PFX(blockcopy_ss_4x8_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 4 - ld1 {v0.8b}, [x2], x3 - ld1 {v1.8b}, [x2], x3 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_8x16_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 8 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - st1 {v0.8h}, [x0], x1 - st1 {v1.8h}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_16x32_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 -.rept 16 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v1.8h}, [x0], x1 - st1 {v2.8h-v3.8h}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_ss_32x64_neon) - lsl x1, x1, #1 - lsl x3, x3, #1 - mov w12, #8 -.Loop_css32x64: - sub w12, w12, #1 -.rept 8 - ld1 {v0.8h-v3.8h}, [x2], x3 - st1 {v0.8h-v3.8h}, [x0], x1 -.endr - cbnz w12, .Loop_css32x64 - ret -endfunc - // chroma blockcopy_sp function PFX(blockcopy_sp_4x8_neon) lsl x3, x3, #1 diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index 80678a827..4be409ab1 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -1017,6 +1017,35 @@ void blockcopy_pp_neon(pixel *dst, intptr_t dst_stride, const pixel *src, } } +template<int width, int height> +void blockcopy_ss_neon(int16_t *dst, intptr_t dst_stride, const int16_t *src, + intptr_t src_stride) +{ + for (int h = 0; h < height; h++) + { + int w = 0; + for (; w + 16 <= width; w += 16) + { + int16x8_t a0 = vld1q_s16(src + w + 0); + int16x8_t a1 = vld1q_s16(src + w + 8); + vst1q_s16(dst + w + 0, a0); + vst1q_s16(dst + w + 8, a1); + } + if (width & 8) + { + vst1q_s16(dst + w, vld1q_s16(src + w)); + w += 8; + } + if (width & 4) + { + vst1_s16(dst + w, vld1_s16(src + w)); + } + + dst += dst_stride; + src += src_stride; + } +} + template<int bx, int by> void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixel *b1, intptr_t sstride0, @@ -1775,6 +1804,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \ @@ -1787,7 +1817,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ - p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \ @@ -1946,12 +1976,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #if HIGH_BIT_DEPTH #define CHROMA_CU_420(W, H) \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; #define CHROMA_CU_S_420(W, H) \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -1959,6 +1991,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_420(W, H) \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -1966,6 +1999,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_S_420(W, H) \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2046,12 +2080,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #if HIGH_BIT_DEPTH #define CHROMA_CU_422(W, H) \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; #define CHROMA_CU_S_422(W, H) \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2059,6 +2095,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_422(W, H) \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2066,6 +2103,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_S_422(W, H) \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
