Hi Chen, Sorry I did not notice it! I sent a new v2 patch series with the fixes.
Thanks, Li From: chen <[email protected]> Date: Friday, 2025. June 20. at 6:25 To: Development for x265 <[email protected]> Cc: nd <[email protected]>, Li Zhang <[email protected]> Subject: Re:[x265] [PATCH 3/4] AArch64: Add SBD pixel_var Neon DotProd intrinsics implementations Hi Li, We found some warning when HIGH_BIT_DEPTH=1, could you please fix it? +void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p) +{ +#if !HIGH_BIT_DEPTH + p.cu[BLOCK_4x4].var = pixel_var_neon_dotprod<4>; + p.cu[BLOCK_8x8].var = pixel_var_neon_dotprod<8>; + p.cu[BLOCK_16x16].var = pixel_var_neon_dotprod<16>; + p.cu[BLOCK_32x32].var = pixel_var_neon_dotprod<32>; + p.cu[BLOCK_64x64].var = pixel_var_neon_dotprod<64>; +#endif // !HIGH_BIT_DEPTH +} At 2025-06-18 02:23:14, "Li Zhang" <[email protected]> wrote: >Add Neon DotProd intrinsics implementation for the standard bit-depth >pixel_var functions. > >This implementation is 1.2x-2.4x faster than the existing Armv8.0 Neon >implementation. >--- > source/common/CMakeLists.txt | 2 +- > source/common/aarch64/asm-primitives.cpp | 1 + > .../aarch64/pixel-prim-neon-dotprod.cpp | 111 ++++++++++++++++++ > source/common/aarch64/pixel-prim.h | 3 + > 4 files changed, 116 insertions(+), 1 deletion(-) > create mode 100644 source/common/aarch64/pixel-prim-neon-dotprod.cpp > >diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt >index 405ec0b2d..14a837429 100644 >--- a/source/common/CMakeLists.txt >+++ b/source/common/CMakeLists.txt >@@ -105,7 +105,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) > > # Add Arm intrinsics files here. > set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp > filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp > loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h > fun-decls.h sao-prim.cpp mem-neon.h) >- set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp) >+ set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp >pixel-prim-neon-dotprod.cpp) > set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp) > set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h > filter-prim-sve.cpp) > set(C_SRCS_SVE2 sao-prim-sve2.cpp) >diff --git a/source/common/aarch64/asm-primitives.cpp >b/source/common/aarch64/asm-primitives.cpp >index d8b0beb8f..e3f8788dd 100644 >--- a/source/common/aarch64/asm-primitives.cpp >+++ b/source/common/aarch64/asm-primitives.cpp >@@ -776,6 +776,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int >cpuMask) > if (cpuMask & X265_CPU_NEON_DOTPROD) > { > setupFilterPrimitives_neon_dotprod(p); >+ setupPixelPrimitives_neon_dotprod(p); > } > #endif > #ifdef HAVE_NEON_I8MM >diff --git a/source/common/aarch64/pixel-prim-neon-dotprod.cpp >b/source/common/aarch64/pixel-prim-neon-dotprod.cpp >new file mode 100644 >index 000000000..16e9fb2c2 >--- /dev/null >+++ b/source/common/aarch64/pixel-prim-neon-dotprod.cpp >@@ -0,0 +1,111 @@ >+/***************************************************************************** >+ * Copyright (C) 2025 MulticoreWare, Inc >+ * >+ * Authors: Li Zhang <[email protected]> >+ * >+ * This program is free software; you can redistribute it and/or modify >+ * it under the terms of the GNU General Public License as published by >+ * the Free Software Foundation; either version 2 of the License, or >+ * (at your option) any later version. >+ * >+ * This program is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >+ * GNU General Public License for more details. >+ * >+ * You should have received a copy of the GNU General Public License >+ * along with this program; if not, write to the Free Software >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. >+ * >+ * This program is also available under a commercial proprietary license. >+ * For more information, contact us at license @ x265.com. >+ >*****************************************************************************/ >+ >+#include "pixel-prim.h" >+#include "mem-neon.h" >+ >+#include <arm_neon.h> >+ >+namespace >+{ >+#if !HIGH_BIT_DEPTH >+template<int size> >+uint64_t pixel_var_neon_dotprod(const uint8_t *pix, intptr_t i_stride) >+{ >+ if (size >= 16) >+ { >+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; >+ uint32x4_t sqr[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; >+ >+ for (int h = 0; h < size; h += 2) >+ { >+ for (int w = 0; w + 16 <= size; w += 16) >+ { >+ uint8x16_t s[2]; >+ load_u8x16xn<2>(pix + w, i_stride, s); >+ >+ sum[0] = vdotq_u32(sum[0], s[0], vdupq_n_u8(1)); >+ sum[1] = vdotq_u32(sum[1], s[1], vdupq_n_u8(1)); >+ >+ sqr[0] = vdotq_u32(sqr[0], s[0], s[0]); >+ sqr[1] = vdotq_u32(sqr[1], s[1], s[1]); >+ } >+ >+ pix += 2 * i_stride; >+ } >+ >+ sum[0] = vaddq_u32(sum[0], sum[1]); >+ sqr[0] = vaddq_u32(sqr[0], sqr[1]); >+ >+ return vaddvq_u32(sum[0]) + (vaddlvq_u32(sqr[0]) << 32); >+ } >+ if (size == 8) >+ { >+ uint16x8_t sum = vdupq_n_u16(0); >+ uint32x2_t sqr = vdup_n_u32(0); >+ >+ for (int h = 0; h < size; ++h) >+ { >+ uint8x8_t s = vld1_u8(pix); >+ >+ sum = vaddw_u8(sum, s); >+ sqr = vdot_u32(sqr, s, s); >+ >+ pix += i_stride; >+ } >+ >+ return vaddvq_u16(sum) + (vaddlv_u32(sqr) << 32); >+ } >+ if (size == 4) { >+ uint16x8_t sum = vdupq_n_u16(0); >+ uint32x2_t sqr = vdup_n_u32(0); >+ >+ for (int h = 0; h < size; h += 2) >+ { >+ uint8x8_t s = load_u8x4x2(pix, i_stride); >+ >+ sum = vaddw_u8(sum, s); >+ sqr = vdot_u32(sqr, s, s); >+ >+ pix += 2 * i_stride; >+ } >+ >+ return vaddvq_u16(sum) + (vaddlv_u32(sqr) << 32); >+ } >+} >+#endif // !HIGH_BIT_DEPTH >+} >+ >+namespace X265_NS >+{ >+void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p) >+{ >+#if !HIGH_BIT_DEPTH >+ p.cu[BLOCK_4x4].var = pixel_var_neon_dotprod<4>; >+ p.cu[BLOCK_8x8].var = pixel_var_neon_dotprod<8>; >+ p.cu[BLOCK_16x16].var = pixel_var_neon_dotprod<16>; >+ p.cu[BLOCK_32x32].var = pixel_var_neon_dotprod<32>; >+ p.cu[BLOCK_64x64].var = pixel_var_neon_dotprod<64>; >+#endif // !HIGH_BIT_DEPTH >+} >+} >diff --git a/source/common/aarch64/pixel-prim.h >b/source/common/aarch64/pixel-prim.h >index da9f1822b..74271b10c 100644 >--- a/source/common/aarch64/pixel-prim.h >+++ b/source/common/aarch64/pixel-prim.h >@@ -15,6 +15,9 @@ namespace X265_NS > > void setupPixelPrimitives_neon(EncoderPrimitives &p); > >+#if defined(HAVE_NEON_DOTPROD) >+void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p); >+#endif > > } > >-- >2.39.5 (Apple Git-154) >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
