Hi Gerda,
Thank for the patches, I have some comments. At 2025-04-24 18:01:02, "Gerda Zsejke More" <[email protected]> wrote: >Add SVE implementation of HBD interp_horiz_pp for LUMA filtering. >An implementation was added for block sizes with width equal to 4 for >both 10-bit and 12-bit build, but for bigger block sizes the SVE >implementation was only enabled for 12-bit build. > >This implementation gives up to 9% uplift compared to the existing >Neon implementation. >--- > source/common/CMakeLists.txt | 2 +- > source/common/aarch64/asm-primitives.cpp | 2 + > source/common/aarch64/filter-prim-sve.cpp | 314 ++++++++++++++++++++++ > source/common/aarch64/filter-prim-sve.h | 37 +++ > source/common/aarch64/neon-sve-bridge.h | 12 + > 5 files changed, 366 insertions(+), 1 deletion(-) > create mode 100644 source/common/aarch64/filter-prim-sve.cpp > create mode 100644 source/common/aarch64/filter-prim-sve.h > >+#if HIGH_BIT_DEPTH >+static const uint16_t dotprod_h_permute_tbl[32] = { >+ // clang-format off >+ 0, 1, 2, 3, 1, 2, 3, 4, >+ 2, 3, 4, 5, 3, 4, 5, 6, >+ 3, 2, 1, 0, 4, 3, 2, 1, Is this resule get from dotprod_h_permute_tbl[0] with "REV64 V.8H"? >+ 5, 4, 3, 2, 6, 5, 4, 3, >+ // clang-format on >+}; >+ >+template<bool coeff2> >+void inline setup_s_hpp_x4(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, >uint16x8_t *idx) >+{ >+ if (coeff2) >+ { >+ d[0] = x265_tblq_u16(s0, idx[0]); >+ d[1] = x265_tblq_u16(s1, idx[2]); >+ d[2] = x265_tblq_u16(s0, idx[1]); >+ d[3] = x265_tblq_u16(s1, idx[3]); >+ } >+ else >+ { >+ d[0] = x265_tblq_u16(s0, idx[0]); >+ d[1] = x265_tblq_u16(s1, idx[0]); >+ d[2] = x265_tblq_u16(s0, idx[1]); >+ d[3] = x265_tblq_u16(s1, idx[1]); >+ } >+} >+ >+template<bool coeff2> >+void inline setup_s_hpp_x8(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, >uint16x8_t s2, >+ uint16x8_t *idx) >+{ >+ if (coeff2) >+ { >+ d[0] = x265_tblq_u16(s0, idx[0]); >+ d[1] = x265_tblq_u16(s1, idx[2]); >+ d[2] = x265_tblq_u16(s0, idx[1]); >+ d[3] = x265_tblq_u16(s1, idx[3]); >+ d[4] = x265_tblq_u16(s1, idx[0]); Above method REV64.8H? >+ d[5] = x265_tblq_u16(s2, idx[2]); >+ d[6] = x265_tblq_u16(s1, idx[1]); >+ d[7] = x265_tblq_u16(s2, idx[3]); >+ } >+ else >+ { >+ d[0] = x265_tblq_u16(s0, idx[0]); >+ d[1] = x265_tblq_u16(s1, idx[0]); >+ d[2] = x265_tblq_u16(s0, idx[1]); >+ d[3] = x265_tblq_u16(s1, idx[1]); >+ d[4] = d[1]; >+ d[5] = x265_tblq_u16(s2, idx[0]); >+ d[6] = d[3]; >+ d[7] = x265_tblq_u16(s2, idx[1]); >+ } >+} >+ >+template<bool coeff2, int width, int height> >+void inline interp8_hpp_sve(const pixel *src, intptr_t srcStride, >+ pixel *dst, intptr_t dstStride, int coeffIdx) >+{ >+ const int N_TAPS = 8; >+ const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1); >+ const int16x8_t filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdx]); >+ uint16x8_t idx[4]; >+ >+ idx[0] = vld1q_u16(dotprod_h_permute_tbl + 0); >+ idx[1] = vld1q_u16(dotprod_h_permute_tbl + 8); >+ idx[2] = vld1q_u16(dotprod_h_permute_tbl + 16); >+ idx[3] = vld1q_u16(dotprod_h_permute_tbl + 24); idx[2] and idx[3] for <coeff2> only
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
