Suggest replace assert() by X265_CHECK, other looks good.
At 2025-02-11 23:19:58, "Micro Daryl Robles" <[email protected]> wrote: >Relative performance compared to scalar C: > > Neoverse N1: 3.67x > Neoverse N2: 2.57x > Neoverse V1: 2.31x > Neoverse V2: 2.35x >--- > source/common/aarch64/loopfilter-prim.cpp | 75 +++++++++++++++++++++++ > 1 file changed, 75 insertions(+) > >diff --git a/source/common/aarch64/loopfilter-prim.cpp >b/source/common/aarch64/loopfilter-prim.cpp >index 9242c859e..7fef25cbb 100644 >--- a/source/common/aarch64/loopfilter-prim.cpp >+++ b/source/common/aarch64/loopfilter-prim.cpp >@@ -1,5 +1,6 @@ > #include "common.h" > #include "loopfilter-prim.h" >+#include "mem-neon.h" > > #define PIXEL_MIN 0 > >@@ -311,6 +312,79 @@ void pelFilterLumaStrong_V_neon(pixel *src, intptr_t >srcStep, intptr_t offset, > } > } > >+void pelFilterLumaStrong_H_neon(pixel *src, intptr_t srcStep, intptr_t offset, >+ int32_t tcP, int32_t tcQ) >+{ >+ assert(UNIT_SIZE == 4 && srcStep == 1); >+ (void)srcStep; >+ >+ const int16x8_t tc_vec = vcombine_s16(vdup_n_s16(tcP), vdup_n_s16(tcQ)); >+ const int16x8_t neg_tc_vec = vnegq_s16(tc_vec); >+ >+ uint8x8_t m0 = vld1_u8(src - 4 * offset); >+ uint8x8_t m1 = vld1_u8(src - 3 * offset); >+ uint8x8_t m2 = vld1_u8(src - 2 * offset); >+ uint8x8_t m3 = vld1_u8(src - 1 * offset); >+ uint8x8_t m4 = vld1_u8(src - 0 * offset); >+ uint8x8_t m5 = vld1_u8(src + 1 * offset); >+ uint8x8_t m6 = vld1_u8(src + 2 * offset); >+ uint8x8_t m7 = vld1_u8(src + 3 * offset); >+ >+ uint8x8_t m12 = >+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m1), >vreinterpret_u32_u8(m2))); >+ uint8x8_t m23 = >+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m2), >vreinterpret_u32_u8(m3))); >+ uint8x8_t m34 = >+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m3), >vreinterpret_u32_u8(m4))); >+ uint8x8_t m45 = >+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m4), >vreinterpret_u32_u8(m5))); >+ uint8x8_t m56 = >+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m5), >vreinterpret_u32_u8(m6))); >+ >+ // src[-1 * offset], src[0 * offset] >+ uint16x8_t p0 = vaddl_u8(m23, m34); >+ p0 = vaddw_u8(p0, m45); >+ uint16x8_t t0 = vshlq_n_u16(p0, 1); >+ uint16x8_t t1 = vaddl_u8(m12, m56); >+ uint16x8_t t01 = vaddq_u16(t0, t1); >+ t01 = vrshrq_n_u16(t01, 3); >+ t01 = vsubw_u8(t01, m34); >+ t01 = vreinterpretq_u16_s16( >+ vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t01)))); >+ uint8x8_t d01 = vmovn_u16(t01); >+ d01 = vadd_u8(d01, m34); >+ store_u8x4_strided_xN<2>(&src[-1 * offset], 1 * offset, &d01); >+ >+ uint8x8_t m16 = >+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m1), >vreinterpret_u32_u8(m6))); >+ uint8x8_t m25 = >+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m2), >vreinterpret_u32_u8(m5))); >+ >+ // src[-2 * offset], src[1 * offset] >+ uint16x8_t p1 = vaddw_u8(p0, m16); >+ uint16x8_t t23 = vrshrq_n_u16(p1, 2); >+ t23 = vsubw_u8(t23, m25); >+ t23 = vreinterpretq_u16_s16( >+ vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t23)))); >+ uint8x8_t d23 = vmovn_u16(t23); >+ d23 = vadd_u8(d23, m25); >+ store_u8x4_strided_xN<2>(&src[-2 * offset], 3 * offset, &d23); >+ >+ uint8x8_t m07 = >+ vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m0), >vreinterpret_u32_u8(m7))); >+ >+ // src[-3 * offset], src[2 * offset] >+ uint16x8_t p2 = vaddl_u8(m07, m16); >+ uint16x8_t t45 = vmlaq_n_u16(p1, p2, 2); >+ t45 = vrshrq_n_u16(t45, 3); >+ t45 = vsubw_u8(t45, m16); >+ t45 = vreinterpretq_u16_s16( >+ vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t45)))); >+ uint8x8_t d45 = vmovn_u16(t45); >+ d45 = vadd_u8(d45, m16); >+ store_u8x4_strided_xN<2>(&src[-3 * offset], 5 * offset, &d45); >+} >+ > } // namespace > > namespace X265_NS >@@ -328,6 +402,7 @@ void setupLoopFilterPrimitives_neon(EncoderPrimitives &p) > p.sign = calSign_neon; > > p.pelFilterLumaStrong[0] = pelFilterLumaStrong_V_neon; >+ p.pelFilterLumaStrong[1] = pelFilterLumaStrong_H_neon; > } > > >-- >2.34.1 >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
