Hi, Did you get a chance to review this patch? Get Outlook for Android<https://aka.ms/AAb9ysg> ________________________________ From: Harshitha Sarangu Suresh Sent: Thursday, May 22, 2025 7:27:31 PM To: ffmpeg-devel@ffmpeg.org <ffmpeg-devel@ffmpeg.org> Cc: Dash Santosh Sathyanarayanan <dash.sathyanaraya...@multicorewareinc.com> Subject: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2planeX_10_c_template()
This optimization provides 5x improvement for the module. The boost in performance was calculated by adding C timers inside the C function and the optimized neon intrinsic function. >From 904144c2db9e5e72d56360c4c2eb38d426852901 Mon Sep 17 00:00:00 2001 From: Harshitha Suresh <harshi...@multicorewareinc.com> Date: Thu, 22 May 2025 10:23:55 +0530 Subject: [PATCH] swscale/output: Implement neon intrinsics for yuv2planeX_10_c_template() --- libswscale/output.c | 76 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/libswscale/output.c b/libswscale/output.c index c37649e7ce..345df5ce59 100644 --- a/libswscale/output.c +++ b/libswscale/output.c @@ -22,7 +22,9 @@ #include <stddef.h> #include <stdint.h> #include <string.h> - +#if defined (__aarch64__) +#include <arm_neon.h> +#endif #include "libavutil/attributes.h" #include "libavutil/avutil.h" #include "libavutil/avassert.h" @@ -337,6 +339,77 @@ yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW, } } + +#if defined (__aarch64__) && !defined(__APPLE__) +static av_always_inline void +yuv2planeX_10_c_template(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits) +{ + const int shift = 11 + 16 - output_bits; + const int bias = 1 << (shift - 1); + const int clip_max = (1 << output_bits) - 1; + int i; + + for (i = 0; i < dstW; i += 16) { + int32x4_t sum0_lo = vdupq_n_s32(bias); + int32x4_t sum0_hi = vdupq_n_s32(bias); + int32x4_t sum1_lo = vdupq_n_s32(bias); + int32x4_t sum1_hi = vdupq_n_s32(bias); + + for (int j = 0; j < filterSize; j++) { + int16x8_t src_vec0 = vld1q_s16(&src[j][i]); + int16x8_t src_vec1 = vld1q_s16(&src[j][i + 8]); + int16x8_t filter_val = vdupq_n_s16(filter[j]); + + sum0_lo = vmlal_s16(sum0_lo, vget_low_s16(src_vec0), vget_low_s16(filter_val)); + sum0_hi = vmlal_s16(sum0_hi, vget_high_s16(src_vec0), vget_high_s16(filter_val)); + sum1_lo = vmlal_s16(sum1_lo, vget_low_s16(src_vec1), vget_low_s16(filter_val)); + sum1_hi = vmlal_s16(sum1_hi, vget_high_s16(src_vec1), vget_high_s16(filter_val)); + } + + // Right shift with rounding + int32x4_t shift_vec = vdupq_n_s32(-shift); + sum0_lo = vshlq_s32(sum0_lo, shift_vec); + sum0_hi = vshlq_s32(sum0_hi, shift_vec); + sum1_lo = vshlq_s32(sum1_lo, shift_vec); + sum1_hi = vshlq_s32(sum1_hi, shift_vec); + + // Clip to output_bits range + sum0_lo = vmaxq_s32(vminq_s32(sum0_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + sum0_hi = vmaxq_s32(vminq_s32(sum0_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + sum1_lo = vmaxq_s32(vminq_s32(sum1_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + sum1_hi = vmaxq_s32(vminq_s32(sum1_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0)); + + // Convert to 16-bit + uint16x8_t result0 = vcombine_u16( + vreinterpret_u16_s16(vmovn_s32(sum0_lo)), + vreinterpret_u16_s16(vmovn_s32(sum0_hi)) + ); + uint16x8_t result1 = vcombine_u16( + vreinterpret_u16_s16(vmovn_s32(sum1_lo)), + vreinterpret_u16_s16(vmovn_s32(sum1_hi)) + ); + + // Store with proper endianness + if (big_endian) { + result0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result0))); + result1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result1))); + } + vst1q_u16(&dest[i], result0); + vst1q_u16(&dest[i + 8], result1); + } + + // Handle remaining pixels + for (; i < dstW; i++) { + int val = bias; + for (int j = 0; j < filterSize; j++) { + val += src[j][i] * filter[j]; + } + output_pixel(&dest[i], val); + } +} +#else static av_always_inline void yuv2planeX_10_c_template(const int16_t *filter, int filterSize, const int16_t **src, uint16_t *dest, int dstW, @@ -355,6 +428,7 @@ yuv2planeX_10_c_template(const int16_t *filter, int filterSize, output_pixel(&dest[i], val); } } +#endif #undef output_pixel -- 2.36.0.windows.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".