PR #23110 opened by Zhao Zhili (quink) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23110 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23110.patch
The NEON loop filter always runs the full filter, so it loses to C in the bestcase. Add a C wrapper that checks and returns early when all groups skip; otherwise it falls through to NEON. checkasm vc1dsp bestcase speedup vs C, NEON -> wrapper: filter M1 Pi 5 (A76) v_loop_filter4 0.44 -> 2.57 0.78 -> 3.86 v_loop_filter8 1.09 -> 2.09 0.89 -> 2.40 v_loop_filter16 0.88 -> 2.16 0.61 -> 1.87 h_loop_filter4 0.96 -> 2.61 0.80 -> 2.59 h_loop_filter8 0.78 -> 2.30 0.61 -> 2.54 h_loop_filter16 0.66 -> 1.83 0.46 -> 1.88 Worstcase speedup is preserved. >From bba72d4c424d566916e61fcbb22eb66ff31361c1 Mon Sep 17 00:00:00 2001 From: Zhao Zhili <[email protected]> Date: Sat, 16 May 2026 13:13:40 +0800 Subject: [PATCH] aarch64/vc1dsp: add early-return fast path for loop filter The NEON loop filter always runs the full filter, so it loses to C in the bestcase. Add a C wrapper that checks and returns early when all groups skip; otherwise it falls through to NEON. checkasm vc1dsp bestcase speedup vs C, NEON -> wrapper: filter M1 Pi 5 (A76) v_loop_filter4 0.44 -> 2.57 0.78 -> 3.86 v_loop_filter8 1.09 -> 2.09 0.89 -> 2.40 v_loop_filter16 0.88 -> 2.16 0.61 -> 1.87 h_loop_filter4 0.96 -> 2.61 0.80 -> 2.59 h_loop_filter8 0.78 -> 2.30 0.61 -> 2.54 h_loop_filter16 0.66 -> 1.83 0.46 -> 1.88 Worstcase speedup is preserved. --- libavcodec/aarch64/vc1dsp_init_aarch64.c | 55 +++++++++++++++++++++--- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c index 3bc0bd17ee..3acb8c77e3 100644 --- a/libavcodec/aarch64/vc1dsp_init_aarch64.c +++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c @@ -54,6 +54,49 @@ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stri int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); +static av_always_inline int vc1_loop_filter_all_skip(uint8_t *src, + ptrdiff_t step, + ptrdiff_t stride, + int len, int pq) +{ + for (int i = 0; i < len; i += 4) { + uint8_t *s = src + 2 * step; + int a0 = (2 * (s[-2 * stride] - s[stride]) - + 5 * (s[-stride] - s[0]) + 4) >> 3; + int a0_sign = a0 >> 31; + a0 = (a0 ^ a0_sign) - a0_sign; + if (a0 < pq) + return 0; + src += step * 4; + } + return 1; +} + +#define VC1_V_LOOP_FILTER_WRAPPER(len) \ +static void vc1_v_loop_filter##len##_aarch64(uint8_t *src, ptrdiff_t stride, \ + int pq) \ +{ \ + if (vc1_loop_filter_all_skip(src, 1, stride, len, pq)) \ + return; \ + ff_vc1_v_loop_filter##len##_neon(src, stride, pq); \ +} + +#define VC1_H_LOOP_FILTER_WRAPPER(len) \ +static void vc1_h_loop_filter##len##_aarch64(uint8_t *src, ptrdiff_t stride, \ + int pq) \ +{ \ + if (vc1_loop_filter_all_skip(src, stride, 1, len, pq)) \ + return; \ + ff_vc1_h_loop_filter##len##_neon(src, stride, pq); \ +} + +VC1_V_LOOP_FILTER_WRAPPER(4) +VC1_H_LOOP_FILTER_WRAPPER(4) +VC1_V_LOOP_FILTER_WRAPPER(8) +VC1_H_LOOP_FILTER_WRAPPER(8) +VC1_V_LOOP_FILTER_WRAPPER(16) +VC1_H_LOOP_FILTER_WRAPPER(16) + static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) { /* Dealing with starting and stopping, and removing escape bytes, are @@ -124,12 +167,12 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon; dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; - dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; - dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; - dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; - dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; - dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; - dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; + dsp->vc1_v_loop_filter4 = vc1_v_loop_filter4_aarch64; + dsp->vc1_h_loop_filter4 = vc1_h_loop_filter4_aarch64; + dsp->vc1_v_loop_filter8 = vc1_v_loop_filter8_aarch64; + dsp->vc1_h_loop_filter8 = vc1_h_loop_filter8_aarch64; + dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_aarch64; + dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_aarch64; dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon; dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
