This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 81fb70c833ac675ac8e09b38ad845a90de4c3e1c Author: Andreas Rheinhardt <[email protected]> AuthorDate: Thu Feb 19 00:40:42 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Sun Feb 22 01:01:27 2026 +0100 avcodec/x86/vvc/mc,dsp_init: Avoid pointless wrappers for w_avg They only add overhead (in form of another function call, sign-extending some parameters to 64bit (although the upper bits are not used at all) and rederiving the actual number of bits (from the maximum value (1<<bpp)-1)). Old benchmarks: w_avg_8_2x2_c: 16.4 ( 1.00x) w_avg_8_2x2_avx2: 12.9 ( 1.27x) w_avg_8_4x4_c: 48.0 ( 1.00x) w_avg_8_4x4_avx2: 14.9 ( 3.23x) w_avg_8_8x8_c: 168.2 ( 1.00x) w_avg_8_8x8_avx2: 22.4 ( 7.49x) w_avg_8_16x16_c: 396.5 ( 1.00x) w_avg_8_16x16_avx2: 47.9 ( 8.28x) w_avg_8_32x32_c: 1466.3 ( 1.00x) w_avg_8_32x32_avx2: 172.8 ( 8.48x) w_avg_8_64x64_c: 5629.3 ( 1.00x) w_avg_8_64x64_avx2: 678.7 ( 8.29x) w_avg_8_128x128_c: 22122.4 ( 1.00x) w_avg_8_128x128_avx2: 2743.5 ( 8.06x) w_avg_10_2x2_c: 18.7 ( 1.00x) w_avg_10_2x2_avx2: 13.1 ( 1.43x) w_avg_10_4x4_c: 50.3 ( 1.00x) w_avg_10_4x4_avx2: 15.9 ( 3.17x) w_avg_10_8x8_c: 109.3 ( 1.00x) w_avg_10_8x8_avx2: 20.6 ( 5.30x) w_avg_10_16x16_c: 395.5 ( 1.00x) w_avg_10_16x16_avx2: 44.8 ( 8.83x) w_avg_10_32x32_c: 1534.2 ( 1.00x) w_avg_10_32x32_avx2: 141.4 (10.85x) w_avg_10_64x64_c: 6003.6 ( 1.00x) w_avg_10_64x64_avx2: 557.4 (10.77x) w_avg_10_128x128_c: 23722.7 ( 1.00x) w_avg_10_128x128_avx2: 2205.0 (10.76x) w_avg_12_2x2_c: 18.6 ( 1.00x) w_avg_12_2x2_avx2: 13.1 ( 1.42x) w_avg_12_4x4_c: 52.2 ( 1.00x) w_avg_12_4x4_avx2: 16.1 ( 3.24x) w_avg_12_8x8_c: 109.2 ( 1.00x) w_avg_12_8x8_avx2: 20.6 ( 5.29x) w_avg_12_16x16_c: 396.1 ( 1.00x) w_avg_12_16x16_avx2: 45.0 ( 8.81x) w_avg_12_32x32_c: 1532.6 ( 1.00x) w_avg_12_32x32_avx2: 142.1 (10.79x) w_avg_12_64x64_c: 6002.2 ( 1.00x) w_avg_12_64x64_avx2: 557.3 (10.77x) w_avg_12_128x128_c: 23748.7 ( 1.00x) w_avg_12_128x128_avx2: 2206.4 (10.76x) New benchmarks: w_avg_8_2x2_c: 16.0 ( 1.00x) w_avg_8_2x2_avx2: 9.3 ( 1.71x) w_avg_8_4x4_c: 48.4 ( 1.00x) w_avg_8_4x4_avx2: 12.4 ( 3.91x) w_avg_8_8x8_c: 168.7 ( 1.00x) w_avg_8_8x8_avx2: 21.1 ( 8.00x) w_avg_8_16x16_c: 394.5 ( 1.00x) w_avg_8_16x16_avx2: 46.2 ( 8.54x) w_avg_8_32x32_c: 1456.3 ( 1.00x) w_avg_8_32x32_avx2: 171.8 ( 8.48x) w_avg_8_64x64_c: 5636.2 ( 1.00x) w_avg_8_64x64_avx2: 676.9 ( 8.33x) w_avg_8_128x128_c: 22129.1 ( 1.00x) w_avg_8_128x128_avx2: 2734.3 ( 8.09x) w_avg_10_2x2_c: 18.7 ( 1.00x) w_avg_10_2x2_avx2: 10.3 ( 1.82x) w_avg_10_4x4_c: 50.8 ( 1.00x) w_avg_10_4x4_avx2: 13.4 ( 3.79x) w_avg_10_8x8_c: 109.7 ( 1.00x) w_avg_10_8x8_avx2: 20.4 ( 5.38x) w_avg_10_16x16_c: 395.2 ( 1.00x) w_avg_10_16x16_avx2: 41.7 ( 9.48x) w_avg_10_32x32_c: 1535.6 ( 1.00x) w_avg_10_32x32_avx2: 137.9 (11.13x) w_avg_10_64x64_c: 6002.1 ( 1.00x) w_avg_10_64x64_avx2: 548.5 (10.94x) w_avg_10_128x128_c: 23742.7 ( 1.00x) w_avg_10_128x128_avx2: 2179.8 (10.89x) w_avg_12_2x2_c: 18.9 ( 1.00x) w_avg_12_2x2_avx2: 10.3 ( 1.84x) w_avg_12_4x4_c: 52.4 ( 1.00x) w_avg_12_4x4_avx2: 13.4 ( 3.91x) w_avg_12_8x8_c: 109.2 ( 1.00x) w_avg_12_8x8_avx2: 20.3 ( 5.39x) w_avg_12_16x16_c: 396.3 ( 1.00x) w_avg_12_16x16_avx2: 41.7 ( 9.51x) w_avg_12_32x32_c: 1532.6 ( 1.00x) w_avg_12_32x32_avx2: 138.6 (11.06x) w_avg_12_64x64_c: 5996.7 ( 1.00x) w_avg_12_64x64_avx2: 549.6 (10.91x) w_avg_12_128x128_c: 23738.0 ( 1.00x) w_avg_12_128x128_avx2: 2177.2 (10.90x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/dsp_init.c | 26 +++----------- libavcodec/x86/vvc/mc.asm | 84 ++++++++++++++++++++++--------------------- 2 files changed, 48 insertions(+), 62 deletions(-) diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c index 80df8e46ee..357f4ea8a1 100644 --- a/libavcodec/x86/vvc/dsp_init.c +++ b/libavcodec/x86/vvc/dsp_init.c @@ -35,14 +35,6 @@ #define bf(fn, bd, opt) fn##_##bd##_##opt #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt -#define AVG_BPC_PROTOTYPES(bpc, opt) \ -void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \ - intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max); - -AVG_BPC_PROTOTYPES( 8, avx2) -AVG_BPC_PROTOTYPES(16, avx2) - #define DMVR_PROTOTYPES(bd, opt) \ void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ int height, intptr_t mx, intptr_t my, int width); \ @@ -168,19 +160,6 @@ FW_PUT_AVX2(12) FW_PUT_16BPC_AVX2(10) FW_PUT_16BPC_AVX2(12) -#define AVG_FUNCS(bpc, bd, opt) \ -static void bf(vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *src0, const int16_t *src1, int width, int height, \ - int denom, int w0, int w1, int o0, int o1) \ -{ \ - BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \ - denom, w0, w1, o0, o1, (1 << bd) - 1); \ -} - -AVG_FUNCS(8, 8, avx2) -AVG_FUNCS(16, 10, avx2) -AVG_FUNCS(16, 12, avx2) - #define ALF_FUNCS(bpc, bd, opt) \ static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \ @@ -249,8 +228,11 @@ SAO_FILTER_FUNCS(12, avx2) #define AVG_INIT(bd, opt) do { \ void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *src0, const int16_t *src1, int width, int height);\ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *src0, const int16_t *src1, int width, int height, \ + int denom, int w0, int w1, int o0, int o1); \ c->inter.avg = bf(ff_vvc_avg, bd, opt); \ - c->inter.w_avg = bf(vvc_w_avg, bd, opt); \ + c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \ } while (0) #define DMVR_INIT(bd) do { \ diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm index 7599ee2e6a..8ba493aebd 100644 --- a/libavcodec/x86/vvc/mc.asm +++ b/libavcodec/x86/vvc/mc.asm @@ -48,8 +48,8 @@ SECTION_RODATA AVG_JMP_TABLE avg, 8, 8, avx2, 2, 4, 8, 16, 32, 64, 128 AVG_JMP_TABLE avg, 16, 10, avx2, 2, 4, 8, 16, 32, 64, 128 -AVG_JMP_TABLE w_avg, 8, 8bpc, avx2, 2, 4, 8, 16, 32, 64, 128 -AVG_JMP_TABLE w_avg, 16, 16bpc, avx2, 2, 4, 8, 16, 32, 64, 128 +AVG_JMP_TABLE w_avg, 8, 8, avx2, 2, 4, 8, 16, 32, 64, 128 +AVG_JMP_TABLE w_avg, 16, 10, avx2, 2, 4, 8, 16, 32, 64, 128 SECTION .text @@ -242,64 +242,68 @@ cglobal vvc_avg_%2, 4, 7, 5, dst, stride, src0, src1, w, h AVG_FN %1, AVG, %3 %endmacro -;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride, -; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, -; intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max); -%macro VVC_W_AVG_AVX2 1 -cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0, t1 - - movifnidn hd, hm - - movifnidn t0d, r8m ; w1 - shl t0d, 16 - mov t0w, r7m ; w0 - movd xm3, t0d - vpbroadcastd m3, xm3 ; w0, w1 - -%if %1 != 8 - pxor m6, m6 ;pixel min - vpbroadcastw m7, r11m ;pixel max +;void ff_vvc_w_avg_%2_avx(uint8_t *dst, ptrdiff_t dst_stride, +; const int16_t *src0, const int16_t *src1, int width, int height, +; int denom, intptr_t w0, int w1, int o0, int o1); +%macro VVC_W_AVG_AVX2 3 +cglobal vvc_w_avg_%2, 4, 7+2*UNIX64, 6+2*(%1 != 8), dst, stride, src0, src1, w, h +%if UNIX64 + ; r6-r8 are volatile and not used for parameter passing + DECLARE_REG_TMP 6, 7, 8 +%else ; Win64 + ; r4-r6 are volatile and not used for parameter passing + DECLARE_REG_TMP 4, 5, 6 %endif - mov t1q, rcx ; save ecx - mov ecx, r11m - inc ecx ; bd - tzcnt ecx, ecx - sub ecx, 8 + mov t1d, r6m ; denom mov t0d, r9m ; o0 add t0d, r10m ; o1 - shl t0d, cl - inc t0d ;((o0 + o1) << (BIT_DEPTH - 8)) + 1 - - neg ecx - add ecx, 7 - add ecx, r6m - movd xm2, ecx ; shift + movifnidn t2d, r8m ; w1 + add t1d, 15-%2 +%if %2 != 8 + shl t0d, %2 - 8 +%endif + movd xm2, t1d ; shift + inc t0d ; ((o0 + o1) << (BIT_DEPTH - 8)) + 1 + shl t2d, 16 + movd xm4, t0d + mov t2w, r7m ; w0 + movd xm3, t2d + vpbroadcastd m3, xm3 ; w0, w1 - dec ecx - shl t0d, cl - movd xm4, t0d - vpbroadcastd m4, xm4 ; offset - mov rcx, t1q ; restore ecx +%if %1 != 8 + pcmpeqw m7, m7 + pxor m6, m6 ; pixel min + psrlw m7, 16-%2 ; pixel max +%endif lea r6, [w_avg_%1 %+ SUFFIX %+ _table] tzcnt wd, wm movsxd wq, dword [r6+wq*4] + + pslld xm4, xm2 + psrad xm4, 1 + vpbroadcastd m4, xm4 ; offset + + movifnidn hd, hm + add wq, r6 - AVG_FN %1, W_AVG + AVG_FN %1, W_AVG, %3 %endmacro INIT_YMM avx2 VVC_AVG_AVX2 16, 12, 0 +VVC_W_AVG_AVX2 16, 12, 0 + VVC_AVG_AVX2 16, 10, 1 -VVC_AVG_AVX2 8, 8, 1 +VVC_W_AVG_AVX2 16, 10, 1 -VVC_W_AVG_AVX2 16 +VVC_AVG_AVX2 8, 8, 1 -VVC_W_AVG_AVX2 8 +VVC_W_AVG_AVX2 8, 8, 1 %endif %endif _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
