This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 5c830fccf4bd30481c3ccc03b8c99bd8791c8796 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Wed Apr 8 18:43:54 2026 +0200 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Mon Apr 13 12:51:35 2026 +0200 avcodec/x86/snowdsp: Add SSSE3 inner_add_yblock Compared to the MMX version, this version benefits from wider registers and pmaddubsw. It also has fewer unnecessary loads and stores: On x64, the MMX version has 12 unnecessary GPR loads and 6 stores in each line when width is eight; for width 16, there are 17 unnecessary GPR loads and six stores per line. Even the 32bit SSSE3 version only has six loads and zero stores per line more than the x64 version. Furthermore, in contrast to the MMX version, the SSSE3 version also does not clobber the array of block pointers given to it. Benchmarks: inner_add_yblock_2_c: 29.2 ( 1.00x) inner_add_yblock_2_mmx: 32.5 ( 0.90x) inner_add_yblock_2_ssse3: 28.6 ( 1.02x) inner_add_yblock_4_c: 85.2 ( 1.00x) inner_add_yblock_4_mmx: 89.2 ( 0.96x) inner_add_yblock_4_ssse3: 84.5 ( 1.01x) inner_add_yblock_8_c: 302.0 ( 1.00x) inner_add_yblock_8_mmx: 77.0 ( 3.92x) inner_add_yblock_8_ssse3: 30.6 ( 9.85x) inner_add_yblock_16_c: 1164.7 ( 1.00x) inner_add_yblock_16_mmx: 260.4 ( 4.47x) inner_add_yblock_16_ssse3: 82.3 (14.15x) Both the MMX and SSSE3 versions leave the size 2 and 4 cases to ff_snow_inner_add_yblock_c() (but the MMX version has a prologue at the beginning that it needs to undo before the call, leading to the higher overhead for these sizes). I don't know why the SSSE3 version is marginally faster than the C version in these cases. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/snow.c | 4 +- libavcodec/snow_dwt.c | 2 +- libavcodec/snow_dwt.h | 8 +- libavcodec/x86/Makefile | 6 +- libavcodec/x86/snowdsp.asm | 191 +++++++++++++++++++++++++++ libavcodec/x86/{snowdsp.c => snowdsp_init.c} | 17 ++- 6 files changed, 216 insertions(+), 12 deletions(-) diff --git a/libavcodec/snow.c b/libavcodec/snow.c index e61f4f726a..5c13709b5c 100644 --- a/libavcodec/snow.c +++ b/libavcodec/snow.c @@ -115,8 +115,8 @@ static av_cold void init_qpel(SnowContext *const s) s->put_snow_qpel_pixels_tab[3][15] = put_snow_qpel2_mc33_8_c; } -void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, - int src_x, int src_stride, IDWTELEM *const *lines, int add, uint8_t *dst8) +void ff_snow_inner_add_yblock_c(const uint8_t *obmc, const int obmc_stride, uint8_t **block, int b_w, int b_h, + int src_x, int src_stride, IDWTELEM *const *lines, int add, uint8_t *dst8) { int y, x; diff --git a/libavcodec/snow_dwt.c b/libavcodec/snow_dwt.c index eb4d1e4d36..28fbebbe4d 100644 --- a/libavcodec/snow_dwt.c +++ b/libavcodec/snow_dwt.c @@ -852,7 +852,7 @@ av_cold void ff_dwt_init(SnowDWTContext *c) { c->vertical_compose97i = snow_vertical_compose97i; c->horizontal_compose97i = snow_horizontal_compose97i; - c->inner_add_yblock = ff_snow_inner_add_yblock; + c->inner_add_yblock = ff_snow_inner_add_yblock_c; #if ARCH_X86 && HAVE_MMX ff_dwt_init_x86(c); diff --git a/libavcodec/snow_dwt.h b/libavcodec/snow_dwt.h index a26db62d6d..d4a384b267 100644 --- a/libavcodec/snow_dwt.h +++ b/libavcodec/snow_dwt.h @@ -137,10 +137,10 @@ void ff_slice_buffer_flush(slice_buffer *buf); void ff_slice_buffer_destroy(slice_buffer *buf); IDWTELEM *ff_slice_buffer_load_line(slice_buffer *buf, int line); -void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, - uint8_t **block, int b_w, int b_h, int src_x, - int src_stride, IDWTELEM *const *lines, - int add, uint8_t *dst8); +void ff_snow_inner_add_yblock_c(const uint8_t *obmc, const int obmc_stride, + uint8_t **block, int b_w, int b_h, int src_x, + int src_stride, IDWTELEM *const *lines, + int add, uint8_t *dst8); int ff_w53_32_c(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t line_size, int h); int ff_w97_32_c(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t line_size, int h); diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index bf723ed1a6..e87cb750f4 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -65,8 +65,10 @@ X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o X86ASM-OBJS-$(CONFIG_PRORES_RAW_DECODER) += x86/proresdsp_init.o X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o -OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o -OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o +OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp_init.o +X86ASM-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o +OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp_init.o +X86ASM-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o diff --git a/libavcodec/x86/snowdsp.asm b/libavcodec/x86/snowdsp.asm new file mode 100644 index 0000000000..bde9054731 --- /dev/null +++ b/libavcodec/x86/snowdsp.asm @@ -0,0 +1,191 @@ +;* +;* ASM optimized Snow DSP functions +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +cextern snow_inner_add_yblock_c + +SECTION .text + +%assign FRAC_BITS 4 +%assign LOG2_OBMC_MAX 6 + +%macro ADD_YBLOCK_PROLOGUE 1 +%assign stack_offset 0 +%if ARCH_X86_32 + PROLOGUE 1, 7, 7+(%1>>4), obmc, offset, dst8, lines, b_h, src_x, dst + ; copy all four block pointers to the stack to be able to load + ; them via esp + mov r2, r2m + mov b_hd, b_hm + mov src_xd, src_xm + movups m1, [r2] + mov linesq, r7m + shl src_xd, 1 ; convert src_x from IDWTELEM to bytes + mov dst8q, r9m + mov src_xm, src_xd + ; Just reuse the space for the arguments to store the block pointers. +%if HAVE_ALIGNED_STACK + movaps r0m, m1 +%else + movups r0m, m1 +%endif + %define src_strideq r6m +%else ; X64 + PROLOGUE 1, 12, 7+(%1>>4), obmc, offset, dst8, lines, b_h, src_x, src_stride, dst, block0, block1, block2, block3 + mov block0q, [r2q] + mov block1q, [r2q + gprsize] + mov block2q, [r2q + 2*gprsize] + mov block3q, [r2q + 3*gprsize] + movifnidn b_hd, b_hm + movifnidn src_xd, src_xm + shl src_xd, 1 ; convert src_x from IDWTELEM to bytes and zero-extend it + movsxd src_strideq, src_stridem + mov linesq, r7mp + mov dst8q, r9mp +%endif + xor offsetd, offsetd + psllw m0, FRAC_BITS - 1 ; pw_m8 +%endmacro + +%macro LOAD_BLOCKPOINTER_FOR_X86_32 2 +%if ARCH_X86_32 + ; we put block #i into the spot of register r#i + mov r5, r %+ %1 %+ m + mov r6, r %+ %2 %+ m + %xdefine block%1q r5 + %xdefine block%2q r6 +%endif +%endmacro + +INIT_XMM ssse3 +; void ff_snow_inner_add_yblock_ssse3(const uint8_t *obmc, const int obmc_stride, +; uint8_t **block, int b_w, int b_h, int src_x, +; int src_stride, IDWTELEM *const *lines, +; int add, uint8_t *dst8); +; Don't use cglobal to load args, as we may want to perform +; a tail call to ff_snow_inner_add_yblock. +cglobal snow_inner_add_yblock + pcmpeqw m0, m0 +%if ARCH_X86_32 + mov r0d, r3m ; block width + cmp r0d, 16 + je .w16 + cmp r0d, 8 + jne snow_inner_add_yblock_c + cmp r1mp, 16 + jne snow_inner_add_yblock_c +%else + ; all arguments used to check for support are already in registers + cmp r3d, 16 + je .w16 + cmp r3d, 8 + jne snow_inner_add_yblock_c + cmp r1d, 16 + jne snow_inner_add_yblock_c +%endif + ADD_YBLOCK_PROLOGUE 8 + .loop8: + LOAD_BLOCKPOINTER_FOR_X86_32 1, 3 + movq m3, [block3q+offsetq] + movq m4, [block1q+offsetq] + mova m1, [obmcq] + mova m2, [obmcq+16*8] +%if ARCH_X86_64 + mov dstq, [linesq] +%endif + LOAD_BLOCKPOINTER_FOR_X86_32 0, 2 + movq m5, [block2q+offsetq] + movq m6, [block0q+offsetq] + punpcklbw m3, m4 +%if ARCH_X86_32 + mov dstq, [linesq] + add dstq, src_xm +%endif + SBUTTERFLY bw, 1, 2, 4 +%if ARCH_X86_64 + movu m4, [dstq+src_xq] +%else + movu m4, [dstq] +%endif + pmaddubsw m3, m1 + add obmcq, 16 + punpcklbw m5, m6 + pmaddubsw m5, m2 + add linesq, gprsize + paddw m3, m5 + psubw m4, m0 ; + 1<<(FRAC_BITS-1) + psrlw m3, LOG2_OBMC_MAX - FRAC_BITS + paddw m3, m4 + psraw m3, FRAC_BITS + packuswb m3, m3 + movq [dst8q+offsetq], m3 + add offsetq, src_strideq + dec b_hd + jnz .loop8 + RET + .w16: + ADD_YBLOCK_PROLOGUE 16 + .loop16: + LOAD_BLOCKPOINTER_FOR_X86_32 2, 3 + mova m3, [block3q+offsetq] + mova m4, [block2q+offsetq] + mova m1, [obmcq] + mova m2, [obmcq+16] + LOAD_BLOCKPOINTER_FOR_X86_32 0, 1 + SBUTTERFLY bw, 3, 4, 7 + mova m5, [block1q+offsetq] + mova m6, [block0q+offsetq] + SBUTTERFLY bw, 1, 2, 7 + mov dstq, [linesq] + pmaddubsw m3, m1 + mova m1, [obmcq+32*16] + pmaddubsw m4, m2 + mova m2, [obmcq+32*16+16] +%if ARCH_X86_32 + add dstq, src_xm +%endif + SBUTTERFLY bw, 5, 6, 7 + SBUTTERFLY bw, 1, 2, 7 + pmaddubsw m5, m1 + add linesq, gprsize + pmaddubsw m6, m2 + paddw m3, m5 + paddw m4, m6 + psrlw m3, LOG2_OBMC_MAX - FRAC_BITS + psrlw m4, LOG2_OBMC_MAX - FRAC_BITS + add obmcq, 32 +%if ARCH_X86_32 + paddw m3, [dstq] + paddw m4, [dstq+16] +%else + paddw m3, [dstq+src_xq] + paddw m4, [dstq+src_xq+16] +%endif + psubw m3, m0 ; + 1<<(FRAC_BITS-1) + psubw m4, m0 ; + 1<<(FRAC_BITS-1) + psraw m3, FRAC_BITS + psraw m4, FRAC_BITS + packuswb m3, m4 + movu [dst8q+offsetq], m3 + add offsetq, src_strideq + dec b_hd + jnz .loop16 + RET diff --git a/libavcodec/x86/snowdsp.c b/libavcodec/x86/snowdsp_init.c similarity index 98% rename from libavcodec/x86/snowdsp.c rename to libavcodec/x86/snowdsp_init.c index 7cd3fd5415..2a120cd0e0 100644 --- a/libavcodec/x86/snowdsp.c +++ b/libavcodec/x86/snowdsp_init.c @@ -24,8 +24,14 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" #include "libavcodec/snow_dwt.h" +void ff_snow_inner_add_yblock_ssse3(const uint8_t *obmc, const int obmc_stride, + uint8_t **block, int b_w, int b_h, int src_x, + int src_stride, IDWTELEM *const *lines, + int add, uint8_t *dst8); + #if HAVE_INLINE_ASM static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){ @@ -864,7 +870,7 @@ static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_st else inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8); } else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8); + ff_snow_inner_add_yblock_c(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8); } static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, @@ -875,7 +881,7 @@ static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_str else if (b_w == 8 && obmc_stride == 16) inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8); else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8); + ff_snow_inner_add_yblock_c(obmc, obmc_stride, block, b_w, b_h, src_x, src_stride, lines, add, dst8); } #endif /* HAVE_6REGS */ @@ -883,9 +889,9 @@ static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_str av_cold void ff_dwt_init_x86(SnowDWTContext *c) { -#if HAVE_INLINE_ASM int mm_flags = av_get_cpu_flags(); +#if HAVE_INLINE_ASM if (mm_flags & AV_CPU_FLAG_MMX) { if(mm_flags & AV_CPU_FLAG_SSE2 & 0){ c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; @@ -909,4 +915,9 @@ av_cold void ff_dwt_init_x86(SnowDWTContext *c) } } #endif /* HAVE_INLINE_ASM */ +#if HAVE_SSSE3_EXTERNAL + if (EXTERNAL_SSSE3(mm_flags)) { + c->inner_add_yblock = ff_snow_inner_add_yblock_ssse3; + } +#endif } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
