This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit ae5314a6bf1db6d1db26d7f6d21dc21a196d15e0 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Sun Apr 5 14:49:26 2026 +0200 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Wed Apr 8 21:00:00 2026 +0200 avcodec/x86/rv34dsp: Port ff_rv34_idct_add_mmxext to SSSE3 With this commit, the RV30 and RV40 decoders no longer clobber the fpu state for normal decoding (only error resilience can still do so). rv34_idct_add_c: 58.1 ( 1.00x) rv34_idct_add_mmxext: 16.5 ( 3.52x) rv34_idct_add_ssse3: 12.2 ( 4.76x) Reviewed-by: Lynne <[email protected]> Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/rv34dsp.asm | 140 ++++++++++++++++++++---------------------- libavcodec/x86/rv34dsp_init.c | 10 +-- tests/checkasm/rv34dsp.c | 2 +- 3 files changed, 74 insertions(+), 78 deletions(-) diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm index 52d2497007..39d0b2587c 100644 --- a/libavcodec/x86/rv34dsp.asm +++ b/libavcodec/x86/rv34dsp.asm @@ -1,5 +1,5 @@ ;****************************************************************************** -;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders +;* ASM-optimized functions for the RV30 and RV40 decoders ;* Copyright (C) 2012 Christophe Gisquet <[email protected]> ;* ;* This file is part of FFmpeg. @@ -22,14 +22,14 @@ %include "libavutil/x86/x86util.asm" SECTION_RODATA -pw_row_coeffs: times 4 dw 13 - times 4 dw 17 - times 4 dw 7 -pd_512: times 2 dd 0x200 -pw_col_coeffs: dw 13, 13, 13, -13 - dw 17, 7, 7, -17 - dw 13, -13, 13, 13 - dw -7, 17, -17, -7 +; 0 1 2 3 (words) -> 1 3 2 0 1 3 2 0 (words) +shuffle: times 2 db 2, 3, 6, 7, 4, 5, 0, 1 + +pw_13: times 8 dw 13 +pw_17: times 8 dw 17 +pw_7: times 8 dw 7 +pw_col_coeffs: dw -17, -7, -13, 13, 7, -17, 13, 13 +pd_512: times 4 dd 0x200 SECTION .text @@ -54,73 +54,69 @@ cglobal rv34_idct_dc_noround, 1, 2, 1 mova [r0+16], m0 RET -; Load coeffs and perform row transform -; Output: coeffs in mm[0467], rounder in mm5 -%macro ROW_TRANSFORM 1 - pxor mm7, mm7 - mova mm0, [%1+ 0*8] - mova mm1, [%1+ 1*8] - mova mm2, [%1+ 2*8] - mova mm3, [%1+ 3*8] - mova [%1+ 0*8], mm7 - mova [%1+ 1*8], mm7 - mova [%1+ 2*8], mm7 - mova [%1+ 3*8], mm7 - mova mm4, mm0 - mova mm6, [pw_row_coeffs+ 0] - paddsw mm0, mm2 ; b0 + b2 - psubsw mm4, mm2 ; b0 - b2 - pmullw mm0, mm6 ; *13 = z0 - pmullw mm4, mm6 ; *13 = z1 - mova mm5, mm1 - pmullw mm1, [pw_row_coeffs+ 8] ; b1*17 - pmullw mm5, [pw_row_coeffs+16] ; b1* 7 - mova mm7, mm3 - pmullw mm3, [pw_row_coeffs+ 8] ; b3*17 - pmullw mm7, [pw_row_coeffs+16] ; b3* 7 - paddsw mm1, mm7 ; z3 = b1*17 + b3* 7 - psubsw mm5, mm3 ; z2 = b1* 7 - b3*17 - mova mm7, mm0 - mova mm6, mm4 - paddsw mm0, mm1 ; z0 + z3 - psubsw mm7, mm1 ; z0 - z3 - paddsw mm4, mm5 ; z1 + z2 - psubsw mm6, mm5 ; z1 - z2 - mova mm5, [pd_512] ; 0x200 -%endmacro - -; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); -%macro COL_TRANSFORM 4 - pshufw mm3, %2, 0xDD ; col. 1,3,1,3 - pshufw %2, %2, 0x88 ; col. 0,2,0,2 - pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1 - pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2 - paddd %2, mm5 - pshufw mm1, %2, 01001110b ; z1 | z0 - pshufw mm2, mm3, 01001110b ; z2 | z3 - paddd %2, mm3 ; z0+z3 | z1+z2 - psubd mm1, mm2 ; z1-z2 | z0-z3 - movd mm3, %1 +%macro COL_TRANSFORM 3 + ; -17*c1-7*c3 | 13*c0-13*c2 | 7*c1-17*c3 | 13*c1+13*c2 = -z3 | z1 | z2 | z0 + pmaddwd %2, m7 + movd m3, %1 + pshufd %3, %2, q0123 ; z0 | z2 | z1 | -z3 + psignd %2, m7 ; z3 | z1 |-z2 | z0 + paddd %3, m5 + paddd %2, %3 ; z0+z3 | z1+z2 | z1-z2 | z0-z3 (+round) +%ifidn %3,m1 + pxor m1, m1 +%endif psrad %2, 10 - pxor mm2, mm2 - psrad mm1, 10 - punpcklbw mm3, mm2 - packssdw %2, mm1 - paddw %2, mm3 + punpcklbw m3, m1 + packssdw %2, %2 + paddw %2, m3 packuswb %2, %2 movd %1, %2 %endmacro -INIT_MMX mmxext -cglobal rv34_idct_add, 3, 3, 0, dst, s, b - ROW_TRANSFORM bq - COL_TRANSFORM [dstq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8] - mova mm0, [pw_col_coeffs+ 0] - COL_TRANSFORM [dstq+sq], mm4, mm0, [pw_col_coeffs+ 8] - mova mm4, [pw_col_coeffs+ 8] - lea dstq, [dstq + 2*sq] - COL_TRANSFORM [dstq], mm6, mm0, mm4 - COL_TRANSFORM [dstq+sq], mm7, mm0, mm4 - ret + +INIT_XMM ssse3 +; ff_rv34_idct_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block) +cglobal rv34_idct_add, 3, 3, 8, dst, stride, block + ; row transform + movq m0, [blockq + 0*8] + movq m1, [blockq + 1*8] + movq m2, [blockq + 2*8] + movq m3, [blockq + 3*8] + pxor m7, m7 + mova m6, [shuffle] + mova [blockq + 0], m7 + mova [blockq + 16], m7 + mova m4, m0 + mova m5, [pw_13] + paddsw m0, m2 ; b0 + b2 + pshufb m1, m6 + psubsw m4, m2 ; b0 - b2 + pmullw m0, m5 ; *13 = z0 + pshufb m3, m6 + pmullw m4, m5 ; *13 = z1 + mova m5, m1 + pmullw m1, [pw_17] ; b1*17 + pmullw m5, [pw_7] ; b1* 7 + pshufb m0, m6 + mova m2, m3 + pmullw m3, [pw_17] ; b3*17 + pmullw m2, [pw_7] ; b3* 7 + pshufb m4, m6 + mova m7, [pw_col_coeffs] + paddsw m1, m2 ; z3 = b1*17 + b3* 7 + psubsw m5, m3 ; z2 = b1* 7 - b3*17 + mova m2, m0 + mova m6, m4 + paddsw m0, m1 ; z0 + z3 + paddsw m4, m5 ; z1 + z2 + psubsw m2, m1 ; z0 - z3 + psubsw m6, m5 ; z1 - z2 + mova m5, [pd_512] ; 0x200 + COL_TRANSFORM [dstq], m0, m1 + COL_TRANSFORM [dstq+strideq], m4, m0 + lea dstq, [dstq + 2*strideq] + COL_TRANSFORM [dstq], m6, m0 + COL_TRANSFORM [dstq+strideq], m2, m0 + RET ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); %macro RV34_IDCT_DC_ADD 0 diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c index c4dcae929a..0a12b49356 100644 --- a/libavcodec/x86/rv34dsp_init.c +++ b/libavcodec/x86/rv34dsp_init.c @@ -1,5 +1,5 @@ /* - * RV30/40 MMX/SSE2 optimizations + * RV30/40 ASM optimizations * Copyright (C) 2012 Christophe Gisquet <[email protected]> * * This file is part of FFmpeg. @@ -27,19 +27,19 @@ void ff_rv34_idct_dc_noround_sse2(int16_t *block); void ff_rv34_idct_dc_add_sse2(uint8_t *dst, ptrdiff_t stride, int dc); void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc); -void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); +void ff_rv34_idct_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block); av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c) { int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMXEXT(cpu_flags)) { - c->rv34_idct_add = ff_rv34_idct_add_mmxext; - } if (EXTERNAL_SSE2(cpu_flags)) { c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_sse2; c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse2; } + if (EXTERNAL_SSSE3(cpu_flags)) { + c->rv34_idct_add = ff_rv34_idct_add_ssse3; + } if (EXTERNAL_SSE4(cpu_flags)) c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4; } diff --git a/tests/checkasm/rv34dsp.c b/tests/checkasm/rv34dsp.c index dc4cc2f31a..7b6dd531d4 100644 --- a/tests/checkasm/rv34dsp.c +++ b/tests/checkasm/rv34dsp.c @@ -88,7 +88,7 @@ static void test_rv34_idct_add(const RV34DSPContext *const s) enum { MAX_STRIDE = 256, ///< arbitrary, should be divisible by four }; - declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int16_t *block); + declare_func(void, uint8_t *dst, ptrdiff_t stride, int16_t *block); if (check_func(s->rv34_idct_add, "rv34_idct_add")) { DECLARE_ALIGNED_16(int16_t, block_ref)[4*4]; _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
