PR #23279 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23279 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23279.patch
Also deduplicate macros and avoid loading constants. >From 602ff02940fedc839df30c833f4bdb477a837e01 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 11 Apr 2026 12:20:31 +0200 Subject: [PATCH 1/8] avcodec/x86/hpeldsp: Deduplicate {avg,put}_pixels{8,16}_x2 macros Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hpeldsp.asm | 81 +++++++------------------------------- 1 file changed, 15 insertions(+), 66 deletions(-) diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index cbdf0e460d..d50bf5d39c 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -38,12 +38,8 @@ cextern pw_8192 SECTION .text ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PUT_PIXELS8_X2 0 -%if cpuflag(sse2) -cglobal put_pixels16_x2, 4,5,4 -%else -cglobal put_pixels8_x2, 4,5 -%endif +%macro PIXELS_X2 2 +cglobal %1_pixels%2_x2, 4,5,4 lea r4, [r2*2] .loop: movu m0, [r1+1] @@ -56,10 +52,14 @@ cglobal put_pixels8_x2, 4,5 %else pavgb m0, [r1] pavgb m1, [r1+r2] +%endif + add r1, r4 +%ifidn %1,avg + pavgb m0, [r0] + pavgb m1, [r0+r2] %endif mova [r0], m0 mova [r0+r2], m1 - add r1, r4 add r0, r4 movu m0, [r1+1] movu m1, [r1+r2+1] @@ -73,6 +73,10 @@ cglobal put_pixels8_x2, 4,5 pavgb m1, [r1+r2] %endif add r1, r4 +%ifidn %1,avg + pavgb m0, [r0] + pavgb m1, [r0+r2] +%endif mova [r0], m0 mova [r0+r2], m1 add r0, r4 @@ -82,12 +86,12 @@ cglobal put_pixels8_x2, 4,5 %endmacro INIT_MMX mmxext -PUT_PIXELS8_X2 +PIXELS_X2 put, 8 +PIXELS_X2 avg, 8 -; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -; The 8_X2 macro can easily be used here INIT_XMM sse2 -PUT_PIXELS8_X2 +PIXELS_X2 put, 16 +PIXELS_X2 avg, 16 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) @@ -308,61 +312,6 @@ INIT_XMM sse2 NO_RND_PIXELS_Y2 avg NO_RND_PIXELS_Y2 put -; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro AVG_PIXELS8_X2 0 -%if cpuflag(sse2) -cglobal avg_pixels16_x2, 4,5,4 -%else -cglobal avg_pixels8_x2, 4,5 -%endif - lea r4, [r2*2] -.loop: - movu m0, [r1] - movu m2, [r1+r2] -%if cpuflag(sse2) - movu m1, [r1+1] - movu m3, [r1+r2+1] - pavgb m0, m1 - pavgb m2, m3 -%else - pavgb m0, [r1+1] - pavgb m2, [r1+r2+1] -%endif - pavgb m0, [r0] - pavgb m2, [r0+r2] - add r1, r4 - mova [r0], m0 - mova [r0+r2], m2 - movu m0, [r1] - movu m2, [r1+r2] -%if cpuflag(sse2) - movu m1, [r1+1] - movu m3, [r1+r2+1] - pavgb m0, m1 - pavgb m2, m3 -%else - pavgb m0, [r1+1] - pavgb m2, [r1+r2+1] -%endif - add r0, r4 - add r1, r4 - pavgb m0, [r0] - pavgb m2, [r0+r2] - mova [r0], m0 - mova [r0+r2], m2 - add r0, r4 - sub r3d, 4 - jne .loop - RET -%endmacro - -INIT_MMX mmxext -AVG_PIXELS8_X2 -; actually avg_pixels16_x2 -INIT_XMM sse2 -AVG_PIXELS8_X2 - - ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro AVG_PIXELS8_Y2 0 %if cpuflag(sse2) -- 2.52.0 >From d74ae020e8490ab346c9861ab8fb6b782aef953b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 11 Apr 2026 12:55:24 +0200 Subject: [PATCH 2/8] avcodec/x86/hpeldsp: Avoid constant in {avg,put}_pixels16_xy2_ssse3 No difference in benchmarks here. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hpeldsp.asm | 52 ++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index d50bf5d39c..70066e4f45 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -27,11 +27,9 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA cextern pb_1 cextern pw_1 cextern pw_2 -pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 cextern pw_8192 @@ -503,51 +501,49 @@ SET_PIXELS_XY2 avg, pw_1, _no_rnd %macro SSSE3_PIXELS_XY2 1-2 cglobal %1_pixels16_xy2, 4,5,%2 - mova m4, [pb_interleave16] - mova m5, [pb_1] - movu m0, [r1] movu m1, [r1+1] - pmaddubsw m0, m5 + movu m0, [r1] + mova m5, [pb_1] + mova m4, [pw_8192] pmaddubsw m1, m5 + pmaddubsw m0, m5 xor r4, r4 add r1, r2 .loop: - movu m2, [r1+r4] movu m3, [r1+r4+1] - pmaddubsw m2, m5 + movu m2, [r1+r4] pmaddubsw m3, m5 - paddw m0, m2 - paddw m1, m3 - pmulhrsw m0, [pw_8192] - pmulhrsw m1, [pw_8192] + pmaddubsw m2, m5 %ifidn %1, avg mova m6, [r0+r4] - packuswb m0, m1 - pshufb m0, m4 +%endif + paddw m1, m3 + paddw m0, m2 + pmulhrsw m1, m4 + pmulhrsw m0, m4 + pslldq m1, 1 + por m0, m1 +%ifidn %1, avg pavgb m0, m6 -%else - packuswb m0, m1 - pshufb m0, m4 %endif mova [r0+r4], m0 add r4, r2 - movu m0, [r1+r4] movu m1, [r1+r4+1] - pmaddubsw m0, m5 + movu m0, [r1+r4] pmaddubsw m1, m5 - paddw m2, m0 - paddw m3, m1 - pmulhrsw m2, [pw_8192] - pmulhrsw m3, [pw_8192] + pmaddubsw m0, m5 %ifidn %1, avg mova m6, [r0+r4] - packuswb m2, m3 - pshufb m2, m4 +%endif + paddw m3, m1 + paddw m2, m0 + pmulhrsw m3, m4 + pmulhrsw m2, m4 + pslldq m3, 1 + por m2, m3 +%ifidn %1, avg pavgb m2, m6 -%else - packuswb m2, m3 - pshufb m2, m4 %endif mova [r0+r4], m2 add r4, r2 -- 2.52.0 >From 1b3fe74479d35aca7d43217f55e77fb02eede933 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 11 Apr 2026 13:45:45 +0200 Subject: [PATCH 3/8] avcodec/x86/hpeldsp: Add _approx to non-bitexact functions' names Right now, their exact counterparts have a "_exact" in their names; switch this around. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hpeldsp.asm | 38 +++++++++++++---------------------- libavcodec/x86/hpeldsp_init.c | 24 +++++++++++----------- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 70066e4f45..04b316fce1 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -92,9 +92,9 @@ PIXELS_X2 put, 16 PIXELS_X2 avg, 16 -; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +; void ff_put_no_rnd_pixels8_x2_approx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) INIT_MMX mmxext -cglobal put_no_rnd_pixels8_x2, 4,5 +cglobal put_no_rnd_pixels8_x2_approx, 4,5 mova m6, [pb_1] lea r4, [r2*2] .loop: @@ -127,13 +127,8 @@ cglobal put_no_rnd_pixels8_x2, 4,5 RET -%macro NO_RND_PIXELS_X2 1 -%if cpuflag(sse2) -cglobal %1_no_rnd_pixels16_x2, 4,5,5 -%else -; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -cglobal %1_no_rnd_pixels8_x2_exact, 4,5 -%endif +%macro NO_RND_PIXELS_X2 2 +cglobal %1_no_rnd_pixels%2_x2, 4,5,5 lea r4, [r2*3] pcmpeqb m4, m4 .loop: @@ -181,10 +176,10 @@ cglobal %1_no_rnd_pixels8_x2_exact, 4,5 %endmacro INIT_MMX mmxext -NO_RND_PIXELS_X2 put +NO_RND_PIXELS_X2 put, 8 INIT_XMM sse2 -NO_RND_PIXELS_X2 avg -NO_RND_PIXELS_X2 put +NO_RND_PIXELS_X2 avg, 16 +NO_RND_PIXELS_X2 put, 16 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro PUT_PIXELS8_Y2 0 @@ -225,9 +220,9 @@ INIT_XMM sse2 PUT_PIXELS8_Y2 -; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +; void ff_put_no_rnd_pixels8_y2_approx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) INIT_MMX mmxext -cglobal put_no_rnd_pixels8_y2, 4,5 +cglobal put_no_rnd_pixels8_y2_approx, 4,5 mova m6, [pb_1] lea r4, [r2+r2] mova m0, [r1] @@ -256,13 +251,8 @@ cglobal put_no_rnd_pixels8_y2, 4,5 RET -%macro NO_RND_PIXELS_Y2 1 -%if cpuflag(sse2) -cglobal %1_no_rnd_pixels16_y2, 4,5,4 -%else -; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -cglobal %1_no_rnd_pixels8_y2_exact, 4,5 -%endif +%macro NO_RND_PIXELS_Y2 2 +cglobal %1_no_rnd_pixels%2_y2, 4,5,4 lea r4, [r2*3] movu m0, [r1] pcmpeqb m3, m3 @@ -305,10 +295,10 @@ cglobal %1_no_rnd_pixels8_y2_exact, 4,5 %endmacro INIT_MMX mmxext -NO_RND_PIXELS_Y2 put +NO_RND_PIXELS_Y2 put, 8 INIT_XMM sse2 -NO_RND_PIXELS_Y2 avg -NO_RND_PIXELS_Y2 put +NO_RND_PIXELS_Y2 avg, 16 +NO_RND_PIXELS_Y2 put, 16 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro AVG_PIXELS8_Y2 0 diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index 3500ad1878..f689879d51 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -43,22 +43,22 @@ void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, +void ff_put_no_rnd_pixels8_x2_approx_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, + const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h); void ff_put_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, +void ff_put_no_rnd_pixels8_y2_approx_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, + const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h); void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, @@ -84,12 +84,12 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags) c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; if (!(flags & AV_CODEC_FLAG_BITEXACT)) { - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_approx_mmxext; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_approx_mmxext; } #endif /* HAVE_MMXEXT_EXTERNAL */ } -- 2.52.0 >From 8d2ce201c0e4a029e57d4a46f245ee7e9dd46a95 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 11 Apr 2026 14:10:52 +0200 Subject: [PATCH 4/8] avcodec/x86/hpeldsp: Deduplicate {avg,put}_pixels{8,16}_y2 macros Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hpeldsp.asm | 65 ++++++++------------------------------ 1 file changed, 14 insertions(+), 51 deletions(-) diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 04b316fce1..a8265749bf 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -182,12 +182,8 @@ NO_RND_PIXELS_X2 avg, 16 NO_RND_PIXELS_X2 put, 16 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PUT_PIXELS8_Y2 0 -%if cpuflag(sse2) -cglobal put_pixels16_y2, 4,5,3 -%else -cglobal put_pixels8_y2, 4,5 -%endif +%macro PIXELS_Y2 2 +cglobal %1_pixels%2_y2, 4,5,3 lea r4, [r2*2] movu m0, [r1] sub r0, r2 @@ -197,6 +193,10 @@ cglobal put_pixels8_y2, 4,5 add r1, r4 pavgb m0, m1 pavgb m1, m2 +%ifidn %1,avg + pavgb m0, [r0+r2] + pavgb m1, [r0+r4] +%endif mova [r0+r2], m0 mova [r0+r4], m1 movu m1, [r1+r2] @@ -205,6 +205,10 @@ cglobal put_pixels8_y2, 4,5 add r1, r4 pavgb m2, m1 pavgb m1, m0 +%ifidn %1,avg + pavgb m2, [r0+r2] + pavgb m1, [r0+r4] +%endif mova [r0+r2], m2 mova [r0+r4], m1 add r0, r4 @@ -214,10 +218,11 @@ cglobal put_pixels8_y2, 4,5 %endmacro INIT_MMX mmxext -PUT_PIXELS8_Y2 -; actually, put_pixels16_y2_sse2 +PIXELS_Y2 put, 8 +PIXELS_Y2 avg, 8 INIT_XMM sse2 -PUT_PIXELS8_Y2 +PIXELS_Y2 put, 16 +PIXELS_Y2 avg, 16 ; void ff_put_no_rnd_pixels8_y2_approx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) @@ -300,48 +305,6 @@ INIT_XMM sse2 NO_RND_PIXELS_Y2 avg, 16 NO_RND_PIXELS_Y2 put, 16 -; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro AVG_PIXELS8_Y2 0 -%if cpuflag(sse2) -cglobal avg_pixels16_y2, 4,5,3 -%else -cglobal avg_pixels8_y2, 4,5 -%endif - lea r4, [r2*2] - movu m0, [r1] - sub r0, r2 -.loop: - movu m1, [r1+r2] - movu m2, [r1+r4] - add r1, r4 - pavgb m0, m1 - pavgb m1, m2 - pavgb m0, [r0+r2] - pavgb m1, [r0+r4] - mova [r0+r2], m0 - mova [r0+r4], m1 - movu m1, [r1+r2] - movu m0, [r1+r4] - pavgb m2, m1 - pavgb m1, m0 - add r0, r4 - add r1, r4 - pavgb m2, [r0+r2] - pavgb m1, [r0+r4] - mova [r0+r2], m2 - mova [r0+r4], m1 - add r0, r4 - sub r3d, 4 - jne .loop - RET -%endmacro - -INIT_MMX mmxext -AVG_PIXELS8_Y2 -; actually avg_pixels16_y2 -INIT_XMM sse2 -AVG_PIXELS8_Y2 - ; void ff_put_no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro SET_PIXELS8_XY2 2-3 -- 2.52.0 >From 8f215d7772ec93586977cf2d386a02d8c0f5e8da Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 11 Apr 2026 14:14:24 +0200 Subject: [PATCH 5/8] avcodec/x86/hpeldsp: Avoid offsetting unnecessarily Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hpeldsp.asm | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index a8265749bf..4a132957b5 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -186,7 +186,6 @@ NO_RND_PIXELS_X2 put, 16 cglobal %1_pixels%2_y2, 4,5,3 lea r4, [r2*2] movu m0, [r1] - sub r0, r2 .loop: movu m1, [r1+r2] movu m2, [r1+r4] @@ -194,11 +193,11 @@ cglobal %1_pixels%2_y2, 4,5,3 pavgb m0, m1 pavgb m1, m2 %ifidn %1,avg - pavgb m0, [r0+r2] - pavgb m1, [r0+r4] + pavgb m0, [r0] + pavgb m1, [r0+r2] %endif - mova [r0+r2], m0 - mova [r0+r4], m1 + mova [r0], m0 + mova [r0+r2], m1 movu m1, [r1+r2] movu m0, [r1+r4] add r0, r4 @@ -206,11 +205,11 @@ cglobal %1_pixels%2_y2, 4,5,3 pavgb m2, m1 pavgb m1, m0 %ifidn %1,avg - pavgb m2, [r0+r2] - pavgb m1, [r0+r4] + pavgb m2, [r0] + pavgb m1, [r0+r2] %endif - mova [r0+r2], m2 - mova [r0+r4], m1 + mova [r0], m2 + mova [r0+r2], m1 add r0, r4 sub r3d, 4 jne .loop @@ -231,7 +230,6 @@ cglobal put_no_rnd_pixels8_y2_approx, 4,5 mova m6, [pb_1] lea r4, [r2+r2] mova m0, [r1] - sub r0, r2 .loop: mova m1, [r1+r2] mova m2, [r1+r4] @@ -239,8 +237,8 @@ cglobal put_no_rnd_pixels8_y2_approx, 4,5 psubusb m1, m6 pavgb m0, m1 pavgb m1, m2 - mova [r0+r2], m0 - mova [r0+r4], m1 + mova [r0], m0 + mova [r0+r2], m1 mova m1, [r1+r2] mova m0, [r1+r4] add r0, r4 @@ -248,8 +246,8 @@ cglobal put_no_rnd_pixels8_y2_approx, 4,5 psubusb m1, m6 pavgb m2, m1 pavgb m1, m0 - mova [r0+r2], m2 - mova [r0+r4], m1 + mova [r0], m2 + mova [r0+r2], m1 add r0, r4 sub r3d, 4 jne .loop -- 2.52.0 >From b6557b7a7826186601a32f4814775ab80718c96c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 30 May 2026 15:59:03 +0200 Subject: [PATCH 6/8] avcodec/x86/hpeldsp: Avoid loading constants No change in benchmarks here. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hpeldsp.asm | 51 +++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 4a132957b5..8e29a232d7 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -28,8 +28,6 @@ %include "libavutil/x86/x86util.asm" cextern pb_1 -cextern pw_1 -cextern pw_2 cextern pw_8192 @@ -305,10 +303,14 @@ NO_RND_PIXELS_Y2 put, 16 ; void ff_put_no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro SET_PIXELS8_XY2 2-3 -cglobal %1%3_pixels8_xy2, 4,5,5 +%macro SET_PIXELS8_XY2 1-2 +cglobal %1%2_pixels8_xy2, 4,5,5 mova m4, [pb_1] - mova m3, [%2] +%ifidn %2, _no_rnd + pcmpeqw m3, m3 +%else + mova m3, [pw_8192] +%endif movh m0, [r1] movh m2, [r1+1] punpcklbw m2, m0 @@ -320,8 +322,8 @@ cglobal %1%3_pixels8_xy2, 4,5,5 movh m1, [r1+r4+1] punpcklbw m0, m1 pmaddubsw m0, m4 -%ifidn %3, _no_rnd - paddw m2, m3 +%ifidn %2, _no_rnd + psubw m2, m3 paddw m2, m0 psrlw m2, 2 %else @@ -342,8 +344,8 @@ cglobal %1%3_pixels8_xy2, 4,5,5 movh m2, [r1+r4+1] punpcklbw m2, m1 pmaddubsw m2, m4 -%ifidn %3, _no_rnd - paddw m0, m3 +%ifidn %2, _no_rnd + psubw m0, m3 paddw m0, m2 psrlw m0, 2 %else @@ -365,18 +367,21 @@ cglobal %1%3_pixels8_xy2, 4,5,5 %endmacro INIT_XMM ssse3 -SET_PIXELS8_XY2 put, pw_1, _no_rnd -SET_PIXELS8_XY2 avg, pw_8192 -SET_PIXELS8_XY2 put, pw_8192 +SET_PIXELS8_XY2 put, _no_rnd +SET_PIXELS8_XY2 avg +SET_PIXELS8_XY2 put ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro SET_PIXELS_XY2 2-3 -cglobal %1%3_pixels16_xy2, 4,5,8 +%macro SET_PIXELS_XY2 1-2 +cglobal %1%2_pixels16_xy2, 4,5,8 pxor m7, m7 - mova m6, [%2] movu m0, [r1] movu m4, [r1+1] + pcmpeqw m6, m6 +%ifnidn %2, _no_rnd + paddw m6, m6 +%endif mova m1, m0 mova m5, m4 punpcklbw m0, m7 @@ -398,8 +403,8 @@ cglobal %1%3_pixels16_xy2, 4,5,8 punpckhbw m3, m7 paddw m0, m2 paddw m1, m3 - paddw m4, m6 - paddw m5, m6 + psubw m4, m6 + psubw m5, m6 paddw m4, m0 paddw m5, m1 psrlw m4, 2 @@ -424,8 +429,8 @@ cglobal %1%3_pixels16_xy2, 4,5,8 punpckhbw m5, m7 paddw m4, m2 paddw m5, m3 - paddw m0, m6 - paddw m1, m6 + psubw m0, m6 + psubw m1, m6 paddw m0, m4 paddw m1, m5 psrlw m0, 2 @@ -445,10 +450,10 @@ cglobal %1%3_pixels16_xy2, 4,5,8 %endmacro INIT_XMM sse2 -SET_PIXELS_XY2 put, pw_2 -SET_PIXELS_XY2 avg, pw_2 -SET_PIXELS_XY2 put, pw_1, _no_rnd -SET_PIXELS_XY2 avg, pw_1, _no_rnd +SET_PIXELS_XY2 put +SET_PIXELS_XY2 avg +SET_PIXELS_XY2 put, _no_rnd +SET_PIXELS_XY2 avg, _no_rnd %macro SSSE3_PIXELS_XY2 1-2 cglobal %1_pixels16_xy2, 4,5,%2 -- 2.52.0 >From ea5824b0da8696ab098d3dd41545da57e1705655 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 12 Apr 2026 15:24:05 +0200 Subject: [PATCH 7/8] avcodec/x86/fpel: Use SSE2 in avg_pixels8 No change in benchmarks here; this already allows to remove an emms_c from cavsdec.c. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/cavsdec.c | 2 -- libavcodec/x86/cavsdsp.c | 6 +---- libavcodec/x86/fpel.asm | 46 +++++++++++++++++++++++++++++------ libavcodec/x86/fpel.h | 8 +++--- libavcodec/x86/h264_qpel.c | 2 +- libavcodec/x86/hpeldsp_init.c | 2 +- libavcodec/x86/qpeldsp_init.c | 6 +---- libavcodec/x86/vc1dsp_init.c | 5 ++-- tests/checkasm/cavsdsp.c | 2 +- 9 files changed, 49 insertions(+), 30 deletions(-) diff --git a/libavcodec/cavsdec.c b/libavcodec/cavsdec.c index cc26a904db..bc1ed60bf0 100644 --- a/libavcodec/cavsdec.c +++ b/libavcodec/cavsdec.c @@ -27,7 +27,6 @@ #include "libavutil/attributes.h" #include "libavutil/avassert.h" -#include "libavutil/emms.h" #include "libavutil/mem.h" #include "avcodec.h" #include "get_bits.h" @@ -1161,7 +1160,6 @@ static int decode_pic(AVSContext *h) break; } while (ff_cavs_next_mb(h)); } - emms_c(); if (ret >= 0 && h->cur.f->pict_type != AV_PICTURE_TYPE_B) { av_frame_unref(h->DPB[1].f); FFSWAP(AVSFrame, h->cur, h->DPB[1]); diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c index e333bbee49..91ec866681 100644 --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -91,11 +91,6 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c) { av_unused int cpu_flags = av_get_cpu_flags(); -#if HAVE_MMX_EXTERNAL - if (EXTERNAL_MMXEXT(cpu_flags)) { - c->avg_cavs_qpel_pixels_tab[1][0] = ff_avg_pixels8x8_mmxext; - } -#endif #if HAVE_SSE2_EXTERNAL if (EXTERNAL_SSE2(cpu_flags)) { c->put_cavs_qpel_pixels_tab[0][ 0] = ff_put_pixels16x16_sse2; @@ -114,6 +109,7 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c) c->avg_cavs_qpel_pixels_tab[0][ 4] = avg_cavs_qpel16_mc01_sse2; c->avg_cavs_qpel_pixels_tab[0][ 8] = avg_cavs_qpel16_mc02_sse2; c->avg_cavs_qpel_pixels_tab[0][12] = avg_cavs_qpel16_mc03_sse2; + c->avg_cavs_qpel_pixels_tab[1][ 0] = ff_avg_pixels8x8_sse2; c->avg_cavs_qpel_pixels_tab[1][ 2] = ff_avg_cavs_qpel8_mc20_sse2; c->avg_cavs_qpel_pixels_tab[1][ 4] = avg_cavs_qpel8_mc01_sse2; c->avg_cavs_qpel_pixels_tab[1][ 8] = ff_avg_cavs_qpel8_mc02_sse2; diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm index e4becca5fb..598a57ab0d 100644 --- a/libavcodec/x86/fpel.asm +++ b/libavcodec/x86/fpel.asm @@ -25,8 +25,40 @@ SECTION .text -; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels, -; ptrdiff_t line_size, int h) +INIT_XMM sse2 +; void ff_avg_pixels8x8_sse2(uint8_t *block, const uint8_t *pixels, +; ptrdiff_t line_size) +cglobal avg_pixels8x8, 3,5,6 + mov r3d, 8 + jmp avg_pixels8_after_prologue + +; void ff_avg_pixels8_sse2(uint8_t *block, const uint8_t *pixels, +; ptrdiff_t line_size, int h) +cglobal avg_pixels8, 4,5,6 +avg_pixels8_after_prologue: + lea r4, [r2*3] +.loop: + movq m0, [r1] + movq m1, [r0] + movhps m0, [r1+r2] + movhps m1, [r0+r2] + movq m2, [r1+r2*2] + movq m3, [r0+r2*2] + pavgb m0, m1 + movq m4, [r1+r4] + pavgb m2, m3 + movq m5, [r0+r4] + lea r1, [r1+r2*4] + pavgb m4, m5 + movq [r0], m0 + movhps [r0+r2], m0 + movq [r0+r2*2], m2 + movq [r0+r4], m4 + lea r0, [r0+r2*4] + sub r3d, 4 + jne .loop + RET + %macro OP_PIXELS 2-3 0 %if %2 == mmsize/2 %define LOAD movh @@ -35,11 +67,13 @@ SECTION .text %define LOAD movu %define SAVE mova %endif -cglobal %1_pixels%2x%2, 3,5+4*%3,%3 ? 4 : 0 +cglobal %1_pixels%2x%2, 3,5+4*%3,4 mov r3d, %2 jmp %1_pixels%2_after_prologue -cglobal %1_pixels%2, 4,5+4*%3,%3 ? 4 : 0 +; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels, +; ptrdiff_t line_size, int h) +cglobal %1_pixels%2, 4,5+4*%3,4 %1_pixels%2_after_prologue: lea r4, [r2*3] .loop: @@ -76,10 +110,6 @@ cglobal %1_pixels%2, 4,5+4*%3,%3 ? 4 : 0 RET %endmacro -INIT_MMX mmxext -OP_PIXELS avg, 8 - -INIT_XMM sse2 OP_PIXELS put, 8, UNIX64 OP_PIXELS put, 16 OP_PIXELS avg, 16 diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h index 0b0056021e..6ec28af635 100644 --- a/libavcodec/x86/fpel.h +++ b/libavcodec/x86/fpel.h @@ -22,10 +22,10 @@ #include <stddef.h> #include <stdint.h> -void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels8x8_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size); +void ff_avg_pixels8_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels8x8_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size); void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels16x16_sse2(uint8_t *block, const uint8_t *pixels, diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 5d618651a4..0cc653c6ca 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -391,7 +391,6 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) if (EXTERNAL_MMXEXT(cpu_flags)) { if (!high_bit_depth) { SET_QPEL_FUNCS_1PP(put_h264_qpel, 2, 4, mmxext, ); - c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_pixels8x8_mmxext; SET_QPEL_FUNCS_1PP(avg_h264_qpel, 2, 4, mmxext, ); c->avg_h264_qpel_pixels_tab[2][0] = ff_avg_pixels4_mmxext; } else if (bit_depth == 10) { @@ -416,6 +415,7 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) H264_QPEL_FUNCS(3, 3, sse2); c->put_h264_qpel_pixels_tab[0][0] = ff_put_pixels16x16_sse2; c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_pixels16x16_sse2; + c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_pixels8x8_sse2; } if (bit_depth == 10) { diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index f689879d51..4e4abd5273 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -80,7 +80,6 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags) c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; - c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; @@ -114,6 +113,7 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags) c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2; c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2; c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2; + c->avg_pixels_tab[1][0] = ff_avg_pixels8_sse2; c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_sse2; c->avg_no_rnd_pixels_tab[1] = ff_avg_no_rnd_pixels16_x2_sse2; diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c index 18c259b0d8..771961c1b5 100644 --- a/libavcodec/x86/qpeldsp_init.c +++ b/libavcodec/x86/qpeldsp_init.c @@ -281,11 +281,6 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c) { int cpu_flags = av_get_cpu_flags(); - if (X86_MMXEXT(cpu_flags)) { -#if HAVE_MMXEXT_EXTERNAL - c->avg_qpel_pixels_tab[1][0] = ff_avg_pixels8x8_mmxext; -#endif /* HAVE_MMXEXT_EXTERNAL */ - } #if HAVE_SSE2_EXTERNAL if (EXTERNAL_SSE2(cpu_flags)) { c->put_no_rnd_qpel_pixels_tab[0][0] = @@ -293,6 +288,7 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c) c->put_no_rnd_qpel_pixels_tab[1][0] = c->put_qpel_pixels_tab[1][0] = ff_put_pixels8x8_sse2; c->avg_qpel_pixels_tab[0][0] = ff_avg_pixels16x16_sse2; + c->avg_qpel_pixels_tab[1][0] = ff_avg_pixels8x8_sse2; SET_V_QPEL_FUNCS (16, sse2,); SET_V_QPEL_FUNCS (8, sse2,); diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c index 9f80048791..3f0eb5746c 100644 --- a/libavcodec/x86/vc1dsp_init.c +++ b/libavcodec/x86/vc1dsp_init.c @@ -72,7 +72,7 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, ptrdiff_t stride, int pq) } DECLARE_FUNCTION(put_, 8, _sse2) -DECLARE_FUNCTION(avg_, 8, _mmxext) +DECLARE_FUNCTION(avg_, 8, _sse2) DECLARE_FUNCTION(put_, 16, _sse2) DECLARE_FUNCTION(avg_, 16, _sse2) @@ -114,8 +114,6 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT if (EXTERNAL_MMXEXT(cpu_flags)) { - dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_mmxext; - dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_mmxext; dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_mmxext; dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_mmxext; @@ -127,6 +125,7 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) dsp->put_vc1_mspel_pixels_tab[0][0] = put_vc1_mspel_mc00_16_sse2; dsp->put_vc1_mspel_pixels_tab[1][0] = put_vc1_mspel_mc00_8_sse2; dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_sse2; + dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_sse2; } if (EXTERNAL_SSSE3(cpu_flags)) { ASSIGN_LF4(ssse3); diff --git a/tests/checkasm/cavsdsp.c b/tests/checkasm/cavsdsp.c index 3e4a9ac127..ab6b695ae4 100644 --- a/tests/checkasm/cavsdsp.c +++ b/tests/checkasm/cavsdsp.c @@ -71,7 +71,7 @@ static void check_cavs_qpeldsp(void) TEST(put_cavs_qpel_pixels_tab), TEST(avg_cavs_qpel_pixels_tab), }; - declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + declare_func(void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride); ff_cavsdsp_init(&cavsdsp); -- 2.52.0 >From 4fee994de32417ec60aa5022e01755d16d89786e Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 12 Apr 2026 17:53:41 +0200 Subject: [PATCH 8/8] avcodec/x86/hpeldsp: Port mmxext functions to SSE2 The only noticable changes in benchmarks are for the x2 horizontal no_rnd case where SSE2 and movhps are beneficial: Old benchmarks: avg_pixels_tab[1][1]_c: 42.2 ( 1.00x) avg_pixels_tab[1][1]_mmxext: 10.8 ( 3.89x) avg_pixels_tab[1][2]_c: 18.0 ( 1.00x) avg_pixels_tab[1][2]_mmxext: 6.1 ( 2.96x) put_no_rnd_pixels_tab[1][1]_c: 29.7 ( 1.00x) put_no_rnd_pixels_tab[1][1]_mmxext: 12.3 ( 2.41x) put_no_rnd_pixels_tab[1][2]_c: 20.4 ( 1.00x) put_no_rnd_pixels_tab[1][2]_mmxext: 12.2 ( 1.67x) put_pixels_tab[1][1]_c: 29.9 ( 1.00x) put_pixels_tab[1][1]_mmxext: 7.6 ( 3.92x) put_pixels_tab[1][2]_c: 16.8 ( 1.00x) put_pixels_tab[1][2]_mmxext: 6.4 ( 2.63x) New benchmarks: avg_pixels_tab[1][1]_c: 42.3 ( 1.00x) avg_pixels_tab[1][1]_sse2: 10.7 ( 3.95x) avg_pixels_tab[1][2]_c: 17.8 ( 1.00x) avg_pixels_tab[1][2]_sse2: 6.3 ( 2.83x) put_no_rnd_pixels_tab[1][1]_c: 29.6 ( 1.00x) put_no_rnd_pixels_tab[1][1]_sse2: 10.5 ( 2.81x) put_no_rnd_pixels_tab[1][2]_c: 20.4 ( 1.00x) put_no_rnd_pixels_tab[1][2]_sse2: 12.3 ( 1.67x) put_pixels_tab[1][1]_c: 30.1 ( 1.00x) put_pixels_tab[1][1]_sse2: 7.6 ( 3.93x) put_pixels_tab[1][2]_c: 16.8 ( 1.00x) put_pixels_tab[1][2]_sse2: 6.4 ( 2.64x) Switching to SSE2 unfortunately increased codesize of the relevant functions by 160B. This makes these functions ABI compatible, i.e. they no longer rely on others calling emms_c to fix the fpu state. It also implies that many mpegvideo decoders (the exceptions are MPEG-4, RV30, RV40 and the VC-1 family) now no longer use any mmx registers at all. So one can remove the emms_c from the MPEG-1/2 decoder. The same is true for VP3. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/mpeg12dec.c | 3 - libavcodec/vp3.c | 2 - libavcodec/x86/hpeldsp.asm | 279 ++++++++++++++++++++-------------- libavcodec/x86/hpeldsp_init.c | 67 ++++---- tests/checkasm/hpeldsp.c | 2 +- 5 files changed, 197 insertions(+), 156 deletions(-) diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c index 4c83bcfa90..ce3066e4a0 100644 --- a/libavcodec/mpeg12dec.c +++ b/libavcodec/mpeg12dec.c @@ -32,7 +32,6 @@ #include <stdatomic.h> #include "libavutil/attributes.h" -#include "libavutil/emms.h" #include "libavutil/imgutils.h" #include "libavutil/internal.h" #include "libavutil/mem_internal.h" @@ -1651,7 +1650,6 @@ static int slice_decode_thread(AVCodecContext *c, void *arg) int ret; ret = mpeg_decode_slice(s, mb_y, &buf, end - buf); - emms_c(); ff_dlog(c, "ret:%d resync:%d/%d mb:%d/%d ts:%d/%d ec:%d\n", ret, s->c.resync_mb_x, s->c.resync_mb_y, s->c.mb_x, s->c.mb_y, s->c.start_mb_y, s->c.end_mb_y, s->c.er.error_count); @@ -2532,7 +2530,6 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture, buf_ptr += 2; // FIXME add minimum number of bytes per slice } else { ret = mpeg_decode_slice(&s->slice, mb_y, &buf_ptr, input_size); - emms_c(); if (ret < 0) { if (avctx->err_recognition & AV_EF_EXPLODE) diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c index 7789252f11..ca468b047c 100644 --- a/libavcodec/vp3.c +++ b/libavcodec/vp3.c @@ -36,7 +36,6 @@ #include <string.h> #include "libavutil/attributes.h" -#include "libavutil/emms.h" #include "libavutil/imgutils.h" #include "libavutil/mem.h" #include "libavutil/mem_internal.h" @@ -1924,7 +1923,6 @@ static void vp3_draw_horiz_band(Vp3DecodeContext *s, int y) for (int i = 3; i < AV_NUM_DATA_POINTERS; i++) offset[i] = 0; - emms_c(); s->avctx->draw_horiz_band(s->avctx, s->current_frame.f, offset, y, 3, h); } diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 8e29a232d7..f79d40a84b 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -34,138 +34,183 @@ cextern pw_8192 SECTION .text ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PIXELS_X2 2 +%macro PIXELS_X2 4 cglobal %1_pixels%2_x2, 4,5,4 lea r4, [r2*2] .loop: - movu m0, [r1+1] - movu m1, [r1+r2+1] -%if cpuflag(sse2) - movu m2, [r1] - movu m3, [r1+r2] - pavgb m0, m2 - pavgb m1, m3 -%else + mov%3 m0, [r1+1] + mov%3 m1, [r1+r2+1] +%if %2 == mmsize && avx_enabled pavgb m0, [r1] pavgb m1, [r1+r2] +%else + mov%3 m2, [r1] + mov%3 m3, [r1+r2] + pavgb m0, m2 + pavgb m1, m3 %endif add r1, r4 %ifidn %1,avg +%if %2 == mmsize pavgb m0, [r0] pavgb m1, [r0+r2] +%else + mov%4 m2, [r0] + mov%4 m3, [r0+r2] + pavgb m0, m2 + pavgb m1, m3 %endif - mova [r0], m0 - mova [r0+r2], m1 +%endif + mov%4 [r0], m0 + mov%4 [r0+r2], m1 add r0, r4 - movu m0, [r1+1] - movu m1, [r1+r2+1] -%if cpuflag(sse2) - movu m2, [r1] - movu m3, [r1+r2] - pavgb m0, m2 - pavgb m1, m3 -%else + mov%3 m0, [r1+1] + mov%3 m1, [r1+r2+1] +%if %2 == mmsize && avx_enabled pavgb m0, [r1] pavgb m1, [r1+r2] +%else + mov%3 m2, [r1] + mov%3 m3, [r1+r2] + pavgb m0, m2 + pavgb m1, m3 %endif add r1, r4 %ifidn %1,avg +%if %2 == mmsize pavgb m0, [r0] pavgb m1, [r0+r2] +%else + mov%4 m2, [r0] + mov%4 m3, [r0+r2] + pavgb m0, m2 + pavgb m1, m3 %endif - mova [r0], m0 - mova [r0+r2], m1 +%endif + mov%4 [r0], m0 + mov%4 [r0+r2], m1 add r0, r4 sub r3d, 4 jne .loop RET %endmacro -INIT_MMX mmxext -PIXELS_X2 put, 8 -PIXELS_X2 avg, 8 - INIT_XMM sse2 -PIXELS_X2 put, 16 -PIXELS_X2 avg, 16 +PIXELS_X2 put, 8, q, q +PIXELS_X2 avg, 8, q, q + +PIXELS_X2 put, 16, u, a +PIXELS_X2 avg, 16, u, a ; void ff_put_no_rnd_pixels8_x2_approx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -INIT_MMX mmxext -cglobal put_no_rnd_pixels8_x2_approx, 4,5 - mova m6, [pb_1] +INIT_XMM sse2 +cglobal put_no_rnd_pixels8_x2_approx, 4,5,5 + mova m4, [pb_1] lea r4, [r2*2] .loop: - mova m0, [r1] - mova m2, [r1+r2] - mova m1, [r1+1] - mova m3, [r1+r2+1] + movq m0, [r1] + movq m1, [r1+1] + movhps m0, [r1+r2] + movhps m1, [r1+r2+1] add r1, r4 - psubusb m0, m6 - psubusb m2, m6 + psubusb m0, m4 pavgb m0, m1 - pavgb m2, m3 - mova [r0], m0 - mova [r0+r2], m2 - mova m0, [r1] - mova m1, [r1+1] - mova m2, [r1+r2] - mova m3, [r1+r2+1] + movq [r0], m0 + movhps [r0+r2], m0 + movq m0, [r1] + movq m1, [r1+1] + movhps m0, [r1+r2] + movhps m1, [r1+r2+1] add r0, r4 add r1, r4 - psubusb m0, m6 - psubusb m2, m6 + psubusb m0, m4 pavgb m0, m1 - pavgb m2, m3 - mova [r0], m0 - mova [r0+r2], m2 + movq [r0], m0 + movhps [r0+r2], m0 add r0, r4 sub r3d, 4 jne .loop RET -%macro NO_RND_PIXELS_X2 2 +%macro NO_RND_PIXELS_X2 4 cglobal %1_no_rnd_pixels%2_x2, 4,5,5 lea r4, [r2*3] pcmpeqb m4, m4 .loop: - movu m0, [r1] - movu m2, [r1+r2] - movu m1, [r1+1] - movu m3, [r1+r2+1] + mov%3 m0, [r1] +%if %2 == mmsize + mov%3 m2, [r1+r2] + mov%3 m1, [r1+1] + mov%3 m3, [r1+r2+1] +%else + movq m1, [r1+1] + movhps m0, [r1+r2] + movhps m1, [r1+r2+1] +%endif pxor m0, m4 +%if %2 == mmsize pxor m2, m4 +%endif pxor m1, m4 +%if %2 == mmsize pxor m3, m4 +%endif pavgb m0, m1 +%if %2 == mmsize pavgb m2, m3 +%endif pxor m0, m4 +%if %2 == mmsize pxor m2, m4 +%endif %ifidn %1, avg pavgb m0, [r0] pavgb m2, [r0+r2] %endif - mova [r0], m0 - mova [r0+r2], m2 - movu m0, [r1+r2*2] - movu m1, [r1+r2*2+1] - movu m2, [r1+r4] - movu m3, [r1+r4+1] + mov%4 [r0], m0 +%if %2 == mmsize + mov%4 [r0+r2], m2 +%else + movhps [r0+r2], m0 +%endif + mov%3 m0, [r1+2*r2] +%if %2 == mmsize + mov%3 m2, [r1+r4] + mov%3 m1, [r1+2*r2+1] + mov%3 m3, [r1+r4+1] +%else + movq m1, [r1+2*r2+1] + movhps m0, [r1+r4] + movhps m1, [r1+r4+1] +%endif pxor m0, m4 +%if %2 == mmsize + pxor m2, m4 +%endif pxor m1, m4 - pxor m2, m4 +%if %2 == mmsize pxor m3, m4 +%endif pavgb m0, m1 +%if %2 == mmsize pavgb m2, m3 +%endif pxor m0, m4 +%if %2 == mmsize pxor m2, m4 +%endif %ifidn %1, avg pavgb m0, [r0+r2*2] pavgb m2, [r0+r4] %endif - mova [r0+r2*2], m0 - mova [r0+r4], m2 + mov%4 [r0+2*r2], m0 +%if %2 == mmsize + mov%4 [r0+r4], m2 +%else + movhps [r0+r4], m0 +%endif lea r1, [r1+r2*4] lea r0, [r0+r2*4] sub r3d, 4 @@ -173,95 +218,109 @@ cglobal %1_no_rnd_pixels%2_x2, 4,5,5 RET %endmacro -INIT_MMX mmxext -NO_RND_PIXELS_X2 put, 8 INIT_XMM sse2 -NO_RND_PIXELS_X2 avg, 16 -NO_RND_PIXELS_X2 put, 16 +NO_RND_PIXELS_X2 put, 8, q, q + +NO_RND_PIXELS_X2 avg, 16, u, a +NO_RND_PIXELS_X2 put, 16, u, a ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PIXELS_Y2 2 -cglobal %1_pixels%2_y2, 4,5,3 +%macro PIXELS_Y2 4 +cglobal %1_pixels%2_y2, 4,5,5 + mov%3 m0, [r1] lea r4, [r2*2] - movu m0, [r1] .loop: - movu m1, [r1+r2] - movu m2, [r1+r4] + mov%3 m1, [r1+r2] + mov%3 m2, [r1+r4] add r1, r4 pavgb m0, m1 pavgb m1, m2 %ifidn %1,avg +%if %2 == mmsize pavgb m0, [r0] pavgb m1, [r0+r2] +%else + mov%4 m3, [r0] + mov%4 m4, [r0+r2] + pavgb m0, m3 + pavgb m1, m4 %endif - mova [r0], m0 - mova [r0+r2], m1 - movu m1, [r1+r2] - movu m0, [r1+r4] +%endif + mov%4 [r0], m0 + mov%4 [r0+r2], m1 + mov%3 m1, [r1+r2] + mov%3 m0, [r1+r4] add r0, r4 add r1, r4 pavgb m2, m1 pavgb m1, m0 %ifidn %1,avg +%if %2 == mmsize pavgb m2, [r0] pavgb m1, [r0+r2] +%else + mov%4 m3, [r0] + mov%4 m4, [r0+r2] + pavgb m2, m3 + pavgb m1, m4 %endif - mova [r0], m2 - mova [r0+r2], m1 +%endif + mov%4 [r0], m2 + mov%4 [r0+r2], m1 add r0, r4 sub r3d, 4 jne .loop RET %endmacro -INIT_MMX mmxext -PIXELS_Y2 put, 8 -PIXELS_Y2 avg, 8 INIT_XMM sse2 -PIXELS_Y2 put, 16 -PIXELS_Y2 avg, 16 +PIXELS_Y2 put, 8, q, q +PIXELS_Y2 avg, 8, q, q + +PIXELS_Y2 put, 16, u, a +PIXELS_Y2 avg, 16, u, a ; void ff_put_no_rnd_pixels8_y2_approx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -INIT_MMX mmxext -cglobal put_no_rnd_pixels8_y2_approx, 4,5 - mova m6, [pb_1] +INIT_XMM sse2 +cglobal put_no_rnd_pixels8_y2_approx, 4,5,4 + mova m3, [pb_1] + movq m0, [r1] lea r4, [r2+r2] - mova m0, [r1] .loop: - mova m1, [r1+r2] - mova m2, [r1+r4] + movq m1, [r1+r2] + movq m2, [r1+r4] add r1, r4 - psubusb m1, m6 + psubusb m1, m3 pavgb m0, m1 pavgb m1, m2 - mova [r0], m0 - mova [r0+r2], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] + movq [r0], m0 + movq [r0+r2], m1 + movq m1, [r1+r2] + movq m0, [r1+r4] add r0, r4 add r1, r4 - psubusb m1, m6 + psubusb m1, m3 pavgb m2, m1 pavgb m1, m0 - mova [r0], m2 - mova [r0+r2], m1 + movq [r0], m2 + movq [r0+r2], m1 add r0, r4 sub r3d, 4 jne .loop RET -%macro NO_RND_PIXELS_Y2 2 +%macro NO_RND_PIXELS_Y2 4 cglobal %1_no_rnd_pixels%2_y2, 4,5,4 + mov%3 m0, [r1] lea r4, [r2*3] - movu m0, [r1] pcmpeqb m3, m3 add r1, r2 pxor m0, m3 .loop: - movu m1, [r1] - movu m2, [r1+r2] + mov%3 m1, [r1] + mov%3 m2, [r1+r2] pxor m1, m3 pxor m2, m3 pavgb m0, m1 @@ -272,10 +331,10 @@ cglobal %1_no_rnd_pixels%2_y2, 4,5,4 pavgb m0, [r0] pavgb m1, [r0+r2] %endif - mova [r0], m0 - mova [r0+r2], m1 - movu m1, [r1+r2*2] - movu m0, [r1+r4] + mov%4 [r0], m0 + mov%4 [r0+r2], m1 + mov%3 m1, [r1+r2*2] + mov%3 m0, [r1+r4] pxor m1, m3 pxor m0, m3 pavgb m2, m1 @@ -286,8 +345,8 @@ cglobal %1_no_rnd_pixels%2_y2, 4,5,4 pavgb m2,[r0+r2*2] pavgb m1,[r0+r4] %endif - mova [r0+r2*2], m2 - mova [r0+r4], m1 + mov%4 [r0+r2*2], m2 + mov%4 [r0+r4], m1 lea r1, [r1+r2*4] lea r0, [r0+r2*4] sub r3d, 4 @@ -295,11 +354,11 @@ cglobal %1_no_rnd_pixels%2_y2, 4,5,4 RET %endmacro -INIT_MMX mmxext -NO_RND_PIXELS_Y2 put, 8 INIT_XMM sse2 -NO_RND_PIXELS_Y2 avg, 16 -NO_RND_PIXELS_Y2 put, 16 +NO_RND_PIXELS_Y2 put, 8, q, q + +NO_RND_PIXELS_Y2 avg, 16, u, a +NO_RND_PIXELS_Y2 put, 16, u, a ; void ff_put_no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index 4e4abd5273..f337e5c85d 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -33,8 +33,8 @@ #include "fpel.h" #include "hpeldsp.h" -void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); +void ff_put_pixels8_x2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, @@ -43,22 +43,20 @@ void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_x2_approx_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_x2_approx_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_x2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_approx_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h); +void ff_put_pixels8_y2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_y2_approx_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_y2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, @@ -69,29 +67,10 @@ void ff_put_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); - -static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags) -{ -#if HAVE_MMXEXT_EXTERNAL - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; - - c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; - c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; - - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; - - if (!(flags & AV_CODEC_FLAG_BITEXACT)) { - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_approx_mmxext; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_approx_mmxext; - } -#endif /* HAVE_MMXEXT_EXTERNAL */ -} +void ff_avg_pixels8_x2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels8_y2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); static void hpeldsp_init_sse2(HpelDSPContext *c, int flags) { @@ -106,19 +85,30 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags) c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2; c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_sse2; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_sse2; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_sse2; c->put_no_rnd_pixels_tab[1][0] = c->put_pixels_tab[1][0] = ff_put_pixels8_sse2; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_sse2; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_sse2; c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2; c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2; c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2; c->avg_pixels_tab[1][0] = ff_avg_pixels8_sse2; + c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_sse2; + c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_sse2; c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_sse2; c->avg_no_rnd_pixels_tab[1] = ff_avg_no_rnd_pixels16_x2_sse2; c->avg_no_rnd_pixels_tab[2] = ff_avg_no_rnd_pixels16_y2_sse2; c->avg_no_rnd_pixels_tab[3] = ff_avg_no_rnd_pixels16_xy2_sse2; + + if (!(flags & AV_CODEC_FLAG_BITEXACT)) { + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_approx_sse2; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_approx_sse2; + } #endif /* HAVE_SSE2_EXTERNAL */ } @@ -138,9 +128,6 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) { int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMXEXT(cpu_flags)) - hpeldsp_init_mmxext(c, flags); - if (EXTERNAL_SSE2(cpu_flags)) hpeldsp_init_sse2(c, flags); diff --git a/tests/checkasm/hpeldsp.c b/tests/checkasm/hpeldsp.c index bf44a666ca..fd87509ddc 100644 --- a/tests/checkasm/hpeldsp.c +++ b/tests/checkasm/hpeldsp.c @@ -69,7 +69,7 @@ void checkasm_check_hpeldsp(void) TEST(put_no_rnd_pixels_tab, 2), // put_no_rnd_pixels_tab only has two usable blocksizes TEST(avg_no_rnd_pixels_tab, 1), }; - declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h); + declare_func(void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h); ff_hpeldsp_init(&hdsp, AV_CODEC_FLAG_BITEXACT); -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
