--- Updated patch to allow float vs. dword min/max as a parameter to CLIPD instead of using 2 separate macros.
libavcodec/x86/dsputil_mmx.c | 6 ++-- libavcodec/x86/dsputil_yasm.asm | 66 +++++++++++++++++++-------------------- libavutil/x86/x86util.asm | 34 ++++++++++++-------- 3 files changed, 56 insertions(+), 50 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 5eb4a24..d9c8e96 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2530,8 +2530,8 @@ void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len); void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len); -void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, - int32_t min, int32_t max, unsigned int len); +void ff_vector_clip_int32_sse2_atom(int32_t *dst, const int32_t *src, + int32_t min, int32_t max, unsigned int len); void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len); @@ -2908,7 +2908,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; if (mm_flags & AV_CPU_FLAG_ATOM) { - c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; + c->vector_clip_int32 = ff_vector_clip_int32_sse2_atom; } else { c->vector_clip_int32 = ff_vector_clip_int32_sse2; } diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 313e774..8d70155 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -1054,50 +1054,50 @@ emu_edge mmx ; int32_t max, unsigned int len) ;----------------------------------------------------------------------------- -; %1 = number of xmm registers used -; %2 = number of inline load/process/store loops per asm loop -; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop -; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2) -; %5 = suffix -%macro VECTOR_CLIP_INT32 4-5 -cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len -%if %4 - cvtsi2ss m4, minm - cvtsi2ss m5, maxm +; %1 = number of inline load/process/store loops per asm loop +; %2 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop + +%macro VECTOR_CLIP_INT32 2 +cglobal vector_clip_int32, 5,5,11, dst, src, min, max, len +%if notcpuflag(sse4) && cpuflag(sse2) && notcpuflag(atom) + cvtsi2ss m4, minm + cvtsi2ss m5, maxm + %assign is_float 1 %else movd m4, minm movd m5, maxm + %assign is_float 0 %endif SPLATD m4 SPLATD m5 .loop: %assign %%i 1 -%rep %2 +%rep %1 mova m0, [srcq+mmsize*0*%%i] mova m1, [srcq+mmsize*1*%%i] mova m2, [srcq+mmsize*2*%%i] mova m3, [srcq+mmsize*3*%%i] -%if %3 +%if %2 mova m7, [srcq+mmsize*4*%%i] mova m8, [srcq+mmsize*5*%%i] mova m9, [srcq+mmsize*6*%%i] mova m10, [srcq+mmsize*7*%%i] %endif - CLIPD m0, m4, m5, m6 - CLIPD m1, m4, m5, m6 - CLIPD m2, m4, m5, m6 - CLIPD m3, m4, m5, m6 -%if %3 - CLIPD m7, m4, m5, m6 - CLIPD m8, m4, m5, m6 - CLIPD m9, m4, m5, m6 - CLIPD m10, m4, m5, m6 + CLIPD m0, m4, m5, is_float, m6 + CLIPD m1, m4, m5, is_float, m6 + CLIPD m2, m4, m5, is_float, m6 + CLIPD m3, m4, m5, is_float, m6 +%if %2 + CLIPD m7, m4, m5, is_float, m6 + CLIPD m8, m4, m5, is_float, m6 + CLIPD m9, m4, m5, is_float, m6 + CLIPD m10, m4, m5, is_float, m6 %endif mova [dstq+mmsize*0*%%i], m0 mova [dstq+mmsize*1*%%i], m1 mova [dstq+mmsize*2*%%i], m2 mova [dstq+mmsize*3*%%i], m3 -%if %3 +%if %2 mova [dstq+mmsize*4*%%i], m7 mova [dstq+mmsize*5*%%i], m8 mova [dstq+mmsize*6*%%i], m9 @@ -1105,28 +1105,26 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len %endif %assign %%i %%i+1 %endrep - add srcq, mmsize*4*(%2+%3) - add dstq, mmsize*4*(%2+%3) - sub lend, mmsize*(%2+%3) + add srcq, mmsize*4*(%1+%2) + add dstq, mmsize*4*(%1+%2) + sub lend, mmsize*(%1+%2) jg .loop REP_RET %endmacro INIT_MMX mmx %define SPLATD SPLATD_MMX -%define CLIPD CLIPD_MMX -VECTOR_CLIP_INT32 0, 1, 0, 0 -INIT_XMM sse2 +VECTOR_CLIP_INT32 1, 0 +INIT_XMM sse2,atom %define SPLATD SPLATD_SSE2 -VECTOR_CLIP_INT32 6, 1, 0, 0, _int -%define CLIPD CLIPD_SSE2 -VECTOR_CLIP_INT32 6, 2, 0, 1 +VECTOR_CLIP_INT32 1, 0 +INIT_XMM sse2 +VECTOR_CLIP_INT32 2, 0 INIT_XMM sse4 -%define CLIPD CLIPD_SSE41 %ifdef m8 -VECTOR_CLIP_INT32 11, 1, 1, 0 +VECTOR_CLIP_INT32 1, 1 %else -VECTOR_CLIP_INT32 6, 1, 0, 0 +VECTOR_CLIP_INT32 1, 0 %endif ;----------------------------------------------------------------------------- diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index 941ec76..447bde4 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -584,37 +584,45 @@ pminsw %1, %3 %endmacro -%macro PMINSD_MMX 3 ; dst, src, tmp +%macro PMINSD 2-3 ; dst, src, tmp +%if cpuflag(sse4) && mmsize >= 16 + pminsd %1, %2 +%else mova %3, %2 pcmpgtd %3, %1 pxor %1, %2 pand %1, %3 pxor %1, %2 +%endif %endmacro -%macro PMAXSD_MMX 3 ; dst, src, tmp +%macro PMAXSD 2-3 ; dst, src, tmp +%if cpuflag(sse4) && mmsize >= 16 + pmaxsd %1, %2 +%else mova %3, %1 pcmpgtd %3, %2 pand %1, %3 pandn %3, %2 por %1, %3 +%endif %endmacro -%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp - PMINSD_MMX %1, %3, %4 - PMAXSD_MMX %1, %2, %4 -%endmacro - -%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused +; %1 = src/dst +; %2 = min +; %3 = max +; %4 = min/max format: 0=dwords, 1=floats (requires SSE2) +; %5 = tmp +%macro CLIPD 3-5 0, 0 +%if %4 == 1 cvtdq2ps %1, %1 minps %1, %3 maxps %1, %2 cvtps2dq %1, %1 -%endmacro - -%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused - pminsd %1, %3 - pmaxsd %1, %2 +%else + PMINSD %1, %3, %5 + PMAXSD %1, %2, %5 +%endif %endmacro %macro VBROADCASTSS 2 ; dst xmm/ymm, src m32 -- 1.7.1 _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel