---
libavcodec/dsputil.c | 17 +++++++
libavcodec/dsputil.h | 14 ++++++
libavcodec/x86/dsputil_mmx.c | 15 +++++++
libavcodec/x86/dsputil_yasm.asm | 88 +++++++++++++++++++++++++++++++++++++++
4 files changed, 134 insertions(+), 0 deletions(-)
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 4389289..4f17b43 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2676,6 +2676,22 @@ static void apply_window_int16_c(int16_t *output, const int16_t *input,
}
}
+static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len)
+{
+ do {
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ *dst++ = av_clip(*src++, min, max);
+ len -= 8;
+ } while (len > 0);
+}
+
#define W0 2048
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -3122,6 +3138,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->scalarproduct_int16 = scalarproduct_int16_c;
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
c->apply_window_int16 = apply_window_int16_c;
+ c->vector_clip_int32 = vector_clip_int32_c;
c->scalarproduct_float = scalarproduct_float_c;
c->butterflies_float = butterflies_float_c;
c->vector_fmul_scalar = vector_fmul_scalar_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index cfc574a..cff8406 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -555,6 +555,20 @@ typedef struct DSPContext {
void (*apply_window_int16)(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
+ /**
+ * Clip each element in an array of int32_t to a given minimum and maximum value.
+ * @param dst destination array
+ * constraints: 16-byte aligned
+ * @param src source array
+ * constraints: 16-byte aligned
+ * @param min minimum value
+ * @param max maximum value
+ * @param len number of elements in the array
+ * constraints: multiple of 16 greater than zero
+ */
+ void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len);
+
/* rv30 functions */
qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 1cc6991..82981c2 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2416,6 +2416,13 @@ int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, i
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
+void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse41 (int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len);
+
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
@@ -2556,6 +2563,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
+
+ c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif
if (mm_flags & AV_CPU_FLAG_MMX2) {
@@ -2829,6 +2838,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+ c->vector_clip_int32 = ff_vector_clip_int32_sse2;
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
} else {
@@ -2854,6 +2864,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
#endif
}
+ if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
+#if HAVE_YASM
+ c->vector_clip_int32 = ff_vector_clip_int32_sse41;
+#endif
+ }
}
if (CONFIG_ENCODERS)
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 8b19cc1..60649c3 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1048,3 +1048,91 @@ emu_edge sse
%ifdef ARCH_X86_32
emu_edge mmx
%endif
+
+;-----------------------------------------------------------------------------
+; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
+; int32_t max, unsigned int len)
+;-----------------------------------------------------------------------------
+
+%macro PMINSD_MMX 3 ; dst, src, tmp
+ mova %3, %2
+ pcmpgtd %3, %1
+ pxor %1, %2
+ pand %1, %3
+ pxor %1, %2
+%endmacro
+
+%macro PMAXSD_MMX 3 ; dst, src, tmp
+ mova %3, %1
+ pcmpgtd %3, %2
+ pand %1, %3
+ pandn %3, %2
+ por %1, %3
+%endmacro
+
+%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
+ PMINSD_MMX %1, %3, %4
+ PMAXSD_MMX %1, %2, %4
+%endmacro
+
+%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
+ cvtdq2ps %1, %1
+ minps %1, %3
+ maxps %1, %2
+ cvtps2dq %1, %1
+%endmacro
+
+%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused
+ pminsd %1, %3
+ pmaxsd %1, %2
+%endmacro
+
+%macro SPLATD_MMX 1
+ punpckldq %1, %1
+%endmacro
+
+%macro SPLATD_SSE2 1
+ pshufd %1, %1, 0
+%endmacro
+
+%macro VECTOR_CLIP_INT32 1
+cglobal vector_clip_int32_%1, 5,5,7, dst, src, min, max, len
+%ifidn %1, sse2
+ cvtsi2ss m4, minm
+ cvtsi2ss m5, maxm
+%else
+ movd m4, minm
+ movd m5, maxm
+%endif
+ SPLATD m4
+ SPLATD m5
+.loop:
+ mova m0, [srcq ]
+ mova m1, [srcq+mmsize ]
+ mova m2, [srcq+mmsize*2]
+ mova m3, [srcq+mmsize*3]
+ CLIPD m0, m4, m5, m6
+ CLIPD m1, m4, m5, m6
+ CLIPD m2, m4, m5, m6
+ CLIPD m3, m4, m5, m6
+ mova [dstq ], m0
+ mova [dstq+mmsize ], m1
+ mova [dstq+mmsize*2], m2
+ mova [dstq+mmsize*3], m3
+ add srcq, mmsize*4
+ add dstq, mmsize*4
+ sub lend, mmsize
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+%define SPLATD SPLATD_MMX
+%define CLIPD CLIPD_MMX
+VECTOR_CLIP_INT32 mmx
+INIT_XMM
+%define SPLATD SPLATD_SSE2
+%define CLIPD CLIPD_SSE2
+VECTOR_CLIP_INT32 sse2
+%define CLIPD CLIPD_SSE41
+VECTOR_CLIP_INT32 sse41
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel