---
vzeroupper doesn't need to be called until the end.
libavcodec/fmtconvert.h | 4 ++--
libavcodec/x86/fmtconvert.asm | 16 ++++++++++++++++
libavcodec/x86/fmtconvert_mmx.c | 15 +++++++++++++++
3 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index 1b53401..c488180 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -75,9 +75,9 @@ typedef struct FmtConvertContext {
* @param dst destination array of interleaved float.
* constraints: 16-byte aligned
* @param src source array of float arrays, one for each channel.
- * constraints: 16-byte aligned
+ * constraints: 32-byte aligned
* @param len number of elements to convert.
- * constraints: multiple of 8
+ * constraints: multiple of 16
* @param channels number of channels
*/
void (*float_interleave)(float *dst, const float **src, unsigned int len,
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index f69cd7c..11cd138 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -342,10 +342,21 @@ cglobal float_interleave2, 3,4,5, dst, src, len, src1
PUNPCKHDQ m1, m3, m4
PUNPCKLDQ m3, m3, m4
+%if cpuflag(avx)
+ vextractf128 [dstq ], m0, 0
+ vextractf128 [dstq+ 16], m2, 0
+ vextractf128 [dstq+ 32], m0, 1
+ vextractf128 [dstq+ 48], m2, 1
+ vextractf128 [dstq+ 64], m3, 0
+ vextractf128 [dstq+ 80], m1, 0
+ vextractf128 [dstq+ 96], m3, 1
+ vextractf128 [dstq+112], m1, 1
+%else
mova [dstq ], m0
mova [dstq+1*mmsize], m2
mova [dstq+2*mmsize], m3
mova [dstq+3*mmsize], m1
+%endif
add srcq, mmsize*2
add dstq, mmsize*4
@@ -354,6 +365,9 @@ cglobal float_interleave2, 3,4,5, dst, src, len, src1
%if mmsize == 8
emms
%endif
+%if mmsize == 32
+ vzeroupper
+%endif
REP_RET
%endmacro
@@ -365,3 +379,5 @@ INIT_XMM sse
%define PUNPCKLDQ unpcklps
%define PUNPCKHDQ unpckhps
FLOAT_INTERLEAVE2
+INIT_YMM avx
+FLOAT_INTERLEAVE2
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 78cca9c..26c86d1 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -81,6 +81,7 @@ static void float_to_int16_interleave_3dnow2(int16_t *dst,
const float **src, lo
void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
+void ff_float_interleave2_avx(float *dst, const float **src, unsigned int len);
void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
@@ -106,6 +107,17 @@ static void float_interleave_sse(float *dst, const float
**src,
else
ff_float_interleave_c(dst, src, len, channels);
}
+
+static void float_interleave_avx(float *dst, const float **src,
+ unsigned int len, int channels)
+{
+ if (channels == 2) {
+ ff_float_interleave2_avx(dst, src, len);
+ } else if (channels == 6)
+ ff_float_interleave6_sse(dst, src, len);
+ else
+ ff_float_interleave_c(dst, src, len, channels);
+}
#endif
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
@@ -138,6 +150,9 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c,
AVCodecContext *avctx)
c->float_to_int16 = ff_float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
}
+ if (HAVE_AVX && mm_flags & AV_CPU_FLAG_AVX) {
+ c->float_interleave = float_interleave_avx;
+ }
}
#endif
}
--
1.7.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel