---
doesn't do much, but it helps a little.
athlon64:
c - 1060
sse - 303
sse3 - 298
sandybridge:
c - 738
sse - 215
sse3 - 217
avx - 208
libavcodec/x86/dsputil_mmx.c | 12 ++++++++++--
libavcodec/x86/dsputil_yasm.asm | 37 ++++++++++++++++++++++++++++++-------
2 files changed, 40 insertions(+), 9 deletions(-)
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index be0ac2e..ed80ab6 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2413,7 +2413,9 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst,
const uint8_t *top, const
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int
w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w,
int left);
-float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
+float ff_scalarproduct_float_sse_aligned (const float *v1, const float *v2,
int order);
+float ff_scalarproduct_float_sse3_aligned(const float *v1, const float *v2,
int order);
+float ff_scalarproduct_float_avx (const float *v1, const float *v2,
int order);
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t
min,
int32_t max, unsigned int len);
@@ -2867,7 +2869,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext
*avctx)
#endif
c->vector_clipf = vector_clipf_sse;
#if HAVE_YASM
- c->scalarproduct_float = ff_scalarproduct_float_sse;
+ c->scalarproduct_float = ff_scalarproduct_float_sse_aligned;
if (!high_bit_depth)
c->emulated_edge_mc = emulated_edge_mc_sse;
@@ -2894,6 +2896,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext
*avctx)
}
#endif
}
+ if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE3) {
+#if HAVE_YASM
+ c->scalarproduct_float = ff_scalarproduct_float_sse3_aligned;
+#endif
+ }
if (mm_flags & AV_CPU_FLAG_SSSE3) {
#if HAVE_YASM
if (mm_flags & AV_CPU_FLAG_ATOM) {
@@ -2925,6 +2932,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext
*avctx)
c->put_h264_chroma_pixels_tab[0]=
ff_put_h264_chroma_mc8_10_avx;
c->avg_h264_chroma_pixels_tab[0]=
ff_avg_h264_chroma_mc8_10_avx;
}
+ c->scalarproduct_float = ff_scalarproduct_float_avx;
}
#endif
}
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 5244362..25eb19f 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -456,29 +456,52 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src,
w, left
ADD_HFYU_LEFT_LOOP 0
-; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
-cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
+;------------------------------------------------------------------------------
+; float ff_scalarproduct_float(const float *v1, const float *v2, int len)
+;------------------------------------------------------------------------------
+
+%macro SCALARPRODUCT_FLOAT 0
+cglobal scalarproduct_float, 3,3,3, v1, v2, offset
neg offsetq
shl offsetq, 2
sub v1q, offsetq
sub v2q, offsetq
xorps xmm0, xmm0
- .loop:
- movaps xmm1, [v1q+offsetq]
- mulps xmm1, [v2q+offsetq]
- addps xmm0, xmm1
- add offsetq, 16
+.loop:
+ movu m1, [v1q+offsetq]
+ mulps m1, m1, [v2q+offsetq]
+ addps m0, m0, m1
+ add offsetq, mmsize
js .loop
+%if cpuflag(avx)
+ vextractf128 xmm0, ymm0, 0
+ vextractf128 xmm1, ymm0, 1
+ addps xmm0, xmm1
+%endif
+%if cpuflag(sse3)
+ haddps xmm0, xmm0
+ haddps xmm0, xmm0
+%else
movhlps xmm1, xmm0
addps xmm0, xmm1
movss xmm1, xmm0
shufps xmm0, xmm0, 1
addss xmm0, xmm1
+%endif
%ifndef ARCH_X86_64
movd r0m, xmm0
fld dword r0m
%endif
RET
+%endmacro
+
+INIT_XMM sse, aligned
+SCALARPRODUCT_FLOAT
+INIT_XMM sse3, aligned
+SCALARPRODUCT_FLOAT
+INIT_YMM avx
+SCALARPRODUCT_FLOAT
+
; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg
linesize,
; x86_reg start_y, x86_reg end_y, x86_reg block_h,
--
1.7.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel