ffmpeg | branch: master | James Almer <jamr...@gmail.com> | Mon Jul 20 00:50:52 2015 -0300| [e3851169eedce6c90534ca41edf3e05e2576453e] | committer: James Almer
x86/vf_ssim: add ff_ssim_4x4_line_xop ~20% faster than ssse3. Also enabled for x86_32 Reviewed-by: Ronald S. Bultje <rsbul...@gmail.com> Signed-off-by: James Almer <jamr...@gmail.com> > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=e3851169eedce6c90534ca41edf3e05e2576453e --- libavfilter/x86/vf_ssim.asm | 62 ++++++++++++++++++++++++++++++++++++++-- libavfilter/x86/vf_ssim_init.c | 5 ++++ 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/libavfilter/x86/vf_ssim.asm b/libavfilter/x86/vf_ssim.asm index 6661987..3293e66 100644 --- a/libavfilter/x86/vf_ssim.asm +++ b/libavfilter/x86/vf_ssim.asm @@ -30,16 +30,50 @@ ssim_c2: times 4 dd 235963 ;(.03*.03*255*255*64*63 + .5) SECTION .text +%macro SSIM_4X4_LINE 1 %if ARCH_X86_64 - -INIT_XMM ssse3 -cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3 +cglobal ssim_4x4_line, 6, 8, %1, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3 +%else +cglobal ssim_4x4_line, 5, 7, %1, buf, buf_stride, ref, ref_stride, sums, buf_stride3, ref_stride3 +%define wd r5mp +%endif lea ref_stride3q, [ref_strideq*3] lea buf_stride3q, [buf_strideq*3] +%if notcpuflag(xop) pxor m7, m7 mova m15, [pw_1] +%endif .loop: +%if cpuflag(xop) + pmovzxbw m0, [bufq+buf_strideq*0] + pmovzxbw m1, [refq+ref_strideq*0] + pmaddwd m4, m0, m0 + pmaddwd m6, m0, m1 + pmovzxbw m2, [bufq+buf_strideq*1] + vpmadcswd m4, m1, m1, m4 + pmovzxbw m3, [refq+ref_strideq*1] + paddw m0, m2 + vpmadcswd m4, m2, m2, m4 + vpmadcswd m6, m2, m3, m6 + paddw m1, m3 + vpmadcswd m4, m3, m3, m4 + + pmovzxbw m2, [bufq+buf_strideq*2] + pmovzxbw m3, [refq+ref_strideq*2] + vpmadcswd m4, m2, m2, m4 + vpmadcswd m6, m2, m3, m6 + pmovzxbw m5, [bufq+buf_stride3q] + pmovzxbw m7, [refq+ref_stride3q] + vpmadcswd m4, m3, m3, m4 + vpmadcswd m6, m5, m7, m6 + paddw m0, m2 + paddw m1, m3 + vpmadcswd m4, m5, m5, m4 + paddw m0, m5 + paddw m1, m7 + vpmadcswd m4, m7, m7, m4 +%else movh m0, [bufq+buf_strideq*0] ; a1 movh m1, [refq+ref_strideq*0] ; b1 movh m2, [bufq+buf_strideq*1] ; a2 @@ -85,12 +119,25 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_ paddd m4, m9 paddd m6, m14 paddd m4, m12 +%endif ; m0 = [word] s1 a,a,a,a,b,b,b,b ; m1 = [word] s2 a,a,a,a,b,b,b,b ; m4 = [dword] ss a,a,b,b ; m6 = [dword] s12 a,a,b,b +%if cpuflag(xop) + vphaddwq m0, m0 ; [dword] s1 a, 0, b, 0 + vphaddwq m1, m1 ; [dword] s2 a, 0, b, 0 + vphadddq m4, m4 ; [dword] ss a, 0, b, 0 + vphadddq m6, m6 ; [dword] s12 a, 0, b, 0 + punpckhdq m2, m0, m1 ; [dword] s1 b, s2 b, 0, 0 + punpckldq m0, m1 ; [dword] s1 a, s2 a, 0, 0 + punpckhdq m3, m4, m6 ; [dword] ss b, s12 b, 0, 0 + punpckldq m4, m6 ; [dword] ss a, s12 a, 0, 0 + punpcklqdq m1, m2, m3 ; [dword] b s1, s2, ss, s12 + punpcklqdq m0, m4 ; [dword] a s1, s2, ss, s12 +%else pmaddwd m0, m15 ; [dword] s1 a,a,b,b pmaddwd m1, m15 ; [dword] s2 a,a,b,b phaddd m0, m4 ; [dword] s1 a, b, ss a, b @@ -99,6 +146,7 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_ punpckldq m0, m1 ; [dword] s1 a, s2 a, s1 b, s2 b punpckhqdq m1, m0, m2 ; [dword] b s1, s2, ss, s12 punpcklqdq m0, m2 ; [dword] a s1, s2, ss, s12 +%endif mova [sumsq+ 0], m0 mova [sumsq+mmsize], m1 @@ -109,7 +157,15 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_ sub wd, mmsize/8 jg .loop RET +%endmacro +%if ARCH_X86_64 +INIT_XMM ssse3 +SSIM_4X4_LINE 16 +%endif +%if HAVE_XOP_EXTERNAL +INIT_XMM xop +SSIM_4X4_LINE 8 %endif INIT_XMM sse4 diff --git a/libavfilter/x86/vf_ssim_init.c b/libavfilter/x86/vf_ssim_init.c index 9514b25..599c928 100644 --- a/libavfilter/x86/vf_ssim_init.c +++ b/libavfilter/x86/vf_ssim_init.c @@ -25,6 +25,9 @@ void ff_ssim_4x4_line_ssse3(const uint8_t *buf, ptrdiff_t buf_stride, const uint8_t *ref, ptrdiff_t ref_stride, int (*sums)[4], int w); +void ff_ssim_4x4_line_xop (const uint8_t *buf, ptrdiff_t buf_stride, + const uint8_t *ref, ptrdiff_t ref_stride, + int (*sums)[4], int w); float ff_ssim_end_line_sse4(const int (*sum0)[4], const int (*sum1)[4], int w); void ff_ssim_init_x86(SSIMDSPContext *dsp) @@ -35,4 +38,6 @@ void ff_ssim_init_x86(SSIMDSPContext *dsp) dsp->ssim_4x4_line = ff_ssim_4x4_line_ssse3; if (EXTERNAL_SSE4(cpu_flags)) dsp->ssim_end_line = ff_ssim_end_line_sse4; + if (EXTERNAL_XOP(cpu_flags)) + dsp->ssim_4x4_line = ff_ssim_4x4_line_xop; } _______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog