Re: [FFmpeg-devel] [PATCHv3] lavu/x86/lls: add fma3 optimizations for update_lls
On Thu, Jan 14, 2016 at 7:39 PM, Ganesh Ajjanagadde wrote: > This improves accuracy (very slightly) and speed for processors having > fma3. > > Sample benchmark (fate flac-16-lpc-cholesky, Haswell): > old: > 5993610 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips > 5951528 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips > > new: > 5252410 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips > 5232869 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips > > Tested with FATE and --disable-fma3, also examined contents of > lavu/lls-test. > > Reviewed-by: James Almer > Reviewed-by: Henrik Gramner > Signed-off-by: Ganesh Ajjanagadde > --- > libavutil/x86/lls.asm| 59 > ++-- > libavutil/x86/lls_init.c | 4 > 2 files changed, 61 insertions(+), 2 deletions(-) > > diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm > index 769befb..317fba6 100644 > --- a/libavutil/x86/lls.asm > +++ b/libavutil/x86/lls.asm > @@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 > .ret: > REP_RET > > -%if HAVE_AVX_EXTERNAL > -INIT_YMM avx > +%macro UPDATE_LLS 0 > cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > %define covarq ctxq > mov countd, [ctxq + LLSModel.indep_count] > @@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > vbroadcastsd ymm6, [varq + iq*8 + 16] > vbroadcastsd ymm7, [varq + iq*8 + 24] > vextractf128 xmm3, ymm1, 1 > +%if cpuflag(fma3) > +mova ymm0, COVAR(iq ,0) > +mova xmm2, COVAR(iq+2,2) > +fmaddpd ymm0, ymm1, ymm4, ymm0 > +fmaddpd xmm2, xmm3, xmm6, xmm2 > +fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1) > +fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3) > +mova COVAR(iq ,0), ymm0 > +mova COVAR(iq ,1), ymm1 > +mova COVAR(iq+2,2), xmm2 > +mova COVAR(iq+2,3), xmm3 > +%else > vmulpd ymm0, ymm1, ymm4 > vmulpd ymm1, ymm1, ymm5 > vmulpd xmm2, xmm3, xmm6 > @@ -148,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > ADDPD_MEM COVAR(iq ,1), ymm1 > ADDPD_MEM COVAR(iq+2,2), xmm2 > ADDPD_MEM COVAR(iq+2,3), xmm3 > +%endif ; cpuflag(fma3) > lea jd, [iq + 4] > cmp jd, count2d > jg .skip4x4 > .loop4x4: > ; Compute all 16 pairwise products of a 4x4 block > movaymm3, [varq + jq*8] > +%if cpuflag(fma3) > +mova ymm0, COVAR(jq, 0) > +mova ymm1, COVAR(jq, 1) > +mova ymm2, COVAR(jq, 2) > +fmaddpd ymm0, ymm3, ymm4, ymm0 > +fmaddpd ymm1, ymm3, ymm5, ymm1 > +fmaddpd ymm2, ymm3, ymm6, ymm2 > +fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3) > +mova COVAR(jq, 0), ymm0 > +mova COVAR(jq, 1), ymm1 > +mova COVAR(jq, 2), ymm2 > +mova COVAR(jq, 3), ymm3 > +%else > vmulpd ymm0, ymm3, ymm4 > vmulpd ymm1, ymm3, ymm5 > vmulpd ymm2, ymm3, ymm6 > @@ -162,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > ADDPD_MEM COVAR(jq,1), ymm1 > ADDPD_MEM COVAR(jq,2), ymm2 > ADDPD_MEM COVAR(jq,3), ymm3 > +%endif ; cpuflag(fma3) > add jd, 4 > cmp jd, count2d > jle .loop4x4 > @@ -169,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > cmp jd, countd > jg .skip2x4 > movaxmm3, [varq + jq*8] > +%if cpuflag(fma3) > +mova xmm0, COVAR(jq, 0) > +mova xmm1, COVAR(jq, 1) > +mova xmm2, COVAR(jq, 2) > +fmaddpd xmm0, xmm3, xmm4, xmm0 > +fmaddpd xmm1, xmm3, xmm5, xmm1 > +fmaddpd xmm2, xmm3, xmm6, xmm2 > +fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3) > +mova COVAR(jq, 0), xmm0 > +mova COVAR(jq, 1), xmm1 > +mova COVAR(jq, 2), xmm2 > +mova COVAR(jq, 3), xmm3 > +%else > vmulpd xmm0, xmm3, xmm4 > vmulpd xmm1, xmm3, xmm5 > vmulpd xmm2, xmm3, xmm6 > @@ -177,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > ADDPD_MEM COVAR(jq,1), xmm1 > ADDPD_MEM COVAR(jq,2), xmm2 > ADDPD_MEM COVAR(jq,3), xmm3 > +%endif ; cpuflag(fma3) > .skip2x4: > add id, 4 > add covarq, 4*COVAR_STRIDE > @@ -187,14 +227,29 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > mov jd, id > .loop2x1: > vmovddup xmm0, [varq + iq*8] > +%if cpuflag(fma3) > +mova xmm1, [varq + jq*8] > +fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0) > +mova COVAR(jq,0), xmm0 > +%else > vmulpd xmm0, [varq + jq*8] > ADDPD_MEM COVAR(jq,0), xmm0 > +%endif ; cpuflag(fma3) > inc id > add covarq, COVAR_STRIDE > cmp id, countd > jle .loop2x1 > .ret: > REP_RET > +%endmacro ; UPDATE_LLS > + > +%if HAVE_AVX_EXTERNAL > +INIT_YMM avx > +UPDATE_LLS > +%endif > +%if HAVE_FMA3_EXTERNAL > +INIT_YMM fma3 > +UPDATE_LLS > %endif > > INIT_XMM sse2 > diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c > index 81f141c..9f0d862 100644 > --- a/libavutil/x86/lls_init.c > +++ b/libavutil/x86/lls_init.c > @@ -25,6 +25,7 @@ > >
[FFmpeg-devel] [PATCHv3] lavu/x86/lls: add fma3 optimizations for update_lls
This improves accuracy (very slightly) and speed for processors having fma3. Sample benchmark (fate flac-16-lpc-cholesky, Haswell): old: 5993610 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips 5951528 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips new: 5252410 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips 5232869 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips Tested with FATE and --disable-fma3, also examined contents of lavu/lls-test. Reviewed-by: James Almer Reviewed-by: Henrik Gramner Signed-off-by: Ganesh Ajjanagadde --- libavutil/x86/lls.asm| 59 ++-- libavutil/x86/lls_init.c | 4 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm index 769befb..317fba6 100644 --- a/libavutil/x86/lls.asm +++ b/libavutil/x86/lls.asm @@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 .ret: REP_RET -%if HAVE_AVX_EXTERNAL -INIT_YMM avx +%macro UPDATE_LLS 0 cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 %define covarq ctxq mov countd, [ctxq + LLSModel.indep_count] @@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 vbroadcastsd ymm6, [varq + iq*8 + 16] vbroadcastsd ymm7, [varq + iq*8 + 24] vextractf128 xmm3, ymm1, 1 +%if cpuflag(fma3) +mova ymm0, COVAR(iq ,0) +mova xmm2, COVAR(iq+2,2) +fmaddpd ymm0, ymm1, ymm4, ymm0 +fmaddpd xmm2, xmm3, xmm6, xmm2 +fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1) +fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3) +mova COVAR(iq ,0), ymm0 +mova COVAR(iq ,1), ymm1 +mova COVAR(iq+2,2), xmm2 +mova COVAR(iq+2,3), xmm3 +%else vmulpd ymm0, ymm1, ymm4 vmulpd ymm1, ymm1, ymm5 vmulpd xmm2, xmm3, xmm6 @@ -148,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 ADDPD_MEM COVAR(iq ,1), ymm1 ADDPD_MEM COVAR(iq+2,2), xmm2 ADDPD_MEM COVAR(iq+2,3), xmm3 +%endif ; cpuflag(fma3) lea jd, [iq + 4] cmp jd, count2d jg .skip4x4 .loop4x4: ; Compute all 16 pairwise products of a 4x4 block movaymm3, [varq + jq*8] +%if cpuflag(fma3) +mova ymm0, COVAR(jq, 0) +mova ymm1, COVAR(jq, 1) +mova ymm2, COVAR(jq, 2) +fmaddpd ymm0, ymm3, ymm4, ymm0 +fmaddpd ymm1, ymm3, ymm5, ymm1 +fmaddpd ymm2, ymm3, ymm6, ymm2 +fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3) +mova COVAR(jq, 0), ymm0 +mova COVAR(jq, 1), ymm1 +mova COVAR(jq, 2), ymm2 +mova COVAR(jq, 3), ymm3 +%else vmulpd ymm0, ymm3, ymm4 vmulpd ymm1, ymm3, ymm5 vmulpd ymm2, ymm3, ymm6 @@ -162,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 ADDPD_MEM COVAR(jq,1), ymm1 ADDPD_MEM COVAR(jq,2), ymm2 ADDPD_MEM COVAR(jq,3), ymm3 +%endif ; cpuflag(fma3) add jd, 4 cmp jd, count2d jle .loop4x4 @@ -169,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 cmp jd, countd jg .skip2x4 movaxmm3, [varq + jq*8] +%if cpuflag(fma3) +mova xmm0, COVAR(jq, 0) +mova xmm1, COVAR(jq, 1) +mova xmm2, COVAR(jq, 2) +fmaddpd xmm0, xmm3, xmm4, xmm0 +fmaddpd xmm1, xmm3, xmm5, xmm1 +fmaddpd xmm2, xmm3, xmm6, xmm2 +fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3) +mova COVAR(jq, 0), xmm0 +mova COVAR(jq, 1), xmm1 +mova COVAR(jq, 2), xmm2 +mova COVAR(jq, 3), xmm3 +%else vmulpd xmm0, xmm3, xmm4 vmulpd xmm1, xmm3, xmm5 vmulpd xmm2, xmm3, xmm6 @@ -177,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 ADDPD_MEM COVAR(jq,1), xmm1 ADDPD_MEM COVAR(jq,2), xmm2 ADDPD_MEM COVAR(jq,3), xmm3 +%endif ; cpuflag(fma3) .skip2x4: add id, 4 add covarq, 4*COVAR_STRIDE @@ -187,14 +227,29 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 mov jd, id .loop2x1: vmovddup xmm0, [varq + iq*8] +%if cpuflag(fma3) +mova xmm1, [varq + jq*8] +fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0) +mova COVAR(jq,0), xmm0 +%else vmulpd xmm0, [varq + jq*8] ADDPD_MEM COVAR(jq,0), xmm0 +%endif ; cpuflag(fma3) inc id add covarq, COVAR_STRIDE cmp id, countd jle .loop2x1 .ret: REP_RET +%endmacro ; UPDATE_LLS + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +UPDATE_LLS +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +UPDATE_LLS %endif INIT_XMM sse2 diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c index 81f141c..9f0d862 100644 --- a/libavutil/x86/lls_init.c +++ b/libavutil/x86/lls_init.c @@ -25,6 +25,7 @@ void ff_update_lls_sse2(LLSModel *m, const double *var); void ff_update_lls_avx(LLSModel *m, const double *var); +void ff_update_lls_fma3(LLSModel *m, const double *var); double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order); av_cold void ff_init_lls_x86(LLSModel *m) @@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m) if (EXTE