Re: [FFmpeg-devel] [PATCHv3] lavu/x86/lls: add fma3 optimizations for update_lls

2016-01-15 Thread Ganesh Ajjanagadde
On Thu, Jan 14, 2016 at 7:39 PM, Ganesh Ajjanagadde
 wrote:
> This improves accuracy (very slightly) and speed for processors having
> fma3.
>
> Sample benchmark (fate flac-16-lpc-cholesky, Haswell):
> old:
> 5993610 decicycles in ff_lpc_calc_coefs,  64 runs,  0 skips
> 5951528 decicycles in ff_lpc_calc_coefs, 128 runs,  0 skips
>
> new:
> 5252410 decicycles in ff_lpc_calc_coefs,  64 runs,  0 skips
> 5232869 decicycles in ff_lpc_calc_coefs, 128 runs,  0 skips
>
> Tested with FATE and --disable-fma3, also examined contents of
> lavu/lls-test.
>
> Reviewed-by: James Almer 
> Reviewed-by: Henrik Gramner 
> Signed-off-by: Ganesh Ajjanagadde 
> ---
>  libavutil/x86/lls.asm| 59 
> ++--
>  libavutil/x86/lls_init.c |  4 
>  2 files changed, 61 insertions(+), 2 deletions(-)
>
> diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
> index 769befb..317fba6 100644
> --- a/libavutil/x86/lls.asm
> +++ b/libavutil/x86/lls.asm
> @@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
>  .ret:
>  REP_RET
>
> -%if HAVE_AVX_EXTERNAL
> -INIT_YMM avx
> +%macro UPDATE_LLS 0
>  cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>  %define covarq ctxq
>  mov  countd, [ctxq + LLSModel.indep_count]
> @@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>  vbroadcastsd ymm6, [varq + iq*8 + 16]
>  vbroadcastsd ymm7, [varq + iq*8 + 24]
>  vextractf128 xmm3, ymm1, 1
> +%if cpuflag(fma3)
> +mova ymm0, COVAR(iq  ,0)
> +mova xmm2, COVAR(iq+2,2)
> +fmaddpd ymm0, ymm1, ymm4, ymm0
> +fmaddpd xmm2, xmm3, xmm6, xmm2
> +fmaddpd ymm1, ymm5, ymm1, COVAR(iq  ,1)
> +fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
> +mova COVAR(iq  ,0), ymm0
> +mova COVAR(iq  ,1), ymm1
> +mova COVAR(iq+2,2), xmm2
> +mova COVAR(iq+2,3), xmm3
> +%else
>  vmulpd  ymm0, ymm1, ymm4
>  vmulpd  ymm1, ymm1, ymm5
>  vmulpd  xmm2, xmm3, xmm6
> @@ -148,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>  ADDPD_MEM COVAR(iq  ,1), ymm1
>  ADDPD_MEM COVAR(iq+2,2), xmm2
>  ADDPD_MEM COVAR(iq+2,3), xmm3
> +%endif ; cpuflag(fma3)
>  lea jd, [iq + 4]
>  cmp jd, count2d
>  jg .skip4x4
>  .loop4x4:
>  ; Compute all 16 pairwise products of a 4x4 block
>  movaymm3, [varq + jq*8]
> +%if cpuflag(fma3)
> +mova ymm0, COVAR(jq, 0)
> +mova ymm1, COVAR(jq, 1)
> +mova ymm2, COVAR(jq, 2)
> +fmaddpd ymm0, ymm3, ymm4, ymm0
> +fmaddpd ymm1, ymm3, ymm5, ymm1
> +fmaddpd ymm2, ymm3, ymm6, ymm2
> +fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
> +mova COVAR(jq, 0), ymm0
> +mova COVAR(jq, 1), ymm1
> +mova COVAR(jq, 2), ymm2
> +mova COVAR(jq, 3), ymm3
> +%else
>  vmulpd  ymm0, ymm3, ymm4
>  vmulpd  ymm1, ymm3, ymm5
>  vmulpd  ymm2, ymm3, ymm6
> @@ -162,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>  ADDPD_MEM COVAR(jq,1), ymm1
>  ADDPD_MEM COVAR(jq,2), ymm2
>  ADDPD_MEM COVAR(jq,3), ymm3
> +%endif ; cpuflag(fma3)
>  add jd, 4
>  cmp jd, count2d
>  jle .loop4x4
> @@ -169,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>  cmp jd, countd
>  jg .skip2x4
>  movaxmm3, [varq + jq*8]
> +%if cpuflag(fma3)
> +mova xmm0, COVAR(jq, 0)
> +mova xmm1, COVAR(jq, 1)
> +mova xmm2, COVAR(jq, 2)
> +fmaddpd xmm0, xmm3, xmm4, xmm0
> +fmaddpd xmm1, xmm3, xmm5, xmm1
> +fmaddpd xmm2, xmm3, xmm6, xmm2
> +fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
> +mova COVAR(jq, 0), xmm0
> +mova COVAR(jq, 1), xmm1
> +mova COVAR(jq, 2), xmm2
> +mova COVAR(jq, 3), xmm3
> +%else
>  vmulpd  xmm0, xmm3, xmm4
>  vmulpd  xmm1, xmm3, xmm5
>  vmulpd  xmm2, xmm3, xmm6
> @@ -177,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>  ADDPD_MEM COVAR(jq,1), xmm1
>  ADDPD_MEM COVAR(jq,2), xmm2
>  ADDPD_MEM COVAR(jq,3), xmm3
> +%endif ; cpuflag(fma3)
>  .skip2x4:
>  add id, 4
>  add covarq, 4*COVAR_STRIDE
> @@ -187,14 +227,29 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
>  mov jd, id
>  .loop2x1:
>  vmovddup xmm0, [varq + iq*8]
> +%if cpuflag(fma3)
> +mova xmm1, [varq + jq*8]
> +fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
> +mova COVAR(jq,0), xmm0
> +%else
>  vmulpd   xmm0, [varq + jq*8]
>  ADDPD_MEM COVAR(jq,0), xmm0
> +%endif ; cpuflag(fma3)
>  inc id
>  add covarq, COVAR_STRIDE
>  cmp id, countd
>  jle .loop2x1
>  .ret:
>  REP_RET
> +%endmacro ; UPDATE_LLS
> +
> +%if HAVE_AVX_EXTERNAL
> +INIT_YMM avx
> +UPDATE_LLS
> +%endif
> +%if HAVE_FMA3_EXTERNAL
> +INIT_YMM fma3
> +UPDATE_LLS
>  %endif
>
>  INIT_XMM sse2
> diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
> index 81f141c..9f0d862 100644
> --- a/libavutil/x86/lls_init.c
> +++ b/libavutil/x86/lls_init.c
> @@ -25,6 +25,7 @@
>
> 

[FFmpeg-devel] [PATCHv3] lavu/x86/lls: add fma3 optimizations for update_lls

2016-01-14 Thread Ganesh Ajjanagadde
This improves accuracy (very slightly) and speed for processors having
fma3.

Sample benchmark (fate flac-16-lpc-cholesky, Haswell):
old:
5993610 decicycles in ff_lpc_calc_coefs,  64 runs,  0 skips
5951528 decicycles in ff_lpc_calc_coefs, 128 runs,  0 skips

new:
5252410 decicycles in ff_lpc_calc_coefs,  64 runs,  0 skips
5232869 decicycles in ff_lpc_calc_coefs, 128 runs,  0 skips

Tested with FATE and --disable-fma3, also examined contents of
lavu/lls-test.

Reviewed-by: James Almer 
Reviewed-by: Henrik Gramner 
Signed-off-by: Ganesh Ajjanagadde 
---
 libavutil/x86/lls.asm| 59 ++--
 libavutil/x86/lls_init.c |  4 
 2 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
index 769befb..317fba6 100644
--- a/libavutil/x86/lls.asm
+++ b/libavutil/x86/lls.asm
@@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
 .ret:
 REP_RET
 
-%if HAVE_AVX_EXTERNAL
-INIT_YMM avx
+%macro UPDATE_LLS 0
 cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
 %define covarq ctxq
 mov  countd, [ctxq + LLSModel.indep_count]
@@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
 vbroadcastsd ymm6, [varq + iq*8 + 16]
 vbroadcastsd ymm7, [varq + iq*8 + 24]
 vextractf128 xmm3, ymm1, 1
+%if cpuflag(fma3)
+mova ymm0, COVAR(iq  ,0)
+mova xmm2, COVAR(iq+2,2)
+fmaddpd ymm0, ymm1, ymm4, ymm0
+fmaddpd xmm2, xmm3, xmm6, xmm2
+fmaddpd ymm1, ymm5, ymm1, COVAR(iq  ,1)
+fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
+mova COVAR(iq  ,0), ymm0
+mova COVAR(iq  ,1), ymm1
+mova COVAR(iq+2,2), xmm2
+mova COVAR(iq+2,3), xmm3
+%else
 vmulpd  ymm0, ymm1, ymm4
 vmulpd  ymm1, ymm1, ymm5
 vmulpd  xmm2, xmm3, xmm6
@@ -148,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
 ADDPD_MEM COVAR(iq  ,1), ymm1
 ADDPD_MEM COVAR(iq+2,2), xmm2
 ADDPD_MEM COVAR(iq+2,3), xmm3
+%endif ; cpuflag(fma3)
 lea jd, [iq + 4]
 cmp jd, count2d
 jg .skip4x4
 .loop4x4:
 ; Compute all 16 pairwise products of a 4x4 block
 movaymm3, [varq + jq*8]
+%if cpuflag(fma3)
+mova ymm0, COVAR(jq, 0)
+mova ymm1, COVAR(jq, 1)
+mova ymm2, COVAR(jq, 2)
+fmaddpd ymm0, ymm3, ymm4, ymm0
+fmaddpd ymm1, ymm3, ymm5, ymm1
+fmaddpd ymm2, ymm3, ymm6, ymm2
+fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
+mova COVAR(jq, 0), ymm0
+mova COVAR(jq, 1), ymm1
+mova COVAR(jq, 2), ymm2
+mova COVAR(jq, 3), ymm3
+%else
 vmulpd  ymm0, ymm3, ymm4
 vmulpd  ymm1, ymm3, ymm5
 vmulpd  ymm2, ymm3, ymm6
@@ -162,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
 ADDPD_MEM COVAR(jq,1), ymm1
 ADDPD_MEM COVAR(jq,2), ymm2
 ADDPD_MEM COVAR(jq,3), ymm3
+%endif ; cpuflag(fma3)
 add jd, 4
 cmp jd, count2d
 jle .loop4x4
@@ -169,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
 cmp jd, countd
 jg .skip2x4
 movaxmm3, [varq + jq*8]
+%if cpuflag(fma3)
+mova xmm0, COVAR(jq, 0)
+mova xmm1, COVAR(jq, 1)
+mova xmm2, COVAR(jq, 2)
+fmaddpd xmm0, xmm3, xmm4, xmm0
+fmaddpd xmm1, xmm3, xmm5, xmm1
+fmaddpd xmm2, xmm3, xmm6, xmm2
+fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
+mova COVAR(jq, 0), xmm0
+mova COVAR(jq, 1), xmm1
+mova COVAR(jq, 2), xmm2
+mova COVAR(jq, 3), xmm3
+%else
 vmulpd  xmm0, xmm3, xmm4
 vmulpd  xmm1, xmm3, xmm5
 vmulpd  xmm2, xmm3, xmm6
@@ -177,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
 ADDPD_MEM COVAR(jq,1), xmm1
 ADDPD_MEM COVAR(jq,2), xmm2
 ADDPD_MEM COVAR(jq,3), xmm3
+%endif ; cpuflag(fma3)
 .skip2x4:
 add id, 4
 add covarq, 4*COVAR_STRIDE
@@ -187,14 +227,29 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
 mov jd, id
 .loop2x1:
 vmovddup xmm0, [varq + iq*8]
+%if cpuflag(fma3)
+mova xmm1, [varq + jq*8]
+fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
+mova COVAR(jq,0), xmm0
+%else
 vmulpd   xmm0, [varq + jq*8]
 ADDPD_MEM COVAR(jq,0), xmm0
+%endif ; cpuflag(fma3)
 inc id
 add covarq, COVAR_STRIDE
 cmp id, countd
 jle .loop2x1
 .ret:
 REP_RET
+%endmacro ; UPDATE_LLS
+
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+UPDATE_LLS
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+UPDATE_LLS
 %endif
 
 INIT_XMM sse2
diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
index 81f141c..9f0d862 100644
--- a/libavutil/x86/lls_init.c
+++ b/libavutil/x86/lls_init.c
@@ -25,6 +25,7 @@
 
 void ff_update_lls_sse2(LLSModel *m, const double *var);
 void ff_update_lls_avx(LLSModel *m, const double *var);
+void ff_update_lls_fma3(LLSModel *m, const double *var);
 double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
 
 av_cold void ff_init_lls_x86(LLSModel *m)
@@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m)
 if (EXTE