Re: [libav-devel] [PATCH 1/2] hevc: x86: Add add_residual optimizations

2016-10-21 Thread Diego Biurrun
On Fri, Oct 21, 2016 at 09:41:28AM +0200, Alexandra Hájková wrote:
> --- /dev/null
> +++ b/libavcodec/x86/hevc_add_res.asm
> @@ -0,0 +1,371 @@
> +%macro ADD_RES_MMX_4_8 0
> +mova  m0, [r1]
> +mova  m2, [r1+8]
> +pxor  m1, m1
> +pxor  m3, m3
> +psubw m1, m0
> +psubw m3, m2
> +packuswb  m0, m2
> +packuswb  m1, m3
> +
> +movd  m2, [r0]
> +movd  m3, [r0+r2]
> +punpckldq m2, m3
> +paddusb   m0, m2
> +psubusbm0, m1

Alignment is off.

> +INIT_MMX mmxext
> +; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t 
> stride)
> +cglobal hevc_add_residual_4_8, 3, 3, 6
> +ADD_RES_MMX_4_8
> +add   r1, 16
> +lea   r0, [r0+r2*2]
> +ADD_RES_MMX_4_8
> +RET
> +
> +%macro ADD_RES_SSE_16_32_8 3
> +mova xm2, [r1+%1]
> +mova xm6, [r1+%1+16]
> +%if cpuflag(avx2)
> +vinserti128   m2, m2, [r1+%1+32], 1
> +vinserti128   m6, m6, [r1+%1+48], 1
> +%endif
> +psubw m1, m0, m2
> +psubw m5, m0, m6

Here you kept the 3-arg variant of the %if-%else ..

> +%if cpuflag(avx2)
> +vinserti128   m4, m4, [r1+%1+96 ], 1
> +vinserti128   m6, m6, [r1+%1+112], 1
> +%endif
> +mova  m3, m0
> +mova  m5, m0
> +psubw m3, m4
> +psubw m5, m6

.. and here you did not. Why?

Diego
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 1/2] hevc: x86: Add add_residual optimizations

2016-10-21 Thread Alexandra Hájková
From: Pierre Edouard Lepere 

Initially written by Pierre Edouard Lepere 
,
extended by James Almer .

Signed-off-by: Alexandra Hájková 
Signed-off-by: Diego Biurrun 
---
Applied review comments from Henrink and Diego (more consistent naming).

 libavcodec/x86/Makefile |   7 +-
 libavcodec/x86/hevc_add_res.asm | 371 
 libavcodec/x86/hevcdsp_init.c   |  42 +
 3 files changed, 417 insertions(+), 3 deletions(-)
 create mode 100644 libavcodec/x86/hevc_add_res.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index a38535b..094c1fa 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -115,9 +115,10 @@ YASM-OBJS-$(CONFIG_AAC_DECODER)+= x86/sbrdsp.o
 YASM-OBJS-$(CONFIG_APE_DECODER)+= x86/apedsp.o
 YASM-OBJS-$(CONFIG_DCA_DECODER)+= x86/dcadsp.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)  += x86/dnxhdenc.o
-YASM-OBJS-$(CONFIG_HEVC_DECODER)   += x86/hevc_deblock.o\
-  x86/hevc_mc.o \
-  x86/hevc_idct.o
+YASM-OBJS-$(CONFIG_HEVC_DECODER)   += x86/hevc_add_res.o\
+  x86/hevc_deblock.o\
+  x86/hevc_idct.o   \
+  x86/hevc_mc.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)   += x86/rv40dsp.o
diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm
new file mode 100644
index 000..a1740b5
--- /dev/null
+++ b/libavcodec/x86/hevc_add_res.asm
@@ -0,0 +1,371 @@
+; *
+; * Provide SIMD optimizations for add_residual functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of Libav.
+; *
+; * Libav is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * Libav is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with Libav; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; 
**
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+max_pixels_10:  times 16  dw ((1 << 10)-1)
+
+SECTION .text
+
+; the add_res macros and functions were largely inspired by h264_idct.asm from 
the x264 project
+%macro ADD_RES_MMX_4_8 0
+mova  m0, [r1]
+mova  m2, [r1+8]
+pxor  m1, m1
+pxor  m3, m3
+psubw m1, m0
+psubw m3, m2
+packuswb  m0, m2
+packuswb  m1, m3
+
+movd  m2, [r0]
+movd  m3, [r0+r2]
+punpckldq m2, m3
+paddusb   m0, m2
+psubusbm0, m1
+movd[r0], m0
+psrlq m0, 32
+movd [r0+r2], m0
+%endmacro
+
+
+INIT_MMX mmxext
+; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t 
stride)
+cglobal hevc_add_residual_4_8, 3, 3, 6
+ADD_RES_MMX_4_8
+add   r1, 16
+lea   r0, [r0+r2*2]
+ADD_RES_MMX_4_8
+RET
+
+%macro ADD_RES_SSE_8_8 0
+pxor  m3, m3
+mova  m4, [r1]
+mova  m6, [r1+16]
+mova  m0, [r1+32]
+mova  m2, [r1+48]
+psubw m5, m3, m4
+psubw m7, m3, m6
+psubw m1, m3, m0
+packuswb  m4, m0
+packuswb  m5, m1
+psubw m3, m2
+packuswb  m6, m2
+packuswb  m7, m3
+
+movq  m0, [r0]
+movq  m1, [r0+r2]
+movhpsm0, [r0+r2*2]
+movhpsm1, [r0+r3]
+paddusb   m0, m4
+paddusb   m1, m6
+psubusb   m0, m5
+psubusb   m1, m7
+movq[r0], m0
+movq [r0+r2], m1
+movhps [r0+2*r2], m0
+movhps   [r0+r3], m1
+%endmacro
+
+%macro ADD_RES_SSE_16_32_8 3
+mova xm2, [r1+%1]
+mova xm6, [r1+%1+16]
+%if cpuflag(avx2)
+vinserti128   

Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual

2016-10-17 Thread Luca Barbato
On 13/10/2016 20:04, Diego Biurrun wrote:
> This does not match the conditions in the .asm file.

Something along those lines seems to work on x86_32 according to
checkasm. Folded in my github tree.

diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 73279c2..d60ae5e 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -337,6 +337,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const
int bit_depth)
 c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
 c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
 }
+if (EXTERNAL_AVX2(cpu_flags)) {
+c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
+}
 } else if (bit_depth == 10) {
 if (EXTERNAL_MMXEXT(cpu_flags)) {
 c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
@@ -370,6 +373,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const
int bit_depth)
 c->idct[0] = ff_hevc_idct_4x4_10_avx;
 c->idct[1] = ff_hevc_idct_8x8_10_avx;
 }
+if (EXTERNAL_AVX2(cpu_flags)) {
+c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
+c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
+}
 }

 #if ARCH_X86_64
@@ -401,8 +408,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const
int bit_depth)
 if (EXTERNAL_AVX2(cpu_flags)) {
 c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
 c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
-
-c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
 }
 } else if (bit_depth == 10) {
 if (EXTERNAL_SSE2(cpu_flags)) {
@@ -434,9 +439,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const
int bit_depth)
 if (EXTERNAL_AVX2(cpu_flags)) {
 c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
 c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
-
-c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
-c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
 }
 }
 #endif /* ARCH_X86_64 */
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual

2016-10-17 Thread Diego Biurrun
On Mon, Oct 17, 2016 at 09:18:19PM +0200, Luca Barbato wrote:
> On 10/16/16 14:00, Luca Barbato wrote:
> >On 10/13/16 16:02, Alexandra Hájková wrote:
> >>From: Pierre Edouard Lepere 
> 
> If nobody has a say I'd push it with the mentioned changes.

This is much too fuzzy IMO. If you have updated patches, send them.

Diego
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual

2016-10-17 Thread Luca Barbato

On 10/16/16 14:00, Luca Barbato wrote:

On 10/13/16 16:02, Alexandra Hájková wrote:

From: Pierre Edouard Lepere 




If nobody has a say I'd push it with the mentioned changes.

lu

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual

2016-10-16 Thread Luca Barbato

On 10/13/16 16:02, Alexandra Hájková wrote:

From: Pierre Edouard Lepere 

Initially written by Pierre Edouard Lepere 
,
extended by James Almer .

Signed-off-by: Alexandra Hájková 
---
 libavcodec/x86/Makefile |   3 +-
 libavcodec/x86/hevc_res_add.asm | 391 


Maybe hevc_add_res.asm for consistency?


+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+
+cglobal hevc_add_residual_16_10,3,5,6
+pxor  m4, m4
+mova  m5, [max_pixels_10]
+lea   r3, [r2*3]
+
+mov r4d, 4
+.loop
+ADD_RESIDUAL_16_AVX2  r0, r2, r3, r1
+lea   r0, [r0+r2*4]
+add   r1, 128
+dec r4d
+jnz .loop
+RET
+
+cglobal hevc_add_residual_32_10,3,5,6
+pxor  m4, m4
+mova  m5, [max_pixels_10]
+
+mov r4d, 6


16


+.loop
+ADD_RESIDUAL_32_AVX2  r0, r2, r1
+lea   r0, [r0+r2*2]
+add   r1, 128
+dec r4d
+jnz .loop
+RET
+%endif ;HAVE_AVX2_EXTERNAL


With that change it passes checkasm on avx2 as well.

lu
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual

2016-10-13 Thread Diego Biurrun
On Thu, Oct 13, 2016 at 04:02:34PM +0200, Alexandra Hájková wrote:
> From: Pierre Edouard Lepere 
> 
> Initially written by Pierre Edouard Lepere 
> ,
> extended by James Almer .
> 
> Signed-off-by: Alexandra Hájková 
> ---
>  libavcodec/x86/Makefile |   3 +-
>  libavcodec/x86/hevc_res_add.asm | 391 
> 
>  libavcodec/x86/hevcdsp_init.c   |  40 
>  3 files changed, 433 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/x86/hevc_res_add.asm

Has this survived Oracle? It has to.

Diego
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual

2016-10-13 Thread Diego Biurrun
On Thu, Oct 13, 2016 at 04:02:34PM +0200, Alexandra Hájková wrote:
> --- a/libavcodec/x86/hevcdsp_init.c
> +++ b/libavcodec/x86/hevcdsp_init.c
> @@ -278,17 +297,24 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
> bit_depth)
> +c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
> +c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
> +c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
> +
>  c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
>  c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
>  c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
>  
>  c->idct[0]= ff_hevc_idct_4x4_8_sse2;
>  c->idct[1]= ff_hevc_idct_8x8_8_sse2;
> +
>  SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
>  SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);

grmbl

> @@ -307,11 +333,16 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
> bit_depth)
>  if (EXTERNAL_AVX(cpu_flags)) {
>  c->idct[0] = ff_hevc_idct_4x4_8_avx;
>  c->idct[1] = ff_hevc_idct_8x8_8_avx;
> +c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
> +c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
> +c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
>  }
>  } else if (bit_depth == 10) {
>  if (EXTERNAL_MMXEXT(cpu_flags)) {
>  c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
>  c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
> +
> +c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
>  }
>  if (EXTERNAL_SSE2(cpu_flags)) {
>  c->hevc_v_loop_filter_chroma = 
> ff_hevc_v_loop_filter_chroma_10_sse2;
> @@ -330,6 +361,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
> bit_depth)
>  SET_LUMA_FUNCS(put_unweighted_pred_avg,  
> ff_hevc_put_unweighted_pred_avg, 10, sse2);
>  SET_CHROMA_FUNCS(put_unweighted_pred_chroma, 
> ff_hevc_put_unweighted_pred, 10, sse2);
>  SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, 
> ff_hevc_put_unweighted_pred_avg, 10, sse2);
> +
> +c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
> +c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
> +c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
>  }
>  if (EXTERNAL_AVX(cpu_flags)) {
>  c->idct[0] = ff_hevc_idct_4x4_10_avx;
> @@ -366,6 +401,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
> bit_depth)
>  if (EXTERNAL_AVX2(cpu_flags)) {
>  c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
>  c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
> +
> +c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
>  }
>  } else if (bit_depth == 10) {
>  if (EXTERNAL_SSE2(cpu_flags)) {
> @@ -397,6 +434,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
> bit_depth)
>  if (EXTERNAL_AVX2(cpu_flags)) {
>  c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
>  c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
> +
> +c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
> +c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
>  }
>  }
>  #endif /* ARCH_X86_64 */

This does not match the conditions in the .asm file.

Diego
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 1/2] hevc/x86: Add add_residual

2016-10-13 Thread Alexandra Hájková
From: Pierre Edouard Lepere 

Initially written by Pierre Edouard Lepere 
,
extended by James Almer .

Signed-off-by: Alexandra Hájková 
---
 libavcodec/x86/Makefile |   3 +-
 libavcodec/x86/hevc_res_add.asm | 391 
 libavcodec/x86/hevcdsp_init.c   |  40 
 3 files changed, 433 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/x86/hevc_res_add.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index a38535b..aa93e67 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_DCA_DECODER)+= x86/dcadsp.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)  += x86/dnxhdenc.o
 YASM-OBJS-$(CONFIG_HEVC_DECODER)   += x86/hevc_deblock.o\
   x86/hevc_mc.o \
-  x86/hevc_idct.o
+  x86/hevc_idct.o   \
+  x86/hevc_res_add.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)   += x86/rv40dsp.o
diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
new file mode 100644
index 000..f8d9fd7
--- /dev/null
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -0,0 +1,391 @@
+; *
+; * Provide SIMD optimizations for add_residual functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of Libav.
+; *
+; * Libav is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * Libav is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with Libav; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; 
**
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+max_pixels_10:  times 16  dw ((1 << 10)-1)
+
+SECTION .text
+
+; the add_res macros and functions were largely inspired by x264 project's 
code in the h264_idct.asm file
+%macro ADD_RES_MMX_4_8 0
+mova  m2, [r1]
+mova  m4, [r1+8]
+pxor  m3, m3
+psubw m3, m2
+packuswb  m2, m2
+packuswb  m3, m3
+pxor  m5, m5
+psubw m5, m4
+packuswb  m4, m4
+packuswb  m5, m5
+
+movh  m0, [r0 ]
+movh  m1, [r0+r2  ]
+paddusb   m0, m2
+paddusb   m1, m4
+psubusb   m0, m3
+psubusb   m1, m5
+movh   [r0 ], m0
+movh   [r0+r2  ], m1
+%endmacro
+
+
+INIT_MMX mmxext
+; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *coeffs, 
ptrdiff_t stride)
+cglobal hevc_add_residual_4_8, 3, 4, 6
+ADD_RES_MMX_4_8
+add   r1, 16
+lea   r0, [r0+r2*2]
+ADD_RES_MMX_4_8
+RET
+
+%macro ADD_RES_SSE_8_8 0
+pxor  m3, m3
+mova  m4, [r1]
+mova  m6, [r1+16]
+mova  m0, [r1+32]
+mova  m2, [r1+48]
+psubw m5, m3, m4
+psubw m7, m3, m6
+psubw m1, m3, m0
+packuswb  m4, m0
+packuswb  m5, m1
+psubw m3, m2
+packuswb  m6, m2
+packuswb  m7, m3
+
+movqm0, [r0 ]
+movqm1, [r0+r2  ]
+movhps  m0, [r0+r2*2]
+movhps  m1, [r0+r3  ]
+paddusb m0, m4
+paddusb m1, m6
+psubusb m0, m5
+psubusb m1, m7
+movq [r0 ], m0
+movq [r0+r2  ], m1
+movhps   [r0+2*r2], m0
+movhps   [r0+r3  ], m1
+%endmacro
+
+%macro ADD_RES_SSE_16_32_8 3
+mova xm2, [r1+%1   ]
+mova xm6, [r1+%1+16]
+%if cpuflag(avx2)
+vinserti128   m2, m2, [r1+%1+32], 1
+vinserti128   m6, m6, [r1+%1+48], 1
+%endif
+%if cpuflag(avx)
+psubw m1, m0, m2
+psubw m5, m0, m6
+%else
+mova  m1, m0
+mova  m5, m0
+psubw m1, m2
+

Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual

2016-10-12 Thread Diego Biurrun
On Wed, Oct 12, 2016 at 06:24:39PM +0200, Alexandra Hájková wrote:
> --- /dev/null
> +++ b/libavcodec/x86/hevc_res_add.asm
> @@ -0,0 +1,391 @@
> +; /*

Drop the /, this is not C.

> +; * Provide SIMD optimizations for add_residual functions for HEVC decoding

s/Provide//

> +; * Copyright (c) 2014 Pierre-Edouard LEPERE
> +; *
> +; * This file is part of Libav.
> +; *
> +; * FFmpeg is free software; you can redistribute it and/or

This is not FFmpeg.

> +; * modify it under the terms of the GNU Lesser General Public
> +; * License as published by the Free Software Foundation; either
> +; * version 2.1 of the License, or (at your option) any later version.
> +; *
> +; * FFmpeg is distributed in the hope that it will be useful,
> +; * but WITHOUT ANY WARRANTY; without even the implied warranty of
> +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +; * Lesser General Public License for more details.
> +; *
> +; * You should have received a copy of the GNU Lesser General Public
> +; * License along with FFmpeg; if not, write to the Free Software
> +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> +; */
> +%include "libavutil/x86/x86util.asm"

Drop the / and add an empty line.

> +;-
> +; void ff_hevc_add_residual__10(pixel *dst, int16_t *block, int stride)
> +;-
> +%macro ADD_RES_SSE_8_10 4

I don't think this function uses an int stride, stray double underscore.

> +;-
> +; void ff_hevc_add_residual__10(pixel *dst, int16_t *block, int stride)
> +;-

same

> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +
> +cglobal hevc_add_residual_16_10,3,5,6
> +%endif ;HAVE_AVX_EXTERNAL

The %if and the %endif comment do not match.

Diego
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 1/2] hevc/x86: Add add_residual

2016-10-12 Thread Alexandra Hájková
From: Pierre Edouard Lepere 

Initially written by Pierre Edouard Lepere 
,
extended by James Almer .

Signed-off-by: Alexandra Hájková 
---
 libavcodec/x86/Makefile |   3 +-
 libavcodec/x86/hevc_res_add.asm | 391 
 libavcodec/x86/hevcdsp_init.c   |  40 
 3 files changed, 433 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/x86/hevc_res_add.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index a38535b..aa93e67 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_DCA_DECODER)+= x86/dcadsp.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)  += x86/dnxhdenc.o
 YASM-OBJS-$(CONFIG_HEVC_DECODER)   += x86/hevc_deblock.o\
   x86/hevc_mc.o \
-  x86/hevc_idct.o
+  x86/hevc_idct.o   \
+  x86/hevc_res_add.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)   += x86/rv40dsp.o
diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
new file mode 100644
index 000..1e3bfc2
--- /dev/null
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -0,0 +1,391 @@
+; /*
+; * Provide SIMD optimizations for add_residual functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of Libav.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+max_pixels_10:  times 16  dw ((1 << 10)-1)
+
+SECTION .text
+
+; the add_res macros and functions were largely inspired by x264 project's 
code in the h264_idct.asm file
+%macro ADD_RES_MMX_4_8 0
+mova  m2, [r1]
+mova  m4, [r1+8]
+pxor  m3, m3
+psubw m3, m2
+packuswb  m2, m2
+packuswb  m3, m3
+pxor  m5, m5
+psubw m5, m4
+packuswb  m4, m4
+packuswb  m5, m5
+
+movh  m0, [r0 ]
+movh  m1, [r0+r2  ]
+paddusb   m0, m2
+paddusb   m1, m4
+psubusb   m0, m3
+psubusb   m1, m5
+movh   [r0 ], m0
+movh   [r0+r2  ], m1
+%endmacro
+
+
+INIT_MMX mmxext
+; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *coeffs, 
ptrdiff_t stride)
+cglobal hevc_add_residual_4_8, 3, 4, 6
+ADD_RES_MMX_4_8
+add   r1, 16
+lea   r0, [r0+r2*2]
+ADD_RES_MMX_4_8
+RET
+
+%macro ADD_RES_SSE_8_8 0
+pxor  m3, m3
+mova  m4, [r1]
+mova  m6, [r1+16]
+mova  m0, [r1+32]
+mova  m2, [r1+48]
+psubw m5, m3, m4
+psubw m7, m3, m6
+psubw m1, m3, m0
+packuswb  m4, m0
+packuswb  m5, m1
+psubw m3, m2
+packuswb  m6, m2
+packuswb  m7, m3
+
+movqm0, [r0 ]
+movqm1, [r0+r2  ]
+movhps  m0, [r0+r2*2]
+movhps  m1, [r0+r3  ]
+paddusb m0, m4
+paddusb m1, m6
+psubusb m0, m5
+psubusb m1, m7
+movq [r0 ], m0
+movq [r0+r2  ], m1
+movhps   [r0+2*r2], m0
+movhps   [r0+r3  ], m1
+%endmacro
+
+%macro ADD_RES_SSE_16_32_8 3
+mova xm2, [r1+%1   ]
+mova xm6, [r1+%1+16]
+%if cpuflag(avx2)
+vinserti128   m2, m2, [r1+%1+32], 1
+vinserti128   m6, m6, [r1+%1+48], 1
+%endif
+%if cpuflag(avx)
+psubw m1, m0, m2
+psubw m5, m0, m6
+%else
+mova  m1, m0
+mova  m5, m0
+psubw m1, m2
+psubw m5, m6
+%endif
+packuswb  m2, m6
+packuswb  m1, m5
+
+mova xm4, [r1+%1+mmsize*2   ]
+mova