Re: [libav-devel] [PATCH 1/2] hevc: x86: Add add_residual optimizations
On Fri, Oct 21, 2016 at 09:41:28AM +0200, Alexandra Hájková wrote: > --- /dev/null > +++ b/libavcodec/x86/hevc_add_res.asm > @@ -0,0 +1,371 @@ > +%macro ADD_RES_MMX_4_8 0 > +mova m0, [r1] > +mova m2, [r1+8] > +pxor m1, m1 > +pxor m3, m3 > +psubw m1, m0 > +psubw m3, m2 > +packuswb m0, m2 > +packuswb m1, m3 > + > +movd m2, [r0] > +movd m3, [r0+r2] > +punpckldq m2, m3 > +paddusb m0, m2 > +psubusbm0, m1 Alignment is off. > +INIT_MMX mmxext > +; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t > stride) > +cglobal hevc_add_residual_4_8, 3, 3, 6 > +ADD_RES_MMX_4_8 > +add r1, 16 > +lea r0, [r0+r2*2] > +ADD_RES_MMX_4_8 > +RET > + > +%macro ADD_RES_SSE_16_32_8 3 > +mova xm2, [r1+%1] > +mova xm6, [r1+%1+16] > +%if cpuflag(avx2) > +vinserti128 m2, m2, [r1+%1+32], 1 > +vinserti128 m6, m6, [r1+%1+48], 1 > +%endif > +psubw m1, m0, m2 > +psubw m5, m0, m6 Here you kept the 3-arg variant of the %if-%else .. > +%if cpuflag(avx2) > +vinserti128 m4, m4, [r1+%1+96 ], 1 > +vinserti128 m6, m6, [r1+%1+112], 1 > +%endif > +mova m3, m0 > +mova m5, m0 > +psubw m3, m4 > +psubw m5, m6 .. and here you did not. Why? Diego ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/2] hevc: x86: Add add_residual optimizations
From: Pierre Edouard LepereInitially written by Pierre Edouard Lepere , extended by James Almer . Signed-off-by: Alexandra Hájková Signed-off-by: Diego Biurrun --- Applied review comments from Henrink and Diego (more consistent naming). libavcodec/x86/Makefile | 7 +- libavcodec/x86/hevc_add_res.asm | 371 libavcodec/x86/hevcdsp_init.c | 42 + 3 files changed, 417 insertions(+), 3 deletions(-) create mode 100644 libavcodec/x86/hevc_add_res.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index a38535b..094c1fa 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -115,9 +115,10 @@ YASM-OBJS-$(CONFIG_AAC_DECODER)+= x86/sbrdsp.o YASM-OBJS-$(CONFIG_APE_DECODER)+= x86/apedsp.o YASM-OBJS-$(CONFIG_DCA_DECODER)+= x86/dcadsp.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o -YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o\ - x86/hevc_mc.o \ - x86/hevc_idct.o +YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o\ + x86/hevc_deblock.o\ + x86/hevc_idct.o \ + x86/hevc_mc.o YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o diff --git a/libavcodec/x86/hevc_add_res.asm b/libavcodec/x86/hevc_add_res.asm new file mode 100644 index 000..a1740b5 --- /dev/null +++ b/libavcodec/x86/hevc_add_res.asm @@ -0,0 +1,371 @@ +; * +; * Provide SIMD optimizations for add_residual functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * +; * This file is part of Libav. +; * +; * Libav is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * Libav is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with Libav; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; ** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 +max_pixels_10: times 16 dw ((1 << 10)-1) + +SECTION .text + +; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project +%macro ADD_RES_MMX_4_8 0 +mova m0, [r1] +mova m2, [r1+8] +pxor m1, m1 +pxor m3, m3 +psubw m1, m0 +psubw m3, m2 +packuswb m0, m2 +packuswb m1, m3 + +movd m2, [r0] +movd m3, [r0+r2] +punpckldq m2, m3 +paddusb m0, m2 +psubusbm0, m1 +movd[r0], m0 +psrlq m0, 32 +movd [r0+r2], m0 +%endmacro + + +INIT_MMX mmxext +; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride) +cglobal hevc_add_residual_4_8, 3, 3, 6 +ADD_RES_MMX_4_8 +add r1, 16 +lea r0, [r0+r2*2] +ADD_RES_MMX_4_8 +RET + +%macro ADD_RES_SSE_8_8 0 +pxor m3, m3 +mova m4, [r1] +mova m6, [r1+16] +mova m0, [r1+32] +mova m2, [r1+48] +psubw m5, m3, m4 +psubw m7, m3, m6 +psubw m1, m3, m0 +packuswb m4, m0 +packuswb m5, m1 +psubw m3, m2 +packuswb m6, m2 +packuswb m7, m3 + +movq m0, [r0] +movq m1, [r0+r2] +movhpsm0, [r0+r2*2] +movhpsm1, [r0+r3] +paddusb m0, m4 +paddusb m1, m6 +psubusb m0, m5 +psubusb m1, m7 +movq[r0], m0 +movq [r0+r2], m1 +movhps [r0+2*r2], m0 +movhps [r0+r3], m1 +%endmacro + +%macro ADD_RES_SSE_16_32_8 3 +mova xm2, [r1+%1] +mova xm6, [r1+%1+16] +%if cpuflag(avx2) +vinserti128
Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual
On 13/10/2016 20:04, Diego Biurrun wrote: > This does not match the conditions in the .asm file. Something along those lines seems to work on x86_32 according to checkasm. Folded in my github tree. diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index 73279c2..d60ae5e 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -337,6 +337,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->add_residual[2] = ff_hevc_add_residual_16_8_avx; c->add_residual[3] = ff_hevc_add_residual_32_8_avx; } +if (EXTERNAL_AVX2(cpu_flags)) { +c->add_residual[3] = ff_hevc_add_residual_32_8_avx2; +} } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext; @@ -370,6 +373,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->idct[0] = ff_hevc_idct_4x4_10_avx; c->idct[1] = ff_hevc_idct_8x8_10_avx; } +if (EXTERNAL_AVX2(cpu_flags)) { +c->add_residual[2] = ff_hevc_add_residual_16_10_avx2; +c->add_residual[3] = ff_hevc_add_residual_32_10_avx2; +} } #if ARCH_X86_64 @@ -401,8 +408,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (EXTERNAL_AVX2(cpu_flags)) { c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2; - -c->add_residual[3] = ff_hevc_add_residual_32_8_avx2; } } else if (bit_depth == 10) { if (EXTERNAL_SSE2(cpu_flags)) { @@ -434,9 +439,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (EXTERNAL_AVX2(cpu_flags)) { c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2; c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2; - -c->add_residual[2] = ff_hevc_add_residual_16_10_avx2; -c->add_residual[3] = ff_hevc_add_residual_32_10_avx2; } } #endif /* ARCH_X86_64 */ ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual
On Mon, Oct 17, 2016 at 09:18:19PM +0200, Luca Barbato wrote: > On 10/16/16 14:00, Luca Barbato wrote: > >On 10/13/16 16:02, Alexandra Hájková wrote: > >>From: Pierre Edouard Lepere> > If nobody has a say I'd push it with the mentioned changes. This is much too fuzzy IMO. If you have updated patches, send them. Diego ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual
On 10/16/16 14:00, Luca Barbato wrote: On 10/13/16 16:02, Alexandra Hájková wrote: From: Pierre Edouard LepereIf nobody has a say I'd push it with the mentioned changes. lu ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual
On 10/13/16 16:02, Alexandra Hájková wrote: From: Pierre Edouard LepereInitially written by Pierre Edouard Lepere , extended by James Almer . Signed-off-by: Alexandra Hájková --- libavcodec/x86/Makefile | 3 +- libavcodec/x86/hevc_res_add.asm | 391 Maybe hevc_add_res.asm for consistency? +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 + +cglobal hevc_add_residual_16_10,3,5,6 +pxor m4, m4 +mova m5, [max_pixels_10] +lea r3, [r2*3] + +mov r4d, 4 +.loop +ADD_RESIDUAL_16_AVX2 r0, r2, r3, r1 +lea r0, [r0+r2*4] +add r1, 128 +dec r4d +jnz .loop +RET + +cglobal hevc_add_residual_32_10,3,5,6 +pxor m4, m4 +mova m5, [max_pixels_10] + +mov r4d, 6 16 +.loop +ADD_RESIDUAL_32_AVX2 r0, r2, r1 +lea r0, [r0+r2*2] +add r1, 128 +dec r4d +jnz .loop +RET +%endif ;HAVE_AVX2_EXTERNAL With that change it passes checkasm on avx2 as well. lu ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual
On Thu, Oct 13, 2016 at 04:02:34PM +0200, Alexandra Hájková wrote: > From: Pierre Edouard Lepere> > Initially written by Pierre Edouard Lepere > , > extended by James Almer . > > Signed-off-by: Alexandra Hájková > --- > libavcodec/x86/Makefile | 3 +- > libavcodec/x86/hevc_res_add.asm | 391 > > libavcodec/x86/hevcdsp_init.c | 40 > 3 files changed, 433 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/x86/hevc_res_add.asm Has this survived Oracle? It has to. Diego ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual
On Thu, Oct 13, 2016 at 04:02:34PM +0200, Alexandra Hájková wrote: > --- a/libavcodec/x86/hevcdsp_init.c > +++ b/libavcodec/x86/hevcdsp_init.c > @@ -278,17 +297,24 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int > bit_depth) > +c->add_residual[1] = ff_hevc_add_residual_8_8_sse2; > +c->add_residual[2] = ff_hevc_add_residual_16_8_sse2; > +c->add_residual[3] = ff_hevc_add_residual_32_8_sse2; > + > c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2; > c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2; > c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2; > > c->idct[0]= ff_hevc_idct_4x4_8_sse2; > c->idct[1]= ff_hevc_idct_8x8_8_sse2; > + > SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); > SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); grmbl > @@ -307,11 +333,16 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int > bit_depth) > if (EXTERNAL_AVX(cpu_flags)) { > c->idct[0] = ff_hevc_idct_4x4_8_avx; > c->idct[1] = ff_hevc_idct_8x8_8_avx; > +c->add_residual[1] = ff_hevc_add_residual_8_8_avx; > +c->add_residual[2] = ff_hevc_add_residual_16_8_avx; > +c->add_residual[3] = ff_hevc_add_residual_32_8_avx; > } > } else if (bit_depth == 10) { > if (EXTERNAL_MMXEXT(cpu_flags)) { > c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext; > c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext; > + > +c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext; > } > if (EXTERNAL_SSE2(cpu_flags)) { > c->hevc_v_loop_filter_chroma = > ff_hevc_v_loop_filter_chroma_10_sse2; > @@ -330,6 +361,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int > bit_depth) > SET_LUMA_FUNCS(put_unweighted_pred_avg, > ff_hevc_put_unweighted_pred_avg, 10, sse2); > SET_CHROMA_FUNCS(put_unweighted_pred_chroma, > ff_hevc_put_unweighted_pred, 10, sse2); > SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, > ff_hevc_put_unweighted_pred_avg, 10, sse2); > + > +c->add_residual[1] = ff_hevc_add_residual_8_10_sse2; > +c->add_residual[2] = ff_hevc_add_residual_16_10_sse2; > +c->add_residual[3] = ff_hevc_add_residual_32_10_sse2; > } > if (EXTERNAL_AVX(cpu_flags)) { > c->idct[0] = ff_hevc_idct_4x4_10_avx; > @@ -366,6 +401,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int > bit_depth) > if (EXTERNAL_AVX2(cpu_flags)) { > c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2; > c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2; > + > +c->add_residual[3] = ff_hevc_add_residual_32_8_avx2; > } > } else if (bit_depth == 10) { > if (EXTERNAL_SSE2(cpu_flags)) { > @@ -397,6 +434,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int > bit_depth) > if (EXTERNAL_AVX2(cpu_flags)) { > c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2; > c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2; > + > +c->add_residual[2] = ff_hevc_add_residual_16_10_avx2; > +c->add_residual[3] = ff_hevc_add_residual_32_10_avx2; > } > } > #endif /* ARCH_X86_64 */ This does not match the conditions in the .asm file. Diego ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/2] hevc/x86: Add add_residual
From: Pierre Edouard LepereInitially written by Pierre Edouard Lepere , extended by James Almer . Signed-off-by: Alexandra Hájková --- libavcodec/x86/Makefile | 3 +- libavcodec/x86/hevc_res_add.asm | 391 libavcodec/x86/hevcdsp_init.c | 40 3 files changed, 433 insertions(+), 1 deletion(-) create mode 100644 libavcodec/x86/hevc_res_add.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index a38535b..aa93e67 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_DCA_DECODER)+= x86/dcadsp.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o\ x86/hevc_mc.o \ - x86/hevc_idct.o + x86/hevc_idct.o \ + x86/hevc_res_add.o YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm new file mode 100644 index 000..f8d9fd7 --- /dev/null +++ b/libavcodec/x86/hevc_res_add.asm @@ -0,0 +1,391 @@ +; * +; * Provide SIMD optimizations for add_residual functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * +; * This file is part of Libav. +; * +; * Libav is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * Libav is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with Libav; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; ** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 +max_pixels_10: times 16 dw ((1 << 10)-1) + +SECTION .text + +; the add_res macros and functions were largely inspired by x264 project's code in the h264_idct.asm file +%macro ADD_RES_MMX_4_8 0 +mova m2, [r1] +mova m4, [r1+8] +pxor m3, m3 +psubw m3, m2 +packuswb m2, m2 +packuswb m3, m3 +pxor m5, m5 +psubw m5, m4 +packuswb m4, m4 +packuswb m5, m5 + +movh m0, [r0 ] +movh m1, [r0+r2 ] +paddusb m0, m2 +paddusb m1, m4 +psubusb m0, m3 +psubusb m1, m5 +movh [r0 ], m0 +movh [r0+r2 ], m1 +%endmacro + + +INIT_MMX mmxext +; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_add_residual_4_8, 3, 4, 6 +ADD_RES_MMX_4_8 +add r1, 16 +lea r0, [r0+r2*2] +ADD_RES_MMX_4_8 +RET + +%macro ADD_RES_SSE_8_8 0 +pxor m3, m3 +mova m4, [r1] +mova m6, [r1+16] +mova m0, [r1+32] +mova m2, [r1+48] +psubw m5, m3, m4 +psubw m7, m3, m6 +psubw m1, m3, m0 +packuswb m4, m0 +packuswb m5, m1 +psubw m3, m2 +packuswb m6, m2 +packuswb m7, m3 + +movqm0, [r0 ] +movqm1, [r0+r2 ] +movhps m0, [r0+r2*2] +movhps m1, [r0+r3 ] +paddusb m0, m4 +paddusb m1, m6 +psubusb m0, m5 +psubusb m1, m7 +movq [r0 ], m0 +movq [r0+r2 ], m1 +movhps [r0+2*r2], m0 +movhps [r0+r3 ], m1 +%endmacro + +%macro ADD_RES_SSE_16_32_8 3 +mova xm2, [r1+%1 ] +mova xm6, [r1+%1+16] +%if cpuflag(avx2) +vinserti128 m2, m2, [r1+%1+32], 1 +vinserti128 m6, m6, [r1+%1+48], 1 +%endif +%if cpuflag(avx) +psubw m1, m0, m2 +psubw m5, m0, m6 +%else +mova m1, m0 +mova m5, m0 +psubw m1, m2 +
Re: [libav-devel] [PATCH 1/2] hevc/x86: Add add_residual
On Wed, Oct 12, 2016 at 06:24:39PM +0200, Alexandra Hájková wrote: > --- /dev/null > +++ b/libavcodec/x86/hevc_res_add.asm > @@ -0,0 +1,391 @@ > +; /* Drop the /, this is not C. > +; * Provide SIMD optimizations for add_residual functions for HEVC decoding s/Provide// > +; * Copyright (c) 2014 Pierre-Edouard LEPERE > +; * > +; * This file is part of Libav. > +; * > +; * FFmpeg is free software; you can redistribute it and/or This is not FFmpeg. > +; * modify it under the terms of the GNU Lesser General Public > +; * License as published by the Free Software Foundation; either > +; * version 2.1 of the License, or (at your option) any later version. > +; * > +; * FFmpeg is distributed in the hope that it will be useful, > +; * but WITHOUT ANY WARRANTY; without even the implied warranty of > +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +; * Lesser General Public License for more details. > +; * > +; * You should have received a copy of the GNU Lesser General Public > +; * License along with FFmpeg; if not, write to the Free Software > +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +; */ > +%include "libavutil/x86/x86util.asm" Drop the / and add an empty line. > +;- > +; void ff_hevc_add_residual__10(pixel *dst, int16_t *block, int stride) > +;- > +%macro ADD_RES_SSE_8_10 4 I don't think this function uses an int stride, stray double underscore. > +;- > +; void ff_hevc_add_residual__10(pixel *dst, int16_t *block, int stride) > +;- same > +%if HAVE_AVX2_EXTERNAL > +INIT_YMM avx2 > + > +cglobal hevc_add_residual_16_10,3,5,6 > +%endif ;HAVE_AVX_EXTERNAL The %if and the %endif comment do not match. Diego ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/2] hevc/x86: Add add_residual
From: Pierre Edouard LepereInitially written by Pierre Edouard Lepere , extended by James Almer . Signed-off-by: Alexandra Hájková --- libavcodec/x86/Makefile | 3 +- libavcodec/x86/hevc_res_add.asm | 391 libavcodec/x86/hevcdsp_init.c | 40 3 files changed, 433 insertions(+), 1 deletion(-) create mode 100644 libavcodec/x86/hevc_res_add.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index a38535b..aa93e67 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_DCA_DECODER)+= x86/dcadsp.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o\ x86/hevc_mc.o \ - x86/hevc_idct.o + x86/hevc_idct.o \ + x86/hevc_res_add.o YASM-OBJS-$(CONFIG_PNG_DECODER)+= x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm new file mode 100644 index 000..1e3bfc2 --- /dev/null +++ b/libavcodec/x86/hevc_res_add.asm @@ -0,0 +1,391 @@ +; /* +; * Provide SIMD optimizations for add_residual functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * +; * This file is part of Libav. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 +max_pixels_10: times 16 dw ((1 << 10)-1) + +SECTION .text + +; the add_res macros and functions were largely inspired by x264 project's code in the h264_idct.asm file +%macro ADD_RES_MMX_4_8 0 +mova m2, [r1] +mova m4, [r1+8] +pxor m3, m3 +psubw m3, m2 +packuswb m2, m2 +packuswb m3, m3 +pxor m5, m5 +psubw m5, m4 +packuswb m4, m4 +packuswb m5, m5 + +movh m0, [r0 ] +movh m1, [r0+r2 ] +paddusb m0, m2 +paddusb m1, m4 +psubusb m0, m3 +psubusb m1, m5 +movh [r0 ], m0 +movh [r0+r2 ], m1 +%endmacro + + +INIT_MMX mmxext +; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_add_residual_4_8, 3, 4, 6 +ADD_RES_MMX_4_8 +add r1, 16 +lea r0, [r0+r2*2] +ADD_RES_MMX_4_8 +RET + +%macro ADD_RES_SSE_8_8 0 +pxor m3, m3 +mova m4, [r1] +mova m6, [r1+16] +mova m0, [r1+32] +mova m2, [r1+48] +psubw m5, m3, m4 +psubw m7, m3, m6 +psubw m1, m3, m0 +packuswb m4, m0 +packuswb m5, m1 +psubw m3, m2 +packuswb m6, m2 +packuswb m7, m3 + +movqm0, [r0 ] +movqm1, [r0+r2 ] +movhps m0, [r0+r2*2] +movhps m1, [r0+r3 ] +paddusb m0, m4 +paddusb m1, m6 +psubusb m0, m5 +psubusb m1, m7 +movq [r0 ], m0 +movq [r0+r2 ], m1 +movhps [r0+2*r2], m0 +movhps [r0+r3 ], m1 +%endmacro + +%macro ADD_RES_SSE_16_32_8 3 +mova xm2, [r1+%1 ] +mova xm6, [r1+%1+16] +%if cpuflag(avx2) +vinserti128 m2, m2, [r1+%1+32], 1 +vinserti128 m6, m6, [r1+%1+48], 1 +%endif +%if cpuflag(avx) +psubw m1, m0, m2 +psubw m5, m0, m6 +%else +mova m1, m0 +mova m5, m0 +psubw m1, m2 +psubw m5, m6 +%endif +packuswb m2, m6 +packuswb m1, m5 + +mova xm4, [r1+%1+mmsize*2 ] +mova