[libav-devel] [PATCH] x86: Port gradfun to yasm
--- libavfilter/x86/Makefile|1 + libavfilter/x86/vf_gradfun.c| 168 +-- libavfilter/x86/vf_gradfun_yasm.asm | 144 ++ 3 files changed, 165 insertions(+), 148 deletions(-) create mode 100644 libavfilter/x86/vf_gradfun_yasm.asm diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 59cefe8..b50b373 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -3,6 +3,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o +YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_yasm.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/vf_hqdn3d.o YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o diff --git a/libavfilter/x86/vf_gradfun.c b/libavfilter/x86/vf_gradfun.c index e571af7..3bac16d 100644 --- a/libavfilter/x86/vf_gradfun.c +++ b/libavfilter/x86/vf_gradfun.c @@ -24,12 +24,10 @@ #include libavutil/x86/asm.h #include libavfilter/gradfun.h -#if HAVE_INLINE_ASM - -DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F}; -DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; - -#if HAVE_MMXEXT_INLINE +#if HAVE_YASM +void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, uint8_t *src, + uint16_t *dc, int thresh, + const uint16_t *dithers); static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers) @@ -41,72 +39,13 @@ static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t *dc, width = x; } x = -width; -__asm__ volatile( -movd %4, %%mm5 \n -pxor %%mm7, %%mm7 \n -pshufw $0, %%mm5, %%mm5 \n -movq %6, %%mm6 \n -movq (%5), %%mm3 \n -movq 8(%5), %%mm4 \n - -1: \n -movd (%2,%0), %%mm0 \n -movd (%3,%0), %%mm1 \n -punpcklbw %%mm7, %%mm0 \n -punpcklwd %%mm1, %%mm1 \n -psllw $7, %%mm0 \n -pxor %%mm2, %%mm2 \n -psubw %%mm0, %%mm1 \n // delta = dc - pix -psubw %%mm1, %%mm2 \n -pmaxsw %%mm1, %%mm2 \n -pmulhuw%%mm5, %%mm2 \n // m = abs(delta) * thresh 16 -psubw %%mm6, %%mm2 \n -pminsw %%mm7, %%mm2 \n // m = -max(0, 127-m) -pmullw %%mm2, %%mm2 \n -paddw %%mm3, %%mm0 \n // pix += dither -psllw $2, %%mm1 \n // m = m*m*delta 14 -pmulhw %%mm2, %%mm1 \n -paddw %%mm1, %%mm0 \n // pix += m -psraw $7, %%mm0 \n -packuswb %%mm0, %%mm0 \n -movd %%mm0, (%1,%0) \n // dst = clip(pix7) -add $4, %0 \n -jnl 2f \n - -movd (%2,%0), %%mm0 \n -movd (%3,%0), %%mm1 \n -punpcklbw %%mm7, %%mm0 \n -punpcklwd %%mm1, %%mm1 \n -psllw $7, %%mm0 \n -pxor %%mm2, %%mm2 \n -psubw %%mm0, %%mm1 \n // delta = dc - pix -psubw %%mm1, %%mm2 \n -pmaxsw %%mm1, %%mm2 \n -pmulhuw%%mm5, %%mm2 \n // m = abs(delta) * thresh 16 -psubw %%mm6, %%mm2 \n -pminsw %%mm7, %%mm2 \n // m = -max(0, 127-m) -pmullw %%mm2, %%mm2 \n -paddw %%mm4, %%mm0 \n // pix += dither -psllw $2, %%mm1 \n // m = m*m*delta 14 -pmulhw %%mm2, %%mm1 \n -paddw %%mm1, %%mm0 \n // pix += m -psraw $7, %%mm0 \n -packuswb %%mm0, %%mm0 \n -movd %%mm0, (%1,%0) \n // dst = clip(pix7) -add $4, %0 \n -jl 1b \n - -2: \n -emms \n -:+r(x) -:r(dst+width), r(src+width), r(dc+width/2), - rm(thresh), r(dithers), m(*pw_7f) -:memory -); +ff_gradfun_filter_line_mmxext(x, dst+width, src+width, dc+width/2, + thresh, dithers); } -#endif -#if HAVE_SSSE3_INLINE +void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, uint8_t *src, + uint16_t *dc, int thresh, + const uint16_t *dithers); static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers) { intptr_t x; @@ -117,100 +56,33 @@ static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, width = x; } x = -width; -__asm__ volatile( -
[libav-devel] [PATCH] x86: Port gradfun to yasm
--- Fixed all of Diego's comments --- libavfilter/x86/Makefile |3 +- libavfilter/x86/vf_gradfun.asm| 144 libavfilter/x86/vf_gradfun.c | 217 - libavfilter/x86/vf_gradfun_init.c | 93 4 files changed, 239 insertions(+), 218 deletions(-) create mode 100644 libavfilter/x86/vf_gradfun.asm delete mode 100644 libavfilter/x86/vf_gradfun.c create mode 100644 libavfilter/x86/vf_gradfun_init.c diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 59cefe8..16b1307 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -1,8 +1,9 @@ -OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/vf_gradfun.o +OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/vf_gradfun_init.o OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o +YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/vf_hqdn3d.o YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o diff --git a/libavfilter/x86/vf_gradfun.asm b/libavfilter/x86/vf_gradfun.asm new file mode 100644 index 000..e1737dd --- /dev/null +++ b/libavfilter/x86/vf_gradfun.asm @@ -0,0 +1,144 @@ +;* +;* x86-optimized functions for gradfun filter +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with Libav; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;** + +%include libavutil/x86/x86util.asm + +SECTION_RODATA + +pw_7f: times 8 dw 0x7F +pw_ff: times 8 dw 0xFF + +SECTION .text + +INIT_MMX mmxext +cglobal gradfun_filter_line, 6, 6 +movh m5, r4d +pxor m7, m7 +pshufwm5, m5,0 +mova m6, [pw_7f] +mova m3, [r5] +mova m4, [r5+8] +.loop: +movh m0, [r2+r0] +movh m1, [r3+r0] +punpcklbw m0, m7 +punpcklwd m1, m1 +psllw m0, 7 +pxor m2, m2 +psubw m1, m0 +psubw m2, m1 +pmaxswm2, m1 +pmulhuw m2, m5 +psubw m2, m6 +pminswm2, m7 +pmullwm2, m2 +paddw m0, m3 +psllw m1, 2 +pmulhwm1, m2 +paddw m0, m1 +psraw m0, 7 +packuswb m0, m0 +movh [r1+r0], m0 +add r0, 4 +jge .end +movh m0, [r2+r0] +movh m1, [r3+r0] +punpcklbw m0, m7 +punpcklwd m1, m1 +psllw m0, 7 +pxor m2, m2 +psubw m1, m0 +psubw m2, m1 +pmaxswm2, m1 +pmulhuw m2, m5 +psubw m2, m6 +pminswm2, m7 +pmullwm2, m2 +paddw m0, m4 +psllw m1, 2 +pmulhwm1, m2 +paddw m0, m1 +psraw m0, 7 +packuswb m0, m0 +movh [r1+r0], m0 +add r0, 4 +jl .loop +.end: +REP_RET + +INIT_XMM ssse3 +cglobal gradfun_filter_line, 6, 6, 8 +movd m5, r4d +pxor m7, m7 +pshuflwm5, m5, 0 +mova m6, [pw_7f] +punpcklqdq m5, m5 +mova m4, [r5] +.loop: +movh m0, [r2+r0] +movh m1, [r3+r0] +punpcklbw m0, m7 +punpcklwd m1, m1 +psllw m0, 7 +psubw m1, m0 +pabsw m2, m1 +pmulhuwm2, m5 +psubw m2, m6 +pminsw m2, m7 +pmullw m2, m2 +psllw m1, 2 +paddw m0, m4 +pmulhw m1, m2 +paddw m0, m1 +psraw m0, 7 +packuswb m0, m0 +movh [r1+r0], m0 +addr0, 8 +jl .loop +REP_RET + +%macro BLUR_LINE 1 +cglobal gradfun_blur_line_%1, 6, 6, 8 +movam7, [pw_ff] +.loop: +%1 m0, [r4+r0] +%1 m1, [r5+r0] +movam2, m0 +movam3, m1 +psrlw m0, 8 +psrlw m1, 8 +pandm2, m7 +pandm3, m7 +paddw m0, m1 +paddw m2, m3 +paddw m0, m2 +paddw m0, [r2+r0] +movam1, [r1+r0] +mova [r1+r0], m0 +psubw m0, m1 +mova [r3+r0], m0 +add r0, 16 +jl .loop +REP_RET +%endmacro + +INIT_XMM
Re: [libav-devel] [PATCH] x86: Port gradfun to yasm
On Mon, Oct 21, 2013 at 9:50 AM, Diego Biurrun di...@biurrun.de wrote: On Mon, Oct 21, 2013 at 09:30:23AM -0400, Daniel Kang wrote: --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -3,6 +3,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o +YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_yasm.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/vf_hqdn3d.o YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o The file with the init code should be renamed to vf_gradfun_init.c instead and vf_gradfun.asm should contain the assembly. --- a/libavfilter/x86/vf_gradfun.c +++ b/libavfilter/x86/vf_gradfun.c @@ -41,72 +39,13 @@ static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t *dc, width = x; } x = -width; +ff_gradfun_filter_line_mmxext(x, dst+width, src+width, dc+width/2, + thresh, dithers); spaces around operators @@ -117,100 +56,33 @@ static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, width = x; } x = -width; +ff_gradfun_filter_line_ssse3(x, dst+width, src+width, dc+width/2, + thresh, dithers); same av_cold void ff_gradfun_init_x86(GradFunContext *gf) { -#if HAVE_MMXEXT_INLINE +#if HAVE_YASM int cpu_flags = av_get_cpu_flags(); - if (cpu_flags AV_CPU_FLAG_MMXEXT) Keep the empty line. gf-filter_line = gradfun_filter_line_mmxext; -#endif -#if HAVE_SSSE3_INLINE if (cpu_flags AV_CPU_FLAG_SSSE3) gf-filter_line = gradfun_filter_line_ssse3; -#endif -#if HAVE_SSE2_INLINE if (cpu_flags AV_CPU_FLAG_SSE2) gf-blur_line = gradfun_blur_line_sse2; #endif Please comment the #endif. Look at vf_yadif_init.c to see how to replace the ifdefs with the right macros. All done. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] x86: Port gradfun to yasm
--- Fix licence to LGPL, add newline for readability. Loren's update. --- libavfilter/x86/Makefile |3 +- libavfilter/x86/vf_gradfun.asm| 110 +++ libavfilter/x86/vf_gradfun.c | 217 - libavfilter/x86/vf_gradfun_init.c | 94 4 files changed, 206 insertions(+), 218 deletions(-) create mode 100644 libavfilter/x86/vf_gradfun.asm delete mode 100644 libavfilter/x86/vf_gradfun.c create mode 100644 libavfilter/x86/vf_gradfun_init.c diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 59cefe8..16b1307 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -1,8 +1,9 @@ -OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/vf_gradfun.o +OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/vf_gradfun_init.o OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o +YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/vf_hqdn3d.o YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o diff --git a/libavfilter/x86/vf_gradfun.asm b/libavfilter/x86/vf_gradfun.asm new file mode 100644 index 000..00fcb16 --- /dev/null +++ b/libavfilter/x86/vf_gradfun.asm @@ -0,0 +1,110 @@ +;** +;* x86-optimized functions for gradfun filter +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;** + +%include libavutil/x86/x86util.asm + +SECTION_RODATA + +pw_7f: times 8 dw 0x7F +pw_ff: times 8 dw 0xFF + +SECTION .text + +%macro FILTER_LINE 1 +movh m0, [r2+r0] +movh m1, [r3+r0] +punpcklbw m0, m7 +punpcklwd m1, m1 +psllw m0, 7 +psubw m1, m0 +PABSW m2, m1 +pmulhuwm2, m5 +psubw m2, m6 +pminsw m2, m7 +pmullw m2, m2 +psllw m1, 2 +paddw m0, %1 +pmulhw m1, m2 +paddw m0, m1 +psraw m0, 7 +packuswb m0, m0 +movh [r1+r0], m0 +%endmacro + +INIT_MMX mmxext +cglobal gradfun_filter_line, 6, 6 +movh m5, r4d +pxor m7, m7 +pshufwm5, m5,0 +mova m6, [pw_7f] +mova m3, [r5] +mova m4, [r5+8] +.loop: +FILTER_LINE m3 +add r0, 4 +jge .end +FILTER_LINE m4 +add r0, 4 +jl .loop +.end: +REP_RET + +INIT_XMM ssse3 +cglobal gradfun_filter_line, 6, 6, 8 +movd m5, r4d +pxor m7, m7 +pshuflwm5, m5, 0 +mova m6, [pw_7f] +punpcklqdq m5, m5 +mova m4, [r5] +.loop: +FILTER_LINE m4 +addr0, 8 +jl .loop +REP_RET + +%macro BLUR_LINE 1 +cglobal gradfun_blur_line_%1, 6, 6, 8 +movam7, [pw_ff] +.loop: +%1 m0, [r4+r0] +%1 m1, [r5+r0] +movam2, m0 +movam3, m1 +psrlw m0, 8 +psrlw m1, 8 +pandm2, m7 +pandm3, m7 +paddw m0, m1 +paddw m2, m3 +paddw m0, m2 +paddw m0, [r2+r0] +movam1, [r1+r0] +mova [r1+r0], m0 +psubw m0, m1 +mova [r3+r0], m0 +add r0, 16 +jl .loop +REP_RET +%endmacro + +INIT_XMM sse2 +BLUR_LINE movdqa +BLUR_LINE movdqu diff --git a/libavfilter/x86/vf_gradfun.c b/libavfilter/x86/vf_gradfun.c deleted file mode 100644 index e571af7..000 --- a/libavfilter/x86/vf_gradfun.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (C) 2009 Loren Merritt lor...@u.washignton.edu - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY
[libav-devel] [PATCH] dsputil: x86: Fix compile error by conditionally compiling code.
Specifically related to the H263 encoder/decoder. --- libavcodec/x86/dsputil.asm |4 1 file changed, 4 insertions(+) diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 7ea796d..d245300 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -727,6 +727,7 @@ BSWAP32_BUF INIT_MMX mmx ; void h263_v_loop_filter(uint8_t *src, int stride, int qscale) cglobal h263_v_loop_filter, 3,5 +%if CONFIG_H263_DECODER || CONFIG_H263_ENCODER movsxdifnidn r1, r1d movsxdifnidn r2, r2d @@ -745,6 +746,7 @@ cglobal h263_v_loop_filter, 3,5 mova [r0], m4 mova [r4], m5 mova[r0+r1], m6 +%endif RET %macro TRANSPOSE4X4 2 @@ -769,6 +771,7 @@ cglobal h263_v_loop_filter, 3,5 ; void h263_h_loop_filter(uint8_t *src, int stride, int qscale) INIT_MMX mmx cglobal h263_h_loop_filter, 3,5,0,32 +%if CONFIG_H263_DECODER || CONFIG_H263_ENCODER movsxdifnidn r1, r1d movsxdifnidn r2, r2d @@ -810,4 +813,5 @@ cglobal h263_h_loop_filter, 3,5,0,32 movd [r4+r1*2], m6 punpckhdqm6, m6 movd[r4+r3], m6 +%endif RET -- 1.7.10.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] dsputil: x86: Fix int - ptrdiff_t
On Fri, Feb 15, 2013 at 3:13 PM, Daniel Kang daniel.d.k...@gmail.com wrote: --- libavcodec/x86/dsputil_mmx.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) ping ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] dsputil: x86: Fix compile error by conditionally compiling code.
On Mon, Feb 18, 2013 at 9:21 AM, Diego Biurrun di...@biurrun.de wrote: On Mon, Feb 18, 2013 at 09:08:35AM -0500, Daniel Kang wrote: Specifically related to the H263 encoder/decoder. --- libavcodec/x86/dsputil.asm |4 1 file changed, 4 insertions(+) Note that it's linking, not a compilation failure. --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -727,6 +727,7 @@ BSWAP32_BUF INIT_MMX mmx ; void h263_v_loop_filter(uint8_t *src, int stride, int qscale) cglobal h263_v_loop_filter, 3,5 +%if CONFIG_H263_DECODER || CONFIG_H263_ENCODER movsxdifnidn r1, r1d movsxdifnidn r2, r2d @@ -745,6 +746,7 @@ cglobal h263_v_loop_filter, 3,5 mova [r0], m4 mova [r4], m5 mova[r0+r1], m6 +%endif RET %macro TRANSPOSE4X4 2 @@ -769,6 +771,7 @@ cglobal h263_v_loop_filter, 3,5 ; void h263_h_loop_filter(uint8_t *src, int stride, int qscale) INIT_MMX mmx cglobal h263_h_loop_filter, 3,5,0,32 +%if CONFIG_H263_DECODER || CONFIG_H263_ENCODER movsxdifnidn r1, r1d movsxdifnidn r2, r2d @@ -810,4 +813,5 @@ cglobal h263_h_loop_filter, 3,5,0,32 movd [r4+r1*2], m6 punpckhdqm6, m6 movd[r4+r3], m6 +%endif RET Can't you move the H.263 code to a separate file? That would be much cleaner than this ifdeffery. A cleaner solution would be to just remove this from dsputil since it's only called in h263 specific contexts, but that falls outside the scope of my work. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] hpeldsp: x86: Convert dsputil_rnd_template to yasm
On Fri, Feb 15, 2013 at 6:33 PM, Loren Merritt lor...@u.washington.edu wrote: On Fri, 15 Feb 2013, Daniel Kang wrote: +%macro PAVGBP_MMX 6 +mova %3, %1 +mova %6, %4 +por%3, %2 +por%6, %5 +pxor %2, %1 +pxor %5, %4 +pand %2, m6 +pand %5, m6 +psrlq %2, 1 +psrlq %5, 1 +psubb %3, %2 +psubb %6, %5 +%endmacro + +%macro PAVGBP_NO_RND_MMX 6 +mova %3, %1 +mova %6, %4 +pand %3, %2 +pand %6, %5 +pxor %2, %1 +pxor %5, %4 +pand %2, m6 +pand %5, m6 +psrlq%2, 1 +psrlq%5, 1 +paddb%3, %2 +paddb%6, %5 +%endmacro Does this need to be interleaved, not just two calls to PAVGB_OP_MMX? No, fixed. +; put_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro PUT_PIXELS8_XY2_MMX 0-1 +cglobal put%1_pixels8_xy2, 4,5 +pxor m7, m7 +SET_RND(m6) +mova m0, [r1] +mova m4, [r1+1] +mova m1, m0 +mova m5, m4 +punpcklbwm0, m7 +punpcklbwm4, m7 +punpckhbwm1, m7 +punpckhbwm5, m7 +paddusw m4, m0 +paddusw m5, m1 +xor r4, r4 +add r1, r2 +.loop: +mova m0, [r1+r4] +mova m2, [r1+r4+1] +mova m1, m0 +mova m3, m2 +punpcklbwm0, m7 +punpcklbwm2, m7 +punpckhbwm1, m7 +punpckhbwm3, m7 +paddusw m0, m2 +paddusw m1, m3 +paddusw m4, m6 +paddusw m5, m6 +paddusw m4, m0 +paddusw m5, m1 +psrlwm4, 2 +psrlwm5, 2 +packuswb m4, m5 +mova[r0+r4], m4 +add r4, r2 +mova m2, [r1+r4] +mova m3, [r1+r4+1] +mova m3, m2 +mova m5, m4 +punpcklbwm2, m7 +punpcklbwm4, m7 +punpckhbwm3, m7 +punpckhbwm5, m7 +paddusw m4, m2 +paddusw m5, m3 +paddusw m0, m6 +paddusw m1, m6 +paddusw m0, m4 +paddusw m1, m5 +psrlwm0, 2 +psrlwm1, 2 +packuswb m0, m1 +mova[r0+r4], m0 +add r4, r2 +sub r3d, 2 +jne .loop +RET +%endmacro Does this and similar functions really need to be unrolled? If so, use %rep. Yes, due to the way this is written. I rep'd the one I could. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] dsputil: x86: Fix linker error with h263 loop filter.
This was caused by referencing a conditionally compiled table. Now the code is also compiled conditionally. --- libavcodec/x86/Makefile|2 + libavcodec/x86/dsputil.asm | 162 -- libavcodec/x86/h263_lf.asm | 187 3 files changed, 189 insertions(+), 162 deletions(-) create mode 100644 libavcodec/x86/h263_lf.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index c740573..24a96a5 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -44,6 +44,8 @@ YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o YASM-OBJS-$(CONFIG_DCT)+= x86/dct32.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o YASM-OBJS-$(CONFIG_FFT)+= x86/fft.o +YASM-OBJS-$(CONFIG_H263_ENCODER) += x86/h263_lf.o +YASM-OBJS-$(CONFIG_H263_DECODER) += x86/h263_lf.o YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ x86/h264_chromamc_10bit.o YASM-OBJS-$(CONFIG_H264DSP)+= x86/h264_deblock.o\ diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 7ea796d..4539e5c 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -22,8 +22,6 @@ %include libavutil/x86/x86util.asm SECTION_RODATA -cextern pb_FC -cextern h263_loop_filter_strength pb_f: times 16 db 15 pb_: times 8 db -1 pb_7: times 8 db 7 @@ -651,163 +649,3 @@ BSWAP32_BUF INIT_XMM ssse3 BSWAP32_BUF - -%macro H263_LOOP_FILTER 5 -pxor m7, m7 -mova m0, [%1] -mova m1, [%1] -mova m2, [%4] -mova m3, [%4] -punpcklbwm0, m7 -punpckhbwm1, m7 -punpcklbwm2, m7 -punpckhbwm3, m7 -psubwm0, m2 -psubwm1, m3 -mova m2, [%2] -mova m3, [%2] -mova m4, [%3] -mova m5, [%3] -punpcklbwm2, m7 -punpckhbwm3, m7 -punpcklbwm4, m7 -punpckhbwm5, m7 -psubwm4, m2 -psubwm5, m3 -psllwm4, 2 -psllwm5, 2 -paddwm4, m0 -paddwm5, m1 -pxor m6, m6 -pcmpgtw m6, m4 -pcmpgtw m7, m5 -pxor m4, m6 -pxor m5, m7 -psubwm4, m6 -psubwm5, m7 -psrlwm4, 3 -psrlwm5, 3 -packuswb m4, m5 -packsswb m6, m7 -pxor m7, m7 -movd m2, %5 -punpcklbwm2, m2 -punpcklbwm2, m2 -punpcklbwm2, m2 -psubusb m2, m4 -mova m3, m2 -psubusb m3, m4 -psubbm2, m3 -mova m3, [%2] -mova m4, [%3] -pxor m3, m6 -pxor m4, m6 -paddusb m3, m2 -psubusb m4, m2 -pxor m3, m6 -pxor m4, m6 -paddusb m2, m2 -packsswb m0, m1 -pcmpgtb m7, m0 -pxor m0, m7 -psubbm0, m7 -mova m1, m0 -psubusb m0, m2 -psubbm1, m0 -pand m1, [pb_FC] -psrlwm1, 2 -pxor m1, m7 -psubbm1, m7 -mova m5, [%1] -mova m6, [%4] -psubbm5, m1 -paddbm6, m1 -%endmacro - -INIT_MMX mmx -; void h263_v_loop_filter(uint8_t *src, int stride, int qscale) -cglobal h263_v_loop_filter, 3,5 -movsxdifnidn r1, r1d -movsxdifnidn r2, r2d - -lea r4, [h263_loop_filter_strength] -movzx r3d, BYTE [r4+r2] -movsxr2, r3b -shl r2, 1 - -mov r3, r0 -sub r3, r1 -mov r4, r3 -sub r4, r1 -H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d - -mova [r3], m3 -mova [r0], m4 -mova [r4], m5 -mova[r0+r1], m6 -RET - -%macro TRANSPOSE4X4 2 -movd m0, [%1] -movd m1, [%1+r1] -movd m2, [%1+r1*2] -movd m3, [%1+r3] -punpcklbw m0, m1 -punpcklbw m2, m3 -mova m1, m0 -punpcklwd m0, m2 -punpckhwd m1, m2 -movd [%2+ 0], m0 -punpckhdq m0, m0 -movd [%2+ 8], m0 -movd [%2+16], m1 -punpckhdq m1, m1 -movd [%2+24], m1 -%endmacro - - -; void h263_h_loop_filter(uint8_t *src, int stride, int qscale) -INIT_MMX mmx -cglobal h263_h_loop_filter, 3,5,0,32 -movsxdifnidn r1, r1d -movsxdifnidn r2, r2d - -lea r4, [h263_loop_filter_strength] -movzx r3d, BYTE [r4+r2] -movsxr2, r3b -shl r2, 1 - -sub r0, 2 -lea r3, [r1*3] - -TRANSPOSE4X4 r0, rsp -lea r4, [r0+r1*4] -TRANSPOSE4X4 r4, rsp+4 - -H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d - -mova m1, m5 -mova m0, m4 -punpcklbwm5, m3 -punpcklbwm4, m6 -punpckhbwm1, m3 -punpckhbwm0, m6 -mova m3, m5 -mova
[libav-devel] [PATCH] dsputil: x86: Fix linker error with h263 loop filter.
This was caused by referencing a conditionally compiled table. Now the code is also compiled conditionally. --- Change order and filename --- libavcodec/x86/Makefile|2 + libavcodec/x86/dsputil.asm | 162 --- libavcodec/x86/h263_loopfilter.asm | 187 3 files changed, 189 insertions(+), 162 deletions(-) create mode 100644 libavcodec/x86/h263_loopfilter.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index c740573..a759e6e 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -44,6 +44,8 @@ YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o YASM-OBJS-$(CONFIG_DCT)+= x86/dct32.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o YASM-OBJS-$(CONFIG_FFT)+= x86/fft.o +YASM-OBJS-$(CONFIG_H263_DECODER) += x86/h263_loopfilter.o +YASM-OBJS-$(CONFIG_H263_ENCODER) += x86/h263_loopfilter.o YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ x86/h264_chromamc_10bit.o YASM-OBJS-$(CONFIG_H264DSP)+= x86/h264_deblock.o\ diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 7ea796d..4539e5c 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -22,8 +22,6 @@ %include libavutil/x86/x86util.asm SECTION_RODATA -cextern pb_FC -cextern h263_loop_filter_strength pb_f: times 16 db 15 pb_: times 8 db -1 pb_7: times 8 db 7 @@ -651,163 +649,3 @@ BSWAP32_BUF INIT_XMM ssse3 BSWAP32_BUF - -%macro H263_LOOP_FILTER 5 -pxor m7, m7 -mova m0, [%1] -mova m1, [%1] -mova m2, [%4] -mova m3, [%4] -punpcklbwm0, m7 -punpckhbwm1, m7 -punpcklbwm2, m7 -punpckhbwm3, m7 -psubwm0, m2 -psubwm1, m3 -mova m2, [%2] -mova m3, [%2] -mova m4, [%3] -mova m5, [%3] -punpcklbwm2, m7 -punpckhbwm3, m7 -punpcklbwm4, m7 -punpckhbwm5, m7 -psubwm4, m2 -psubwm5, m3 -psllwm4, 2 -psllwm5, 2 -paddwm4, m0 -paddwm5, m1 -pxor m6, m6 -pcmpgtw m6, m4 -pcmpgtw m7, m5 -pxor m4, m6 -pxor m5, m7 -psubwm4, m6 -psubwm5, m7 -psrlwm4, 3 -psrlwm5, 3 -packuswb m4, m5 -packsswb m6, m7 -pxor m7, m7 -movd m2, %5 -punpcklbwm2, m2 -punpcklbwm2, m2 -punpcklbwm2, m2 -psubusb m2, m4 -mova m3, m2 -psubusb m3, m4 -psubbm2, m3 -mova m3, [%2] -mova m4, [%3] -pxor m3, m6 -pxor m4, m6 -paddusb m3, m2 -psubusb m4, m2 -pxor m3, m6 -pxor m4, m6 -paddusb m2, m2 -packsswb m0, m1 -pcmpgtb m7, m0 -pxor m0, m7 -psubbm0, m7 -mova m1, m0 -psubusb m0, m2 -psubbm1, m0 -pand m1, [pb_FC] -psrlwm1, 2 -pxor m1, m7 -psubbm1, m7 -mova m5, [%1] -mova m6, [%4] -psubbm5, m1 -paddbm6, m1 -%endmacro - -INIT_MMX mmx -; void h263_v_loop_filter(uint8_t *src, int stride, int qscale) -cglobal h263_v_loop_filter, 3,5 -movsxdifnidn r1, r1d -movsxdifnidn r2, r2d - -lea r4, [h263_loop_filter_strength] -movzx r3d, BYTE [r4+r2] -movsxr2, r3b -shl r2, 1 - -mov r3, r0 -sub r3, r1 -mov r4, r3 -sub r4, r1 -H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d - -mova [r3], m3 -mova [r0], m4 -mova [r4], m5 -mova[r0+r1], m6 -RET - -%macro TRANSPOSE4X4 2 -movd m0, [%1] -movd m1, [%1+r1] -movd m2, [%1+r1*2] -movd m3, [%1+r3] -punpcklbw m0, m1 -punpcklbw m2, m3 -mova m1, m0 -punpcklwd m0, m2 -punpckhwd m1, m2 -movd [%2+ 0], m0 -punpckhdq m0, m0 -movd [%2+ 8], m0 -movd [%2+16], m1 -punpckhdq m1, m1 -movd [%2+24], m1 -%endmacro - - -; void h263_h_loop_filter(uint8_t *src, int stride, int qscale) -INIT_MMX mmx -cglobal h263_h_loop_filter, 3,5,0,32 -movsxdifnidn r1, r1d -movsxdifnidn r2, r2d - -lea r4, [h263_loop_filter_strength] -movzx r3d, BYTE [r4+r2] -movsxr2, r3b -shl r2, 1 - -sub r0, 2 -lea r3, [r1*3] - -TRANSPOSE4X4 r0, rsp -lea r4, [r0+r1*4] -TRANSPOSE4X4 r4, rsp+4 - -H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d - -mova m1, m5 -mova m0, m4 -punpcklbwm5, m3 -punpcklbwm4, m6 -punpckhbwm1, m3 -
[libav-devel] [PATCH] dsputil: x86: Fix int - ptrdiff_t
--- libavcodec/x86/dsputil_mmx.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index c011a21..fbc4b01 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -147,7 +147,7 @@ void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, - int line_size, int h) + ptrdiff_t line_size, int h) { ff_put_pixels8_mmxext(block, pixels, line_size, h); ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h); -- 1.7.10.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] hpeldsp: x86: Convert dsputil_rnd_template to yasm
On Fri, Feb 15, 2013 at 1:53 PM, Diego Biurrun di...@biurrun.de wrote: On Fri, Feb 15, 2013 at 12:53:44AM -0500, Daniel Kang wrote: On Thu, Feb 14, 2013 at 7:59 AM, Diego Biurrun di...@biurrun.de wrote: On Wed, Feb 13, 2013 at 05:53:36PM -0500, Daniel Kang wrote: --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -475,12 +481,18 @@ CAVS_MC(put_, 8, 3dnow) CAVS_MC(put_, 16,3dnow) CAVS_MC(avg_, 8, 3dnow) CAVS_MC(avg_, 16,3dnow) +#endif /* HAVE_AMD3DNOW_INLINE */ static av_cold void ff_cavsdsp_init_3dnow(CAVSDSPContext *c, AVCodecContext *avctx) { +#if HAVE_YASM +c-put_cavs_qpel_pixels_tab[0][0] = ff_put_cavs_qpel16_mc00_mmxext; +c-put_cavs_qpel_pixels_tab[1][0] = ff_put_cavs_qpel16_mc00_mmxext; +#endif +#if HAVE_INLINE_ASM #define dspfunc(PFX, IDX, NUM) \ -c-PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmxext; \ c-PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ c-PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ mmxext functions in the 3dnow init function? Yes this is correct. Does not contain any mmxext specific instructions. That's not my definition of correct. It is not wrongly placed, just wrongly named then. Please fix the name. What do you suggest? --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -128,26 +136,45 @@ void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, - int line_size, int h) + ptrdiff_t line_size, int h) Is there a reason not to do this separately, i.e. right away? No. So let's go ahead and change it separately. :) Sure --- a/libavcodec/x86/dsputil_rnd_template.c +++ b/libavcodec/x86/dsputil_rnd_template.c @@ -25,570 +25,28 @@ //FIXME optimize -static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ -DEF(put, pixels8_y2)(block , pixels , line_size, h); -DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); +static void DEF(ff_put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ +DEF(ff_put, pixels8_y2)(block , pixels , line_size, h); +DEF(ff_put, pixels8_y2)(block+8, pixels+8, line_size, h); } Is the FIXME comment still valid in some way? Yes and no. There are mmxext versions of the same thing and they're faster anyway. So it's cruft more than anything else, please delete it. Done +lea r1, [r1+r2*2] +lea r0, [r0+r2*2] +sub r3d, 4 +jne .loop +RET Weird placement of .loop; I suggest aligning it with the rest. Probably it is handled inconsistently throughout... All my code I've written has the loop in that placement. It's very possible it's inconsistent across files. I suggest no idiosyncratic formatting for jump instructions and/or maintaining the style of the file. The majority of the code in that file already has loops in that format. The exceptions are the two functions at the bottom (the sse2 ones). ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] hpeldsp: x86: Convert dsputil_rnd_template to yasm
On Thu, Feb 14, 2013 at 7:59 AM, Diego Biurrun di...@biurrun.de wrote: On Wed, Feb 13, 2013 at 05:53:36PM -0500, Daniel Kang wrote: --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -475,12 +481,18 @@ CAVS_MC(put_, 8, 3dnow) CAVS_MC(put_, 16,3dnow) CAVS_MC(avg_, 8, 3dnow) CAVS_MC(avg_, 16,3dnow) +#endif /* HAVE_AMD3DNOW_INLINE */ static av_cold void ff_cavsdsp_init_3dnow(CAVSDSPContext *c, AVCodecContext *avctx) { +#if HAVE_YASM +c-put_cavs_qpel_pixels_tab[0][0] = ff_put_cavs_qpel16_mc00_mmxext; +c-put_cavs_qpel_pixels_tab[1][0] = ff_put_cavs_qpel16_mc00_mmxext; +#endif +#if HAVE_INLINE_ASM #define dspfunc(PFX, IDX, NUM) \ -c-PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmxext; \ c-PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ c-PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ mmxext functions in the 3dnow init function? Yes this is correct. Does not contain any mmxext specific instructions. --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -128,26 +136,45 @@ void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, - int line_size, int h) + ptrdiff_t line_size, int h) Is there a reason not to do this separately, i.e. right away? No. --- a/libavcodec/x86/dsputil_rnd_template.c +++ b/libavcodec/x86/dsputil_rnd_template.c @@ -25,570 +25,28 @@ //FIXME optimize -static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ -DEF(put, pixels8_y2)(block , pixels , line_size, h); -DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); +static void DEF(ff_put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){ +DEF(ff_put, pixels8_y2)(block , pixels , line_size, h); +DEF(ff_put, pixels8_y2)(block+8, pixels+8, line_size, h); } Is the FIXME comment still valid in some way? Yes and no. There are mmxext versions of the same thing and they're faster anyway. Please prettyprint those lines that you are changing anyway, I think I gave you an sed expression to do it automatically the last time. It should still work and/or be easy to adopt. Done. @@ -56,6 +107,44 @@ PUT_PIXELS8_X2 +%macro PUT_PIXELS8_X2_MMX 0-1 +%if %0 == 1 +cglobal put%1_pixels8_x2, 4,4 +%else +cglobal put_pixels8_x2, 4,4 +%endif IIRC you don't need the %if, but you can just pass an empty first parameter and it should do the right thing. .. more below .. I tried this (and Ronald's suggestion) and I get the error: libavcodec/x86/hpeldsp.asm:142: error: (cglobal_internal:8) `%ifndef' expects macro identifiers I suspect this has to do with putting the %1 in the middle of the string. Suggestions appreciated. +lea r1, [r1+r2*2] +lea r0, [r0+r2*2] +sub r3d, 4 +jne .loop +RET Weird placement of .loop; I suggest aligning it with the rest. Probably it is handled inconsistently throughout... All my code I've written has the loop in that placement. It's very possible it's inconsistent across files. @@ -453,6 +753,201 @@ AVG_PIXELS8_XY2 +%macro AVG_PIXELS8_XY2_MMX 0-1 Some macros have comments with the C functions they implement, some don't. Please add the comments everywhere, I consider them helpful. Done. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] mpeg4qpel: Make movsxifnidn do the right thing
Fixes an instruction that does nothing by changing the source to dword --- libavcodec/x86/mpeg4qpel.asm |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/mpeg4qpel.asm b/libavcodec/x86/mpeg4qpel.asm index 6b5d203..df20ea9 100644 --- a/libavcodec/x86/mpeg4qpel.asm +++ b/libavcodec/x86/mpeg4qpel.asm @@ -100,7 +100,7 @@ PUT_NO_RND_PIXELS8_L2 ; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) %macro PUT_NO_RND_PIXELS16_l2 0 cglobal put_no_rnd_pixels16_l2, 6,6 -movsxdifnidn r3, r3 +movsxdifnidn r3, r3d movsxdifnidn r4, r4d pcmpeqb m6, m6 testr5d, 1 -- 1.7.10.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] dsputil: x86: Convert h263 loop filter to yasm
On Tue, Feb 5, 2013 at 1:23 PM, Diego Biurrun di...@biurrun.de wrote: On Fri, Feb 01, 2013 at 06:28:30PM -0500, Daniel Kang wrote: --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -648,3 +650,160 @@ BSWAP32_BUF + +INIT_MMX mmx +cglobal h263_v_loop_filter, 3,5 +movsxdifnidn r1, r1 +movsxdifnidn r2, r2 + +INIT_MMX mmx +cglobal h263_h_loop_filter, 3,5,0,32 +movsxdifnidn r1, r1 +movsxdifnidn r2, r2 Is the sign extension still needed after all the ptrdiff_t changes I did? Yes. You didn't touch h263 loop filter, either that or your changes haven't been pushed. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] dsputil: x86: Convert h263 loop filter to yasm
On Fri, Feb 1, 2013 at 6:28 PM, Daniel Kang daniel.d.k...@gmail.com wrote: --- I am very skeptical when assembly works on the first time. More testing would be appreciated. --- libavcodec/x86/dsputil.asm | 159 libavcodec/x86/dsputil_mmx.c | 185 ++ 2 files changed, 167 insertions(+), 177 deletions(-) Ping? ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] dsputil: x86: Convert h263 loop filter to yasm
On Tue, Feb 5, 2013 at 9:16 AM, Luca Barbato lu_z...@gentoo.org wrote: On 02/02/13 00:28, Daniel Kang wrote: --- I am very skeptical when assembly works on the first time. More testing would be appreciated. --- libavcodec/x86/dsputil.asm | 159 libavcodec/x86/dsputil_mmx.c | 185 ++ 2 files changed, 167 insertions(+), 177 deletions(-) /usr/lib/gcc/x86_64-pc-linux-gnu/4.6.3/../../../../x86_64-pc-linux-gnu/bin/ld: libavcodec/libavcodec.a(dsputil.o): relocation R_X86_64_32 against `ff_h263_loop_filter_strength' can not be used when making a shared object; recompile with -fPIC libavcodec/libavcodec.a: could not read symbols: Bad value Looks like something is broken for x86_64. What configure options are you using? ./configure make works just fine with me with a clean build, on master. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] dsputil: x86: Convert h263 loop filter to yasm
On Tue, Feb 5, 2013 at 1:04 PM, Luca Barbato lu_z...@gentoo.org wrote: On 05/02/13 18:31, Daniel Kang wrote: On Tue, Feb 5, 2013 at 9:16 AM, Luca Barbato lu_z...@gentoo.org wrote: On 02/02/13 00:28, Daniel Kang wrote: --- I am very skeptical when assembly works on the first time. More testing would be appreciated. --- libavcodec/x86/dsputil.asm | 159 libavcodec/x86/dsputil_mmx.c | 185 ++ 2 files changed, 167 insertions(+), 177 deletions(-) /usr/lib/gcc/x86_64-pc-linux-gnu/4.6.3/../../../../x86_64-pc-linux-gnu/bin/ld: libavcodec/libavcodec.a(dsputil.o): relocation R_X86_64_32 against `ff_h263_loop_filter_strength' can not be used when making a shared object; recompile with -fPIC libavcodec/libavcodec.a: could not read symbols: Bad value Looks like something is broken for x86_64. What configure options are you using? ./configure make works just fine with me with a clean build, on master. Here does not and it is a clean checkout with that patch on top. Which compiler are you using? lu ddkang@ddkang ~/code/libav $ gcc --version gcc (Ubuntu/Linaro 4.7.2-2ubuntu1) 4.7.2 Copyright (C) 2012 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] dsputil: x86: Convert h263 loop filter to yasm
--- I am very skeptical when assembly works on the first time. More testing would be appreciated. --- libavcodec/x86/dsputil.asm | 159 libavcodec/x86/dsputil_mmx.c | 185 ++ 2 files changed, 167 insertions(+), 177 deletions(-) diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 8002779..b7b7046 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -22,6 +22,8 @@ %include libavutil/x86/x86util.asm SECTION_RODATA +cextern pb_FC +cextern h263_loop_filter_strength pb_f: times 16 db 15 pb_: times 8 db -1 pb_7: times 8 db 7 @@ -648,3 +650,160 @@ BSWAP32_BUF INIT_XMM ssse3 BSWAP32_BUF + + +%macro H263_LOOP_FILTER 5 +pxor m7, m7 +mova m0, [%1] +mova m1, [%1] +mova m2, [%4] +mova m3, [%4] +punpcklbwm0, m7 +punpckhbwm1, m7 +punpcklbwm2, m7 +punpckhbwm3, m7 +psubwm0, m2 +psubwm1, m3 +mova m2, [%2] +mova m3, [%2] +mova m4, [%3] +mova m5, [%3] +punpcklbwm2, m7 +punpckhbwm3, m7 +punpcklbwm4, m7 +punpckhbwm5, m7 +psubwm4, m2 +psubwm5, m3 +psllwm4, 2 +psllwm5, 2 +paddwm4, m0 +paddwm5, m1 +pxor m6, m6 +pcmpgtw m6, m4 +pcmpgtw m7, m5 +pxor m4, m6 +pxor m5, m7 +psubwm4, m6 +psubwm5, m7 +psrlwm4, 3 +psrlwm5, 3 +packuswb m4, m5 +packsswb m6, m7 +pxor m7, m7 +movd m2, %5 +punpcklbwm2, m2 +punpcklbwm2, m2 +punpcklbwm2, m2 +psubusb m2, m4 +mova m3, m2 +psubusb m3, m4 +psubbm2, m3 +mova m3, [%2] +mova m4, [%3] +pxor m3, m6 +pxor m4, m6 +paddusb m3, m2 +psubusb m4, m2 +pxor m3, m6 +pxor m4, m6 +paddusb m2, m2 +packsswb m0, m1 +pcmpgtb m7, m0 +pxor m0, m7 +psubbm0, m7 +mova m1, m0 +psubusb m0, m2 +psubbm1, m0 +pand m1, [pb_FC] +psrlwm1, 2 +pxor m1, m7 +psubbm1, m7 +mova m5, [%1] +mova m6, [%4] +psubbm5, m1 +paddbm6, m1 +%endmacro + +INIT_MMX mmx +cglobal h263_v_loop_filter, 3,5 +movsxdifnidn r1, r1 +movsxdifnidn r2, r2 + +movzx r3d, BYTE [ff_h263_loop_filter_strength+r2] +movsxr2, r3b +shl r2, 1 + +mov r3, r0 +sub r3, r1 +mov r4, r3 +sub r4, r1 +H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d + +mova [r3], m3 +mova [r0], m4 +mova [r4], m5 +mova[r0+r1], m6 +RET + +%macro TRANSPOSE4X4 2 +movd m0, [%1] +movd m1, [%1+r1] +movd m2, [%1+r1*2] +movd m3, [%1+r3] +punpcklbw m0, m1 +punpcklbw m2, m3 +mova m1, m0 +punpcklwd m0, m2 +punpckhwd m1, m2 +movd [%2+ 0], m0 +punpckhdq m0, m0 +movd [%2+ 8], m0 +movd [%2+16], m1 +punpckhdq m1, m1 +movd [%2+24], m1 +%endmacro + + +INIT_MMX mmx +cglobal h263_h_loop_filter, 3,5,0,32 +movsxdifnidn r1, r1 +movsxdifnidn r2, r2 + +movzx r3d, BYTE [ff_h263_loop_filter_strength+r2] +movsxr2, r3b +shl r2, 1 + +sub r0, 2 +lea r3, [r1*3] + +TRANSPOSE4X4 r0, rsp +lea r4, [r0+r1*4] +TRANSPOSE4X4 r4, rsp+4 + +H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d + +mova m1, m5 +mova m0, m4 +punpcklbwm5, m3 +punpcklbwm4, m6 +punpckhbwm1, m3 +punpckhbwm0, m6 +mova m3, m5 +mova m6, m1 +punpcklwdm5, m4 +punpcklwdm1, m0 +punpckhwdm3, m4 +punpckhwdm6, m0 +movd [r0], m5 +punpckhdqm5, m5 +movd [r0+r1*1], m5 +movd [r0+r1*2], m3 +punpckhdqm3, m3 +movd[r0+r3], m3 +movd [r4], m1 +punpckhdqm1, m1 +movd [r4+r1*1], m1 +movd [r4+r1*2], m6 +punpckhdqm6, m6 +movd[r4+r3], m6 +RET diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 3ccef62..7039095 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -650,181 +650,12 @@ static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, *left_top = tl; } #endif +#endif /* HAVE_INLINE_ASM */ -static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){ -__asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... -movd (%1), %%mm0 \n\t -add %3,
[libav-devel] [PATCH] dsputil: Fix error by not using redzone
--- I currently have no way of testing if this fixes the mingw32 failures or not. Testing would be appreciated --- libavcodec/x86/mpeg4qpel.asm |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/mpeg4qpel.asm b/libavcodec/x86/mpeg4qpel.asm index a5525d2..6b5d203 100644 --- a/libavcodec/x86/mpeg4qpel.asm +++ b/libavcodec/x86/mpeg4qpel.asm @@ -168,7 +168,7 @@ INIT_MMX 3dnow PUT_NO_RND_PIXELS16_l2 %macro MPEG4_QPEL16_H_LOWPASS 1 -cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8 +cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16 movsxdifnidn r2, r2d movsxdifnidn r3, r3d pxor m7, m7 @@ -201,7 +201,7 @@ cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8 paddwm6, [PW_ROUND] paddwm0, m6 psrawm0, 5 -mova[rsp-8], m0 +mova[rsp+8], m0 mova m0, [r1+5] mova m5, m0 mova m6, m0 @@ -225,7 +225,7 @@ cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8 paddwm1, [PW_ROUND] paddwm3, m1 psrawm3, 5 -mova m1, [rsp-8] +mova m1, [rsp+8] packuswb m1, m3 OP_MOV [r0], m1, m4 mova m1, [r1+9] -- 1.7.10.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH v2] dsputil: x86: Convert mpeg4 qpel and dsputil avg to yasm
On Sat, Jan 26, 2013 at 1:25 PM, Diego Biurrun di...@biurrun.de wrote: On Sat, Jan 26, 2013 at 01:01:09PM -0500, Daniel Kang wrote: On Sat, Jan 26, 2013 at 3:23 AM, Diego Biurrun di...@biurrun.de wrote: On Sat, Jan 26, 2013 at 12:32:16AM -0500, Daniel Kang wrote: --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -879,3 +884,984 @@ cglobal avg_pixels16, 4,5,4 +; mpeg4 qpel + +%macro MPEG4_QPEL16_H_LOWPASS 1 +cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8 So it seems like dsputil.asm is becoming the new dumping ground for functions of all kind. It doubles in size after your patch and at around 2k lines it starts to work against our current efforts of splitting dsputil into sensibly-sized pieces. If you continue your porting efforts, it will probably end up around 5k lines or so. Whenever there is an opportunity to make dsputil less monolithic comes up, we should exploit it. That seems to be the case here. I was trying to avoid drama and bikeshedding re: file names and save that for another patch. I guess I could split it in this patch if you want. Come on, don't blow the issue out of proportion. Just come up with a suitable name, maybe ask one or two other people that know the code for suitable suggestions. My suggestion would be mpeg4qpel.asm, maybe h263qpel.asm, but the former is probably more fitting, not sure. Even in case you should get three different suggestions and change to one after the other, it's easy enough with git and will not hinder your workflow at all. However, going back and forth after your patch has been pushed just creates unnecessary churn and annoyance. Very well, moved to mpeg4qpel.asm +%macro QPEL_V_LOW 5 +paddw m0, m1 +mova m4, [pw_20] +pmullw m4, m0 +mova m0, %4 +mova m5, %1 +paddw m5, m0 +psubw m4, m5 +mova m5, %2 +mova m6, %3 +paddw m5, m3 +paddw m6, m2 +paddw m6, m6 +psubw m5, m6 +pmullw m5, [pw_3] +paddw m4, [PW_ROUND] +paddw m5, m4 +psraw m5, 5 +packuswb m5, m5 +OP_MOV %5, m5, m7 +SWAP 0,1,2,3 +%endmacro nit: SWAP is not special, format its arguments like the rest of the macro instructions. I disagree on this one, I think SWAP is special. The rest of the codebase disagrees with you then. In the rest of the files SWAP has spaces after comma and arguments aligned with the other instructions. Only some of it does, but changed. --- a/libavcodec/x86/dsputil_avg_template.c +++ b/libavcodec/x86/dsputil_avg_template.c @@ -24,781 +24,32 @@ //FIXME the following could be optimized too ... +static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_put_no_rnd_pixels8_x2)(block , pixels , line_size, h); +DEF(ff_put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); } +static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_put_pixels8_y2)(block , pixels , line_size, h); +DEF(ff_put_pixels8_y2)(block+8, pixels+8, line_size, h); } +static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_put_no_rnd_pixels8_y2)(block , pixels , line_size, h); +DEF(ff_put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); } +static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_avg_pixels8)(block , pixels , line_size, h); +DEF(ff_avg_pixels8)(block+8, pixels+8, line_size, h); } +static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_avg_pixels8_x2)(block , pixels , line_size, h); +DEF(ff_avg_pixels8_x2)(block+8, pixels+8, line_size, h); } +static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_avg_pixels8_y2)(block , pixels , line_size, h); +DEF(ff_avg_pixels8_y2)(block+8, pixels+8, line_size, h); } +static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_avg_pixels8_xy2)(block , pixels , line_size, h); +DEF(ff_avg_pixels8_xy2)(block+8, pixels+8, line_size, h); } If you feel motivated, you could fix the formatting as you are changing all lines anyway. Fixed. Hehe, sort of :) Try running the following (GNU) sed command on your tree: sed -i -e 's/+/ + /g' -e 's/ ,/,/g' libavcodec/x86/dsputil_avg_template.c That should prettyprint it nicely. Fixed. --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -80,6 +80,143 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF
[libav-devel] [PATCH] dsputil: Fix compile error
Fixes under --disable-optimizations --disable-yasm --disable-inline-asm Due to misplaced HAVE_YASM guard --- libavcodec/x86/dsputil_mmx.c |2 ++ 1 file changed, 2 insertions(+) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 743a7c1..3ccef62 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -81,6 +81,7 @@ DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; +#if HAVE_YASM void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, @@ -179,6 +180,7 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride); #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext +#endif /* HAVE_YASM */ #if HAVE_INLINE_ASM -- 1.7.10.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] dsputil: Fix error with wrong number of registers
Allocated 5 instead of 6 registers --- libavcodec/x86/mpeg4qpel.asm |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/mpeg4qpel.asm b/libavcodec/x86/mpeg4qpel.asm index 39c9fc8..a5525d2 100644 --- a/libavcodec/x86/mpeg4qpel.asm +++ b/libavcodec/x86/mpeg4qpel.asm @@ -99,7 +99,7 @@ PUT_NO_RND_PIXELS8_L2 ; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) %macro PUT_NO_RND_PIXELS16_l2 0 -cglobal put_no_rnd_pixels16_l2, 5,5 +cglobal put_no_rnd_pixels16_l2, 6,6 movsxdifnidn r3, r3 movsxdifnidn r4, r4d pcmpeqb m6, m6 -- 1.7.10.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH v2] dsputil: x86: Convert mpeg4 qpel and dsputil avg to yasm
On Wed, Jan 23, 2013 at 12:36 PM, Ronald S. Bultje rsbul...@gmail.com wrote: Hi Daniel, On Tue, Jan 22, 2013 at 11:19 PM, Daniel Kang daniel.d.k...@gmail.com wrote: @@ -1330,10 +1087,12 @@ static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,\ { \ uint64_t half[8 + 9]; \ uint8_t * const halfH = ((uint8_t*)half); \ -put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ -stride, 9); \ -put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \ -OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \ +ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ + stride, 9); \ +ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \ +8, stride, 9); \ +ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ + stride, 8); \ } \ So, for all cases like this, does this actually affect speed? I mean, previously this could be inlined, now it no longer can be. I wonder if that has any effect on speed (i.e. was it ever inlined previously?). Depending on the architecture (??) the functions are inlined, but are often not. I suspect GCC's insane method of reordering registers swallows any overhead from calling these functions, but due to macro hell, I'm not sure of the best way to test this. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH v2] dsputil: x86: Convert mpeg4 qpel and dsputil avg to yasm
On Wed, Jan 23, 2013 at 4:14 PM, Daniel Kang daniel.d.k...@gmail.com wrote: On Wed, Jan 23, 2013 at 12:36 PM, Ronald S. Bultje rsbul...@gmail.com wrote: Hi Daniel, On Tue, Jan 22, 2013 at 11:19 PM, Daniel Kang daniel.d.k...@gmail.com wrote: @@ -1330,10 +1087,12 @@ static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,\ { \ uint64_t half[8 + 9]; \ uint8_t * const halfH = ((uint8_t*)half); \ -put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ -stride, 9); \ -put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \ -OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \ +ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ + stride, 9); \ +ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \ +8, stride, 9); \ +ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ + stride, 8); \ } \ So, for all cases like this, does this actually affect speed? I mean, previously this could be inlined, now it no longer can be. I wonder if that has any effect on speed (i.e. was it ever inlined previously?). Depending on the architecture (??) the functions are inlined, but are often not. I suspect GCC's insane method of reordering registers swallows any overhead from calling these functions, but due to macro hell, I'm not sure of the best way to test this. Sorry, this was not very clear. I think the yasm version is faster despite calling overhead, because GCC uses some ridiculous method of reordering registers for the inline assembly. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] dsputil: x86: Convert some inline asm to yasm
On Tue, Jan 22, 2013 at 5:10 PM, Diego Biurrun di...@biurrun.de wrote: On Tue, Jan 22, 2013 at 04:40:34PM -0500, Daniel Kang wrote: --- a/libavcodec/x86/dsputil_avg_template.c +++ b/libavcodec/x86/dsputil_avg_template.c @@ -24,781 +24,32 @@ //FIXME the following could be optimized too ... +static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_put_no_rnd_pixels8_x2)(block , pixels , line_size, h); +DEF(ff_put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); } +static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_put_pixels8_y2)(block , pixels , line_size, h); +DEF(ff_put_pixels8_y2)(block+8, pixels+8, line_size, h); } +static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_put_no_rnd_pixels8_y2)(block , pixels , line_size, h); +DEF(ff_put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); } +static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_avg_pixels8)(block , pixels , line_size, h); +DEF(ff_avg_pixels8)(block+8, pixels+8, line_size, h); } +static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_avg_pixels8_x2)(block , pixels , line_size, h); +DEF(ff_avg_pixels8_x2)(block+8, pixels+8, line_size, h); } +static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_avg_pixels8_y2)(block , pixels , line_size, h); +DEF(ff_avg_pixels8_y2)(block+8, pixels+8, line_size, h); } +static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ +DEF(ff_avg_pixels8_xy2)(block , pixels , line_size, h); +DEF(ff_avg_pixels8_xy2)(block+8, pixels+8, line_size, h); } Moving this to a macro and deleting the file seems saner to me. Maybe there are other opinions though... I was trying to avoid more macro hell in dsputil. Suggestions appreciated. --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -83,6 +83,147 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF +#if HAVE_YASM +/* VC-1-specific */ +#define ff_put_pixels8_mmx ff_put_pixels8_mmxext +void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, + int stride, int rnd) +{ +ff_put_pixels8_mmx(dst, src, stride, 8); +} + +void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src, + int stride, int rnd) +{ +ff_avg_pixels8_mmxext(dst, src, stride, 8); +} Is this used outside of VC-1? If no, this should be split out and moved to a VC-1-specific file. The avg and put pixels functions are. I am fairly confident the others aren't. +/***/ +/* 3Dnow specific */ + +#define DEF(x) x ## _3dnow + +#include dsputil_avg_template.c + +#undef DEF + +/***/ +/* MMXEXT specific */ + +#define DEF(x) x ## _mmxext + +#include dsputil_avg_template.c + +#undef DEF + + + +#endif /* HAVE_YASM */ + + + + #if HAVE_INLINE_ASM nit: stray large amount of empty lines Fixed. --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -879,3 +884,986 @@ cglobal avg_pixels16, 4,5,4 lea r0, [r0+r2*4] jnz .loop REP_RET + + + + +; HPEL mmxext +%macro PAVGB_OP 2 nit: 4 empty lines looks slightly weird; in that file 2 empty lines between unrelated blocks seem to be the norm. Fixed. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH v2] dsputilenc: x86: Convert pixel inline asm to yasm
On Wed, Jan 16, 2013 at 2:41 AM, Daniel Kang daniel.d.k...@gmail.com wrote: --- Fixed movu - mova comment from Loren --- libavcodec/x86/dsputilenc.asm | 152 + libavcodec/x86/dsputilenc_mmx.c | 201 --- 2 files changed, 172 insertions(+), 181 deletions(-) Ping? ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH v2] dsputilenc: x86: Convert pixel inline asm to yasm
--- Fixed movu - mova comment from Loren --- libavcodec/x86/dsputilenc.asm | 152 + libavcodec/x86/dsputilenc_mmx.c | 201 --- 2 files changed, 172 insertions(+), 181 deletions(-) diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index a2cb7f9..7b8763c 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -333,3 +333,155 @@ cglobal sse16, 5, 5, 8 paddd m7, m1 movd eax, m7 ; return value RET + +INIT_MMX mmx +; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) +cglobal get_pixels, 3,4 +movsxdifnidn r2, r2d +add r0, 128 +mov r3, -128 +pxor m7, m7 +.loop: +mova m0, [r1] +mova m2, [r1+r2] +mova m1, m0 +mova m3, m2 +punpcklbwm0, m7 +punpckhbwm1, m7 +punpcklbwm2, m7 +punpckhbwm3, m7 +mova [r0+r3+ 0], m0 +mova [r0+r3+ 8], m1 +mova [r0+r3+16], m2 +mova [r0+r3+24], m3 +lea r1, [r1+r2*2] +add r3, 32 +js .loop +REP_RET + +INIT_XMM sse2 +cglobal get_pixels, 3, 4 +movsxdifnidn r2, r2d +lea r3, [r2*3] +pxor m4, m4 +movh m0, [r1] +movh m1, [r1+r2] +movh m2, [r1+r2*2] +movh m3, [r1+r3] +lea r1, [r1+r2*4] +punpcklbwm0, m4 +punpcklbwm1, m4 +punpcklbwm2, m4 +punpcklbwm3, m4 +mova [r0], m0 +mova [r0+0x10], m1 +mova [r0+0x20], m2 +mova [r0+0x30], m3 +movh m0, [r1] +movh m1, [r1+r2*1] +movh m2, [r1+r2*2] +movh m3, [r1+r3] +punpcklbwm0, m4 +punpcklbwm1, m4 +punpcklbwm2, m4 +punpcklbwm3, m4 +mova [r0+0x40], m0 +mova [r0+0x50], m1 +mova [r0+0x60], m2 +mova [r0+0x70], m3 +RET + +INIT_MMX mmx +; diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const unint8_t *s2, stride) +cglobal diff_pixels, 4,5 +movsxdifnidn r3, r3d +pxor m7, m7 +add r0, 128 +mov r4, -128 +.loop: +mova m0, [r1] +mova m2, [r2] +mova m1, m0 +mova m3, m2 +punpcklbwm0, m7 +punpckhbwm1, m7 +punpcklbwm2, m7 +punpckhbwm3, m7 +psubwm0, m2 +psubwm1, m3 +mova [r0+r4+0], m0 +mova [r0+r4+8], m1 +add r1, r3 +add r2, r3 +add r4, 16 +jne .loop +REP_RET + +INIT_MMX mmx +; pix_sum16_mmx(uint8_t * pix, int line_size) +cglobal pix_sum16, 2, 3 +movsxdifnidn r1, r1d +mov r2, r1 +neg r2 +shl r2, 4 +sub r0, r2 +pxor m7, m7 +pxor m6, m6 +.loop: +mova m0, [r0+r2+0] +mova m1, [r0+r2+0] +mova m2, [r0+r2+8] +mova m3, [r0+r2+8] +punpcklbwm0, m7 +punpckhbwm1, m7 +punpcklbwm2, m7 +punpckhbwm3, m7 +paddwm1, m0 +paddwm3, m2 +paddwm3, m1 +paddwm6, m3 +add r2, r1 +js .loop +mova m5, m6 +psrlqm6, 32 +paddwm6, m5 +mova m5, m6 +psrlqm6, 16 +paddwm6, m5 +movdeax, m6 +and eax, 0x +RET + +INIT_MMX mmx +; pix_norm1_mmx(uint8_t *pix, int line_size) +cglobal pix_norm1, 2, 4 +movsxdifnidn r1, r1d +mov r2, 16 +pxor m0, m0 +pxor m7, m7 +.loop: +mova m2, [r0+0] +mova m3, [r0+8] +mova m1, m2 +punpckhbwm1, m0 +punpcklbwm2, m0 +mova m4, m3 +punpckhbwm3, m0 +punpcklbwm4, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +pmaddwd m4, m4 +padddm2, m1 +padddm4, m3 +padddm7, m2 +add r0, r1 +padddm7, m4 +dec r2 +jne .loop +mova m1, m7 +psrlqm7, 32 +padddm1, m7 +movdeax, m1 +RET + diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index e5d2473..fa126d6 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -30,181 +30,14 @@ #include libavcodec/mathops.h #include dsputil_mmx.h +void ff_get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size); +void ff_get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size); +void ff_diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride); +int ff_pix_sum16_mmx(uint8_t * pix, int line_size); +int ff_pix_norm1_mmx(uint8_t *pix, int line_size); #if HAVE_INLINE_ASM -static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) -{ -__asm__ volatile( -mov
[libav-devel] [PATCH] dsputilenc: x86: Convert pixel inline asm to yasm
--- Tested on a variety of configs, but that pesky emms bug prevents full testing of x86_32 --disable-asm. So, more testing would be appreciated (and on MSVC). --- libavcodec/x86/dsputilenc.asm | 152 + libavcodec/x86/dsputilenc_mmx.c | 201 --- 2 files changed, 172 insertions(+), 181 deletions(-) diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index 3bb1f2f..d5a0206 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -335,3 +335,155 @@ cglobal sse16, 5, 5, 8 paddd m7, m1 movd eax, m7 ; return value RET + +INIT_MMX mmx +; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) +cglobal get_pixels, 3,4 +movsxdifnidn r2, r2d +add r0, 128 +mov r3, -128 +pxor m7, m7 +.loop: +movu m0, [r1] +movu m2, [r1+r2] +movu m1, m0 +movu m3, m2 +punpcklbwm0, m7 +punpckhbwm1, m7 +punpcklbwm2, m7 +punpckhbwm3, m7 +movu [r0+r3+ 0], m0 +movu [r0+r3+ 8], m1 +movu [r0+r3+16], m2 +movu [r0+r3+24], m3 +lea r1, [r1+r2*2] +add r3, 32 +js .loop +REP_RET + +INIT_XMM sse2 +cglobal get_pixels, 3, 4 +movsxdifnidn r2, r2d +lea r3, [r2*3] +pxor m4, m4 +movh m0, [r1] +movh m1, [r1+r2] +movh m2, [r1+r2*2] +movh m3, [r1+r3] +lea r1, [r1+r2*4] +punpcklbwm0, m4 +punpcklbwm1, m4 +punpcklbwm2, m4 +punpcklbwm3, m4 +mova [r0], m0 +mova [r0+0x10], m1 +mova [r0+0x20], m2 +mova [r0+0x30], m3 +movh m0, [r1] +movh m1, [r1+r2*1] +movh m2, [r1+r2*2] +movh m3, [r1+r3] +punpcklbwm0, m4 +punpcklbwm1, m4 +punpcklbwm2, m4 +punpcklbwm3, m4 +mova [r0+0x40], m0 +mova [r0+0x50], m1 +mova [r0+0x60], m2 +mova [r0+0x70], m3 +RET + +INIT_MMX mmx +; diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const unint8_t *s2, stride) +cglobal diff_pixels, 4,5 +movsxdifnidn r3, r3d +pxor m7, m7 +add r0, 128 +mov r4, -128 +.loop: +mova m0, [r1] +mova m2, [r2] +mova m1, m0 +mova m3, m2 +punpcklbwm0, m7 +punpckhbwm1, m7 +punpcklbwm2, m7 +punpckhbwm3, m7 +psubwm0, m2 +psubwm1, m3 +mova [r0+r4+0], m0 +mova [r0+r4+8], m1 +add r1, r3 +add r2, r3 +add r4, 16 +jne .loop +REP_RET + +INIT_MMX mmx +; pix_sum16_mmx(uint8_t * pix, int line_size) +cglobal pix_sum16, 2, 3 +movsxdifnidn r1, r1d +mov r2, r1 +neg r2 +shl r2, 4 +sub r0, r2 +pxor m7, m7 +pxor m6, m6 +.loop: +mova m0, [r0+r2+0] +mova m1, [r0+r2+0] +mova m2, [r0+r2+8] +mova m3, [r0+r2+8] +punpcklbwm0, m7 +punpckhbwm1, m7 +punpcklbwm2, m7 +punpckhbwm3, m7 +paddwm1, m0 +paddwm3, m2 +paddwm3, m1 +paddwm6, m3 +add r2, r1 +js .loop +mova m5, m6 +psrlqm6, 32 +paddwm6, m5 +mova m5, m6 +psrlqm6, 16 +paddwm6, m5 +movdeax, m6 +and eax, 0x +RET + +INIT_MMX mmx +; pix_norm1_mmx(uint8_t *pix, int line_size) +cglobal pix_norm1, 2, 4 +movsxdifnidn r1, r1d +mov r2, 16 +pxor m0, m0 +pxor m7, m7 +.loop: +mova m2, [r0+0] +mova m3, [r0+8] +mova m1, m2 +punpckhbwm1, m0 +punpcklbwm2, m0 +mova m4, m3 +punpckhbwm3, m0 +punpcklbwm4, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +pmaddwd m4, m4 +padddm2, m1 +padddm4, m3 +padddm7, m2 +add r0, r1 +padddm7, m4 +dec r2 +jne .loop +mova m1, m7 +psrlqm7, 32 +padddm1, m7 +movdeax, m1 +RET + diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index e5d2473..fa126d6 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -30,181 +30,14 @@ #include libavcodec/mathops.h #include dsputil_mmx.h +void ff_get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size); +void ff_get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size); +void ff_diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride); +int ff_pix_sum16_mmx(uint8_t * pix, int line_size); +int ff_pix_norm1_mmx(uint8_t *pix, int line_size); #if HAVE_INLINE_ASM
[libav-devel] [PATCH v4] yadif: x86: fix build for compilers without aligned stack
Manually load registers to avoid using 8 registers with compilers that do not align the stack (e.g. MSVC). --- Now with named args. --- libavfilter/x86/yadif.asm | 56 +++-- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm index 5e406a4..bc4b3ce 100644 --- a/libavfilter/x86/yadif.asm +++ b/libavfilter/x86/yadif.asm @@ -31,8 +31,8 @@ pw_1: times 8 dw 1 SECTION .text %macro CHECK 2 -movu m2, [curq+mrefsq+%1] -movu m3, [curq+prefsq+%2] +movu m2, [curq+t1+%1] +movu m3, [curq+t0+%2] mova m4, m2 mova m5, m2 pxor m4, m3 @@ -97,8 +97,8 @@ SECTION .text %macro FILTER 3 .loop%1: pxor m7, m7 -LOAD 0, [curq+mrefsq] -LOAD 1, [curq+prefsq] +LOAD 0, [curq+t1] +LOAD 1, [curq+t0] LOAD 2, [%2] LOAD 3, [%3] mova m4, m3 @@ -109,8 +109,8 @@ SECTION .text mova [rsp+32], m1 psubwm2, m4 ABS1 m2, m4 -LOAD 3, [prevq+mrefsq] -LOAD 4, [prevq+prefsq] +LOAD 3, [prevq+t1] +LOAD 4, [prevq+t0] psubwm3, m0 psubwm4, m1 ABS1 m3, m5 @@ -119,8 +119,8 @@ SECTION .text psrlwm2, 1 psrlwm3, 1 pmaxsw m2, m3 -LOAD 3, [nextq+mrefsq] -LOAD 4, [nextq+prefsq] +LOAD 3, [nextq+t1] +LOAD 4, [nextq+t0] psubwm3, m0 psubwm4, m1 ABS1 m3, m5 @@ -136,8 +136,8 @@ SECTION .text psrlwm1, 1 ABS1 m0, m2 -movu m2, [curq+mrefsq-1] -movu m3, [curq+prefsq-1] +movu m2, [curq+t1-1] +movu m3, [curq+t0-1] mova m4, m2 psubusb m2, m3 psubusb m3, m4 @@ -164,12 +164,12 @@ SECTION .text CHECK2 mova m6, [rsp+48] -cmp DWORD modem, 2 +cmp DWORD r8m, 2 jge .end%1 -LOAD 2, [%2+mrefsq*2] -LOAD 4, [%3+mrefsq*2] -LOAD 3, [%2+prefsq*2] -LOAD 5, [%3+prefsq*2] +LOAD 2, [%2+t1*2] +LOAD 4, [%3+t1*2] +LOAD 3, [%2+t0*2] +LOAD 5, [%3+t0*2] paddwm2, m4 paddwm3, m5 psrlwm2, 1 @@ -208,19 +208,31 @@ SECTION .text add prevq, mmsize/2 addcurq, mmsize/2 add nextq, mmsize/2 -sub wd, mmsize/2 +sub DWORD r4m, mmsize/2 jg .loop%1 %endmacro %macro YADIF 0 -cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \ - mrefs, parity, mode -test wq, wq +%if ARCH_X86_32 +cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \ +mrefs, parity, mode +%else +cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \ +mrefs, parity, mode +%endif +cmp DWORD wm, 0 jle .ret -movsxdifnidn prefsq, prefsd -movsxdifnidn mrefsq, mrefsd +%if ARCH_X86_32 +movr4, r5mp +movr5, r6mp +DECLARE_REG_TMP 4,5 +%else +movsxd r5, DWORD r5m +movsxd r6, DWORD r6m +DECLARE_REG_TMP 5,6 +%endif -cmp DWORD paritym, 0 +cmp DWORD paritym, 0 je .parity0 FILTER 1, prevq, curq jmp .ret -- 1.7.10.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] yadif: x86: fix build for compilers without aligned stack
Manually load registers to avoid using 8 registers on x86_32 with compilers that do not align the stack (e.g. MSVC). --- Add which platform it fixes --- libavfilter/x86/yadif.asm | 56 +++-- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm index 5e406a4..bc4b3ce 100644 --- a/libavfilter/x86/yadif.asm +++ b/libavfilter/x86/yadif.asm @@ -31,8 +31,8 @@ pw_1: times 8 dw 1 SECTION .text %macro CHECK 2 -movu m2, [curq+mrefsq+%1] -movu m3, [curq+prefsq+%2] +movu m2, [curq+t1+%1] +movu m3, [curq+t0+%2] mova m4, m2 mova m5, m2 pxor m4, m3 @@ -97,8 +97,8 @@ SECTION .text %macro FILTER 3 .loop%1: pxor m7, m7 -LOAD 0, [curq+mrefsq] -LOAD 1, [curq+prefsq] +LOAD 0, [curq+t1] +LOAD 1, [curq+t0] LOAD 2, [%2] LOAD 3, [%3] mova m4, m3 @@ -109,8 +109,8 @@ SECTION .text mova [rsp+32], m1 psubwm2, m4 ABS1 m2, m4 -LOAD 3, [prevq+mrefsq] -LOAD 4, [prevq+prefsq] +LOAD 3, [prevq+t1] +LOAD 4, [prevq+t0] psubwm3, m0 psubwm4, m1 ABS1 m3, m5 @@ -119,8 +119,8 @@ SECTION .text psrlwm2, 1 psrlwm3, 1 pmaxsw m2, m3 -LOAD 3, [nextq+mrefsq] -LOAD 4, [nextq+prefsq] +LOAD 3, [nextq+t1] +LOAD 4, [nextq+t0] psubwm3, m0 psubwm4, m1 ABS1 m3, m5 @@ -136,8 +136,8 @@ SECTION .text psrlwm1, 1 ABS1 m0, m2 -movu m2, [curq+mrefsq-1] -movu m3, [curq+prefsq-1] +movu m2, [curq+t1-1] +movu m3, [curq+t0-1] mova m4, m2 psubusb m2, m3 psubusb m3, m4 @@ -164,12 +164,12 @@ SECTION .text CHECK2 mova m6, [rsp+48] -cmp DWORD modem, 2 +cmp DWORD r8m, 2 jge .end%1 -LOAD 2, [%2+mrefsq*2] -LOAD 4, [%3+mrefsq*2] -LOAD 3, [%2+prefsq*2] -LOAD 5, [%3+prefsq*2] +LOAD 2, [%2+t1*2] +LOAD 4, [%3+t1*2] +LOAD 3, [%2+t0*2] +LOAD 5, [%3+t0*2] paddwm2, m4 paddwm3, m5 psrlwm2, 1 @@ -208,19 +208,31 @@ SECTION .text add prevq, mmsize/2 addcurq, mmsize/2 add nextq, mmsize/2 -sub wd, mmsize/2 +sub DWORD r4m, mmsize/2 jg .loop%1 %endmacro %macro YADIF 0 -cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \ - mrefs, parity, mode -test wq, wq +%if ARCH_X86_32 +cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \ +mrefs, parity, mode +%else +cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \ +mrefs, parity, mode +%endif +cmp DWORD wm, 0 jle .ret -movsxdifnidn prefsq, prefsd -movsxdifnidn mrefsq, mrefsd +%if ARCH_X86_32 +movr4, r5mp +movr5, r6mp +DECLARE_REG_TMP 4,5 +%else +movsxd r5, DWORD r5m +movsxd r6, DWORD r6m +DECLARE_REG_TMP 5,6 +%endif -cmp DWORD paritym, 0 +cmp DWORD paritym, 0 je .parity0 FILTER 1, prevq, curq jmp .ret -- 1.7.10.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] yadif: Fix bug with x86_32 MSVC
--- More testing would be appreciated --- libavfilter/x86/yadif.asm | 83 - 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm index 5e406a4..da0d6eb 100644 --- a/libavfilter/x86/yadif.asm +++ b/libavfilter/x86/yadif.asm @@ -31,8 +31,8 @@ pw_1: times 8 dw 1 SECTION .text %macro CHECK 2 -movu m2, [curq+mrefsq+%1] -movu m3, [curq+prefsq+%2] +movu m2, [r2+t1+%1] +movu m3, [r2+t0+%2] mova m4, m2 mova m5, m2 pxor m4, m3 @@ -97,20 +97,20 @@ SECTION .text %macro FILTER 3 .loop%1: pxor m7, m7 -LOAD 0, [curq+mrefsq] -LOAD 1, [curq+prefsq] +LOAD 0, [r2+t1] +LOAD 1, [r2+t0] LOAD 2, [%2] LOAD 3, [%3] mova m4, m3 paddwm3, m2 psrawm3, 1 -mova [rsp+ 0], m0 -mova [rsp+16], m3 -mova [rsp+32], m1 +mova [rsp- 0], m0 +mova [rsp-16], m3 +mova [rsp-32], m1 psubwm2, m4 ABS1 m2, m4 -LOAD 3, [prevq+mrefsq] -LOAD 4, [prevq+prefsq] +LOAD 3, [r1+t1] +LOAD 4, [r1+t0] psubwm3, m0 psubwm4, m1 ABS1 m3, m5 @@ -119,8 +119,8 @@ SECTION .text psrlwm2, 1 psrlwm3, 1 pmaxsw m2, m3 -LOAD 3, [nextq+mrefsq] -LOAD 4, [nextq+prefsq] +LOAD 3, [r3+t1] +LOAD 4, [r3+t0] psubwm3, m0 psubwm4, m1 ABS1 m3, m5 @@ -128,7 +128,7 @@ SECTION .text paddwm3, m4 psrlwm3, 1 pmaxsw m2, m3 -mova [rsp+48], m2 +mova [rsp-48], m2 paddwm1, m0 paddwm0, m0 @@ -136,8 +136,8 @@ SECTION .text psrlwm1, 1 ABS1 m0, m2 -movu m2, [curq+mrefsq-1] -movu m3, [curq+prefsq-1] +movu m2, [r2+t1-1] +movu m3, [r2+t0-1] mova m4, m2 psubusb m2, m3 psubusb m3, m4 @@ -163,20 +163,20 @@ SECTION .text CHECK 1, -3 CHECK2 -mova m6, [rsp+48] -cmp DWORD modem, 2 +mova m6, [rsp-48] +cmp DWORD r8m, 2 jge .end%1 -LOAD 2, [%2+mrefsq*2] -LOAD 4, [%3+mrefsq*2] -LOAD 3, [%2+prefsq*2] -LOAD 5, [%3+prefsq*2] +LOAD 2, [%2+t1*2] +LOAD 4, [%3+t1*2] +LOAD 3, [%2+t0*2] +LOAD 5, [%3+t0*2] paddwm2, m4 paddwm3, m5 psrlwm2, 1 psrlwm3, 1 -mova m4, [rsp+ 0] -mova m5, [rsp+16] -mova m7, [rsp+32] +mova m4, [rsp- 0] +mova m5, [rsp-16] +mova m7, [rsp-32] psubwm2, m4 psubwm3, m7 mova m0, m5 @@ -195,7 +195,7 @@ SECTION .text pmaxsw m6, m4 .end%1: -mova m2, [rsp+16] +mova m2, [rsp-16] mova m3, m2 psubwm2, m6 paddwm3, m6 @@ -203,30 +203,37 @@ SECTION .text pminsw m1, m3 packuswb m1, m1 -movh [dstq], m1 -adddstq, mmsize/2 -add prevq, mmsize/2 -addcurq, mmsize/2 -add nextq, mmsize/2 -sub wd, mmsize/2 +movh [r0], m1 +add r0, mmsize/2 +add r1, mmsize/2 +add r2, mmsize/2 +add r3, mmsize/2 +sub DWORD r4m, mmsize/2 jg .loop%1 %endmacro +%assign PAD -1*80 %macro YADIF 0 -cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \ - mrefs, parity, mode -test wq, wq +cglobal yadif_filter_line, 4, 7, 8, PAD +cmp DWORD r4m, 0 jle .ret -movsxdifnidn prefsq, prefsd -movsxdifnidn mrefsq, mrefsd +%if ARCH_X86_32 +movifnidn r4, r5mp +movifnidn r5, r6mp + DECLARE_REG_TMP 4,5 +%else +movsxdifnidn r5, DWORD r5m +movsxdifnidn r6, DWORD r6m +DECLARE_REG_TMP 5,6 +%endif -cmp DWORD paritym, 0 +cmp DWORD r7m, 0 je .parity0 -FILTER 1, prevq, curq +FILTER 1, r1, r2 jmp .ret .parity0: -FILTER 0, curq, nextq +FILTER 0, r2, r3 .ret: RET -- 1.7.10.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH v2] yadif: x86: fix building with automatic stack alignment
Manually reload registers to avoid trying to use 8 registers when compilers that do not align the stack. MSVC among those. --- Update based on suggestion and commit message. --- libavfilter/x86/yadif.asm | 83 - 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm index 5e406a4..d2b4be5 100644 --- a/libavfilter/x86/yadif.asm +++ b/libavfilter/x86/yadif.asm @@ -31,8 +31,8 @@ pw_1: times 8 dw 1 SECTION .text %macro CHECK 2 -movu m2, [curq+mrefsq+%1] -movu m3, [curq+prefsq+%2] +movu m2, [r2+t1+%1] +movu m3, [r2+t0+%2] mova m4, m2 mova m5, m2 pxor m4, m3 @@ -97,20 +97,20 @@ SECTION .text %macro FILTER 3 .loop%1: pxor m7, m7 -LOAD 0, [curq+mrefsq] -LOAD 1, [curq+prefsq] +LOAD 0, [r2+t1] +LOAD 1, [r2+t0] LOAD 2, [%2] LOAD 3, [%3] mova m4, m3 paddwm3, m2 psrawm3, 1 -mova [rsp+ 0], m0 -mova [rsp+16], m3 -mova [rsp+32], m1 +mova [rsp- 0], m0 +mova [rsp-16], m3 +mova [rsp-32], m1 psubwm2, m4 ABS1 m2, m4 -LOAD 3, [prevq+mrefsq] -LOAD 4, [prevq+prefsq] +LOAD 3, [r1+t1] +LOAD 4, [r1+t0] psubwm3, m0 psubwm4, m1 ABS1 m3, m5 @@ -119,8 +119,8 @@ SECTION .text psrlwm2, 1 psrlwm3, 1 pmaxsw m2, m3 -LOAD 3, [nextq+mrefsq] -LOAD 4, [nextq+prefsq] +LOAD 3, [r3+t1] +LOAD 4, [r3+t0] psubwm3, m0 psubwm4, m1 ABS1 m3, m5 @@ -128,7 +128,7 @@ SECTION .text paddwm3, m4 psrlwm3, 1 pmaxsw m2, m3 -mova [rsp+48], m2 +mova [rsp-48], m2 paddwm1, m0 paddwm0, m0 @@ -136,8 +136,8 @@ SECTION .text psrlwm1, 1 ABS1 m0, m2 -movu m2, [curq+mrefsq-1] -movu m3, [curq+prefsq-1] +movu m2, [r2+t1-1] +movu m3, [r2+t0-1] mova m4, m2 psubusb m2, m3 psubusb m3, m4 @@ -163,20 +163,20 @@ SECTION .text CHECK 1, -3 CHECK2 -mova m6, [rsp+48] -cmp DWORD modem, 2 +mova m6, [rsp-48] +cmp DWORD r8m, 2 jge .end%1 -LOAD 2, [%2+mrefsq*2] -LOAD 4, [%3+mrefsq*2] -LOAD 3, [%2+prefsq*2] -LOAD 5, [%3+prefsq*2] +LOAD 2, [%2+t1*2] +LOAD 4, [%3+t1*2] +LOAD 3, [%2+t0*2] +LOAD 5, [%3+t0*2] paddwm2, m4 paddwm3, m5 psrlwm2, 1 psrlwm3, 1 -mova m4, [rsp+ 0] -mova m5, [rsp+16] -mova m7, [rsp+32] +mova m4, [rsp- 0] +mova m5, [rsp-16] +mova m7, [rsp-32] psubwm2, m4 psubwm3, m7 mova m0, m5 @@ -195,7 +195,7 @@ SECTION .text pmaxsw m6, m4 .end%1: -mova m2, [rsp+16] +mova m2, [rsp-16] mova m3, m2 psubwm2, m6 paddwm3, m6 @@ -203,30 +203,37 @@ SECTION .text pminsw m1, m3 packuswb m1, m1 -movh [dstq], m1 -adddstq, mmsize/2 -add prevq, mmsize/2 -addcurq, mmsize/2 -add nextq, mmsize/2 -sub wd, mmsize/2 +movh [r0], m1 +add r0, mmsize/2 +add r1, mmsize/2 +add r2, mmsize/2 +add r3, mmsize/2 +sub DWORD r4m, mmsize/2 jg .loop%1 %endmacro +%assign PAD -1*80 %macro YADIF 0 -cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \ - mrefs, parity, mode -test wq, wq +cglobal yadif_filter_line, 4, 7, 8, PAD +cmp DWORD r4m, 0 jle .ret -movsxdifnidn prefsq, prefsd -movsxdifnidn mrefsq, mrefsd +%if ARCH_X86_32 +mov r4, r5mp +mov r5, r6mp +DECLARE_REG_TMP 4,5 +%else +movsxdifnidn r5, DWORD r5m +movsxdifnidn r6, DWORD r6m +DECLARE_REG_TMP 5,6 +%endif -cmp DWORD paritym, 0 +cmp DWORD r7m, 0 je .parity0 -FILTER 1, prevq, curq +FILTER 1, r1, r2 jmp .ret .parity0: -FILTER 0, curq, nextq +FILTER 0, r2, r3 .ret: RET -- 1.7.10.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH v2] yadif: x86: fix building with automatic stack alignment
On Sat, Jan 12, 2013 at 4:13 PM, Ronald S. Bultje rsbul...@gmail.com wrote: Hi, On Sat, Jan 12, 2013 at 12:03 PM, Daniel Kang daniel.d.k...@gmail.com wrote: Manually reload registers to avoid trying to use 8 registers when compilers that do not align the stack. MSVC among those. [..] -adddstq, mmsize/2 -add prevq, mmsize/2 -addcurq, mmsize/2 -add nextq, mmsize/2 -sub wd, mmsize/2 [..] +add r0, mmsize/2 +add r1, mmsize/2 +add r2, mmsize/2 +add r3, mmsize/2 +sub DWORD r4m, mmsize/2 [..] -cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \ - mrefs, parity, mode [..] +cglobal yadif_filter_line, 4, 7, 8, PAD Do you think it's possible to somehow keep the named arguments, at least for the ones where you load arguments from the stack (mrefsm, paritym, etc) and for the first 3-4 arguments that are not temp-based? I tried when making the patch and I think it makes things more confusing. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] yadif: Fix bug with x86_32 MSVC
On Sat, Jan 12, 2013 at 10:36 PM, Loren Merritt lor...@u.washington.edu wrote: On Sat, 12 Jan 2013, Daniel Kang wrote: -mova [rsp+ 0], m0 -mova [rsp+16], m3 -mova [rsp+32], m1 +mova [rsp- 0], m0 +mova [rsp-16], m3 +mova [rsp-32], m1 You can't do that on x86_32. What do I do instead? Also this seemed to work in my tests, why won't this work on x86_32? +%assign PAD -1*80 Unused? cglobal errors if I put -80 for the stack space. %macro YADIF 0 -cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \ - mrefs, parity, mode -test wq, wq +cglobal yadif_filter_line, 4, 7, 8, PAD Do you have a reason for removing all the named args? I can't use half of the named args, and I thought it was less confusing if I didn't use them at all. +cmp DWORD r4m, 0 jle .ret -movsxdifnidn prefsq, prefsd -movsxdifnidn mrefsq, mrefsd +%if ARCH_X86_32 +movifnidn r4, r5mp +movifnidn r5, r6mp +DECLARE_REG_TMP 4,5 +%else +movsxdifnidn r5, DWORD r5m +movsxdifnidn r6, DWORD r6m +DECLARE_REG_TMP 5,6 +%endif No ifnidn. After your change, they will not in fact be identical. Will fix. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH v3] yadif: x86: fix build for compilers without aligned stack
Manually load registers to avoid using 8 registers with compilers that do not align the stack (e.g. MSVC). --- Better commit message and avoid redzone (Loren's comments). --- libavfilter/x86/yadif.asm | 68 ++--- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm index 5e406a4..adfd3db 100644 --- a/libavfilter/x86/yadif.asm +++ b/libavfilter/x86/yadif.asm @@ -31,8 +31,8 @@ pw_1: times 8 dw 1 SECTION .text %macro CHECK 2 -movu m2, [curq+mrefsq+%1] -movu m3, [curq+prefsq+%2] +movu m2, [r2+t1+%1] +movu m3, [r2+t0+%2] mova m4, m2 mova m5, m2 pxor m4, m3 @@ -97,8 +97,8 @@ SECTION .text %macro FILTER 3 .loop%1: pxor m7, m7 -LOAD 0, [curq+mrefsq] -LOAD 1, [curq+prefsq] +LOAD 0, [r2+t1] +LOAD 1, [r2+t0] LOAD 2, [%2] LOAD 3, [%3] mova m4, m3 @@ -109,8 +109,8 @@ SECTION .text mova [rsp+32], m1 psubwm2, m4 ABS1 m2, m4 -LOAD 3, [prevq+mrefsq] -LOAD 4, [prevq+prefsq] +LOAD 3, [r1+t1] +LOAD 4, [r1+t0] psubwm3, m0 psubwm4, m1 ABS1 m3, m5 @@ -119,8 +119,8 @@ SECTION .text psrlwm2, 1 psrlwm3, 1 pmaxsw m2, m3 -LOAD 3, [nextq+mrefsq] -LOAD 4, [nextq+prefsq] +LOAD 3, [r3+t1] +LOAD 4, [r3+t0] psubwm3, m0 psubwm4, m1 ABS1 m3, m5 @@ -136,8 +136,8 @@ SECTION .text psrlwm1, 1 ABS1 m0, m2 -movu m2, [curq+mrefsq-1] -movu m3, [curq+prefsq-1] +movu m2, [r2+t1-1] +movu m3, [r2+t0-1] mova m4, m2 psubusb m2, m3 psubusb m3, m4 @@ -164,12 +164,12 @@ SECTION .text CHECK2 mova m6, [rsp+48] -cmp DWORD modem, 2 +cmp DWORD r8m, 2 jge .end%1 -LOAD 2, [%2+mrefsq*2] -LOAD 4, [%3+mrefsq*2] -LOAD 3, [%2+prefsq*2] -LOAD 5, [%3+prefsq*2] +LOAD 2, [%2+t1*2] +LOAD 4, [%3+t1*2] +LOAD 3, [%2+t0*2] +LOAD 5, [%3+t0*2] paddwm2, m4 paddwm3, m5 psrlwm2, 1 @@ -203,30 +203,40 @@ SECTION .text pminsw m1, m3 packuswb m1, m1 -movh [dstq], m1 -adddstq, mmsize/2 -add prevq, mmsize/2 -addcurq, mmsize/2 -add nextq, mmsize/2 -sub wd, mmsize/2 +movh [r0], m1 +add r0, mmsize/2 +add r1, mmsize/2 +add r2, mmsize/2 +add r3, mmsize/2 +sub DWORD r4m, mmsize/2 jg .loop%1 %endmacro %macro YADIF 0 -cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \ - mrefs, parity, mode -test wq, wq +%if ARCH_X86_32 +cglobal yadif_filter_line, 4, 6, 8, 80 +%else +cglobal yadif_filter_line, 4, 7, 8, 80 +%endif +cmp DWORD r4m, 0 jle .ret -movsxdifnidn prefsq, prefsd -movsxdifnidn mrefsq, mrefsd +%if ARCH_X86_32 +mov r4, r5mp +mov r5, r6mp +DECLARE_REG_TMP 4,5 +%else +movsxd r5, DWORD r5m +movsxd r6, DWORD r6m +DECLARE_REG_TMP 5,6 +%endif -cmp DWORD paritym, 0 +cmp DWORD r7m, 0 je .parity0 -FILTER 1, prevq, curq +FILTER 1, r1, r2 jmp .ret .parity0: -FILTER 0, curq, nextq +FILTER 0, r2, r3 .ret: RET -- 1.7.10.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] yadif: Port inline assembly to YASM
--- Updated with Loren's suggestion. --- libavfilter/x86/Makefile |3 +- libavfilter/x86/yadif.asm| 242 +++ libavfilter/x86/yadif.c | 74 --- libavfilter/x86/yadif_init.c | 54 libavfilter/x86/yadif_template.c | 261 -- 5 files changed, 298 insertions(+), 336 deletions(-) create mode 100644 libavfilter/x86/yadif.asm delete mode 100644 libavfilter/x86/yadif.c create mode 100644 libavfilter/x86/yadif_init.c delete mode 100644 libavfilter/x86/yadif_template.c diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 0f08e39..47569cf 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -1,6 +1,7 @@ OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/gradfun.o OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o -OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o +OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif_init.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/hqdn3d.o YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o +YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm new file mode 100644 index 000..5e406a4 --- /dev/null +++ b/libavfilter/x86/yadif.asm @@ -0,0 +1,242 @@ +;* +;* x86-optimized functions for yadif filter +;* +;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at +;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with Libav; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;** + +%include libavutil/x86/x86util.asm + +SECTION_RODATA + +pb_1: times 16 db 1 +pw_1: times 8 dw 1 + +SECTION .text + +%macro CHECK 2 +movu m2, [curq+mrefsq+%1] +movu m3, [curq+prefsq+%2] +mova m4, m2 +mova m5, m2 +pxor m4, m3 +pavgb m5, m3 +pand m4, [pb_1] +psubusb m5, m4 +%if mmsize == 16 +psrldqm5, 1 +%else +psrlq m5, 8 +%endif +punpcklbw m5, m7 +mova m4, m2 +psubusb m2, m3 +psubusb m3, m4 +pmaxubm2, m3 +mova m3, m2 +mova m4, m2 +%if mmsize == 16 +psrldqm3, 1 +psrldqm4, 2 +%else +psrlq m3, 8 +psrlq m4, 16 +%endif +punpcklbw m2, m7 +punpcklbw m3, m7 +punpcklbw m4, m7 +paddw m2, m3 +paddw m2, m4 +%endmacro + +%macro CHECK1 0 +movam3, m0 +pcmpgtw m3, m2 +pminsw m0, m2 +movam6, m3 +pandm5, m3 +pandn m3, m1 +por m3, m5 +movam1, m3 +%endmacro + +%macro CHECK2 0 +paddw m6, [pw_1] +psllw m6, 14 +paddsw m2, m6 +movam3, m0 +pcmpgtw m3, m2 +pminsw m0, m2 +pandm5, m3 +pandn m3, m1 +por m3, m5 +movam1, m3 +%endmacro + +%macro LOAD 2 +movh m%1, %2 +punpcklbw m%1, m7 +%endmacro + +%macro FILTER 3 +.loop%1: +pxor m7, m7 +LOAD 0, [curq+mrefsq] +LOAD 1, [curq+prefsq] +LOAD 2, [%2] +LOAD 3, [%3] +mova m4, m3 +paddwm3, m2 +psrawm3, 1 +mova [rsp+ 0], m0 +mova [rsp+16], m3 +mova [rsp+32], m1 +psubwm2, m4 +ABS1 m2, m4 +LOAD 3, [prevq+mrefsq] +LOAD 4, [prevq+prefsq] +psubwm3, m0 +psubwm4, m1 +ABS1 m3, m5 +ABS1 m4, m5 +paddwm3, m4 +psrlwm2, 1 +psrlwm3, 1 +pmaxsw m2, m3 +LOAD 3, [nextq+mrefsq] +LOAD 4, [nextq+prefsq] +psubwm3, m0 +psubwm4, m1 +ABS1 m3, m5 +ABS1 m4, m5 +paddwm3, m4 +psrlwm3, 1 +pmaxsw m2, m3 +mova [rsp+48], m2 + +paddwm1, m0 +paddwm0, m0 +psubwm0, m1 +psrlwm1, 1 +ABS1 m0, m2 + +movu m2, [curq+mrefsq-1] +movu m3, [curq+prefsq-1] +mova m4, m2 +psubusb m2, m3 +psubusb m3, m4 +pmaxub
Re: [libav-devel] [PATCH v2] YADIF: Port inline assembly to YASM
On Sun, Jan 6, 2013 at 12:54 PM, Diego Biurrun di...@biurrun.de wrote: On Sun, Jan 06, 2013 at 11:32:15AM -0600, Daniel Kang wrote: --- Updated to use ABS1 --- libavfilter/x86/Makefile |1 + libavfilter/x86/yadif.c | 60 +++-- libavfilter/x86/yadif_template.c | 261 -- libavfilter/x86/yadif_yasm.asm | 241 +++ 4 files changed, 262 insertions(+), 301 deletions(-) delete mode 100644 libavfilter/x86/yadif_template.c create mode 100644 libavfilter/x86/yadif_yasm.asm No further issues to be noticed from me. Does this pass the test script that I gave you? While the script doesn't work for unrelated reasons, it builds with every configuration the script tests. It also passes on x86_32 and x86_64 for the yadif test for me. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH v2] YADIF: Port inline assembly to YASM
On Tue, Jan 8, 2013 at 10:04 AM, Daniel Kang daniel.d.k...@gmail.com wrote: On Sun, Jan 6, 2013 at 12:54 PM, Diego Biurrun di...@biurrun.de wrote: On Sun, Jan 06, 2013 at 11:32:15AM -0600, Daniel Kang wrote: --- Updated to use ABS1 --- libavfilter/x86/Makefile |1 + libavfilter/x86/yadif.c | 60 +++-- libavfilter/x86/yadif_template.c | 261 -- libavfilter/x86/yadif_yasm.asm | 241 +++ 4 files changed, 262 insertions(+), 301 deletions(-) delete mode 100644 libavfilter/x86/yadif_template.c create mode 100644 libavfilter/x86/yadif_yasm.asm No further issues to be noticed from me. Does this pass the test script that I gave you? While the script doesn't work for unrelated reasons, it builds with every configuration the script tests. It also passes on x86_32 and x86_64 for the yadif test for me. Oops ignore that. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH v3] YADIF: Port inline assembly to YASM
\ -psubusb MM3, MM2 \n\t\ -psubusb MM4, MM3 \n\t\ -pmaxubMM3, MM2 \n\t\ -PSHUF(MM3, MM2) \ -punpcklbw MM7, MM2 \n\t /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\ -punpcklbw MM7, MM3 \n\t /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\ -paddw MM2, MM0 \n\t\ -paddw MM3, MM0 \n\t\ -psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\ -\ -CHECK(-2,0)\ -CHECK1\ -CHECK(-3,1)\ -CHECK2\ -CHECK(0,-2)\ -CHECK1\ -CHECK(1,-3)\ -CHECK2\ -\ -/* if(p-mode2) ... */\ -MOVQ 48(%[tmp]), MM6 \n\t /* diff */\ -cmpl $2, %[mode] \n\t\ -jge 1f \n\t\ -LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\ -LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\ -LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\ -LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\ -paddw MM4, MM2 \n\t\ -paddw MM5, MM3 \n\t\ -psrlw $1,MM2 \n\t /* b */\ -psrlw $1,MM3 \n\t /* f */\ -MOVQ (%[tmp]), MM4 \n\t /* c */\ -MOVQ 16(%[tmp]), MM5 \n\t /* d */\ -MOVQ 32(%[tmp]), MM7 \n\t /* e */\ -psubw MM4, MM2 \n\t /* b-c */\ -psubw MM7, MM3 \n\t /* f-e */\ -MOVQ MM5, MM0 \n\t\ -psubw MM4, MM5 \n\t /* d-c */\ -psubw MM7, MM0 \n\t /* d-e */\ -MOVQ MM2, MM4 \n\t\ -pminswMM3, MM2 \n\t\ -pmaxswMM4, MM3 \n\t\ -pmaxswMM5, MM2 \n\t\ -pminswMM5, MM3 \n\t\ -pmaxswMM0, MM2 \n\t /* max */\ -pminswMM0, MM3 \n\t /* min */\ -pxor MM4, MM4 \n\t\ -pmaxswMM3, MM6 \n\t\ -psubw MM2, MM4 \n\t /* -max */\ -pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\ -1: \n\t\ -\ -MOVQ 16(%[tmp]), MM2 \n\t /* d */\ -MOVQ MM2, MM3 \n\t\ -psubw MM6, MM2 \n\t /* d-diff */\ -paddw MM6, MM3 \n\t /* d+diff */\ -pmaxswMM2, MM1 \n\t\ -pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, d+diff); */\ -packuswb MM1, MM1 \n\t\ -\ -::[prev] r(prev),\ - [cur] r(cur),\ - [next] r(next),\ - [prefs]r((x86_reg)prefs),\ - [mrefs]r((x86_reg)mrefs),\ - [mode] g(mode),\ - [tmp] r(tmp)\ -);\ -__asm__ volatile(MOV MM1, %0 :=m(*dst));\ -dst += STEP;\ -prev+= STEP;\ -cur += STEP;\ -next+= STEP;\ -} - -if (parity) { -#define prev2 prev -#define next2 cur -FILTER -#undef prev2 -#undef next2 -} else { -#define prev2 cur -#define next2 next -FILTER -#undef prev2 -#undef next2 -} -} -#undef STEP -#undef MM -#undef MOV -#undef MOVQ -#undef MOVQU -#undef PSHUF -#undef PSRL1 -#undef PSRL2 -#undef LOAD -#undef PABS -#undef CHECK -#undef CHECK1 -#undef CHECK2 -#undef FILTER diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm new file mode 100644 index 000..79265e6 --- /dev/null +++ b/libavfilter/x86/yadif_yasm.asm @@ -0,0 +1,241 @@ +;* +;* x86-optimized functions for yadif filter +;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at +;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with Libav; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;** + +%include libavutil/x86/x86util.asm + +SECTION_RODATA + +pb_1: times 16 db 1 +pw_1: times 8 dw 1 + +SECTION .text + +%macro CHECK 2 +movu m2, [curq+mrefsq+%1] +movu m3, [curq+prefsq+%2] +mova m4, m2 +mova m5, m2 +pxor m4, m3 +pavgb m5, m3 +pand m4, [pb_1] +psubusb m5, m4 +%if mmsize == 16 +psrldqm5, 1 +%else +psrlq m5, 8 +%endif +punpcklbw m5, m7 +mova m4
[libav-devel] [PATCH v2] YADIF: Port inline assembly to YASM
\ -pmaxubMM3, MM2 \n\t\ -PSHUF(MM3, MM2) \ -punpcklbw MM7, MM2 \n\t /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\ -punpcklbw MM7, MM3 \n\t /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\ -paddw MM2, MM0 \n\t\ -paddw MM3, MM0 \n\t\ -psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\ -\ -CHECK(-2,0)\ -CHECK1\ -CHECK(-3,1)\ -CHECK2\ -CHECK(0,-2)\ -CHECK1\ -CHECK(1,-3)\ -CHECK2\ -\ -/* if(p-mode2) ... */\ -MOVQ 48(%[tmp]), MM6 \n\t /* diff */\ -cmpl $2, %[mode] \n\t\ -jge 1f \n\t\ -LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\ -LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\ -LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\ -LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\ -paddw MM4, MM2 \n\t\ -paddw MM5, MM3 \n\t\ -psrlw $1,MM2 \n\t /* b */\ -psrlw $1,MM3 \n\t /* f */\ -MOVQ (%[tmp]), MM4 \n\t /* c */\ -MOVQ 16(%[tmp]), MM5 \n\t /* d */\ -MOVQ 32(%[tmp]), MM7 \n\t /* e */\ -psubw MM4, MM2 \n\t /* b-c */\ -psubw MM7, MM3 \n\t /* f-e */\ -MOVQ MM5, MM0 \n\t\ -psubw MM4, MM5 \n\t /* d-c */\ -psubw MM7, MM0 \n\t /* d-e */\ -MOVQ MM2, MM4 \n\t\ -pminswMM3, MM2 \n\t\ -pmaxswMM4, MM3 \n\t\ -pmaxswMM5, MM2 \n\t\ -pminswMM5, MM3 \n\t\ -pmaxswMM0, MM2 \n\t /* max */\ -pminswMM0, MM3 \n\t /* min */\ -pxor MM4, MM4 \n\t\ -pmaxswMM3, MM6 \n\t\ -psubw MM2, MM4 \n\t /* -max */\ -pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\ -1: \n\t\ -\ -MOVQ 16(%[tmp]), MM2 \n\t /* d */\ -MOVQ MM2, MM3 \n\t\ -psubw MM6, MM2 \n\t /* d-diff */\ -paddw MM6, MM3 \n\t /* d+diff */\ -pmaxswMM2, MM1 \n\t\ -pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, d+diff); */\ -packuswb MM1, MM1 \n\t\ -\ -::[prev] r(prev),\ - [cur] r(cur),\ - [next] r(next),\ - [prefs]r((x86_reg)prefs),\ - [mrefs]r((x86_reg)mrefs),\ - [mode] g(mode),\ - [tmp] r(tmp)\ -);\ -__asm__ volatile(MOV MM1, %0 :=m(*dst));\ -dst += STEP;\ -prev+= STEP;\ -cur += STEP;\ -next+= STEP;\ -} - -if (parity) { -#define prev2 prev -#define next2 cur -FILTER -#undef prev2 -#undef next2 -} else { -#define prev2 cur -#define next2 next -FILTER -#undef prev2 -#undef next2 -} -} -#undef STEP -#undef MM -#undef MOV -#undef MOVQ -#undef MOVQU -#undef PSHUF -#undef PSRL1 -#undef PSRL2 -#undef LOAD -#undef PABS -#undef CHECK -#undef CHECK1 -#undef CHECK2 -#undef FILTER diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm new file mode 100644 index 000..6519de8 --- /dev/null +++ b/libavfilter/x86/yadif_yasm.asm @@ -0,0 +1,241 @@ +;* +;* x86-optimized functions for yadif filter +;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at +;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with Libav; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;** + +%include libavutil/x86/x86util.asm + +SECTION_RODATA + +pb_1: times 16 db 1 +pw_1: times 8 dw 1 + +SECTION .text + +%macro CHECK 2 +movu m2, [curq+mrefsq+%1] +movu m3, [curq+prefsq+%2] +mova m4, m2 +mova m5, m2 +pxor m4, m3 +pavgb m5, m3 +pand m4, [pb_1] +psubusb m5, m4 +%if mmsize == 16 +psrldqm5, 1 +%else +psrlq m5, 8 +%endif +punpcklbw m5, m7 +mova m4, m2 +psubusb m2, m3 +psubusb m3, m4 +pmaxubm2, m3 +mova m3
[libav-devel] [PATCH] YADIF: Port inline assembly to YASM
\ -psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\ -\ -CHECK(-2,0)\ -CHECK1\ -CHECK(-3,1)\ -CHECK2\ -CHECK(0,-2)\ -CHECK1\ -CHECK(1,-3)\ -CHECK2\ -\ -/* if(p-mode2) ... */\ -MOVQ 48(%[tmp]), MM6 \n\t /* diff */\ -cmpl $2, %[mode] \n\t\ -jge 1f \n\t\ -LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\ -LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\ -LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\ -LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\ -paddw MM4, MM2 \n\t\ -paddw MM5, MM3 \n\t\ -psrlw $1,MM2 \n\t /* b */\ -psrlw $1,MM3 \n\t /* f */\ -MOVQ (%[tmp]), MM4 \n\t /* c */\ -MOVQ 16(%[tmp]), MM5 \n\t /* d */\ -MOVQ 32(%[tmp]), MM7 \n\t /* e */\ -psubw MM4, MM2 \n\t /* b-c */\ -psubw MM7, MM3 \n\t /* f-e */\ -MOVQ MM5, MM0 \n\t\ -psubw MM4, MM5 \n\t /* d-c */\ -psubw MM7, MM0 \n\t /* d-e */\ -MOVQ MM2, MM4 \n\t\ -pminswMM3, MM2 \n\t\ -pmaxswMM4, MM3 \n\t\ -pmaxswMM5, MM2 \n\t\ -pminswMM5, MM3 \n\t\ -pmaxswMM0, MM2 \n\t /* max */\ -pminswMM0, MM3 \n\t /* min */\ -pxor MM4, MM4 \n\t\ -pmaxswMM3, MM6 \n\t\ -psubw MM2, MM4 \n\t /* -max */\ -pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\ -1: \n\t\ -\ -MOVQ 16(%[tmp]), MM2 \n\t /* d */\ -MOVQ MM2, MM3 \n\t\ -psubw MM6, MM2 \n\t /* d-diff */\ -paddw MM6, MM3 \n\t /* d+diff */\ -pmaxswMM2, MM1 \n\t\ -pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, d+diff); */\ -packuswb MM1, MM1 \n\t\ -\ -::[prev] r(prev),\ - [cur] r(cur),\ - [next] r(next),\ - [prefs]r((x86_reg)prefs),\ - [mrefs]r((x86_reg)mrefs),\ - [mode] g(mode),\ - [tmp] r(tmp)\ -);\ -__asm__ volatile(MOV MM1, %0 :=m(*dst));\ -dst += STEP;\ -prev+= STEP;\ -cur += STEP;\ -next+= STEP;\ -} - -if (parity) { -#define prev2 prev -#define next2 cur -FILTER -#undef prev2 -#undef next2 -} else { -#define prev2 cur -#define next2 next -FILTER -#undef prev2 -#undef next2 -} -} -#undef STEP -#undef MM -#undef MOV -#undef MOVQ -#undef MOVQU -#undef PSHUF -#undef PSRL1 -#undef PSRL2 -#undef LOAD -#undef PABS -#undef CHECK -#undef CHECK1 -#undef CHECK2 -#undef FILTER diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm new file mode 100644 index 000..ce76ff3 --- /dev/null +++ b/libavfilter/x86/yadif_yasm.asm @@ -0,0 +1,246 @@ +;* +;* x86-optimized functions for volume filter +;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;** + +%include libavutil/x86/x86util.asm + +SECTION .text + +cextern pb_1 +cextern pw_1 + +%macro CHECK 2 +movu m2, [curq+mrefsq+%1] +movu m3, [curq+prefsq+%2] +mova m4, m2 +mova m5, m2 +pxor m4, m3 +pavgb m5, m3 +pand m4, [pb_1] +psubusb m5, m4 +%if mmsize == 16 +psrldqm5, 1 +%else +psrlq m5, 8 +%endif +punpcklbw m5, m7 +mova m4, m2 +psubusb m2, m3 +psubusb m3, m4 +pmaxubm2, m3 +mova m3, m2 +mova m4, m2 +%if mmsize == 16 +psrldqm3, 1 +psrldqm4, 2 +%else +psrlq m3, 8 +psrlq m4, 16 +%endif +punpcklbw m2, m7 +punpcklbw m3, m7 +punpcklbw m4, m7 +paddw m2, m3 +paddw m2, m4 +%endmacro + +%macro CHECK1 0 +movam3, m0 +pcmpgtw m3, m2 +pminsw m0, m2 +movam6, m3 +pand
Re: [libav-devel] [PATCH] YADIF: Port inline assembly to YASM
If someone could test this on msvc 32/64-bit that would be great. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] YADIF: Port inline assembly to YASM
On Sat, Jan 5, 2013 at 12:17 PM, Ronald S. Bultje rsbul...@gmail.com wrote: Hi, On Sat, Jan 5, 2013 at 9:01 AM, Daniel Kang daniel.d.k...@gmail.com wrote: --- a/libavfilter/x86/yadif.c +++ b/libavfilter/x86/yadif.c @@ -26,49 +26,34 @@ #include libavcodec/x86/dsputil_mmx.h #include libavfilter/yadif.h -#if HAVE_INLINE_ASM +#if HAVE_YASM DECLARE_ASM_CONST(16, const xmm_reg, pb_1) = {0x0101010101010101ULL, 0x0101010101010101ULL}; DECLARE_ASM_CONST(16, const xmm_reg, pw_1) = {0x0001000100010001ULL, 0x0001000100010001ULL}; Move to .asm file also (SECTION_RODATA). Just remove the whole _mmx.c file except for the glue bits. Fixed. +%macro YADIF 0 +cglobal yadif_filter_line, 9, 9, 8, dst, prev, cur, next, w, prefs, \ +mrefs, parity, mode +%assign pad 16*5-gprsize-(stack_offset15) +SUBrsp, pad [..] +ADDrsp, pad +RET +%endmacro cglobal yadif_filter_line, 9, 9, 8, 16*5, names... Fixed. That way stack alignment works on msvc also. Now, this is harder because you'll need to use only 6 regs on msvc (instead of 7), because the 7th one needs to hold the stack pointer. You can test locally by changing HAVE_INLINE_ASM from 1 to 0 in your config.{mak,h,asm}. Sorry, how changing that test MSVC? +INIT_XMM sse2 +YADIF +INIT_MMX mmxext +YADIF %if ARCH_X86_32 INIT_MMX mmxext YADIF %endif Same change in c wrapper glue, gives smaller object files. Fixed. Is there a fate test? Yes, filter-yadif-mode1 and filter-yadif-mode0 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] YADIF: Port inline assembly to YASM
On Sat, Jan 5, 2013 at 12:29 PM, Diego Biurrun di...@biurrun.de wrote: On Sat, Jan 05, 2013 at 11:01:19AM -0600, Daniel Kang wrote: --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -4,3 +4,4 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/hqdn3d.o YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o +YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/yadif_yasm.o copy and paste typo :) Fixed. As a result, your code is probably untested, please check again. Interestingly enough it is tested (adding an int 3 kills the program as expected). I'm not sure why. --- a/libavfilter/x86/yadif.c +++ b/libavfilter/x86/yadif.c @@ -26,49 +26,34 @@ -#if HAVE_INLINE_ASM +#if HAVE_YASM -#if HAVE_MMXEXT_INLINE -#undef RENAME -#define RENAME(a) a ## _mmxext -#include yadif_template.c #endif -#endif /* HAVE_INLINE_ASM */ Please comment the #endif, these files have a tendency to collect a lot of them and then the commented endifs help keep track. Fixed. av_cold void ff_yadif_init_x86(YADIFContext *yadif) { int cpu_flags = av_get_cpu_flags(); -#if HAVE_MMXEXT_INLINE +#if HAVE_YASM if (cpu_flags AV_CPU_FLAG_MMXEXT) -yadif-filter_line = yadif_filter_line_mmxext; -#endif -#if HAVE_SSE2_INLINE +yadif-filter_line = ff_yadif_filter_line_mmxext; if (cpu_flags AV_CPU_FLAG_SSE2) -yadif-filter_line = yadif_filter_line_sse2; -#endif -#if HAVE_SSSE3_INLINE +yadif-filter_line = ff_yadif_filter_line_sse2; if (cpu_flags AV_CPU_FLAG_SSSE3) -yadif-filter_line = yadif_filter_line_ssse3; +yadif-filter_line = ff_yadif_filter_line_ssse3; #endif These could likely use HAVE_EXTERNAL_MMXEXT, etc... Maybe I'm missing something? AVX@AVX-PC /cygdrive/c/Code/libav $ git grep HAVE_EXTERNAL nothing --- /dev/null +++ b/libavfilter/x86/yadif_yasm.asm @@ -0,0 +1,246 @@ +;* +;* x86-optimized functions for volume filter volume? Copy/paste fail. Fixed. +;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com Keep the previous copyright line as well. Fixed. +%macro CHECK 2 +%if mmsize == 16 +psrldqm5, 1 +%else +psrlq m5, 8 +%endif +%if mmsize == 16 +psrldqm3, 1 +psrldqm4, 2 +%else +psrlq m3, 8 +psrlq m4, 16 +%endif idea (untested): %macro PSRLQ 2 %if mmsize == 16 psrlq %1, %2 * 8 %else psrlq %1, %2 %endif I don't think it's worth it for those couple of lines. +%macro CHECK1 0 +%endmacro + +%macro CHECK2 0 +%endmacro These names are not terribly descriptive. They weren't in the original file. I have no idea what to call them. +%macro LOAD 2 +movh m%1, %2 +punpcklbw m%1, m7 +%endmacro + +%macro ABSY 1-2 +%if cpuflag(ssse3) +pabsw %1, %1 +%else +ABS1_MMXEXT %1, %2 +%endif +%endmacro Is this a candidate for simply extending ABS1_MMXEXT? No. The MMXEXT is there for a reason. +%macro FILTER 3 +.loop%1: +pxorm7, m7 +LOAD 0, [curq+mrefsq] +LOAD 1, [curq+prefsq] +LOAD 2, [%2] +LOAD 3, [%3] +movam4, m3 +paddw m3, m2 +psraw m3, 1 +mova [rsp+ 0], m0 +mova [rsp+16], m3 +mova [rsp+32], m1 Indentation is off by one char, keep the ',' aligned. Oops, fixed. +%if mmsize == 16 +mova m3, m2 +psrldq m3, 2 +%else +pshufw m3, m2, 9 +%endif Didn't we have a macro for this? If we do, I can't find it. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] YADIF: Port inline assembly to YASM
\ -PSHUF(MM3, MM2) \ -punpcklbw MM7, MM2 \n\t /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\ -punpcklbw MM7, MM3 \n\t /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\ -paddw MM2, MM0 \n\t\ -paddw MM3, MM0 \n\t\ -psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\ -\ -CHECK(-2,0)\ -CHECK1\ -CHECK(-3,1)\ -CHECK2\ -CHECK(0,-2)\ -CHECK1\ -CHECK(1,-3)\ -CHECK2\ -\ -/* if(p-mode2) ... */\ -MOVQ 48(%[tmp]), MM6 \n\t /* diff */\ -cmpl $2, %[mode] \n\t\ -jge 1f \n\t\ -LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\ -LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\ -LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\ -LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\ -paddw MM4, MM2 \n\t\ -paddw MM5, MM3 \n\t\ -psrlw $1,MM2 \n\t /* b */\ -psrlw $1,MM3 \n\t /* f */\ -MOVQ (%[tmp]), MM4 \n\t /* c */\ -MOVQ 16(%[tmp]), MM5 \n\t /* d */\ -MOVQ 32(%[tmp]), MM7 \n\t /* e */\ -psubw MM4, MM2 \n\t /* b-c */\ -psubw MM7, MM3 \n\t /* f-e */\ -MOVQ MM5, MM0 \n\t\ -psubw MM4, MM5 \n\t /* d-c */\ -psubw MM7, MM0 \n\t /* d-e */\ -MOVQ MM2, MM4 \n\t\ -pminswMM3, MM2 \n\t\ -pmaxswMM4, MM3 \n\t\ -pmaxswMM5, MM2 \n\t\ -pminswMM5, MM3 \n\t\ -pmaxswMM0, MM2 \n\t /* max */\ -pminswMM0, MM3 \n\t /* min */\ -pxor MM4, MM4 \n\t\ -pmaxswMM3, MM6 \n\t\ -psubw MM2, MM4 \n\t /* -max */\ -pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\ -1: \n\t\ -\ -MOVQ 16(%[tmp]), MM2 \n\t /* d */\ -MOVQ MM2, MM3 \n\t\ -psubw MM6, MM2 \n\t /* d-diff */\ -paddw MM6, MM3 \n\t /* d+diff */\ -pmaxswMM2, MM1 \n\t\ -pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, d+diff); */\ -packuswb MM1, MM1 \n\t\ -\ -::[prev] r(prev),\ - [cur] r(cur),\ - [next] r(next),\ - [prefs]r((x86_reg)prefs),\ - [mrefs]r((x86_reg)mrefs),\ - [mode] g(mode),\ - [tmp] r(tmp)\ -);\ -__asm__ volatile(MOV MM1, %0 :=m(*dst));\ -dst += STEP;\ -prev+= STEP;\ -cur += STEP;\ -next+= STEP;\ -} - -if (parity) { -#define prev2 prev -#define next2 cur -FILTER -#undef prev2 -#undef next2 -} else { -#define prev2 cur -#define next2 next -FILTER -#undef prev2 -#undef next2 -} -} -#undef STEP -#undef MM -#undef MOV -#undef MOVQ -#undef MOVQU -#undef PSHUF -#undef PSRL1 -#undef PSRL2 -#undef LOAD -#undef PABS -#undef CHECK -#undef CHECK1 -#undef CHECK2 -#undef FILTER diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm new file mode 100644 index 000..91ced62 --- /dev/null +++ b/libavfilter/x86/yadif_yasm.asm @@ -0,0 +1,250 @@ +;* +;* x86-optimized functions for yadif filter +;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at +;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with Libav; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;** + +%include libavutil/x86/x86util.asm + +SECTION_RODATA + +pb_1: times 16 db 1 +pw_1: times 8 dw 1 + +SECTION .text + +%macro CHECK 2 +movu m2, [curq+mrefsq+%1] +movu m3, [curq+prefsq+%2] +mova m4, m2 +mova m5, m2 +pxor m4, m3 +pavgb m5, m3 +pand m4, [pb_1] +psubusb m5, m4 +%if mmsize == 16 +psrldqm5, 1 +%else +psrlq m5, 8 +%endif +punpcklbw m5, m7 +mova m4, m2 +psubusb m2, m3 +psubusb m3, m4 +pmaxubm2, m3 +mova m3, m2 +mova m4, m2 +%if mmsize
Re: [libav-devel] [PATCH] YADIF: Port inline assembly to YASM
On Sat, Jan 5, 2013 at 5:47 PM, Diego Biurrun di...@biurrun.de wrote: On Sat, Jan 05, 2013 at 12:48:58PM -0500, Daniel Kang wrote: On Sat, Jan 5, 2013 at 12:29 PM, Diego Biurrun di...@biurrun.de wrote: On Sat, Jan 05, 2013 at 11:01:19AM -0600, Daniel Kang wrote: --- a/libavfilter/x86/yadif.c +++ b/libavfilter/x86/yadif.c av_cold void ff_yadif_init_x86(YADIFContext *yadif) { int cpu_flags = av_get_cpu_flags(); -#if HAVE_MMXEXT_INLINE +#if HAVE_YASM if (cpu_flags AV_CPU_FLAG_MMXEXT) -yadif-filter_line = yadif_filter_line_mmxext; -#endif -#if HAVE_SSE2_INLINE +yadif-filter_line = ff_yadif_filter_line_mmxext; if (cpu_flags AV_CPU_FLAG_SSE2) -yadif-filter_line = yadif_filter_line_sse2; -#endif -#if HAVE_SSSE3_INLINE +yadif-filter_line = ff_yadif_filter_line_sse2; if (cpu_flags AV_CPU_FLAG_SSSE3) -yadif-filter_line = yadif_filter_line_ssse3; +yadif-filter_line = ff_yadif_filter_line_ssse3; #endif These could likely use HAVE_EXTERNAL_MMXEXT, etc... Maybe I'm missing something? AVX@AVX-PC /cygdrive/c/Code/libav $ git grep HAVE_EXTERNAL nothing I confused the order, it's HAVE_MMXEXT_EXTERNAL, etc... Fixed. +%macro CHECK1 0 +%endmacro + +%macro CHECK2 0 +%endmacro These names are not terribly descriptive. They weren't in the original file. I have no idea what to call them. Maybe ask Loren? Asked. +%macro LOAD 2 +movh m%1, %2 +punpcklbw m%1, m7 +%endmacro + +%macro ABSY 1-2 +%if cpuflag(ssse3) +pabsw %1, %1 +%else +ABS1_MMXEXT %1, %2 +%endif +%endmacro Is this a candidate for simply extending ABS1_MMXEXT? No. The MMXEXT is there for a reason. I was being totally unclear, as I was referring to some changes I only have locally. Please review the following patch: http://patches.libav.org/patch/25264/ That looks right. +%if mmsize == 16 +mova m3, m2 +psrldq m3, 2 +%else +pshufw m3, m2, 9 +%endif Didn't we have a macro for this? If we do, I can't find it. h264_intrapred.asm has some very similar code that likely should be extracted into a common macro. In another patch? This patch is supposed to be a port. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] YADIF: Port inline assembly to YASM
\ -psubusb MM4, MM3 \n\t\ -pmaxubMM3, MM2 \n\t\ -PSHUF(MM3, MM2) \ -punpcklbw MM7, MM2 \n\t /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\ -punpcklbw MM7, MM3 \n\t /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\ -paddw MM2, MM0 \n\t\ -paddw MM3, MM0 \n\t\ -psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\ -\ -CHECK(-2,0)\ -CHECK1\ -CHECK(-3,1)\ -CHECK2\ -CHECK(0,-2)\ -CHECK1\ -CHECK(1,-3)\ -CHECK2\ -\ -/* if(p-mode2) ... */\ -MOVQ 48(%[tmp]), MM6 \n\t /* diff */\ -cmpl $2, %[mode] \n\t\ -jge 1f \n\t\ -LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\ -LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\ -LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\ -LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\ -paddw MM4, MM2 \n\t\ -paddw MM5, MM3 \n\t\ -psrlw $1,MM2 \n\t /* b */\ -psrlw $1,MM3 \n\t /* f */\ -MOVQ (%[tmp]), MM4 \n\t /* c */\ -MOVQ 16(%[tmp]), MM5 \n\t /* d */\ -MOVQ 32(%[tmp]), MM7 \n\t /* e */\ -psubw MM4, MM2 \n\t /* b-c */\ -psubw MM7, MM3 \n\t /* f-e */\ -MOVQ MM5, MM0 \n\t\ -psubw MM4, MM5 \n\t /* d-c */\ -psubw MM7, MM0 \n\t /* d-e */\ -MOVQ MM2, MM4 \n\t\ -pminswMM3, MM2 \n\t\ -pmaxswMM4, MM3 \n\t\ -pmaxswMM5, MM2 \n\t\ -pminswMM5, MM3 \n\t\ -pmaxswMM0, MM2 \n\t /* max */\ -pminswMM0, MM3 \n\t /* min */\ -pxor MM4, MM4 \n\t\ -pmaxswMM3, MM6 \n\t\ -psubw MM2, MM4 \n\t /* -max */\ -pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\ -1: \n\t\ -\ -MOVQ 16(%[tmp]), MM2 \n\t /* d */\ -MOVQ MM2, MM3 \n\t\ -psubw MM6, MM2 \n\t /* d-diff */\ -paddw MM6, MM3 \n\t /* d+diff */\ -pmaxswMM2, MM1 \n\t\ -pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, d+diff); */\ -packuswb MM1, MM1 \n\t\ -\ -::[prev] r(prev),\ - [cur] r(cur),\ - [next] r(next),\ - [prefs]r((x86_reg)prefs),\ - [mrefs]r((x86_reg)mrefs),\ - [mode] g(mode),\ - [tmp] r(tmp)\ -);\ -__asm__ volatile(MOV MM1, %0 :=m(*dst));\ -dst += STEP;\ -prev+= STEP;\ -cur += STEP;\ -next+= STEP;\ -} - -if (parity) { -#define prev2 prev -#define next2 cur -FILTER -#undef prev2 -#undef next2 -} else { -#define prev2 cur -#define next2 next -FILTER -#undef prev2 -#undef next2 -} -} -#undef STEP -#undef MM -#undef MOV -#undef MOVQ -#undef MOVQU -#undef PSHUF -#undef PSRL1 -#undef PSRL2 -#undef LOAD -#undef PABS -#undef CHECK -#undef CHECK1 -#undef CHECK2 -#undef FILTER diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm new file mode 100644 index 000..76553a3 --- /dev/null +++ b/libavfilter/x86/yadif_yasm.asm @@ -0,0 +1,249 @@ +;* +;* x86-optimized functions for yadif filter +;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at +;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with Libav; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;** + +%include libavutil/x86/x86util.asm + +SECTION_RODATA + +pb_1: times 16 db 1 +pw_1: times 8 dw 1 + +SECTION .text + +%macro CHECK 2 +movu m2, [curq+mrefsq+%1] +movu m3, [curq+prefsq+%2] +mova m4, m2 +mova m5, m2 +pxor m4, m3 +pavgb m5, m3 +pand m4, [pb_1] +psubusb m5, m4 +%if mmsize == 16 +psrldqm5, 1 +%else +psrlq m5, 8 +%endif +punpcklbw m5, m7 +mova m4, m2 +psubusb m2, m3 +psubusb m3, m4
[libav-devel] [PATCH] YADIF: Port inline assembly to YASM
\ -pmaxubMM3, MM2 \n\t\ -PSHUF(MM3, MM2) \ -punpcklbw MM7, MM2 \n\t /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\ -punpcklbw MM7, MM3 \n\t /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\ -paddw MM2, MM0 \n\t\ -paddw MM3, MM0 \n\t\ -psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\ -\ -CHECK(-2,0)\ -CHECK1\ -CHECK(-3,1)\ -CHECK2\ -CHECK(0,-2)\ -CHECK1\ -CHECK(1,-3)\ -CHECK2\ -\ -/* if(p-mode2) ... */\ -MOVQ 48(%[tmp]), MM6 \n\t /* diff */\ -cmpl $2, %[mode] \n\t\ -jge 1f \n\t\ -LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\ -LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\ -LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\ -LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\ -paddw MM4, MM2 \n\t\ -paddw MM5, MM3 \n\t\ -psrlw $1,MM2 \n\t /* b */\ -psrlw $1,MM3 \n\t /* f */\ -MOVQ (%[tmp]), MM4 \n\t /* c */\ -MOVQ 16(%[tmp]), MM5 \n\t /* d */\ -MOVQ 32(%[tmp]), MM7 \n\t /* e */\ -psubw MM4, MM2 \n\t /* b-c */\ -psubw MM7, MM3 \n\t /* f-e */\ -MOVQ MM5, MM0 \n\t\ -psubw MM4, MM5 \n\t /* d-c */\ -psubw MM7, MM0 \n\t /* d-e */\ -MOVQ MM2, MM4 \n\t\ -pminswMM3, MM2 \n\t\ -pmaxswMM4, MM3 \n\t\ -pmaxswMM5, MM2 \n\t\ -pminswMM5, MM3 \n\t\ -pmaxswMM0, MM2 \n\t /* max */\ -pminswMM0, MM3 \n\t /* min */\ -pxor MM4, MM4 \n\t\ -pmaxswMM3, MM6 \n\t\ -psubw MM2, MM4 \n\t /* -max */\ -pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\ -1: \n\t\ -\ -MOVQ 16(%[tmp]), MM2 \n\t /* d */\ -MOVQ MM2, MM3 \n\t\ -psubw MM6, MM2 \n\t /* d-diff */\ -paddw MM6, MM3 \n\t /* d+diff */\ -pmaxswMM2, MM1 \n\t\ -pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, d+diff); */\ -packuswb MM1, MM1 \n\t\ -\ -::[prev] r(prev),\ - [cur] r(cur),\ - [next] r(next),\ - [prefs]r((x86_reg)prefs),\ - [mrefs]r((x86_reg)mrefs),\ - [mode] g(mode),\ - [tmp] r(tmp)\ -);\ -__asm__ volatile(MOV MM1, %0 :=m(*dst));\ -dst += STEP;\ -prev+= STEP;\ -cur += STEP;\ -next+= STEP;\ -} - -if (parity) { -#define prev2 prev -#define next2 cur -FILTER -#undef prev2 -#undef next2 -} else { -#define prev2 cur -#define next2 next -FILTER -#undef prev2 -#undef next2 -} -} -#undef STEP -#undef MM -#undef MOV -#undef MOVQ -#undef MOVQU -#undef PSHUF -#undef PSRL1 -#undef PSRL2 -#undef LOAD -#undef PABS -#undef CHECK -#undef CHECK1 -#undef CHECK2 -#undef FILTER diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm new file mode 100644 index 000..e51ed7e --- /dev/null +++ b/libavfilter/x86/yadif_yasm.asm @@ -0,0 +1,249 @@ +;* +;* x86-optimized functions for yadif filter +;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at +;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with Libav; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;** + +%include libavutil/x86/x86util.asm + +SECTION_RODATA + +pb_1: times 16 db 1 +pw_1: times 8 dw 1 + +SECTION .text + +%macro CHECK 2 +movu m2, [curq+mrefsq+%1] +movu m3, [curq+prefsq+%2] +mova m4, m2 +mova m5, m2 +pxor m4, m3 +pavgb m5, m3 +pand m4, [pb_1] +psubusb m5, m4 +%if mmsize == 16 +psrldqm5, 1 +%else +psrlq m5, 8 +%endif +punpcklbw m5, m7 +mova m4, m2 +psubusb m2, m3 +psubusb m3, m4 +pmaxubm2, m3 +mova
Re: [libav-devel] [PATCH] x86: h264_qpel: sign-extend stride arguments
On Tue, Nov 6, 2012 at 7:34 AM, Diego Biurrun di...@biurrun.de wrote: --- Now sign-extends all stride arguments, not just put_pixels16_sse2(). Also adds a colon to a jump label to make NASM happy. Since this patch is intended for squashing, I did not separate it from the rest. libavcodec/x86/dsputil.asm| 11 ++- libavcodec/x86/h264_qpel_8bit.asm | 30 ++ 2 files changed, 40 insertions(+), 1 deletions(-) If it works, the patch looks okay to me? ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/2] H264: Remove 3dnow qpel code.
The only CPUs that have 3dnow and don't have mmxext are 12 years old. Moreover, AMD has deprecated 3dnow. --- libavcodec/x86/dsputil_avg_template.c |8 +- libavcodec/x86/dsputil_mmx.c | 142 + libavcodec/x86/h264_qpel.c|4 - 3 files changed, 8 insertions(+), 146 deletions(-) diff --git a/libavcodec/x86/dsputil_avg_template.c b/libavcodec/x86/dsputil_avg_template.c index 8b116b7..b514746 100644 --- a/libavcodec/x86/dsputil_avg_template.c +++ b/libavcodec/x86/dsputil_avg_template.c @@ -55,6 +55,7 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_ :%REG_a, memory); } +#ifndef SKIP_FOR_3DNOW static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { __asm__ volatile( @@ -104,7 +105,7 @@ static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :S((x86_reg)src1Stride), D((x86_reg)dstStride) :memory); } - +#endif static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { @@ -226,6 +227,7 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src :memory);*/ } +#ifndef SKIP_FOR_3DNOW static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { __asm__ volatile( @@ -276,7 +278,7 @@ static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :S((x86_reg)src1Stride), D((x86_reg)dstStride) :memory); } - +#endif static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { @@ -872,6 +874,7 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line :%REG_a, memory); } +#ifndef SKIP_FOR_3DNOW static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { do { @@ -896,6 +899,7 @@ static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_siz h -= 4; } while(h 0); } +#endif //FIXME the following could be optimized too ... static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 86a08cb..a0231b7 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -197,12 +197,14 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; #define DEF(x) x ## _3dnow #define PAVGB pavgusb #define OP_AVG PAVGB +#define SKIP_FOR_3DNOW #include dsputil_avg_template.c #undef DEF #undef PAVGB #undef OP_AVG +#undef SKIP_FOR_3DNOW /***/ /* MMX2 specific */ @@ -1051,73 +1053,6 @@ static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \ );\ } \ \ -static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \ - uint8_t *src, \ - int dstStride, \ - int srcStride, \ - int h) \ -{ \ -int i;\ -int16_t temp[16]; \ -/* quick HACK, XXX FIXME MUST be optimized */ \ -for (i = 0; i h; i++) { \ -temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \ - (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]);\ -temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \ - (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]);\ -temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \ - (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]);\ -temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \ - (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]);\ -temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \ - (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]);\ -temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \ - (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]);\ -temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \ - (src[ 4] + src[ 9]) * 3 -
Re: [libav-devel] [PATCH 2/2] H.264: Convert 8-bit qpel inlined assembly to yasm
On Sat, Oct 13, 2012 at 11:04 AM, Daniel Kang daniel.d.k...@gmail.com wrote: --- libavcodec/x86/Makefile |4 +- libavcodec/x86/dsputil.asm| 222 +++ libavcodec/x86/dsputil_avg_template.c | 136 +--- libavcodec/x86/dsputil_mmx.c | 105 +-- libavcodec/x86/h264_qpel.c| 1138 + libavcodec/x86/h264_qpel_8bit.asm | 833 6 files changed, 1239 insertions(+), 1199 deletions(-) create mode 100644 libavcodec/x86/h264_qpel_8bit.asm This introduces many unused function warnings, but the functions are being called... ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] H.264: Convert 8-bit qpel inlined assembly to yasm
On Sat, Oct 13, 2012 at 11:31 AM, Måns Rullgård m...@mansr.com wrote: Daniel Kang daniel.d.k...@gmail.com writes: On Sat, Oct 13, 2012 at 11:04 AM, Daniel Kang daniel.d.k...@gmail.com wrote: --- libavcodec/x86/Makefile |4 +- libavcodec/x86/dsputil.asm| 222 +++ libavcodec/x86/dsputil_avg_template.c | 136 +--- libavcodec/x86/dsputil_mmx.c | 105 +-- libavcodec/x86/h264_qpel.c| 1138 + libavcodec/x86/h264_qpel_8bit.asm | 833 6 files changed, 1239 insertions(+), 1199 deletions(-) create mode 100644 libavcodec/x86/h264_qpel_8bit.asm This introduces many unused function warnings, but the functions are being called... Obviously they are not. GCC does not simply make things like that up. If I add an exit in the appropriate places, it exits. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] H.264: Convert 8-bit qpel inlined assembly to yasm
On Wed, Aug 22, 2012 at 11:30 PM, Loren Merritt lor...@u.washington.edu wrote: On Wed, 22 Aug 2012, daniel.d.k...@gmail.com wrote: +; void pixels8_l2_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +%macro PIXELS8_L2 1 +%define OP op_%1 +cglobal %1_pixels8_l2, 6,6 +test r5d, 1 +je .loop +movam0, [r1] +movam1, [r2] +add r1, r4 +add r2, 8 +pavgb m0, m1 +OP m0, [r0] +add r0, r3 +decr5d +.loop: +movam0, [r1] +add r1, r4 +movam1, [r1] +add r1, r4 +pavgb m0, [r2] +pavgb m1, [r2+8] +OP m0, [r0] +add r0, r3 +OP m1, [r0] +add r0, r3 +movam0, [r1] +add r1, r4 +movam1, [r1] +add r1, r4 +pavgb m0, [r2+16] +pavgb m1, [r2+24] +OP m0, [r0] +add r0, r3 +OP m1, [r0] +add r0, r3 +add r2, 32 +subr5d, 4 +jne .loop +REP_RET +%endmacro More adds than necessary. Use [r1+r4]. Fixed locally. +%macro QPEL4_H_LOWPASS_OP 1 +cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride +%define OP op_%1h I don't think this define clarifies anything, and it's only used once or twice each function. What do you suggest I do instead? +%macro QPEL8_H_LOWPASS_OP_XMM 1 +%define OP op_%1h +cglobal %1_h264_qpel8_h_lowpass, 4,5,7 ; dst, src, dstStride, srcStride +mov r4d, 8 +pxor m7, m7 +mova m6, [pw_5] +.loop: +lddqu m1, [r1-2] +mova m0, m1 +punpckhbw m1, m7 +punpcklbw m0, m7 +mova m2, m1 +mova m3, m1 +mova m4, m1 +mova m5, m1 +palignr m4, m0, 2 +palignr m3, m0, 4 +palignr m2, m0, 6 +palignr m1, m0, 8 +palignr m5, m0, 10 +paddw m0, m5 +paddw m2, m3 +paddw m1, m4 +psllw m2, 2 +psubw m2, m1 +paddw m0, [pw_16] +pmullwm2, m6 +paddw m2, m0 +psraw m2, 5 +packuswb m2, m2 +OPm2, [r0], m4 +add r1, r3 +add r0, r2 +dec r4d +jne .loop +REP_RET +%endmacro + +INIT_XMM ssse3 +QPEL8_H_LOWPASS_OP_XMM put +QPEL8_H_LOWPASS_OP_XMM avg There aren't any cpus that have both lddqu and ssse3. Use movu instead, since that's what lddqu actually does on everything other than pentium4. Fixed locally. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] H.264: Convert 8-bit qpel inlined assembly to yasm
On Thu, Aug 23, 2012 at 12:22 PM, Derek Buitenhuis derek.buitenh...@gmail.com wrote: On 22/08/2012 10:56 PM, daniel.d.k...@gmail.com wrote: From: Daniel Kang daniel.d.k...@gmail.com --- libavcodec/x86/Makefile |3 +- libavcodec/x86/dsputil.asm| 138 libavcodec/x86/dsputil_mmx.c | 79 +- libavcodec/x86/dsputil_mmx_avg_template.c |8 +- libavcodec/x86/h264_qpel.asm | 849 ++ libavcodec/x86/h264_qpel_mmx.c| 1107 - 6 files changed, 1149 insertions(+), 1035 deletions(-) create mode 100644 libavcodec/x86/h264_qpel.asm Doesn't this break !HAVE_INLINE_ASM? My tests indicate it does... No idea. There's no way to configure !HAVE_INLINE_ASM (before you say disable in config.h, you also have to disable HAVE_EBX, blah blah). Get me a system with VS or a configure option and I'll fix the breakage. A little birdie told me you haven't switched over all funcs to YASM versions yet. Depends on what you mean by that. I have yasm versions for everything needed. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] H.264: Convert 8-bit qpel inlined assembly to yasm
On Wed, Aug 22, 2012 at 5:01 AM, Diego Biurrun di...@biurrun.de wrote: On Tue, Aug 21, 2012 at 10:08:03PM -0500, daniel.d.k...@gmail.com wrote: From: Daniel Kang daniel.d.k...@gmail.com --- libavcodec/x86/Makefile |3 +- libavcodec/x86/dsputil.asm| 138 libavcodec/x86/dsputil_mmx.c | 79 +- libavcodec/x86/dsputil_mmx_avg_template.c |8 +- libavcodec/x86/h264_qpel.asm | 853 ++ libavcodec/x86/h264_qpel_mmx.c| 1107 - 6 files changed, 1153 insertions(+), 1035 deletions(-) create mode 100644 libavcodec/x86/h264_qpel.asm What changed? Again: Please annotate your patches with the --annotate option of git-send-email. My bad -- I forgot to do this. I fixed the first patch to shut up compiler warnings. --- /dev/null +++ b/libavcodec/x86/h264_qpel.asm @@ -0,0 +1,853 @@ + +%macro op_avgh 3 ; op_avgh +movh %3, %2 +pavgb %1, %3 +movh %2, %1 +%endmacro + +%macro op_avg 3 ; op_avg +pavgb %1, %2 +mova %2, %1 +%endmacro + +%macro op_puth 3 ; op_puth +movh %2, %1 +%endmacro + +%macro op_put 3 ; op_put +mova %2, %1 +%endmacro The the comments comments look look very very redundant redundant. Added for debugging and forgot to change. Fixed. +%if ARCH_X86_64 +; Is there a has ssse3 flag? Yes of course, why do you ask in a comment buried deep in the code? Fixed. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] H264: Remove 3dnow qpel code.
On Wed, Aug 22, 2012 at 9:14 AM, Diego Biurrun di...@biurrun.de wrote: On Tue, Aug 21, 2012 at 10:08:02PM -0500, daniel.d.k...@gmail.com wrote: From: Daniel Kang daniel.d.k...@gmail.com The only CPUs that have 3dnow and don't have mmxext are 12 years old. Moreover, AMD has deprecated 3dnow. --- libavcodec/x86/dsputil_mmx.c | 142 + libavcodec/x86/dsputil_mmx_avg_template.c |8 +- libavcodec/x86/h264_qpel_mmx.c|4 - 3 files changed, 8 insertions(+), 146 deletions(-) You have sent this patch before, we have discussed this before and my last round of questions remains unanswered. What questions? I fixed the SKIP_FOR_3DNOW ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] H264: Remove 3dnow qpel code.
On Wed, Aug 22, 2012 at 10:19 AM, Diego Biurrun di...@biurrun.de wrote: On Wed, Aug 22, 2012 at 10:15:14AM -0700, Daniel Kang wrote: On Wed, Aug 22, 2012 at 9:14 AM, Diego Biurrun di...@biurrun.de wrote: On Tue, Aug 21, 2012 at 10:08:02PM -0500, daniel.d.k...@gmail.com wrote: From: Daniel Kang daniel.d.k...@gmail.com The only CPUs that have 3dnow and don't have mmxext are 12 years old. Moreover, AMD has deprecated 3dnow. --- libavcodec/x86/dsputil_mmx.c | 142 + libavcodec/x86/dsputil_mmx_avg_template.c |8 +- libavcodec/x86/h264_qpel_mmx.c|4 - 3 files changed, 8 insertions(+), 146 deletions(-) You have sent this patch before, we have discussed this before and my last round of questions remains unanswered. What questions? I fixed the SKIP_FOR_3DNOW See my mail from July 25th. Size: The binary is about 418kb smaller. Why drop this: As mentioned in the commit message, 3dnow is deprecated by AMD. I see no reason to support a deprecated instruction set. Also, it's a maintenance burden for anyone who makes functional changes to the code. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] H264: Remove 3dnow qpel code.
On Wed, Aug 22, 2012 at 4:04 PM, Diego Biurrun di...@biurrun.de wrote: On Wed, Aug 22, 2012 at 08:56:46PM +0200, Luca Barbato wrote: On 8/22/12 8:45 PM, Måns Rullgård wrote: Daniel Kang daniel.d.k...@gmail.com writes: Why drop this: As mentioned in the commit message, 3dnow is deprecated by AMD. I see no reason to support a deprecated instruction set. Also, it's a maintenance burden for anyone who makes functional changes to the code. It is deprecated on current CPUs, not on the ancient ones predating SSE. We should mention which cpu might have a regression in performance (yes I can only think only of one) the rest should be fine. Do we have mmxext or sse for all functions that we have 3dnow for? For everything I removed, there's an mmxext counterpart. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/4] vc1dec: Fix motion vector scaling for field pictures
On Sun, Aug 19, 2012 at 1:10 PM, Diego Biurrun di...@biurrun.de wrote: On Sun, Aug 19, 2012 at 07:00:32PM +0200, Hendrik Leppkes wrote: On Sun, Aug 19, 2012 at 6:52 PM, Diego Biurrun di...@biurrun.de wrote: --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -1164,177 +1164,57 @@ static av_always_inline void get_mvdata_interlaced(VC1Context *v, int *dmv_x, +static void scale_field_mv(VC1Context *v, int16_t *mv, int opp, int dir) { +px = (px * scale) 8; +py = (py * scale) 8; + +if (FFABS(px) scalezone1_x) +px = (px * scale1) 8; +else if (px 0) +px = ((px * scale2) 8) - zone1offset_x; + else +px = ((px * scale2) 8) + zone1offset_x; + +if (FFABS(py) scalezone1_y) +py = (py * scale1) 8; +else if (py 0) +py = ((py * scale2) 8) - zone1offset_y; + else +py = ((py * scale2) 8) + zone1offset_y; many unnecessary () Just because they are not necessary for the syntax does not mean that they don't greatly help the readability of the code. Sure, but clearly not the case here IMO. The () around the multiplications just clutter the code, which will in any case be evaluated left to right, matching reading direction. Myself I looked twice to see why the () were added, only to note that they were added for no reason. I suggest we paint the bikeshed bright pink. Please, there's no need to reject a functional patch on the basis of too many () ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/3] x86: avcodec: Consistently name all init files
On Wed, Aug 15, 2012 at 2:40 AM, Diego Biurrun di...@biurrun.de wrote: On Tue, Aug 14, 2012 at 02:58:52PM +0200, Janne Grunau wrote: On 2012-08-13 13:42:57 +0200, Diego Biurrun wrote: --- libavcodec/x86/Makefile|6 +++--- libavcodec/x86/{pngdsp-init.c = pngdsp_init.c}|0 .../x86/{proresdsp-init.c = proresdsp_init.c} |0 libavcodec/x86/{vp8dsp-init.c = vp8dsp_init.c}|0 4 files changed, 3 insertions(+), 3 deletions(-) rename libavcodec/x86/{pngdsp-init.c = pngdsp_init.c} (100%) rename libavcodec/x86/{proresdsp-init.c = proresdsp_init.c} (100%) rename libavcodec/x86/{vp8dsp-init.c = vp8dsp_init.c} (100%) ok, since you obviously care. do we have a rule in the developer docs to use underscores instead of dash in filenames? I'm just asking because the it annoys me in the Linux kernel that both are used. We don't have a rule. We could of course add one if you want. I haven't given a general rule much thought yet, but, as you may have guessed, inconsistencies between similar files in the same subdirectory irked me. Can we choose something to prevent potential multiple renames of files? ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/3] x86: avcodec: Consistently name all init files
On Wed, Aug 15, 2012 at 3:41 PM, Diego Biurrun di...@biurrun.de wrote: On Wed, Aug 15, 2012 at 09:47:28AM -0700, Daniel Kang wrote: On Wed, Aug 15, 2012 at 2:40 AM, Diego Biurrun di...@biurrun.de wrote: On Tue, Aug 14, 2012 at 02:58:52PM +0200, Janne Grunau wrote: On 2012-08-13 13:42:57 +0200, Diego Biurrun wrote: --- libavcodec/x86/Makefile|6 +++--- libavcodec/x86/{pngdsp-init.c = pngdsp_init.c}|0 .../x86/{proresdsp-init.c = proresdsp_init.c} |0 libavcodec/x86/{vp8dsp-init.c = vp8dsp_init.c}|0 4 files changed, 3 insertions(+), 3 deletions(-) rename libavcodec/x86/{pngdsp-init.c = pngdsp_init.c} (100%) rename libavcodec/x86/{proresdsp-init.c = proresdsp_init.c} (100%) rename libavcodec/x86/{vp8dsp-init.c = vp8dsp_init.c} (100%) ok, since you obviously care. do we have a rule in the developer docs to use underscores instead of dash in filenames? I'm just asking because the it annoys me in the Linux kernel that both are used. We don't have a rule. We could of course add one if you want. I haven't given a general rule much thought yet, but, as you may have guessed, inconsistencies between similar files in the same subdirectory irked me. Can we choose something to prevent potential multiple renames of files? Which file is at risk of being renamed multiple times and what would be the problem? I am not worried about a particular file, but the case of having all files in one directory conforming to one standard, and deciding later to adapt a global standard. I think deciding on a particular format now would save trouble down the road. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 4/5] x86: fix rNmp macros with nasm
On Sun, Aug 5, 2012 at 7:36 PM, Mans Rullgard m...@mansr.com wrote: For some reason, nasm requires this. No harm done to yasm. Signed-off-by: Mans Rullgard m...@mansr.com --- libavutil/x86/x86inc.asm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Has this been synced with x264? ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 07/45] x86: mmx2 --- mmxext in asm constructs
On Sun, Aug 5, 2012 at 10:20 AM, Ronald S. Bultje rsbul...@gmail.com wrote: Plus, I didn't say it was a good idea, I said I could live with it if others want it. Right now, it seems others (i.e. Loren) don't. FYI, it makes my life harder. Also I agree with Loren. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] h264: convert 8-bit qpel inlined assembly to yasm
As it turns out, the qpel functions use stuff from dsputil. I'll just rewrite dsputil while I'm at it, so this will be somewhat delayed. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] H264: Remove 3dnow qpel code.
On Sat, Aug 4, 2012 at 1:10 PM, Diego Biurrun di...@biurrun.de wrote: On Thu, Aug 02, 2012 at 12:30:48AM -0500, daniel.d.k...@gmail.com wrote: --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -198,12 +198,14 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; #define DEF(x) x ## _3dnow #define PAVGB pavgusb #define OP_AVG PAVGB +#define IS_3DNOW #include dsputil_mmx_avg_template.c #undef DEF #undef PAVGB #undef OP_AVG +#undef IS_3DNOW IS_3DNOW supposedly stands for - what? SKIP_FOR_3DNOW would be a much more sensible name IMO. Fixed locally. Will resend once I port the rest of the parts of dsputil I need to. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] h264: convert 8-bit qpel inlined assembly to yasm
On Wed, Aug 1, 2012 at 1:03 AM, Loren Merritt lor...@u.washington.eduwrote: On Wed, 1 Aug 2012, Luca Barbato wrote: +%macro OP_MOVH_MMX 3 +movh %3, %2 +pavgb %1, %3 +movh %2, %1 +%endmacro + +%macro MOVH_MMX 3 +movh %2, %1 +%endmacro + +%macro OP_MOV_MMX 3 +mova %3, %2 +pavgb %1, %3 +mova %2, %1 pavgb %1, %2 mova %2, %1 (Just for the full width one) Fixed locally. +%endmacro + +%macro MOV_MMX 3 +mova %2, %1 +%endmacro It's op_put vs op_avg (or mov vs avg), not mov vs op_mov. Plus, naming them put vs avg would allow you to exploit the same put vs avg that's already in all the function names, rather than a separate %define OP. Fixed locally. +%macro QPEL8OR16_V_LOWPASS_OP 1 +cglobal %1_h264_qpel8or16_v_lowpass, 5,5,7 ; dst, src, dstStride, srcStride, h +%if cpuflag(sse2) +sub r1, r3 +sub r1, r3 +%endif +pxor m7, m7 +movh m0, [r1] +movh m1, [r1+r3] +lea r1, [r1+2*r3] +movh m2, [r1] +movh m3, [r1+r3] +lea r1, [r1+2*r3] +movh m4, [r1] +add r1, r3 +punpcklbw m0, m7 +punpcklbw m1, m7 +punpcklbw m2, m7 +punpcklbw m3, m7 +punpcklbw m4, m7 +FILT_V +FILT_V +FILT_V +FILT_V +FILT_V +FILT_V +FILT_V +FILT_V +cmp r4d, 16 +jne .end +FILT_V +FILT_V +FILT_V +FILT_V +FILT_V +FILT_V +FILT_V +FILT_V +.end: +RET +%endmacro (and other cases of this) REP_RET Fixed locally. ... and I'll skip the suggestions for improvement, since Daniel Kang has a separate branch for that. Thank you. I'll get to rewriting the qpel eventually. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] h264: convert 8-bit qpel inlined assembly to yasm
On Wed, Aug 1, 2012 at 5:22 AM, Måns Rullgård m...@mansr.com wrote: Daniel Kang daniel.d.k...@gmail.com writes: Can I have access to a setup that doesn't have inlined assembly? --extra-cflags=-D__asm__=error should make it fail nicely. This gives me bizarre errors like: /usr/include/x86_64-linux-gnu/asm/posix_types_64.h:25:14: error: storage class specified for parameter ‘__kernel_clockid_t’ Is there a way to just disable inlined assembly in libav? ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] H.264: Convert 8-bit qpel inlined assembly to yasm
On Thu, Aug 2, 2012 at 1:30 AM, daniel.d.k...@gmail.com wrote: From: Daniel Kang daniel.d.k...@gmail.com --- libavcodec/x86/Makefile |3 +- libavcodec/x86/dsputil_mmx.c| 81 +--- libavcodec/x86/dsputil_yasm.asm | 42 ++ libavcodec/x86/h264_qpel.asm| 850 +++ libavcodec/x86/h264_qpel_mmx.c | 946 --- 5 files changed, 984 insertions(+), 938 deletions(-) create mode 100644 libavcodec/x86/h264_qpel.asm Compiles and passes fate-h264 for me with --disable-yasm and with yasm. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] h264: remove 3dnow qpel code
On Wed, Aug 1, 2012 at 12:57 AM, Loren Merritt lor...@u.washington.eduwrote: On Wed, 1 Aug 2012, Luca Barbato wrote: From: Daniel Kang daniel.d.k...@gmail.com Remove the code to eases porting the other qpel optimizations to yasm. AMD has deprecated 3dnow and the only CPUs that have 3dnow and do not have mmxext are 12 years old. libavcodec/x86/dsputil_mmx_avg_template.c:58:1: warning: `put_pixels4_l2_3dnow' defined but not used libavcodec/x86/dsputil_mmx_avg_template.c:229:1: warning: `avg_pixels4_l2_3dnow' defined but not used libavcodec/x86/dsputil_mmx_avg_template.c:875:1: warning: `avg_pixels4_3dnow' defined but not used Those errors are fairly difficult to fix. Whoever wrote the original inlined assembly made it very hard to disable only certain functions at specific CPU types. Which would you prefer, a hack-ish workaround or to wait until I port the rest over to yasm? ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] h264: remove 3dnow qpel code
On Wed, Aug 1, 2012 at 1:03 AM, Daniel Kang daniel.d.k...@gmail.com wrote: On Wed, Aug 1, 2012 at 12:57 AM, Loren Merritt lor...@u.washington.eduwrote: libavcodec/x86/dsputil_mmx_avg_template.c:58:1: warning: `put_pixels4_l2_3dnow' defined but not used libavcodec/x86/dsputil_mmx_avg_template.c:229:1: warning: `avg_pixels4_l2_3dnow' defined but not used libavcodec/x86/dsputil_mmx_avg_template.c:875:1: warning: `avg_pixels4_3dnow' defined but not used Those errors are fairly difficult to fix. Whoever wrote the original inlined assembly made it very hard to disable only certain functions at specific CPU types. Which would you prefer, a hack-ish workaround or to wait until I port the rest over to yasm? I'm thinking something along the lines of: diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 47d99f9..126a0b9 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -198,12 +198,14 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; #define DEF(x) x ## _3dnow #define PAVGB pavgusb #define OP_AVG PAVGB +#define IS_3DNOW #include dsputil_mmx_avg_template.c #undef DEF #undef PAVGB #undef OP_AVG +#undef IS_3DNOW /***/ /* MMX2 specific */ diff --git a/libavcodec/x86/dsputil_mmx_avg_template.c b/libavcodec/x86/dsputil_mmx_avg_template.c index 8b116b7..29d0e51 100644 --- a/libavcodec/x86/dsputil_mmx_avg_template.c +++ b/libavcodec/x86/dsputil_mmx_avg_template.c @@ -55,6 +55,7 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_ :%REG_a, memory); } +#ifndef IS_3DNOW static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { __asm__ volatile( @@ -104,7 +105,7 @@ static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :S((x86_reg)src1Stride), D((x86_reg)dstStride) :memory); } - +#endif static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { @@ -226,6 +227,7 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src :memory);*/ } +#ifndef IS_3DNOW static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { __asm__ volatile( @@ -276,7 +278,7 @@ static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :S((x86_reg)src1Stride), D((x86_reg)dstStride) :memory); } - +#endif static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { @@ -872,6 +874,7 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line :%REG_a, memory); } +#ifndef IS_3DNOW static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { do { @@ -896,6 +899,7 @@ static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_siz h -= 4; } while(h 0); } +#endif //FIXME the following could be optimized too ... static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] h264: convert 8-bit qpel inlined assembly to yasm
On Wed, Aug 1, 2012 at 12:28 AM, Ronald S. Bultje rsbul...@gmail.comwrote: Hi, On Tue, Jul 31, 2012 at 9:12 PM, Luca Barbato lu_z...@gentoo.org wrote: @@ -2879,10 +2879,9 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, int mm_flags) { const int bit_depth = avctx-bits_per_raw_sample; - -#if HAVE_INLINE_ASM const int high_bit_depth = bit_depth 8; +#if HAVE_INLINE_ASM if (!(mm_flags AV_CPU_FLAG_SSE2SLOW)) { // these functions are slower than mmx on AMD, but faster on Intel if (!high_bit_depth) { @@ -2893,7 +2892,9 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, H264_QPEL_FUNCS(0, 0, sse2); } } +#endif /* HAVE_INLINE_ASM */ The H264_QPEL_FUNCS() call should go outside HAVE_INLINE_ASM. Fixed locally. +;* +;* MMX/SSE2/SSSE3-optimized H.264 qpel code +;* +;* Copyright (C) 2012 Daniel Kang +;* +;* Authors: Daniel Kang daniel.d.k...@gmail.com You told me you based this substantially on the existing implementation; if that's the case, you should acknowledge the original author(s) of that code also. Fixed locally. Can I have access to a setup that doesn't have inlined assembly? ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] H.264: Convert 8-bit qpel inlined assembly to yasm
0.1% slower, probably due to alignment issues and a very small amount of calling overhead. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] H264: Remove 3dnow qpel code.
From: Daniel Kang daniel.d.k...@gmail.com The only CPUs that have 3dnow and don't have mmxext are 12 years old. --- libavcodec/x86/dsputil_mmx.c |9 - libavcodec/x86/h264_qpel_mmx.c |4 2 files changed, 0 insertions(+), 13 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 5eb4a24..f7dbb0b 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2783,15 +2783,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, SET_QPEL_FUNCS(avg_qpel,0, 16, 3dnow, ); SET_QPEL_FUNCS(avg_qpel,1, 8, 3dnow, ); -if (!high_bit_depth) { -SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, ); -SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, ); -SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, ); -SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, ); -SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, ); -SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, ); -} - SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, ); SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, ); SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, ); diff --git a/libavcodec/x86/h264_qpel_mmx.c b/libavcodec/x86/h264_qpel_mmx.c index 85ae07e..cd7ea99 100644 --- a/libavcodec/x86/h264_qpel_mmx.c +++ b/libavcodec/x86/h264_qpel_mmx.c @@ -1161,9 +1161,6 @@ QPEL(put_, 16,XMM, 16)\ QPEL(avg_, 8, XMM, 16)\ QPEL(avg_, 16,XMM, 16)\ -#define PAVGB pavgusb -QPEL_H264(put_, PUT_OP, 3dnow) -QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) #undef PAVGB #define PAVGB pavgb QPEL_H264(put_, PUT_OP, mmx2) @@ -1182,7 +1179,6 @@ QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) #endif #undef PAVGB -H264_MC_4816(3dnow) H264_MC_4816(mmx2) H264_MC_816(H264_MC_V, sse2) H264_MC_816(H264_MC_HV, sse2) -- 1.7.7.3 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] H264: Remove 3dnow qpel code.
On Mon, Jul 23, 2012 at 5:21 PM, Diego Biurrun di...@biurrun.de wrote: On Mon, Jul 23, 2012 at 05:12:23PM -0700, Daniel Kang wrote: From: Daniel Kang daniel.d.k...@gmail.com The only CPUs that have 3dnow and don't have mmxext are 12 years old. --- libavcodec/x86/dsputil_mmx.c |9 - libavcodec/x86/h264_qpel_mmx.c |4 2 files changed, 0 insertions(+), 13 deletions(-) What sort of maintenance burden does this relieve us from? I'm writing this mail on a system fitting the description you mention, my trusty old K6-III. 1. Decreases binary size. 2. We don't support Windows ME (12 years old), I don't see a reason to keep this cruft. 3. 3dnow becomes a pain when I'm trying to port code to yasm. 4. You can probably decode 260p H.264 with a K6-III. Who seriously would use this? ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] VP8: Implement sliced threading.
On Thu, Jul 12, 2012 at 1:07 PM, Daniel Kang daniel.d.k...@gmail.comwrote: Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with eight threads. Sliced threading uses more memory than single or frame threading. --- libavcodec/vp8.c | 514 ++ libavcodec/vp8.h | 63 --- 2 files changed, 399 insertions(+), 178 deletions(-) This should have addressed all of Luca's and Diego's comments. Ping? ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] VP8: Implement sliced threading.
On Fri, Jul 13, 2012 at 10:34 AM, Luca Barbato lu_z...@gentoo.org wrote: This should have addressed all of Luca's and Diego's comments. Ping Looks fine for me, shall we push it in 4 hours? If no one else has objections, fine by me. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] VP8: Implement sliced threading.
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with eight threads. Sliced threading uses more memory than single or frame threading. --- libavcodec/vp8.c | 514 ++ libavcodec/vp8.h | 63 --- 2 files changed, 399 insertions(+), 178 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 2181976..936c16a 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -4,6 +4,7 @@ * Copyright (C) 2010 David Conrad * Copyright (C) 2010 Ronald S. Bultje * Copyright (C) 2010 Jason Garrett-Glaser + * Copyright (C) 2012 Daniel Kang * * This file is part of Libav. * @@ -30,17 +31,28 @@ #include rectangle.h #include thread.h +#if HAVE_PTHREADS +#include pthread.h +#elif HAVE_W32THREADS +#include w32pthreads.h +#endif + #if ARCH_ARM # include arm/vp8.h #endif static void free_buffers(VP8Context *s) { +int i; +if (s-thread_data) +for (i = 0; i MAX_THREADS; i++) { +av_freep(s-thread_data[i].filter_strength); +av_freep(s-thread_data[i].edge_emu_buffer); +} +av_freep(s-thread_data); av_freep(s-macroblocks_base); -av_freep(s-filter_strength); av_freep(s-intra4x4_pred_mode_top); av_freep(s-top_nnz); -av_freep(s-edge_emu_buffer); av_freep(s-top_border); s-macroblocks = NULL; @@ -108,6 +120,9 @@ static void vp8_decode_flush(AVCodecContext *avctx) static int update_dimensions(VP8Context *s, int width, int height) { +AVCodecContext *avctx = s-avctx; +int i; + if (width != s-avctx-width || height != s-avctx-height) { if (av_image_check_size(width, height, 0, s-avctx)) @@ -121,14 +136,25 @@ static int update_dimensions(VP8Context *s, int width, int height) s-mb_width = (s-avctx-coded_width +15) / 16; s-mb_height = (s-avctx-coded_height+15) / 16; -s-macroblocks_base= av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); -s-filter_strength = av_mallocz(s-mb_width*sizeof(*s-filter_strength)); -s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); -s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); -s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-mb_layout = (avctx-active_thread_type == FF_THREAD_SLICE) (FFMIN(s-num_coeff_partitions, avctx-thread_count) 1); +if (s-mb_layout == 0) { // Frame threading and one thread +s-macroblocks_base = av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); +s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); +} +else // Sliced threading +s-macroblocks_base = av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); +s-top_nnz= av_mallocz(s-mb_width*sizeof(*s-top_nnz)); +s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-thread_data= av_mallocz(MAX_THREADS*sizeof(VP8ThreadData)); -if (!s-macroblocks_base || !s-filter_strength || !s-intra4x4_pred_mode_top || -!s-top_nnz || !s-top_border) +for (i = 0; i MAX_THREADS; i++) { +s-thread_data[i].filter_strength = av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength)); +pthread_mutex_init(s-thread_data[i].lock, NULL); +pthread_cond_init(s-thread_data[i].cond, NULL); +} + +if (!s-macroblocks_base || !s-top_nnz || !s-top_border || +(!s-intra4x4_pred_mode_top s-mb_layout == 0)) return AVERROR(ENOMEM); s-macroblocks= s-macroblocks_base + 1; @@ -332,12 +358,6 @@ static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) memset(s-segmentation, 0, sizeof(s-segmentation)); } -if (!s-macroblocks_base || /* first frame */ -width != s-avctx-width || height != s-avctx-height) { -if ((ret = update_dimensions(s, width, height)) 0) -return ret; -} - ff_vp56_init_range_decoder(c, buf, header_size); buf += header_size; buf_size -= header_size; @@ -366,6 +386,12 @@ static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) return AVERROR_INVALIDDATA; } +if (!s-macroblocks_base || /* first frame */ +width != s-avctx-width || height != s-avctx-height) { +if ((ret = update_dimensions(s, width, height)) 0) +return ret; +} + get_quants(s); if (!s-keyframe) { @@ -468,19 +494,26 @@ const uint8_t *get_submv_prob(uint32_t left, uint32_t top) * @returns the number of motion vectors parsed (2, 4 or 16) */ static av_always_inline -int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb) +int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout) { int part_idx; int n, num; -VP8Macroblock *top_mb = mb[2]; +VP8Macroblock *top_mb
Re: [libav-devel] [PATCH 3/3] VP8: Implement sliced threading.
On Thu, Jul 12, 2012 at 12:27 PM, Måns Rullgård m...@mansr.com wrote: Luca Barbato lu_z...@gentoo.org writes: On 07/12/2012 09:42 AM, Måns Rullgård wrote: Are you still increasing the memory usage by a huge amount? If so, I'm a bit concerned about how that will affect performance on systems with relatively small caches. Not sure if those systems would use slice threading, single thread and frame threading should keep the previous memory usage. It was my understanding that some version of this patch increased memory usage for all modes. If that is no longer the case, great. This is no longer the case. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] VP8: Implement sliced threading.
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with eight threads. Sliced threading uses more memory than single or frame threading. --- libavcodec/vp8.c | 514 ++ libavcodec/vp8.h | 63 --- 2 files changed, 399 insertions(+), 178 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 2181976..ee954fc 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -4,6 +4,7 @@ * Copyright (C) 2010 David Conrad * Copyright (C) 2010 Ronald S. Bultje * Copyright (C) 2010 Jason Garrett-Glaser + * Copyright (C) 2012 Daniel Kang * * This file is part of Libav. * @@ -30,17 +31,28 @@ #include rectangle.h #include thread.h +#if HAVE_PTHREADS +#include pthread.h +#elif HAVE_W32THREADS +#include w32pthreads.h +#endif + #if ARCH_ARM # include arm/vp8.h #endif static void free_buffers(VP8Context *s) { +int i; +if (s-thread_data) +for (i = 0; i MAX_THREADS; i++) { +av_freep(s-thread_data[i].filter_strength); +av_freep(s-thread_data[i].edge_emu_buffer); +} +av_freep(s-thread_data); av_freep(s-macroblocks_base); -av_freep(s-filter_strength); av_freep(s-intra4x4_pred_mode_top); av_freep(s-top_nnz); -av_freep(s-edge_emu_buffer); av_freep(s-top_border); s-macroblocks = NULL; @@ -108,6 +120,9 @@ static void vp8_decode_flush(AVCodecContext *avctx) static int update_dimensions(VP8Context *s, int width, int height) { +AVCodecContext *avctx = s-avctx; +int i; + if (width != s-avctx-width || height != s-avctx-height) { if (av_image_check_size(width, height, 0, s-avctx)) @@ -121,14 +136,25 @@ static int update_dimensions(VP8Context *s, int width, int height) s-mb_width = (s-avctx-coded_width +15) / 16; s-mb_height = (s-avctx-coded_height+15) / 16; -s-macroblocks_base= av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); -s-filter_strength = av_mallocz(s-mb_width*sizeof(*s-filter_strength)); -s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); -s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); -s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-mb_layout = (avctx-active_thread_type == FF_THREAD_SLICE) (FFMIN(s-num_coeff_partitions, avctx-thread_count) 1); +if (!s-mb_layout) { // Frame threading and one thread +s-macroblocks_base = av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); +s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); +} +else // Sliced threading +s-macroblocks_base = av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); +s-top_nnz= av_mallocz(s-mb_width*sizeof(*s-top_nnz)); +s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-thread_data= av_mallocz(MAX_THREADS*sizeof(VP8ThreadData)); -if (!s-macroblocks_base || !s-filter_strength || !s-intra4x4_pred_mode_top || -!s-top_nnz || !s-top_border) +for (i = 0; i MAX_THREADS; i++) { +s-thread_data[i].filter_strength = av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength)); +pthread_mutex_init(s-thread_data[i].lock, NULL); +pthread_cond_init(s-thread_data[i].cond, NULL); +} + +if (!s-macroblocks_base || !s-top_nnz || !s-top_border || +(!s-intra4x4_pred_mode_top !s-mb_layout)) return AVERROR(ENOMEM); s-macroblocks= s-macroblocks_base + 1; @@ -332,12 +358,6 @@ static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) memset(s-segmentation, 0, sizeof(s-segmentation)); } -if (!s-macroblocks_base || /* first frame */ -width != s-avctx-width || height != s-avctx-height) { -if ((ret = update_dimensions(s, width, height)) 0) -return ret; -} - ff_vp56_init_range_decoder(c, buf, header_size); buf += header_size; buf_size -= header_size; @@ -366,6 +386,12 @@ static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) return AVERROR_INVALIDDATA; } +if (!s-macroblocks_base || /* first frame */ +width != s-avctx-width || height != s-avctx-height) { +if ((ret = update_dimensions(s, width, height)) 0) +return ret; +} + get_quants(s); if (!s-keyframe) { @@ -468,19 +494,26 @@ const uint8_t *get_submv_prob(uint32_t left, uint32_t top) * @returns the number of motion vectors parsed (2, 4 or 16) */ static av_always_inline -int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb) +int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout) { int part_idx; int n, num; -VP8Macroblock *top_mb = mb[2]; +VP8Macroblock *top_mb
[libav-devel] [PATCH 1/3] VP8: Refactor decoding a single mb_row.
This is in preperation for sliced threading. --- libavcodec/vp8.c | 164 -- 1 files changed, 86 insertions(+), 78 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 94200f6..8ebc445 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -1574,11 +1574,95 @@ static void release_queued_segmaps(VP8Context *s, int is_close) s-maps_are_invalid = 0; } +#define MARGIN (16 2) +static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe, + AVFrame *prev_frame, int mb_y) +{ +VP8Context *s = avctx-priv_data; +VP56RangeCoder *c = s-coeff_partition[mb_y (s-num_coeff_partitions-1)]; +VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2; +int i, y, mb_x, mb_xy = mb_y*s-mb_width; +uint8_t *dst[3] = { +curframe-data[0] + 16*mb_y*s-linesize, +curframe-data[1] + 8*mb_y*s-uvlinesize, +curframe-data[2] + 8*mb_y*s-uvlinesize +}; + +memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock +memset(s-left_nnz, 0, sizeof(s-left_nnz)); +AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101); + +// left edge of 129 for intra prediction +if (!(avctx-flags CODEC_FLAG_EMU_EDGE)) { +for (i = 0; i 3; i++) +for (y = 0; y 16!!i; y++) +dst[i][y*curframe-linesize[i]-1] = 129; +if (mb_y == 1) // top left edge is also 129 +s-top_border[0][15] = s-top_border[0][23] = s-top_border[0][31] = 129; +} + +s-mv_min.x = -MARGIN; +s-mv_max.x = ((s-mb_width - 1) 6) + MARGIN; + +for (mb_x = 0; mb_x s-mb_width; mb_x++, mb_xy++, mb++) { +/* Prefetch the current frame, 4 MBs ahead */ +s-dsp.prefetch(dst[0] + (mb_x3)*4*s-linesize + 64, s-linesize, 4); +s-dsp.prefetch(dst[1] + (mb_x7)*s-uvlinesize + 64, dst[2] - dst[1], 2); + +decode_mb_mode(s, mb, mb_x, mb_y, curframe-ref_index[0] + mb_xy, + prev_frame prev_frame-ref_index[0] ? prev_frame-ref_index[0] + mb_xy : NULL); + +prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS); + +if (!mb-skip) +decode_mb_coeffs(s, c, mb, s-top_nnz[mb_x], s-left_nnz); + +if (mb-mode = MODE_I4x4) +intra_predict(s, dst, mb, mb_x, mb_y); +else +inter_predict(s, dst, mb, mb_x, mb_y); + +prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN); + +if (!mb-skip) { +idct_mb(s, dst, mb); +} else { +AV_ZERO64(s-left_nnz); +AV_WN64(s-top_nnz[mb_x], 0); // array of 9, so unaligned + +// Reset DC block predictors if they would exist if the mb had coefficients +if (mb-mode != MODE_I4x4 mb-mode != VP8_MVMODE_SPLIT) { +s-left_nnz[8] = 0; +s-top_nnz[mb_x][8] = 0; +} +} + +if (s-deblock_filter) +filter_level_for_mb(s, mb, s-filter_strength[mb_x]); + +prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2); + +dst[0] += 16; +dst[1] += 8; +dst[2] += 8; +s-mv_min.x -= 64; +s-mv_max.x -= 64; +} +if (s-deblock_filter) { +if (s-filter.simple) +filter_mb_row_simple(s, curframe, mb_y); +else +filter_mb_row(s, curframe, mb_y); +} +s-mv_min.y -= 64; +s-mv_max.y -= 64; +} + static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt) { VP8Context *s = avctx-priv_data; -int ret, mb_x, mb_y, i, y, referenced; +int ret, mb_y, i, referenced; enum AVDiscard skip_thresh; AVFrame *av_uninit(curframe), *prev_frame; @@ -1686,90 +1770,14 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, if (s-keyframe) memset(s-intra4x4_pred_mode_top, DC_PRED, s-mb_width*4); -#define MARGIN (16 2) s-mv_min.y = -MARGIN; s-mv_max.y = ((s-mb_height - 1) 6) + MARGIN; for (mb_y = 0; mb_y s-mb_height; mb_y++) { -VP56RangeCoder *c = s-coeff_partition[mb_y (s-num_coeff_partitions-1)]; -VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2; -int mb_xy = mb_y*s-mb_width; -uint8_t *dst[3] = { -curframe-data[0] + 16*mb_y*s-linesize, -curframe-data[1] + 8*mb_y*s-uvlinesize, -curframe-data[2] + 8*mb_y*s-uvlinesize -}; - -memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock -memset(s-left_nnz, 0, sizeof(s-left_nnz)); -AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101); - -// left edge of 129 for intra prediction -if (!(avctx-flags CODEC_FLAG_EMU_EDGE)) { -for (i = 0; i 3; i++) -for (y = 0; y 16!!i; y++) -dst[i][y*curframe-linesize[i]-1] = 129; -if (mb_y
[libav-devel] [PATCH 2/3] VP8: Move data from VP8Context-VP8Macroblock
In preparation for sliced threading. --- libavcodec/vp8.c | 25 + libavcodec/vp8.h |7 --- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 8ebc445..2181976 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -622,10 +622,11 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y) } static av_always_inline -void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, +void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int mb_x, int keyframe) { -uint8_t *intra4x4 = s-intra4x4_pred_mode_mb; +uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb; + if (keyframe) { int x, y; uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x; @@ -655,7 +656,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s-prob-segmentid); else if (s-segmentation.enabled) *segment = ref ? *ref : *segment; -s-segment = *segment; +mb-segment = *segment; mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c, s-prob-mbskip) : 0; @@ -663,14 +664,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra); if (mb-mode == MODE_I4x4) { -decode_intra4x4_modes(s, c, mb_x, 1); +decode_intra4x4_modes(s, c, mb, mb_x, 1); } else { const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u; AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes); AV_WN32A(s-intra4x4_pred_mode_left, modes); } -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); mb-ref_frame = VP56_FRAME_CURRENT; } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) { // inter MB, 16.2 @@ -688,9 +689,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s-prob-pred16x16); if (mb-mode == MODE_I4x4) -decode_intra4x4_modes(s, c, mb_x, 0); +decode_intra4x4_modes(s, c, mb, mb_x, 0); -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); mb-ref_frame = VP56_FRAME_CURRENT; mb-partitioning = VP8_SPLITMVMODE_NONE; AV_ZERO32(mb-bmv[0]); @@ -791,7 +792,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, { int i, x, y, luma_start = 0, luma_ctx = 3; int nnz_pred, nnz, nnz_total = 0; -int segment = s-segment; +int segment = mb-segment; int block_dc = 0; if (mb-mode != MODE_I4x4 mb-mode != VP8_MVMODE_SPLIT) { @@ -1002,7 +1003,7 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, s-hpc.pred16x16[mode](dst[0], s-linesize); } else { uint8_t *ptr = dst[0]; -uint8_t *intra4x4 = s-intra4x4_pred_mode_mb; +uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb; uint8_t tr_top[4] = { 127, 127, 127, 127 }; // all blocks on the right edge of the macroblock use bottom edge @@ -1087,9 +1088,9 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, } if (avctx-flags CODEC_FLAG_EMU_EDGE) { -mode = check_intra_pred8x8_mode_emuedge(s-chroma_pred_mode, mb_x, mb_y); +mode = check_intra_pred8x8_mode_emuedge(mb-chroma_pred_mode, mb_x, mb_y); } else { -mode = check_intra_pred8x8_mode(s-chroma_pred_mode, mb_x, mb_y); +mode = check_intra_pred8x8_mode(mb-chroma_pred_mode, mb_x, mb_y); } s-hpc.pred8x8[mode](dst[1], s-uvlinesize); s-hpc.pred8x8[mode](dst[2], s-uvlinesize); @@ -1408,7 +1409,7 @@ static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *m int interior_limit, filter_level; if (s-segmentation.enabled) { -filter_level = s-segmentation.filter_level[s-segment]; +filter_level = s-segmentation.filter_level[mb-segment]; if (!s-segmentation.absolute_vals) filter_level += s-filter.level; } else diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h index a738cb7..2f2cb80 100644 --- a/libavcodec/vp8.h +++ b/libavcodec/vp8.h @@ -79,6 +79,10 @@ typedef struct { uint8_t mode; uint8_t ref_frame; uint8_t partitioning; +uint8_t chroma_pred_mode; +uint8_t segment; +uint8_t intra4x4_pred_mode_mb[16]; +uint8_t intra4x4_pred_mode_top[4]; VP56mv mv; VP56mv bmv[16]; } VP8Macroblock; @@ -97,8 +101,6 @@ typedef struct { uint8_t keyframe;
[libav-devel] [PATCH 3/3] VP8: Implement sliced threading.
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with eight threads. Sliced threading uses more memory than single or frame threading. --- libavcodec/vp8.c | 521 ++ libavcodec/vp8.h | 63 --- 2 files changed, 407 insertions(+), 177 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 2181976..756714e 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -4,6 +4,7 @@ * Copyright (C) 2010 David Conrad * Copyright (C) 2010 Ronald S. Bultje * Copyright (C) 2010 Jason Garrett-Glaser + * Copyright (C) 2012 Daniel Kang * * This file is part of Libav. * @@ -22,6 +23,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include pthread.h + #include libavutil/imgutils.h #include avcodec.h #include internal.h @@ -36,11 +39,16 @@ static void free_buffers(VP8Context *s) { +int i; +if (s-thread_data) +for (i = 0; i MAX_THREADS; i++) { +av_freep(s-thread_data[i].filter_strength); +av_freep(s-thread_data[i].edge_emu_buffer); +} +av_freep(s-thread_data); av_freep(s-macroblocks_base); -av_freep(s-filter_strength); av_freep(s-intra4x4_pred_mode_top); av_freep(s-top_nnz); -av_freep(s-edge_emu_buffer); av_freep(s-top_border); s-macroblocks = NULL; @@ -108,6 +116,9 @@ static void vp8_decode_flush(AVCodecContext *avctx) static int update_dimensions(VP8Context *s, int width, int height) { +AVCodecContext *avctx = s-avctx; +int i; + if (width != s-avctx-width || height != s-avctx-height) { if (av_image_check_size(width, height, 0, s-avctx)) @@ -121,14 +132,25 @@ static int update_dimensions(VP8Context *s, int width, int height) s-mb_width = (s-avctx-coded_width +15) / 16; s-mb_height = (s-avctx-coded_height+15) / 16; -s-macroblocks_base= av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); -s-filter_strength = av_mallocz(s-mb_width*sizeof(*s-filter_strength)); -s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); -s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); -s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-mlayout = (avctx-active_thread_type == FF_THREAD_SLICE) (FFMIN(s-num_coeff_partitions, avctx-thread_count) 1); +if (s-mlayout == 0) { // Frame threading and one thread +s-macroblocks_base = av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); +s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); +} +else // Sliced threading +s-macroblocks_base = av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); +s-top_nnz= av_mallocz(s-mb_width*sizeof(*s-top_nnz)); +s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-thread_data= av_mallocz(MAX_THREADS*sizeof(VP8ThreadData)); -if (!s-macroblocks_base || !s-filter_strength || !s-intra4x4_pred_mode_top || -!s-top_nnz || !s-top_border) +for (i = 0; i MAX_THREADS; i++) { +s-thread_data[i].filter_strength = av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength)); +//pthread_mutex_init(s-thread_data[i]-lock, NULL); +//pthread_cond_init(s-thread_data[i]-cond, NULL); +} + +if (!s-macroblocks_base || !s-top_nnz || !s-top_border || +(!s-intra4x4_pred_mode_top s-mlayout == 0)) return AVERROR(ENOMEM); s-macroblocks= s-macroblocks_base + 1; @@ -332,12 +354,6 @@ static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) memset(s-segmentation, 0, sizeof(s-segmentation)); } -if (!s-macroblocks_base || /* first frame */ -width != s-avctx-width || height != s-avctx-height) { -if ((ret = update_dimensions(s, width, height)) 0) -return ret; -} - ff_vp56_init_range_decoder(c, buf, header_size); buf += header_size; buf_size -= header_size; @@ -366,6 +382,12 @@ static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) return AVERROR_INVALIDDATA; } +if (!s-macroblocks_base || /* first frame */ +width != s-avctx-width || height != s-avctx-height) { +if ((ret = update_dimensions(s, width, height)) 0) +return ret; +} + get_quants(s); if (!s-keyframe) { @@ -468,19 +490,26 @@ const uint8_t *get_submv_prob(uint32_t left, uint32_t top) * @returns the number of motion vectors parsed (2, 4 or 16) */ static av_always_inline -int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb) +int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout) { int part_idx; int n, num; -VP8Macroblock *top_mb = mb[2]; +VP8Macroblock
[libav-devel] [PATCH] VP8: Implement sliced threading.
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with eight threads. Sliced threading uses more memory than single or frame threading. --- libavcodec/vp8.c | 526 ++ libavcodec/vp8.h | 63 --- 2 files changed, 412 insertions(+), 177 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 2181976..a0040d0 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -4,6 +4,7 @@ * Copyright (C) 2010 David Conrad * Copyright (C) 2010 Ronald S. Bultje * Copyright (C) 2010 Jason Garrett-Glaser + * Copyright (C) 2012 Daniel Kang * * This file is part of Libav. * @@ -22,6 +23,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ + #include libavutil/imgutils.h #include avcodec.h #include internal.h @@ -30,17 +32,28 @@ #include rectangle.h #include thread.h +#if HAVE_PTHREADS +#include pthread.h +#elif HAVE_W32THREADS +#include w32pthreads.h +#endif + #if ARCH_ARM # include arm/vp8.h #endif static void free_buffers(VP8Context *s) { +int i; +if (s-thread_data) +for (i = 0; i MAX_THREADS; i++) { +av_freep(s-thread_data[i].filter_strength); +av_freep(s-thread_data[i].edge_emu_buffer); +} +av_freep(s-thread_data); av_freep(s-macroblocks_base); -av_freep(s-filter_strength); av_freep(s-intra4x4_pred_mode_top); av_freep(s-top_nnz); -av_freep(s-edge_emu_buffer); av_freep(s-top_border); s-macroblocks = NULL; @@ -108,6 +121,9 @@ static void vp8_decode_flush(AVCodecContext *avctx) static int update_dimensions(VP8Context *s, int width, int height) { +AVCodecContext *avctx = s-avctx; +int i; + if (width != s-avctx-width || height != s-avctx-height) { if (av_image_check_size(width, height, 0, s-avctx)) @@ -121,14 +137,25 @@ static int update_dimensions(VP8Context *s, int width, int height) s-mb_width = (s-avctx-coded_width +15) / 16; s-mb_height = (s-avctx-coded_height+15) / 16; -s-macroblocks_base= av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); -s-filter_strength = av_mallocz(s-mb_width*sizeof(*s-filter_strength)); -s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); -s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); -s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-mlayout = (avctx-active_thread_type == FF_THREAD_SLICE) (FFMIN(s-num_coeff_partitions, avctx-thread_count) 1); +if (s-mlayout == 0) { // Frame threading and one thread +s-macroblocks_base = av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); +s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); +} +else // Sliced threading +s-macroblocks_base = av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); +s-top_nnz= av_mallocz(s-mb_width*sizeof(*s-top_nnz)); +s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-thread_data= av_mallocz(MAX_THREADS*sizeof(VP8ThreadData)); + +for (i = 0; i MAX_THREADS; i++) { +s-thread_data[i].filter_strength = av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength)); +//pthread_mutex_init(s-thread_data[i]-lock, NULL); +//pthread_cond_init(s-thread_data[i]-cond, NULL); +} -if (!s-macroblocks_base || !s-filter_strength || !s-intra4x4_pred_mode_top || -!s-top_nnz || !s-top_border) +if (!s-macroblocks_base || !s-top_nnz || !s-top_border || +(!s-intra4x4_pred_mode_top s-mlayout == 0)) return AVERROR(ENOMEM); s-macroblocks= s-macroblocks_base + 1; @@ -332,12 +359,6 @@ static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) memset(s-segmentation, 0, sizeof(s-segmentation)); } -if (!s-macroblocks_base || /* first frame */ -width != s-avctx-width || height != s-avctx-height) { -if ((ret = update_dimensions(s, width, height)) 0) -return ret; -} - ff_vp56_init_range_decoder(c, buf, header_size); buf += header_size; buf_size -= header_size; @@ -366,6 +387,12 @@ static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) return AVERROR_INVALIDDATA; } +if (!s-macroblocks_base || /* first frame */ +width != s-avctx-width || height != s-avctx-height) { +if ((ret = update_dimensions(s, width, height)) 0) +return ret; +} + get_quants(s); if (!s-keyframe) { @@ -468,19 +495,26 @@ const uint8_t *get_submv_prob(uint32_t left, uint32_t top) * @returns the number of motion vectors parsed (2, 4 or 16) */ static av_always_inline -int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb) +int decode_splitmvs
Re: [libav-devel] [PATCH 3/3] VP8: Implement sliced threading.
On Wed, Jul 11, 2012 at 5:22 PM, Luca Barbato lu_z...@gentoo.org wrote: On 07/11/2012 08:34 PM, Daniel Kang wrote: +#include pthread.h + Check pthread.c, we apparently have non-pthread users, for the rest I'm not sure which comments you addressed. This addresses Mans' objection to sched_yield() (I no longer use it). Refactoring thread.h code no longer applies, so I have no outstanding comments to address (I think). ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 2/3] VP8: Move data from VP8Context-VP8Macroblock
In preparation for sliced threading. --- libavcodec/vp8.c | 25 + libavcodec/vp8.h |7 --- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 8ebc445..2181976 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -622,10 +622,11 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y) } static av_always_inline -void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, +void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int mb_x, int keyframe) { -uint8_t *intra4x4 = s-intra4x4_pred_mode_mb; +uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb; + if (keyframe) { int x, y; uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x; @@ -655,7 +656,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s-prob-segmentid); else if (s-segmentation.enabled) *segment = ref ? *ref : *segment; -s-segment = *segment; +mb-segment = *segment; mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c, s-prob-mbskip) : 0; @@ -663,14 +664,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra); if (mb-mode == MODE_I4x4) { -decode_intra4x4_modes(s, c, mb_x, 1); +decode_intra4x4_modes(s, c, mb, mb_x, 1); } else { const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u; AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes); AV_WN32A(s-intra4x4_pred_mode_left, modes); } -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); mb-ref_frame = VP56_FRAME_CURRENT; } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) { // inter MB, 16.2 @@ -688,9 +689,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s-prob-pred16x16); if (mb-mode == MODE_I4x4) -decode_intra4x4_modes(s, c, mb_x, 0); +decode_intra4x4_modes(s, c, mb, mb_x, 0); -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); mb-ref_frame = VP56_FRAME_CURRENT; mb-partitioning = VP8_SPLITMVMODE_NONE; AV_ZERO32(mb-bmv[0]); @@ -791,7 +792,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, { int i, x, y, luma_start = 0, luma_ctx = 3; int nnz_pred, nnz, nnz_total = 0; -int segment = s-segment; +int segment = mb-segment; int block_dc = 0; if (mb-mode != MODE_I4x4 mb-mode != VP8_MVMODE_SPLIT) { @@ -1002,7 +1003,7 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, s-hpc.pred16x16[mode](dst[0], s-linesize); } else { uint8_t *ptr = dst[0]; -uint8_t *intra4x4 = s-intra4x4_pred_mode_mb; +uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb; uint8_t tr_top[4] = { 127, 127, 127, 127 }; // all blocks on the right edge of the macroblock use bottom edge @@ -1087,9 +1088,9 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, } if (avctx-flags CODEC_FLAG_EMU_EDGE) { -mode = check_intra_pred8x8_mode_emuedge(s-chroma_pred_mode, mb_x, mb_y); +mode = check_intra_pred8x8_mode_emuedge(mb-chroma_pred_mode, mb_x, mb_y); } else { -mode = check_intra_pred8x8_mode(s-chroma_pred_mode, mb_x, mb_y); +mode = check_intra_pred8x8_mode(mb-chroma_pred_mode, mb_x, mb_y); } s-hpc.pred8x8[mode](dst[1], s-uvlinesize); s-hpc.pred8x8[mode](dst[2], s-uvlinesize); @@ -1408,7 +1409,7 @@ static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *m int interior_limit, filter_level; if (s-segmentation.enabled) { -filter_level = s-segmentation.filter_level[s-segment]; +filter_level = s-segmentation.filter_level[mb-segment]; if (!s-segmentation.absolute_vals) filter_level += s-filter.level; } else diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h index a738cb7..2f2cb80 100644 --- a/libavcodec/vp8.h +++ b/libavcodec/vp8.h @@ -79,6 +79,10 @@ typedef struct { uint8_t mode; uint8_t ref_frame; uint8_t partitioning; +uint8_t chroma_pred_mode; +uint8_t segment; +uint8_t intra4x4_pred_mode_mb[16]; +uint8_t intra4x4_pred_mode_top[4]; VP56mv mv; VP56mv bmv[16]; } VP8Macroblock; @@ -97,8 +101,6 @@ typedef struct { uint8_t keyframe;
[libav-devel] [PATCH 1/3] VP8: Refactor decoding a single mb_row.
This is in preperation for sliced threading. --- Splitting long lines to please Diego. --- libavcodec/vp8.c | 164 -- 1 files changed, 86 insertions(+), 78 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 94200f6..8ebc445 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -1574,11 +1574,95 @@ static void release_queued_segmaps(VP8Context *s, int is_close) s-maps_are_invalid = 0; } +#define MARGIN (16 2) +static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe, + AVFrame *prev_frame, int mb_y) +{ +VP8Context *s = avctx-priv_data; +VP56RangeCoder *c = s-coeff_partition[mb_y (s-num_coeff_partitions-1)]; +VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2; +int i, y, mb_x, mb_xy = mb_y*s-mb_width; +uint8_t *dst[3] = { +curframe-data[0] + 16*mb_y*s-linesize, +curframe-data[1] + 8*mb_y*s-uvlinesize, +curframe-data[2] + 8*mb_y*s-uvlinesize +}; + +memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock +memset(s-left_nnz, 0, sizeof(s-left_nnz)); +AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101); + +// left edge of 129 for intra prediction +if (!(avctx-flags CODEC_FLAG_EMU_EDGE)) { +for (i = 0; i 3; i++) +for (y = 0; y 16!!i; y++) +dst[i][y*curframe-linesize[i]-1] = 129; +if (mb_y == 1) // top left edge is also 129 +s-top_border[0][15] = s-top_border[0][23] = s-top_border[0][31] = 129; +} + +s-mv_min.x = -MARGIN; +s-mv_max.x = ((s-mb_width - 1) 6) + MARGIN; + +for (mb_x = 0; mb_x s-mb_width; mb_x++, mb_xy++, mb++) { +/* Prefetch the current frame, 4 MBs ahead */ +s-dsp.prefetch(dst[0] + (mb_x3)*4*s-linesize + 64, s-linesize, 4); +s-dsp.prefetch(dst[1] + (mb_x7)*s-uvlinesize + 64, dst[2] - dst[1], 2); + +decode_mb_mode(s, mb, mb_x, mb_y, curframe-ref_index[0] + mb_xy, + prev_frame prev_frame-ref_index[0] ? prev_frame-ref_index[0] + mb_xy : NULL); + +prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS); + +if (!mb-skip) +decode_mb_coeffs(s, c, mb, s-top_nnz[mb_x], s-left_nnz); + +if (mb-mode = MODE_I4x4) +intra_predict(s, dst, mb, mb_x, mb_y); +else +inter_predict(s, dst, mb, mb_x, mb_y); + +prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN); + +if (!mb-skip) { +idct_mb(s, dst, mb); +} else { +AV_ZERO64(s-left_nnz); +AV_WN64(s-top_nnz[mb_x], 0); // array of 9, so unaligned + +// Reset DC block predictors if they would exist if the mb had coefficients +if (mb-mode != MODE_I4x4 mb-mode != VP8_MVMODE_SPLIT) { +s-left_nnz[8] = 0; +s-top_nnz[mb_x][8] = 0; +} +} + +if (s-deblock_filter) +filter_level_for_mb(s, mb, s-filter_strength[mb_x]); + +prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2); + +dst[0] += 16; +dst[1] += 8; +dst[2] += 8; +s-mv_min.x -= 64; +s-mv_max.x -= 64; +} +if (s-deblock_filter) { +if (s-filter.simple) +filter_mb_row_simple(s, curframe, mb_y); +else +filter_mb_row(s, curframe, mb_y); +} +s-mv_min.y -= 64; +s-mv_max.y -= 64; +} + static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt) { VP8Context *s = avctx-priv_data; -int ret, mb_x, mb_y, i, y, referenced; +int ret, mb_y, i, referenced; enum AVDiscard skip_thresh; AVFrame *av_uninit(curframe), *prev_frame; @@ -1686,90 +1770,14 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, if (s-keyframe) memset(s-intra4x4_pred_mode_top, DC_PRED, s-mb_width*4); -#define MARGIN (16 2) s-mv_min.y = -MARGIN; s-mv_max.y = ((s-mb_height - 1) 6) + MARGIN; for (mb_y = 0; mb_y s-mb_height; mb_y++) { -VP56RangeCoder *c = s-coeff_partition[mb_y (s-num_coeff_partitions-1)]; -VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2; -int mb_xy = mb_y*s-mb_width; -uint8_t *dst[3] = { -curframe-data[0] + 16*mb_y*s-linesize, -curframe-data[1] + 8*mb_y*s-uvlinesize, -curframe-data[2] + 8*mb_y*s-uvlinesize -}; - -memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock -memset(s-left_nnz, 0, sizeof(s-left_nnz)); -AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101); - -// left edge of 129 for intra prediction -if (!(avctx-flags CODEC_FLAG_EMU_EDGE)) { -for (i = 0; i 3; i++) -for (y = 0; y 16!!i; y++) -
[libav-devel] [PATCH 3/3] VP8: Implement sliced threading.
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with eight threads. Sliced threading uses more memory than single or frame threading. --- Some cosmetic changes to please Diego. --- libavcodec/pthread.c| 11 + libavcodec/thread.h | 11 + libavcodec/vp8.c| 522 +-- libavcodec/vp8.h| 61 --- libavcodec/x86/thread.h | 24 +++ 5 files changed, 452 insertions(+), 177 deletions(-) create mode 100644 libavcodec/x86/thread.h diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c index c7edb9e..a7aff31 100644 --- a/libavcodec/pthread.c +++ b/libavcodec/pthread.c @@ -1053,3 +1053,14 @@ void ff_thread_free(AVCodecContext *avctx) else thread_free(avctx); } + +void ff_thread_sleep(int nms) +{ +#if defined(_WIN32) +Sleep(nms); +#elif defined(__OS2__) +DosSleep(nms); +#else // If it's not Windows, give up and say it's pthreads. +sched_yield(); +#endif +} diff --git a/libavcodec/thread.h b/libavcodec/thread.h index 7f018fc..d037ea3 100644 --- a/libavcodec/thread.h +++ b/libavcodec/thread.h @@ -29,6 +29,15 @@ #include config.h #include avcodec.h +#if ARCH_X86 +#include libavcodec/x86/thread.h +#endif + +#if ARCH_X86 +#define pause_hint() x86_pause_hint() +#else +#define pause_hint() +#endif /** * Wait for decoding threads to finish and reset internal state. @@ -113,4 +122,6 @@ void ff_thread_release_buffer(AVCodecContext *avctx, AVFrame *f); int ff_thread_init(AVCodecContext *s); void ff_thread_free(AVCodecContext *s); +void ff_thread_sleep(int nms); + #endif /* AVCODEC_THREAD_H */ diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 2181976..4e2a9e8 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -4,6 +4,7 @@ * Copyright (C) 2010 David Conrad * Copyright (C) 2010 Ronald S. Bultje * Copyright (C) 2010 Jason Garrett-Glaser + * Copyright (C) 2012 Daniel Kang * * This file is part of Libav. * @@ -36,11 +37,16 @@ static void free_buffers(VP8Context *s) { +int i; +if (s-thread_data) +for (i = 0; i MAX_THREADS; i++) { +av_freep(s-thread_data[i].filter_strength); +av_freep(s-thread_data[i].edge_emu_buffer); +} +av_freep(s-thread_data); av_freep(s-macroblocks_base); -av_freep(s-filter_strength); av_freep(s-intra4x4_pred_mode_top); av_freep(s-top_nnz); -av_freep(s-edge_emu_buffer); av_freep(s-top_border); s-macroblocks = NULL; @@ -108,6 +114,9 @@ static void vp8_decode_flush(AVCodecContext *avctx) static int update_dimensions(VP8Context *s, int width, int height) { +AVCodecContext *avctx = s-avctx; +int i; + if (width != s-avctx-width || height != s-avctx-height) { if (av_image_check_size(width, height, 0, s-avctx)) @@ -121,14 +130,25 @@ static int update_dimensions(VP8Context *s, int width, int height) s-mb_width = (s-avctx-coded_width +15) / 16; s-mb_height = (s-avctx-coded_height+15) / 16; -s-macroblocks_base= av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); -s-filter_strength = av_mallocz(s-mb_width*sizeof(*s-filter_strength)); -s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); -s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); -s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-mlayout = (avctx-active_thread_type == FF_THREAD_SLICE) (FFMIN(s-num_coeff_partitions, avctx-thread_count) 1); +if (s-mlayout == 0) { // Frame threading and one thread +s-macroblocks_base = av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); +s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); +} +else // Sliced threading +s-macroblocks_base = av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); +s-top_nnz= av_mallocz(s-mb_width*sizeof(*s-top_nnz)); +s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-thread_data= av_mallocz(MAX_THREADS*sizeof(VP8ThreadData)); + +for (i = 0; i MAX_THREADS; i++) { +s-thread_data[i].filter_strength = av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength)); +//pthread_mutex_init(s-thread_data[i]-lock, NULL); +//pthread_cond_init(s-thread_data[i]-cond, NULL); +} -if (!s-macroblocks_base || !s-filter_strength || !s-intra4x4_pred_mode_top || -!s-top_nnz || !s-top_border) +if (!s-macroblocks_base || !s-top_nnz || !s-top_border || +(!s-intra4x4_pred_mode_top s-mlayout == 0)) return AVERROR(ENOMEM); s-macroblocks= s-macroblocks_base + 1; @@ -332,12 +352,6 @@ static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) memset(s-segmentation, 0, sizeof(s-segmentation)); } -if (!s-macroblocks_base || /* first
[libav-devel] [PATCH 3/5] VP8: Move data from VP8Context-VP8Macroblock
In preparation for sliced threading. --- libavcodec/vp8.c | 46 +- libavcodec/vp8.h |8 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 6ab4b26..9d10827 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -38,7 +38,6 @@ static void free_buffers(VP8Context *s) { av_freep(s-macroblocks_base); av_freep(s-filter_strength); -av_freep(s-intra4x4_pred_mode_top); av_freep(s-top_nnz); av_freep(s-edge_emu_buffer); av_freep(s-top_border); @@ -123,15 +122,14 @@ static int update_dimensions(VP8Context *s, int width, int height) s-macroblocks_base= av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); s-filter_strength = av_mallocz(s-mb_width*sizeof(*s-filter_strength)); -s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); -if (!s-macroblocks_base || !s-filter_strength || !s-intra4x4_pred_mode_top || +if (!s-macroblocks_base || !s-filter_strength || !s-top_nnz || !s-top_border) return AVERROR(ENOMEM); -s-macroblocks= s-macroblocks_base + s-mb_width + 2; +s-macroblocks= s-macroblocks_base + s-mb_width + 1; return 0; } @@ -622,14 +620,17 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y) } static av_always_inline -void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, +void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int mb_x, int keyframe) { -uint8_t *intra4x4 = s-intra4x4_pred_mode_mb; +VP8Macroblock *mb_top = mb - s-mb_width - 1; +uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb; + +memcpy(mb-intra4x4_pred_mode_top, mb_top-intra4x4_pred_mode_top, 4); if (keyframe) { int x, y; -uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x; uint8_t* const left = s-intra4x4_pred_mode_left; +uint8_t* const top = mb-intra4x4_pred_mode_top; for (y = 0; y 4; y++) { for (x = 0; x 4; x++) { const uint8_t *ctx; @@ -655,7 +656,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s-prob-segmentid); else if (s-segmentation.enabled) *segment = ref ? *ref : *segment; -s-segment = *segment; +mb-segment = *segment; mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c, s-prob-mbskip) : 0; @@ -663,14 +664,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra); if (mb-mode == MODE_I4x4) { -decode_intra4x4_modes(s, c, mb_x, 1); +decode_intra4x4_modes(s, c, mb, mb_x, 1); } else { const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u; -AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes); AV_WN32A(s-intra4x4_pred_mode_left, modes); +AV_WN32A(mb-intra4x4_pred_mode_top, modes); } -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); mb-ref_frame = VP56_FRAME_CURRENT; } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) { // inter MB, 16.2 @@ -688,9 +689,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s-prob-pred16x16); if (mb-mode == MODE_I4x4) -decode_intra4x4_modes(s, c, mb_x, 0); +decode_intra4x4_modes(s, c, mb, mb_x, 0); -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); mb-ref_frame = VP56_FRAME_CURRENT; mb-partitioning = VP8_SPLITMVMODE_NONE; AV_ZERO32(mb-bmv[0]); @@ -791,7 +792,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, { int i, x, y, luma_start = 0, luma_ctx = 3; int nnz_pred, nnz, nnz_total = 0; -int segment = s-segment; +int segment = mb-segment; int block_dc = 0; if (mb-mode != MODE_I4x4 mb-mode != VP8_MVMODE_SPLIT) { @@ -1002,7 +1003,7 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, s-hpc.pred16x16[mode](dst[0], s-linesize); } else { uint8_t *ptr = dst[0]; -uint8_t *intra4x4 = s-intra4x4_pred_mode_mb; +uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb; uint8_t tr_top[4] = { 127, 127, 127, 127
[libav-devel] [PATCH 5/5] VP8: Implement sliced threading.
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with eight threads. Also allow frame/single-thread to use less memory than sliced threading. --- libavcodec/pthread.c| 11 + libavcodec/thread.h | 11 + libavcodec/vp8.c| 498 --- libavcodec/vp8.h| 62 --- libavcodec/x86/thread.h | 24 +++ 5 files changed, 422 insertions(+), 184 deletions(-) create mode 100644 libavcodec/x86/thread.h diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c index c7edb9e..a7aff31 100644 --- a/libavcodec/pthread.c +++ b/libavcodec/pthread.c @@ -1053,3 +1053,14 @@ void ff_thread_free(AVCodecContext *avctx) else thread_free(avctx); } + +void ff_thread_sleep(int nms) +{ +#if defined(_WIN32) +Sleep(nms); +#elif defined(__OS2__) +DosSleep(nms); +#else // If it's not Windows, give up and say it's pthreads. +sched_yield(); +#endif +} diff --git a/libavcodec/thread.h b/libavcodec/thread.h index 7f018fc..d037ea3 100644 --- a/libavcodec/thread.h +++ b/libavcodec/thread.h @@ -29,6 +29,15 @@ #include config.h #include avcodec.h +#if ARCH_X86 +#include libavcodec/x86/thread.h +#endif + +#if ARCH_X86 +#define pause_hint() x86_pause_hint() +#else +#define pause_hint() +#endif /** * Wait for decoding threads to finish and reset internal state. @@ -113,4 +122,6 @@ void ff_thread_release_buffer(AVCodecContext *avctx, AVFrame *f); int ff_thread_init(AVCodecContext *s); void ff_thread_free(AVCodecContext *s); +void ff_thread_sleep(int nms); + #endif /* AVCODEC_THREAD_H */ diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 0d845d0..5696898 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -4,6 +4,7 @@ * Copyright (C) 2010 David Conrad * Copyright (C) 2010 Ronald S. Bultje * Copyright (C) 2010 Jason Garrett-Glaser + * Copyright (C) 2012 Daniel Kang * * This file is part of Libav. * @@ -36,10 +37,16 @@ static void free_buffers(VP8Context *s) { +int i; +if (s-thread_data) +for (i = 0; i MAX_THREADS; i++) { +av_freep(s-thread_data[i].filter_strength); +av_freep(s-thread_data[i].edge_emu_buffer); +} +av_freep(s-thread_data); av_freep(s-macroblocks_base); -av_freep(s-filter_strength); +av_freep(s-intra4x4_pred_mode_top); av_freep(s-top_nnz); -av_freep(s-edge_emu_buffer); av_freep(s-top_border); s-macroblocks = NULL; @@ -107,6 +114,9 @@ static void vp8_decode_flush(AVCodecContext *avctx) static int update_dimensions(VP8Context *s, int width, int height) { +AVCodecContext *avctx = s-avctx; +int i; + if (width != s-avctx-width || height != s-avctx-height) { if (av_image_check_size(width, height, 0, s-avctx)) @@ -120,16 +130,28 @@ static int update_dimensions(VP8Context *s, int width, int height) s-mb_width = (s-avctx-coded_width +15) / 16; s-mb_height = (s-avctx-coded_height+15) / 16; -s-macroblocks_base= av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); -s-filter_strength = av_mallocz(s-mb_width*sizeof(*s-filter_strength)); -s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); -s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-mlayout = (avctx-active_thread_type == FF_THREAD_SLICE) (FFMIN(s-num_coeff_partitions, avctx-thread_count) 1); +if (s-mlayout == 0) { // Frame threading and one thread +s-macroblocks_base = av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); +s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); +} +else // Sliced threading +s-macroblocks_base = av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); +s-top_nnz= av_mallocz(s-mb_width*sizeof(*s-top_nnz)); +s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-thread_data= av_mallocz(MAX_THREADS*sizeof(VP8ThreadData)); + +for (i = 0; i MAX_THREADS; i++) { +s-thread_data[i].filter_strength = av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength)); +//pthread_mutex_init(s-thread_data[i]-lock, NULL); +//pthread_cond_init(s-thread_data[i]-cond, NULL); +} -if (!s-macroblocks_base || !s-filter_strength || -!s-top_nnz || !s-top_border) +if (!s-macroblocks_base || !s-top_nnz || !s-top_border || +(!s-intra4x4_pred_mode_top s-mlayout == 0)) return AVERROR(ENOMEM); -s-macroblocks= s-macroblocks_base + s-mb_width + 1; +s-macroblocks= s-macroblocks_base + 1; return 0; } @@ -330,12 +352,6 @@ static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) memset(s-segmentation, 0, sizeof(s-segmentation)); } -if (!s-macroblocks_base || /* first frame */ -width != s-avctx-width
[libav-devel] [PATCH 1/5] VP8: Refactor decoding a single mb_row.
This is in preperation for sliced threading. --- libavcodec/vp8.c | 162 -- 1 files changed, 84 insertions(+), 78 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 94200f6..7a8a0c6 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -1574,11 +1574,93 @@ static void release_queued_segmaps(VP8Context *s, int is_close) s-maps_are_invalid = 0; } +#define MARGIN (16 2) +static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe, AVFrame *prev_frame, int mb_y) { +VP8Context *s = avctx-priv_data; +VP56RangeCoder *c = s-coeff_partition[mb_y (s-num_coeff_partitions-1)]; +VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2; +int i, y, mb_x, mb_xy = mb_y*s-mb_width; +uint8_t *dst[3] = { +curframe-data[0] + 16*mb_y*s-linesize, +curframe-data[1] + 8*mb_y*s-uvlinesize, +curframe-data[2] + 8*mb_y*s-uvlinesize +}; + +memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock +memset(s-left_nnz, 0, sizeof(s-left_nnz)); +AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101); + +// left edge of 129 for intra prediction +if (!(avctx-flags CODEC_FLAG_EMU_EDGE)) { +for (i = 0; i 3; i++) +for (y = 0; y 16!!i; y++) +dst[i][y*curframe-linesize[i]-1] = 129; +if (mb_y == 1) // top left edge is also 129 +s-top_border[0][15] = s-top_border[0][23] = s-top_border[0][31] = 129; +} + +s-mv_min.x = -MARGIN; +s-mv_max.x = ((s-mb_width - 1) 6) + MARGIN; + +for (mb_x = 0; mb_x s-mb_width; mb_x++, mb_xy++, mb++) { +/* Prefetch the current frame, 4 MBs ahead */ +s-dsp.prefetch(dst[0] + (mb_x3)*4*s-linesize + 64, s-linesize, 4); +s-dsp.prefetch(dst[1] + (mb_x7)*s-uvlinesize + 64, dst[2] - dst[1], 2); + +decode_mb_mode(s, mb, mb_x, mb_y, curframe-ref_index[0] + mb_xy, + prev_frame prev_frame-ref_index[0] ? prev_frame-ref_index[0] + mb_xy : NULL); + +prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS); + +if (!mb-skip) +decode_mb_coeffs(s, c, mb, s-top_nnz[mb_x], s-left_nnz); + +if (mb-mode = MODE_I4x4) +intra_predict(s, dst, mb, mb_x, mb_y); +else +inter_predict(s, dst, mb, mb_x, mb_y); + +prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN); + +if (!mb-skip) { +idct_mb(s, dst, mb); +} else { +AV_ZERO64(s-left_nnz); +AV_WN64(s-top_nnz[mb_x], 0); // array of 9, so unaligned + +// Reset DC block predictors if they would exist if the mb had coefficients +if (mb-mode != MODE_I4x4 mb-mode != VP8_MVMODE_SPLIT) { +s-left_nnz[8] = 0; +s-top_nnz[mb_x][8] = 0; +} +} + +if (s-deblock_filter) +filter_level_for_mb(s, mb, s-filter_strength[mb_x]); + +prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2); + +dst[0] += 16; +dst[1] += 8; +dst[2] += 8; +s-mv_min.x -= 64; +s-mv_max.x -= 64; +} +if (s-deblock_filter) { +if (s-filter.simple) +filter_mb_row_simple(s, curframe, mb_y); +else +filter_mb_row(s, curframe, mb_y); +} +s-mv_min.y -= 64; +s-mv_max.y -= 64; +} + static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt) { VP8Context *s = avctx-priv_data; -int ret, mb_x, mb_y, i, y, referenced; +int ret, mb_y, i, referenced; enum AVDiscard skip_thresh; AVFrame *av_uninit(curframe), *prev_frame; @@ -1686,90 +1768,14 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, if (s-keyframe) memset(s-intra4x4_pred_mode_top, DC_PRED, s-mb_width*4); -#define MARGIN (16 2) s-mv_min.y = -MARGIN; s-mv_max.y = ((s-mb_height - 1) 6) + MARGIN; for (mb_y = 0; mb_y s-mb_height; mb_y++) { -VP56RangeCoder *c = s-coeff_partition[mb_y (s-num_coeff_partitions-1)]; -VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2; -int mb_xy = mb_y*s-mb_width; -uint8_t *dst[3] = { -curframe-data[0] + 16*mb_y*s-linesize, -curframe-data[1] + 8*mb_y*s-uvlinesize, -curframe-data[2] + 8*mb_y*s-uvlinesize -}; - -memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock -memset(s-left_nnz, 0, sizeof(s-left_nnz)); -AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101); - -// left edge of 129 for intra prediction -if (!(avctx-flags CODEC_FLAG_EMU_EDGE)) { -for (i = 0; i 3; i++) -for (y = 0; y 16!!i; y++) -dst[i][y*curframe-linesize[i]-1] = 129; -if (mb_y == 1) // top left edge is also
[libav-devel] [PATCH 2/3] VP8: Move data from VP8Context-VP8Macroblock
In preparation for sliced threading. --- libavcodec/vp8.c | 25 + libavcodec/vp8.h |7 --- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 7a8a0c6..b70e87e 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -622,10 +622,11 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y) } static av_always_inline -void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, +void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int mb_x, int keyframe) { -uint8_t *intra4x4 = s-intra4x4_pred_mode_mb; +uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb; + if (keyframe) { int x, y; uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x; @@ -655,7 +656,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s-prob-segmentid); else if (s-segmentation.enabled) *segment = ref ? *ref : *segment; -s-segment = *segment; +mb-segment = *segment; mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c, s-prob-mbskip) : 0; @@ -663,14 +664,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra); if (mb-mode == MODE_I4x4) { -decode_intra4x4_modes(s, c, mb_x, 1); +decode_intra4x4_modes(s, c, mb, mb_x, 1); } else { const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u; AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes); AV_WN32A(s-intra4x4_pred_mode_left, modes); } -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); mb-ref_frame = VP56_FRAME_CURRENT; } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) { // inter MB, 16.2 @@ -688,9 +689,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s-prob-pred16x16); if (mb-mode == MODE_I4x4) -decode_intra4x4_modes(s, c, mb_x, 0); +decode_intra4x4_modes(s, c, mb, mb_x, 0); -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); mb-ref_frame = VP56_FRAME_CURRENT; mb-partitioning = VP8_SPLITMVMODE_NONE; AV_ZERO32(mb-bmv[0]); @@ -791,7 +792,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, { int i, x, y, luma_start = 0, luma_ctx = 3; int nnz_pred, nnz, nnz_total = 0; -int segment = s-segment; +int segment = mb-segment; int block_dc = 0; if (mb-mode != MODE_I4x4 mb-mode != VP8_MVMODE_SPLIT) { @@ -1002,7 +1003,7 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, s-hpc.pred16x16[mode](dst[0], s-linesize); } else { uint8_t *ptr = dst[0]; -uint8_t *intra4x4 = s-intra4x4_pred_mode_mb; +uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb; uint8_t tr_top[4] = { 127, 127, 127, 127 }; // all blocks on the right edge of the macroblock use bottom edge @@ -1087,9 +1088,9 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, } if (avctx-flags CODEC_FLAG_EMU_EDGE) { -mode = check_intra_pred8x8_mode_emuedge(s-chroma_pred_mode, mb_x, mb_y); +mode = check_intra_pred8x8_mode_emuedge(mb-chroma_pred_mode, mb_x, mb_y); } else { -mode = check_intra_pred8x8_mode(s-chroma_pred_mode, mb_x, mb_y); +mode = check_intra_pred8x8_mode(mb-chroma_pred_mode, mb_x, mb_y); } s-hpc.pred8x8[mode](dst[1], s-uvlinesize); s-hpc.pred8x8[mode](dst[2], s-uvlinesize); @@ -1408,7 +1409,7 @@ static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *m int interior_limit, filter_level; if (s-segmentation.enabled) { -filter_level = s-segmentation.filter_level[s-segment]; +filter_level = s-segmentation.filter_level[mb-segment]; if (!s-segmentation.absolute_vals) filter_level += s-filter.level; } else diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h index a738cb7..2f2cb80 100644 --- a/libavcodec/vp8.h +++ b/libavcodec/vp8.h @@ -79,6 +79,10 @@ typedef struct { uint8_t mode; uint8_t ref_frame; uint8_t partitioning; +uint8_t chroma_pred_mode; +uint8_t segment; +uint8_t intra4x4_pred_mode_mb[16]; +uint8_t intra4x4_pred_mode_top[4]; VP56mv mv; VP56mv bmv[16]; } VP8Macroblock; @@ -97,8 +101,6 @@ typedef struct { uint8_t keyframe;
[libav-devel] [PATCH 3/3] VP8: Implement sliced threading.
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with eight threads. Also allow frame/single-thread to use less memory than sliced threading. --- libavcodec/pthread.c| 11 + libavcodec/thread.h | 11 + libavcodec/vp8.c| 514 +-- libavcodec/vp8.h| 61 --- libavcodec/x86/thread.h | 24 +++ 5 files changed, 445 insertions(+), 176 deletions(-) create mode 100644 libavcodec/x86/thread.h diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c index c7edb9e..a7aff31 100644 --- a/libavcodec/pthread.c +++ b/libavcodec/pthread.c @@ -1053,3 +1053,14 @@ void ff_thread_free(AVCodecContext *avctx) else thread_free(avctx); } + +void ff_thread_sleep(int nms) +{ +#if defined(_WIN32) +Sleep(nms); +#elif defined(__OS2__) +DosSleep(nms); +#else // If it's not Windows, give up and say it's pthreads. +sched_yield(); +#endif +} diff --git a/libavcodec/thread.h b/libavcodec/thread.h index 7f018fc..d037ea3 100644 --- a/libavcodec/thread.h +++ b/libavcodec/thread.h @@ -29,6 +29,15 @@ #include config.h #include avcodec.h +#if ARCH_X86 +#include libavcodec/x86/thread.h +#endif + +#if ARCH_X86 +#define pause_hint() x86_pause_hint() +#else +#define pause_hint() +#endif /** * Wait for decoding threads to finish and reset internal state. @@ -113,4 +122,6 @@ void ff_thread_release_buffer(AVCodecContext *avctx, AVFrame *f); int ff_thread_init(AVCodecContext *s); void ff_thread_free(AVCodecContext *s); +void ff_thread_sleep(int nms); + #endif /* AVCODEC_THREAD_H */ diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index b70e87e..4233eda 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -4,6 +4,7 @@ * Copyright (C) 2010 David Conrad * Copyright (C) 2010 Ronald S. Bultje * Copyright (C) 2010 Jason Garrett-Glaser + * Copyright (C) 2012 Daniel Kang * * This file is part of Libav. * @@ -36,11 +37,16 @@ static void free_buffers(VP8Context *s) { +int i; +if (s-thread_data) +for (i = 0; i MAX_THREADS; i++) { +av_freep(s-thread_data[i].filter_strength); +av_freep(s-thread_data[i].edge_emu_buffer); +} +av_freep(s-thread_data); av_freep(s-macroblocks_base); -av_freep(s-filter_strength); av_freep(s-intra4x4_pred_mode_top); av_freep(s-top_nnz); -av_freep(s-edge_emu_buffer); av_freep(s-top_border); s-macroblocks = NULL; @@ -108,6 +114,9 @@ static void vp8_decode_flush(AVCodecContext *avctx) static int update_dimensions(VP8Context *s, int width, int height) { +AVCodecContext *avctx = s-avctx; +int i; + if (width != s-avctx-width || height != s-avctx-height) { if (av_image_check_size(width, height, 0, s-avctx)) @@ -121,14 +130,25 @@ static int update_dimensions(VP8Context *s, int width, int height) s-mb_width = (s-avctx-coded_width +15) / 16; s-mb_height = (s-avctx-coded_height+15) / 16; -s-macroblocks_base= av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); -s-filter_strength = av_mallocz(s-mb_width*sizeof(*s-filter_strength)); -s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); -s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); -s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-mlayout = (avctx-active_thread_type == FF_THREAD_SLICE) (FFMIN(s-num_coeff_partitions, avctx-thread_count) 1); +if (s-mlayout == 0) { // Frame threading and one thread +s-macroblocks_base = av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); +s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); +} +else // Sliced threading +s-macroblocks_base = av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); +s-top_nnz= av_mallocz(s-mb_width*sizeof(*s-top_nnz)); +s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); +s-thread_data= av_mallocz(MAX_THREADS*sizeof(VP8ThreadData)); + +for (i = 0; i MAX_THREADS; i++) { +s-thread_data[i].filter_strength = av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength)); +//pthread_mutex_init(s-thread_data[i]-lock, NULL); +//pthread_cond_init(s-thread_data[i]-cond, NULL); +} -if (!s-macroblocks_base || !s-filter_strength || !s-intra4x4_pred_mode_top || -!s-top_nnz || !s-top_border) +if (!s-macroblocks_base || !s-top_nnz || !s-top_border || +(!s-intra4x4_pred_mode_top s-mlayout == 0)) return AVERROR(ENOMEM); s-macroblocks= s-macroblocks_base + 1; @@ -332,12 +352,6 @@ static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) memset(s-segmentation, 0, sizeof(s-segmentation)); } -if (!s-macroblocks_base || /* first frame */ -width != s-avctx
Re: [libav-devel] [PATCH 3/5] VP8: Move data from VP8Context-VP8Macroblock
On Fri, Jun 22, 2012 at 3:33 PM, Luca Barbato lu_z...@gentoo.org wrote: I am testing Jason's idea, which only need a little more memory. However, from the timings and preliminary tests I have done, I don't think it scales nearly as well. I see I've decided to stick with this approach for now, since it scales better at lower number of threads. A modification of Jason's idea might scale better, but it will require as much memory. Could you try to swap layouts depending on which threading system is used? Tried this, and the speed hit is now ~1.5% for frame and single threaded. I'll try profiling and seeing where the issues are. Apparently I'm not allowed to attached large files, so here's my WIP patch if anyone has suggestions, etc.: http://privatepaste.com/dbeafd7e68 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 3/5] VP8: Move data from VP8Context-VP8Macroblock
On Fri, Jun 22, 2012 at 12:26 AM, Luca Barbato lu_z...@gentoo.org wrote: On 06/22/2012 04:19 AM, Daniel Kang wrote: In preparation for sliced threading. --- libavcodec/vp8.c | 54 ++ libavcodec/vp8.h | 11 +-- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 6ab4b26..bc2476e 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -38,7 +38,6 @@ static void free_buffers(VP8Context *s) { av_freep(s-macroblocks_base); av_freep(s-filter_strength); -av_freep(s-intra4x4_pred_mode_top); av_freep(s-top_nnz); av_freep(s-edge_emu_buffer); av_freep(s-top_border); @@ -123,15 +122,14 @@ static int update_dimensions(VP8Context *s, int width, int height) s-macroblocks_base= av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); s-filter_strength = av_mallocz(s-mb_width*sizeof(*s-filter_strength)); -s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); -if (!s-macroblocks_base || !s-filter_strength || !s-intra4x4_pred_mode_top || +if (!s-macroblocks_base || !s-filter_strength || !s-top_nnz || !s-top_border) return AVERROR(ENOMEM); -s-macroblocks= s-macroblocks_base + s-mb_width + 2; +s-macroblocks= s-macroblocks_base + s-mb_width + 1; return 0; } @@ -622,14 +620,19 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y) } static av_always_inline -void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, +void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int mb_x, int keyframe) { -uint8_t *intra4x4 = s-intra4x4_pred_mode_mb; +VP8Macroblock *mb_top = mb - s-mb_width - 1; +VP8Macroblock *mb_left = mb - 1; +uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb; + +memcpy(mb-intra4x4_pred_mode_left, mb_left-intra4x4_pred_mode_left, 4); +memcpy(mb-intra4x4_pred_mode_top, mb_top-intra4x4_pred_mode_top, 4); if (keyframe) { int x, y; -uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x; -uint8_t* const left = s-intra4x4_pred_mode_left; +uint8_t* const top = mb-intra4x4_pred_mode_top; +uint8_t* const left = mb-intra4x4_pred_mode_left; for (y = 0; y 4; y++) { for (x = 0; x 4; x++) { const uint8_t *ctx; @@ -655,7 +658,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s-prob-segmentid); else if (s-segmentation.enabled) *segment = ref ? *ref : *segment; -s-segment = *segment; +mb-segment = *segment; mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c, s-prob-mbskip) : 0; @@ -663,14 +666,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra); if (mb-mode == MODE_I4x4) { -decode_intra4x4_modes(s, c, mb_x, 1); +decode_intra4x4_modes(s, c, mb, mb_x, 1); } else { const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u; -AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes); -AV_WN32A(s-intra4x4_pred_mode_left, modes); +AV_WN32A(mb-intra4x4_pred_mode_top, modes); +AV_WN32A(mb-intra4x4_pred_mode_left, modes); } -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); mb-ref_frame = VP56_FRAME_CURRENT; } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) { // inter MB, 16.2 @@ -688,9 +691,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s-prob-pred16x16); if (mb-mode == MODE_I4x4) -decode_intra4x4_modes(s, c, mb_x, 0); +decode_intra4x4_modes(s, c, mb, mb_x, 0); -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); mb-ref_frame = VP56_FRAME_CURRENT; mb-partitioning = VP8_SPLITMVMODE_NONE; AV_ZERO32(mb-bmv[0]); @@ -791,7 +794,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, { int i, x, y, luma_start = 0
[libav-devel] [PATCH 2/4] VP8: Move data from VP8Context-VP8Macroblock
In preparation for sliced threading. --- libavcodec/vp8.c | 54 ++ libavcodec/vp8.h | 11 +-- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 6ab4b26..bc2476e 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -38,7 +38,6 @@ static void free_buffers(VP8Context *s) { av_freep(s-macroblocks_base); av_freep(s-filter_strength); -av_freep(s-intra4x4_pred_mode_top); av_freep(s-top_nnz); av_freep(s-edge_emu_buffer); av_freep(s-top_border); @@ -123,15 +122,14 @@ static int update_dimensions(VP8Context *s, int width, int height) s-macroblocks_base= av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); s-filter_strength = av_mallocz(s-mb_width*sizeof(*s-filter_strength)); -s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); -if (!s-macroblocks_base || !s-filter_strength || !s-intra4x4_pred_mode_top || +if (!s-macroblocks_base || !s-filter_strength || !s-top_nnz || !s-top_border) return AVERROR(ENOMEM); -s-macroblocks= s-macroblocks_base + s-mb_width + 2; +s-macroblocks= s-macroblocks_base + s-mb_width + 1; return 0; } @@ -622,14 +620,19 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y) } static av_always_inline -void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, +void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int mb_x, int keyframe) { -uint8_t *intra4x4 = s-intra4x4_pred_mode_mb; +VP8Macroblock *mb_top = mb - s-mb_width - 1; +VP8Macroblock *mb_left = mb - 1; +uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb; + +memcpy(mb-intra4x4_pred_mode_left, mb_left-intra4x4_pred_mode_left, 4); +memcpy(mb-intra4x4_pred_mode_top, mb_top-intra4x4_pred_mode_top, 4); if (keyframe) { int x, y; -uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x; -uint8_t* const left = s-intra4x4_pred_mode_left; +uint8_t* const top = mb-intra4x4_pred_mode_top; +uint8_t* const left = mb-intra4x4_pred_mode_left; for (y = 0; y 4; y++) { for (x = 0; x 4; x++) { const uint8_t *ctx; @@ -655,7 +658,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s-prob-segmentid); else if (s-segmentation.enabled) *segment = ref ? *ref : *segment; -s-segment = *segment; +mb-segment = *segment; mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c, s-prob-mbskip) : 0; @@ -663,14 +666,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra); if (mb-mode == MODE_I4x4) { -decode_intra4x4_modes(s, c, mb_x, 1); +decode_intra4x4_modes(s, c, mb, mb_x, 1); } else { const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u; -AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes); -AV_WN32A(s-intra4x4_pred_mode_left, modes); +AV_WN32A(mb-intra4x4_pred_mode_top, modes); +AV_WN32A(mb-intra4x4_pred_mode_left, modes); } -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); mb-ref_frame = VP56_FRAME_CURRENT; } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) { // inter MB, 16.2 @@ -688,9 +691,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s-prob-pred16x16); if (mb-mode == MODE_I4x4) -decode_intra4x4_modes(s, c, mb_x, 0); +decode_intra4x4_modes(s, c, mb, mb_x, 0); -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s-prob-pred8x8c); mb-ref_frame = VP56_FRAME_CURRENT; mb-partitioning = VP8_SPLITMVMODE_NONE; AV_ZERO32(mb-bmv[0]); @@ -791,7 +794,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, { int i, x, y, luma_start = 0, luma_ctx = 3; int nnz_pred, nnz, nnz_total = 0; -int segment = s-segment; +int segment = mb-segment; int block_dc = 0; if (mb-mode != MODE_I4x4 mb-mode != VP8_MVMODE_SPLIT) { @@ -1002,7 +1005,7 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
[libav-devel] [PATCH 3/4] VP8: Decode mvs and mb modes separately.
In preparation for sliced threading. --- libavcodec/vp8.c | 37 - 1 files changed, 32 insertions(+), 5 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index bc2476e..db4a875 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -1578,6 +1578,32 @@ static void release_queued_segmaps(VP8Context *s, int is_close) } #define MARGIN (16 2) +static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, AVFrame *curframe, AVFrame *prev_frame) { +VP8Context *s = avctx-priv_data; +int mb_x, mb_y; + +s-mv_min.y = -MARGIN; +s-mv_max.y = ((s-mb_height - 1) 6) + MARGIN; +for (mb_y = 0; mb_y s-mb_height; mb_y++) { +VP8Macroblock *mb = s-macroblocks_base + ((s-mb_width+1)*(mb_y + 1) + 1); +int mb_xy = mb_y*s-mb_width; + +memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock +AV_WN32A((mb-1)-intra4x4_pred_mode_left, DC_PRED*0x01010101); + +s-mv_min.x = -MARGIN; +s-mv_max.x = ((s-mb_width - 1) 6) + MARGIN; +for (mb_x = 0; mb_x s-mb_width; mb_x++, mb_xy++, mb++) { +decode_mb_mode(s, mb, mb_x, mb_y, curframe-ref_index[0] + mb_xy, + prev_frame prev_frame-ref_index[0] ? prev_frame-ref_index[0] + mb_xy : NULL); +s-mv_min.x -= 64; +s-mv_max.x -= 64; +} +s-mv_min.y -= 64; +s-mv_max.y -= 64; +} +} + static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe, AVFrame *prev_frame, int mb_y) { VP8Context *s = avctx-priv_data; VP56RangeCoder *c = s-coeff_partition[mb_y (s-num_coeff_partitions-1)]; @@ -1589,9 +1615,7 @@ static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe, AVFrame curframe-data[2] + 8*mb_y*s-uvlinesize }; -memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock memset(s-left_nnz, 0, sizeof(s-left_nnz)); -AV_WN32A((mb-1)-intra4x4_pred_mode_left, DC_PRED*0x01010101); // left edge of 129 for intra prediction if (!(avctx-flags CODEC_FLAG_EMU_EDGE)) { @@ -1610,9 +1634,6 @@ static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe, AVFrame s-dsp.prefetch(dst[0] + (mb_x3)*4*s-linesize + 64, s-linesize, 4); s-dsp.prefetch(dst[1] + (mb_x7)*s-uvlinesize + 64, dst[2] - dst[1], 2); -decode_mb_mode(s, mb, mb_x, mb_y, curframe-ref_index[0] + mb_xy, - prev_frame prev_frame-ref_index[0] ? prev_frame-ref_index[0] + mb_xy : NULL); - prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS); if (!mb-skip) @@ -1774,6 +1795,12 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, memset(mb-intra4x4_pred_mode_top, DC_PRED, 4); } +// Make sure the previous frame has read its segmentation map, +// if we re-use the same map. +if (prev_frame s-segmentation.enabled !s-segmentation.update_map) +ff_thread_await_progress(prev_frame, 1, 0); +vp8_decode_mv_mb_modes(avctx, curframe, prev_frame); + s-mv_min.y = -MARGIN; s-mv_max.y = ((s-mb_height - 1) 6) + MARGIN; -- 1.7.7.3 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/4] VP8: Change mb memory layout for sliced threading.
--- libavcodec/vp8.c | 14 +++--- 1 files changed, 7 insertions(+), 7 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 7a8a0c6..6ab4b26 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -121,7 +121,7 @@ static int update_dimensions(VP8Context *s, int width, int height) s-mb_width = (s-avctx-coded_width +15) / 16; s-mb_height = (s-avctx-coded_height+15) / 16; -s-macroblocks_base= av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks)); +s-macroblocks_base= av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks)); s-filter_strength = av_mallocz(s-mb_width*sizeof(*s-filter_strength)); s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4); s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); @@ -131,7 +131,7 @@ static int update_dimensions(VP8Context *s, int width, int height) !s-top_nnz || !s-top_border) return AVERROR(ENOMEM); -s-macroblocks= s-macroblocks_base + 1; +s-macroblocks= s-macroblocks_base + s-mb_width + 2; return 0; } @@ -472,7 +472,7 @@ int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb) { int part_idx; int n, num; -VP8Macroblock *top_mb = mb[2]; +VP8Macroblock *top_mb = mb[-s-mb_width-1]; VP8Macroblock *left_mb = mb[-1]; const uint8_t *mbsplits_left = vp8_mbsplits[left_mb-partitioning], *mbsplits_top = vp8_mbsplits[top_mb-partitioning], @@ -534,9 +534,9 @@ int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb) static av_always_inline void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y) { -VP8Macroblock *mb_edge[3] = { mb + 2 /* top */, +VP8Macroblock *mb_edge[3] = { mb - s-mb_width-1 /* top */, mb - 1 /* left */, - mb + 1 /* top-left */ }; + mb - s-mb_width-2 /* top-left */ }; enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV }; enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT }; int idx = CNT_ZERO; @@ -1578,7 +1578,7 @@ static void release_queued_segmaps(VP8Context *s, int is_close) static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe, AVFrame *prev_frame, int mb_y) { VP8Context *s = avctx-priv_data; VP56RangeCoder *c = s-coeff_partition[mb_y (s-num_coeff_partitions-1)]; -VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2; +VP8Macroblock *mb = s-macroblocks + ((s-mb_width+1)*(mb_y + 1) + 1); int i, y, mb_x, mb_xy = mb_y*s-mb_width; uint8_t *dst[3] = { curframe-data[0] + 16*mb_y*s-linesize, @@ -1757,7 +1757,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, memset(s-top_nnz, 0, s-mb_width*sizeof(*s-top_nnz)); /* Zero macroblock structures for top/top-left prediction from outside the frame. */ -memset(s-macroblocks + s-mb_height*2 - 1, 0, (s-mb_width+1)*sizeof(*s-macroblocks)); +memset(s-macroblocks_base, 0, (s-mb_width+1)*sizeof(*s-macroblocks)); // top edge of 127 for intra prediction if (!(avctx-flags CODEC_FLAG_EMU_EDGE)) { -- 1.7.7.3 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 4/4] [WIP] VP8: Implement sliced threading.
--- Jason has some ideas to improve speed. This hurts frame and single threaded by ~0.5% (expected). With two threads, I get ~30% speed increase with 4, ~45%, and with 8 ~50%. --- libavcodec/vp8.c | 394 +++--- libavcodec/vp8.h | 56 + 2 files changed, 288 insertions(+), 162 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index db4a875..56e40ea 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -4,6 +4,7 @@ * Copyright (C) 2010 David Conrad * Copyright (C) 2010 Ronald S. Bultje * Copyright (C) 2010 Jason Garrett-Glaser + * Copyright (C) 2012 Daniel Kang * * This file is part of Libav. * @@ -22,6 +23,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include pthread.h + #include libavutil/imgutils.h #include avcodec.h #include internal.h @@ -36,10 +39,13 @@ static void free_buffers(VP8Context *s) { +int i; +for (i = 0; i MAX_THREADS; i++) { +av_freep(s-thread_data[i]); +} av_freep(s-macroblocks_base); av_freep(s-filter_strength); av_freep(s-top_nnz); -av_freep(s-edge_emu_buffer); av_freep(s-top_border); s-macroblocks = NULL; @@ -107,6 +113,8 @@ static void vp8_decode_flush(AVCodecContext *avctx) static int update_dimensions(VP8Context *s, int width, int height) { +int i; + if (width != s-avctx-width || height != s-avctx-height) { if (av_image_check_size(width, height, 0, s-avctx)) @@ -125,8 +133,14 @@ static int update_dimensions(VP8Context *s, int width, int height) s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz)); s-top_border = av_mallocz((s-mb_width+1)*sizeof(*s-top_border)); -if (!s-macroblocks_base || !s-filter_strength || -!s-top_nnz || !s-top_border) +for (i = 0; i MAX_THREADS; i++) { +s-thread_data[i] = av_mallocz(sizeof(VP8ThreadData)); +s-thread_data[i]-filter_strength = av_mallocz(s-mb_width*sizeof(*s-thread_data[0]-filter_strength)); +//pthread_mutex_init(s-thread_data[i]-lock, NULL); +//pthread_cond_init(s-thread_data[i]-cond, NULL); +} + +if (!s-macroblocks_base || !s-top_nnz || !s-top_border) return AVERROR(ENOMEM); s-macroblocks= s-macroblocks_base + s-mb_width + 1; @@ -624,15 +638,13 @@ void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int mb_x, int keyframe) { VP8Macroblock *mb_top = mb - s-mb_width - 1; -VP8Macroblock *mb_left = mb - 1; uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb; -memcpy(mb-intra4x4_pred_mode_left, mb_left-intra4x4_pred_mode_left, 4); memcpy(mb-intra4x4_pred_mode_top, mb_top-intra4x4_pred_mode_top, 4); if (keyframe) { int x, y; uint8_t* const top = mb-intra4x4_pred_mode_top; -uint8_t* const left = mb-intra4x4_pred_mode_left; +uint8_t* const left = s-intra4x4_pred_mode_left; for (y = 0; y 4; y++) { for (x = 0; x 4; x++) { const uint8_t *ctx; @@ -670,7 +682,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ } else { const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u; AV_WN32A(mb-intra4x4_pred_mode_top, modes); -AV_WN32A(mb-intra4x4_pred_mode_left, modes); +AV_WN32A( s-intra4x4_pred_mode_left, modes); } mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra); @@ -789,7 +801,7 @@ int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16], } static av_always_inline -void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, +void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9]) { int i, x, y, luma_start = 0, luma_ctx = 3; @@ -801,16 +813,16 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, nnz_pred = t_nnz[8] + l_nnz[8]; // decode DC values and do hadamard -nnz = decode_block_coeffs(c, s-block_dc, s-prob-token[1], 0, nnz_pred, +nnz = decode_block_coeffs(c, td-block_dc, s-prob-token[1], 0, nnz_pred, s-qmat[segment].luma_dc_qmul); l_nnz[8] = t_nnz[8] = !!nnz; if (nnz) { nnz_total += nnz; block_dc = 1; if (nnz == 1) -s-vp8dsp.vp8_luma_dc_wht_dc(s-block, s-block_dc); +s-vp8dsp.vp8_luma_dc_wht_dc(td-block, td-block_dc); else -s-vp8dsp.vp8_luma_dc_wht(s-block, s-block_dc); +s-vp8dsp.vp8_luma_dc_wht(td-block, td-block_dc); } luma_start = 1; luma_ctx = 0; @@ -820,10 +832,10 @@ void decode_mb_coeffs(VP8Context *s
Re: [libav-devel] [PATCH 1/4] VP8: Change mb memory layout for sliced threading.
On Thu, Jun 21, 2012 at 6:52 PM, Daniel Kang daniel.d.k...@gmail.comwrote: --- libavcodec/vp8.c | 14 +++--- 1 files changed, 7 insertions(+), 7 deletions(-) Oops I accidentally a patch -_- Will re-send later. ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel