On 5/1/2018 5:02 AM, Paul B Mahol wrote: > Specifically for yuv444, yuv422, yuv420 format when main stream has no alpha, > and alpha > is straight. > > Signed-off-by: Paul B Mahol <one...@gmail.com> > --- > libavfilter/vf_overlay.c | 75 +++++------------- > libavfilter/vf_overlay.h | 85 +++++++++++++++++++++ > libavfilter/x86/Makefile | 2 + > libavfilter/x86/vf_overlay.asm | 157 > ++++++++++++++++++++++++++++++++++++++ > libavfilter/x86/vf_overlay_init.c | 63 +++++++++++++++ > 5 files changed, 326 insertions(+), 56 deletions(-) > create mode 100644 libavfilter/vf_overlay.h > create mode 100644 libavfilter/x86/vf_overlay.asm > create mode 100644 libavfilter/x86/vf_overlay_init.c
[...] > diff --git a/libavfilter/x86/vf_overlay.asm b/libavfilter/x86/vf_overlay.asm > new file mode 100644 > index 0000000000..d639cce9e5 > --- /dev/null > +++ b/libavfilter/x86/vf_overlay.asm > @@ -0,0 +1,157 @@ > +;***************************************************************************** > +;* x86-optimized functions for overlay filter > +;* > +;* Copyright (C) 2018 Paul B Mahol > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;***************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +pw_128: times 8 dw 128 > +pw_255: times 8 dw 255 > +pw_257: times 8 dw 257 > +pw_65280: times 8 dw 65280 > + > +SECTION .text > + > +INIT_XMM sse4 > +cglobal overlay_row_44, 6, 8, 6, 0, d, da, s, a, w, alinesize, r, x You're not using the alinesize parameter here. Make this 5, 7, 8 and use that reg for r. That way this can work on x86_32. Also, pointless 0 after xmm reg amount. Just remove it. > + xor xq, xq > + movsxdifnidn wq, wd > + mov rq, wq > + and rq, mmsize/2 - 1 > + cmp wq, mmsize/2 > + jl .end > + sub wq, rq > + mova m3, [pw_255] > + mova m4, [pw_128] > + mova m5, [pw_257] > + .loop0: > + pmovzxbw m0, [sq+xq] > + pmovzxbw m2, [aq+xq] > + pmovzxbw m1, [dq+xq] > + pmullw m0, m2 > + pxor m2, m3 > + pmullw m1, m2 > + paddw m0, m4 > + paddw m0, m1 > + pmulhuw m0, m5 > + packuswb m0, m0 > + movq [dq+xq], m0 > + add xq, mmsize/2 > + cmp xq, wq > + jl .loop0 > + > + .end: > + mov eax, xd > + RET > + > +INIT_XMM sse4 > +cglobal overlay_row_22, 6, 8, 8, 0, d, da, s, a, w, al, r, x Same here with al. > + xor xq, xq > + movsxdifnidn wq, wd > + sub wq, 1 > + mov rq, wq > + and rq, mmsize/2 - 1 > + cmp wq, mmsize/2 > + jl .end > + sub wq, rq > + mova m3, [pw_255] > + mova m4, [pw_128] > + mova m5, [pw_257] > + mova m7, [pw_65280] > + .loop0: > + pmovzxbw m0, [sq+xq] > + movu m2, [aq+2*xq] > + pand m2, m3 > + movu m6, [aq+2*xq] > + pand m6, m7 > + psrlw m6, 8 > + paddw m2, m6 > + psrlw m2, 1 > + movu m6, [aq+2*xq] > + pand m6, m3 > + paddw m2, m6 > + psrlw m2, 1 > + pmovzxbw m1, [dq+xq] > + pmullw m0, m2 > + pxor m2, m3 > + pmullw m1, m2 > + paddw m0, m4 > + paddw m0, m1 > + pmulhuw m0, m5 > + packuswb m0, m0 > + movq [dq+xq], m0 > + add xq, mmsize/2 > + cmp xq, wq > + jl .loop0 > + > + .end: > + mov eax, xd > + RET > + > +INIT_XMM sse4 > +cglobal overlay_row_20, 6, 8, 8, 0, d, da, s, a, w, al, r, x > + xor xq, xq > + movsxdifnidn wq, wd > + sub wq, 1 > + mov rq, wq > + and rq, mmsize/2 - 1 > + cmp wq, mmsize/2 > + jl .end > + sub wq, rq > + mov daq, aq > + add daq, alq Use al straight from memory here, and use the gpr for r, much like above. > + mova m3, [pw_255] > + mova m4, [pw_128] > + mova m5, [pw_257] > + mova m7, [pw_65280] > + .loop0: > + pmovzxbw m0, [sq+xq] > + movu m2, [aq+2*xq] > + pand m2, m3 > + movu m6, [aq+2*xq] > + pand m6, m7 > + psrlw m6, 8 > + paddw m2, m6 > + movu m6, [daq+2*xq] > + pand m6, m3 > + paddw m2, m6 > + movu m6, [daq+2*xq] > + pand m6, m7 > + psrlw m6, 8 > + paddw m2, m6 > + psrlw m2, 2 > + pmovzxbw m1, [dq+xq] > + pmullw m0, m2 > + pxor m2, m3 > + pmullw m1, m2 > + paddw m0, m4 > + paddw m0, m1 > + pmulhuw m0, m5 > + packuswb m0, m0 > + movq [dq+xq], m0 > + add xq, mmsize/2 > + cmp xq, wq > + jl .loop0 > + > + .end: > + mov eax, xd > + RET > diff --git a/libavfilter/x86/vf_overlay_init.c > b/libavfilter/x86/vf_overlay_init.c > new file mode 100644 > index 0000000000..865fd035f6 > --- /dev/null > +++ b/libavfilter/x86/vf_overlay_init.c > @@ -0,0 +1,63 @@ > +/* > + * Copyright (c) 2018 Paul B Mahol > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/x86/cpu.h" > +#include "libavfilter/vf_overlay.h" > + > +int ff_overlay_row_44_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a, > + int w, ptrdiff_t alinesize); > + > +int ff_overlay_row_20_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a, > + int w, ptrdiff_t alinesize); > + > +int ff_overlay_row_22_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a, > + int w, ptrdiff_t alinesize); > + > +av_cold void ff_overlay_init_x86(OverlayContext *s, int format, int > alpha_format, int main_has_alpha) > +{ > + int cpu_flags = av_get_cpu_flags(); > + > + if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) && > + (format == OVERLAY_FORMAT_YUV444 || > + format == OVERLAY_FORMAT_GBRP) && > + alpha_format == 0 && main_has_alpha == 0) { > + s->blend_row[0] = ff_overlay_row_44_sse4; > + s->blend_row[1] = ff_overlay_row_44_sse4; > + s->blend_row[2] = ff_overlay_row_44_sse4; > + } > + > + if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) && > + (format == OVERLAY_FORMAT_YUV420) && > + alpha_format == 0 && main_has_alpha == 0) { > + s->blend_row[0] = ff_overlay_row_44_sse4; > + s->blend_row[1] = ff_overlay_row_20_sse4; > + s->blend_row[2] = ff_overlay_row_20_sse4; > + } > + > + if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) && > + (format == OVERLAY_FORMAT_YUV422) && > + alpha_format == 0 && main_has_alpha == 0) { > + s->blend_row[0] = ff_overlay_row_44_sse4; > + s->blend_row[1] = ff_overlay_row_22_sse4; > + s->blend_row[2] = ff_overlay_row_22_sse4; > + } You can remove all the x86_64 checks after the changes described above. > +} > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel