On 28/05/15 2:39 PM, Stefano Sabatini wrote: > From f3b4e77dd9dd299aba8f4fa83625d2b61b243c3c Mon Sep 17 00:00:00 2001 > From: Stefano Sabatini <stefa...@gmail.com> > Date: Fri, 15 May 2015 18:58:17 +0200 > Subject: [PATCH] lavu/imgutils: add av_image_copy_plane_from_uswc() function. > > This function allows support to optimized GPU to CPU. > > Based on code from vlc dxva2.c, commit 62107e56 by Laurent Aimar > <fen...@videolan.org>. > > TODO: fix integration with the build system, bump micro > > Signed-off-by: Stefano Sabatini <stefa...@gmail.com> > --- > libavutil/imgutils.c | 14 ++++++ > libavutil/imgutils.h | 18 +++++++ > libavutil/imgutils_internal.h | 29 +++++++++++ > libavutil/x86/Makefile | 1 + > libavutil/x86/imgutils.c | 109 > ++++++++++++++++++++++++++++++++++++++++++ > 5 files changed, 171 insertions(+) > create mode 100644 libavutil/imgutils_internal.h > create mode 100644 libavutil/x86/imgutils.c > > diff --git a/libavutil/imgutils.c b/libavutil/imgutils.c > index ef0e671..e538c75 100644 > --- a/libavutil/imgutils.c > +++ b/libavutil/imgutils.c > @@ -30,6 +30,7 @@ > #include "mathematics.h" > #include "pixdesc.h" > #include "rational.h" > +#include "imgutils_internal.h" > > void av_image_fill_max_pixsteps(int max_pixsteps[4], int > max_pixstep_comps[4], > const AVPixFmtDescriptor *pixdesc) > @@ -405,3 +406,16 @@ int av_image_copy_to_buffer(uint8_t *dst, int dst_size, > > return size; > } > + > +void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize, > + const uint8_t *src, size_t src_linesize, > + unsigned bytewidth, unsigned height, > + unsigned cpu_flags) > +{ > +#ifndef HAVE_SSSE3
All HAVE_ are always defined to either 0 or 1. Nonetheless, this kind of check does not belong outside of arch folders. You should check for ARCH_X86 to call functions in the x86/ folder. See lavc/lavfi for examples. > + av_unused(cpu_flags); > + av_image_copy_plane(dst, dst_linesize, src, src_linesize, bytewidth, > height); > +#else > + ff_image_copy_plane_from_uswc_x86(dst, dst_linesize, src, src_linesize, > bytewidth, height, cpu_flags); > +#endif > +} > diff --git a/libavutil/imgutils.h b/libavutil/imgutils.h > index 23282a3..82c3826 100644 > --- a/libavutil/imgutils.h > +++ b/libavutil/imgutils.h > @@ -111,6 +111,24 @@ void av_image_copy_plane(uint8_t *dst, int > dst_linesize, > int bytewidth, int height); > > /** > + * Copy image plane from src to dst, similar to av_image_copy_plane(). > + * src must be an USWC buffer. > + * It performs optimized copy from "Uncacheable Speculative Write > + * Combining" memory as used by some video surface. > + * It is really efficient only when SSE4.1 is available. > + * > + * In case the target CPU does not support USWC caching this function > + * will be equivalent to av_image_copy_plane(). > + * > + * @param cpu_flags as returned by av_get_cpu_flags() > + * @see av_image_copy_plane() > + */ > +void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize, > + const uint8_t *src, size_t src_linesize, > + unsigned bytewidth, unsigned height, > + unsigned cpu_flags); > + > +/** > * Copy image in src_data to dst_data. > * > * @param dst_linesizes linesizes for the image in dst_data > diff --git a/libavutil/imgutils_internal.h b/libavutil/imgutils_internal.h > new file mode 100644 > index 0000000..16ed977 > --- /dev/null > +++ b/libavutil/imgutils_internal.h > @@ -0,0 +1,29 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#ifndef AVUTIL_IMGUTILS_INTERNAL_H > +#define AVUTIL_IMGUTILS_INTERNAL_H > + > +#include "imgutils.h" > + > +void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize, > + const uint8_t *src, size_t src_linesize, > + unsigned bytewidth, unsigned height, > + unsigned cpu_flags); > + > +#endif /* AVUTIL_IMGUTILS_INTERNAL_H */ > diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile > index eb70a62..a719c00 100644 > --- a/libavutil/x86/Makefile > +++ b/libavutil/x86/Makefile > @@ -1,5 +1,6 @@ > OBJS += x86/cpu.o \ > x86/float_dsp_init.o \ > + x86/imgutils.o \ > x86/lls_init.o \ > > OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \ > diff --git a/libavutil/x86/imgutils.c b/libavutil/x86/imgutils.c > new file mode 100644 > index 0000000..91c7a42 > --- /dev/null > +++ b/libavutil/x86/imgutils.c > @@ -0,0 +1,109 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include <inttypes.h> > +#include "config.h" > +#include "libavutil/attributes.h" > +#include "libavutil/avassert.h" > +#include "libavutil/intreadwrite.h" > +#include "libavutil/x86/asm.h" > +#include "libavutil/x86/cpu.h" > +#include "libavutil/cpu.h" > +#include "libavutil/pixdesc.h" > + > +#include "libavutil/avassert.h" > +#include "libavutil/x86/asm.h" > +#include "libavutil/imgutils.h" > +#include "libavutil/imgutils_internal.h" > + > +#ifdef HAVE_SSE2 > +/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 > instruction > + * load and storing data with the SSE>=2 instruction store. > + */ > +#define COPY16(dstp, srcp, load, store) \ > + __asm__ volatile ( \ > + load " 0(%[src]), %%xmm1\n" \ > + store " %%xmm1, 0(%[dst])\n" \ > + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1") > + > +#define COPY64(dstp, srcp, load, store) \ > + __asm__ volatile ( \ > + load " 0(%[src]), %%xmm1\n" \ > + load " 16(%[src]), %%xmm2\n" \ > + load " 32(%[src]), %%xmm3\n" \ > + load " 48(%[src]), %%xmm4\n" \ > + store " %%xmm1, 0(%[dst])\n" \ > + store " %%xmm2, 16(%[dst])\n" \ > + store " %%xmm3, 32(%[dst])\n" \ > + store " %%xmm4, 48(%[dst])\n" \ > + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", > "xmm3", "xmm4") > +#endif As already mentioned, this should be done in nasm/yasm syntax. Also, any reason you're not using more xmm registers to reduce the amount of loops? or just unroll things a bit even if you still use only four (you skipped xmm0 for that matter). > + > +#define AV_CPU_SSE4() ((cpu_flags & AV_CPU_FLAG_SSE4) != 0) "!= 0" is unnecessary. And you could use the helper macros from libavutil/x86/cpu.h. > +#define AV_CPU_SSSE3() ((cpu_flags & AV_CPU_FLAG_SSSE3) != 0) > +#define AV_CPU_SSE2() ((cpu_flags & AV_CPU_FLAG_SSE2) != 0) > + > +void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize, > + const uint8_t *src, size_t src_linesize, > + unsigned bytewidth, unsigned height, > + unsigned cpu_flags) > +{ > +#ifndef HAVE_SSSE3 > + av_unused(cpu_flags); > + return av_copy_plane(dst, dst_linesize, src, src_linesize, bytewidth, > height); > +#endif > + > + av_assert0(((intptr_t)dst & 0x0f) == 0 && (dst_linesize & 0x0f) == 0); > + > + __asm__ volatile ("mfence"); > + > + for (unsigned y = 0; y < height; y++) { > + const unsigned unaligned = (-(uintptr_t)src) & 0x0f; > + unsigned x = unaligned; > + > +#ifdef HAVE_SSE42 > + if (AV_CPU_SSE4()) { > + if (!unaligned) { > + for (; x+63 < bytewidth; x += 64) > + COPY64(&dst[x], &src[x], "movntdqa", "movdqa"); > + } else { > + COPY16(dst, src, "movdqu", "movdqa"); > + for (; x+63 < bytewidth; x += 64) > + COPY64(&dst[x], &src[x], "movntdqa", "movdqu"); > + } > + } else > +#endif > + { > + if (!unaligned) { > + for (; x+63 < bytewidth; x += 64) > + COPY64(&dst[x], &src[x], "movdqa", "movdqa"); > + } else { > + COPY16(dst, src, "movdqu", "movdqa"); > + for (; x+63 < bytewidth; x += 64) > + COPY64(&dst[x], &src[x], "movdqa", "movdqu"); > + } > + } > + > + for (; x < bytewidth; x++) > + dst[x] = src[x]; > + > + src += src_linesize; > + dst += dst_linesize; > + } > + __asm__ volatile ("mfence"); > +} > -- 1.9.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel