On date Monday 2015-05-18 13:26:56 +0200, Stefano Sabatini encoded:
> On Mon, May 18, 2015 at 1:17 PM, Hendrik Leppkes <h.lepp...@gmail.com>
> wrote:
> 
> > On Mon, May 18, 2015 at 12:37 PM, Stefano Sabatini <stefa...@gmail.com>
> > wrote:
> >
> [...]
> 
> > >
> > > I have a first hackish patch, performed some tests and I got some
> > > significant performance gains, on my iCore5 with Intel Graphics HD4000 I
> > > have now the same performance as the software decoder using DXVA2 for
> > > decoding a H.264 1920x1080 video, but using only a single thread. The
> > patch
> > > as is is a hack, since I had to modify the compilation flags to enable
> > > assembly compilation in the ffmpeg_dxva2.c file. I should probably create
> > > an optimized copy function in libavutil, comments are welcome.
> >
> > FWIW, I never saw any benefits from using a small cache over simply
> > copying directly to the destination memory, that could potentially
> > simplify this a bit.
> >
> 
> 
> > And yeah, its a huge hack, we don't want new inline assembly.
> >
> 
> The sanest approach is probably to add a function to libavutil. The
> optimized copy would then be accessible to third-party library users, with
> no assembly hacks involved.

New patch attached, it's still somehow hackish, please advice if you
consider this approach acceptable.
-- 
FFmpeg = Formidable and Friendly MultiPurpose Explosive Game
>From f3b4e77dd9dd299aba8f4fa83625d2b61b243c3c Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefa...@gmail.com>
Date: Fri, 15 May 2015 18:58:17 +0200
Subject: [PATCH] lavu/imgutils: add av_image_copy_plane_from_uswc() function.

This function allows support to optimized GPU to CPU.

Based on code from vlc dxva2.c, commit 62107e56 by Laurent Aimar
<fen...@videolan.org>.

TODO: fix integration with the build system, bump micro

Signed-off-by: Stefano Sabatini <stefa...@gmail.com>
---
 libavutil/imgutils.c          |  14 ++++++
 libavutil/imgutils.h          |  18 +++++++
 libavutil/imgutils_internal.h |  29 +++++++++++
 libavutil/x86/Makefile        |   1 +
 libavutil/x86/imgutils.c      | 109 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 171 insertions(+)
 create mode 100644 libavutil/imgutils_internal.h
 create mode 100644 libavutil/x86/imgutils.c

diff --git a/libavutil/imgutils.c b/libavutil/imgutils.c
index ef0e671..e538c75 100644
--- a/libavutil/imgutils.c
+++ b/libavutil/imgutils.c
@@ -30,6 +30,7 @@
 #include "mathematics.h"
 #include "pixdesc.h"
 #include "rational.h"
+#include "imgutils_internal.h"
 
 void av_image_fill_max_pixsteps(int max_pixsteps[4], int max_pixstep_comps[4],
                                 const AVPixFmtDescriptor *pixdesc)
@@ -405,3 +406,16 @@ int av_image_copy_to_buffer(uint8_t *dst, int dst_size,
 
     return size;
 }
+
+void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize,
+				   const uint8_t *src, size_t src_linesize,
+				   unsigned bytewidth, unsigned height,
+				   unsigned cpu_flags)
+{
+#ifndef HAVE_SSSE3
+    av_unused(cpu_flags);
+    av_image_copy_plane(dst, dst_linesize, src, src_linesize, bytewidth, height);
+#else
+    ff_image_copy_plane_from_uswc_x86(dst, dst_linesize, src, src_linesize, bytewidth, height, cpu_flags);
+#endif
+}
diff --git a/libavutil/imgutils.h b/libavutil/imgutils.h
index 23282a3..82c3826 100644
--- a/libavutil/imgutils.h
+++ b/libavutil/imgutils.h
@@ -111,6 +111,24 @@ void av_image_copy_plane(uint8_t       *dst, int dst_linesize,
                          int bytewidth, int height);
 
 /**
+ * Copy image plane from src to dst, similar to av_image_copy_plane().
+ * src must be an USWC buffer.
+ * It performs optimized copy from "Uncacheable Speculative Write
+ * Combining" memory as used by some video surface.
+ * It is really efficient only when SSE4.1 is available.
+ *
+ * In case the target CPU does not support USWC caching this function
+ * will be equivalent to av_image_copy_plane().
+ *
+ * @param cpu_flags as returned by av_get_cpu_flags()
+ * @see av_image_copy_plane()
+ */
+void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize,
+                                   const uint8_t *src, size_t src_linesize,
+                                   unsigned bytewidth, unsigned height,
+                                   unsigned cpu_flags);
+
+/**
  * Copy image in src_data to dst_data.
  *
  * @param dst_linesizes linesizes for the image in dst_data
diff --git a/libavutil/imgutils_internal.h b/libavutil/imgutils_internal.h
new file mode 100644
index 0000000..16ed977
--- /dev/null
+++ b/libavutil/imgutils_internal.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_IMGUTILS_INTERNAL_H
+#define AVUTIL_IMGUTILS_INTERNAL_H
+
+#include "imgutils.h"
+
+void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize,
+				       const uint8_t *src, size_t src_linesize,
+				       unsigned bytewidth, unsigned height,
+				       unsigned cpu_flags);
+
+#endif /* AVUTIL_IMGUTILS_INTERNAL_H */
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index eb70a62..a719c00 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -1,5 +1,6 @@
 OBJS += x86/cpu.o                                                       \
         x86/float_dsp_init.o                                            \
+        x86/imgutils.o                                                  \
         x86/lls_init.o                                                  \
 
 OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o                      \
diff --git a/libavutil/x86/imgutils.c b/libavutil/x86/imgutils.c
new file mode 100644
index 0000000..91c7a42
--- /dev/null
+++ b/libavutil/x86/imgutils.c
@@ -0,0 +1,109 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+
+#include "libavutil/avassert.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/imgutils_internal.h"
+
+#ifdef HAVE_SSE2
+/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
+ * load and storing data with the SSE>=2 instruction store.
+ */
+#define COPY16(dstp, srcp, load, store) \
+    __asm__ volatile (                      \
+        load "  0(%[src]), %%xmm1\n"    \
+        store " %%xmm1,    0(%[dst])\n" \
+        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
+
+#define COPY64(dstp, srcp, load, store) \
+    __asm__ volatile (                      \
+        load "  0(%[src]), %%xmm1\n"    \
+        load " 16(%[src]), %%xmm2\n"    \
+        load " 32(%[src]), %%xmm3\n"    \
+        load " 48(%[src]), %%xmm4\n"    \
+        store " %%xmm1,    0(%[dst])\n" \
+        store " %%xmm2,   16(%[dst])\n" \
+        store " %%xmm3,   32(%[dst])\n" \
+        store " %%xmm4,   48(%[dst])\n" \
+        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
+#endif
+
+#define AV_CPU_SSE4()  ((cpu_flags & AV_CPU_FLAG_SSE4)  != 0)
+#define AV_CPU_SSSE3() ((cpu_flags & AV_CPU_FLAG_SSSE3) != 0)
+#define AV_CPU_SSE2()  ((cpu_flags & AV_CPU_FLAG_SSE2)  != 0)
+
+void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize,
+				       const uint8_t *src, size_t src_linesize,
+				       unsigned bytewidth, unsigned height,
+				       unsigned cpu_flags)
+{
+#ifndef HAVE_SSSE3
+    av_unused(cpu_flags);
+    return av_copy_plane(dst, dst_linesize, src, src_linesize, bytewidth, height);
+#endif
+
+    av_assert0(((intptr_t)dst & 0x0f) == 0 && (dst_linesize & 0x0f) == 0);
+
+    __asm__ volatile ("mfence");
+
+    for (unsigned y = 0; y < height; y++) {
+        const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
+        unsigned x = unaligned;
+
+#ifdef HAVE_SSE42
+        if (AV_CPU_SSE4()) {
+            if (!unaligned) {
+                for (; x+63 < bytewidth; x += 64)
+                    COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
+            } else {
+                COPY16(dst, src, "movdqu", "movdqa");
+                for (; x+63 < bytewidth; x += 64)
+                    COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
+            }
+        } else
+#endif
+        {
+            if (!unaligned) {
+                for (; x+63 < bytewidth; x += 64)
+                    COPY64(&dst[x], &src[x], "movdqa", "movdqa");
+            } else {
+                COPY16(dst, src, "movdqu", "movdqa");
+                for (; x+63 < bytewidth; x += 64)
+                    COPY64(&dst[x], &src[x], "movdqa", "movdqu");
+            }
+        }
+
+        for (; x < bytewidth; x++)
+            dst[x] = src[x];
+
+        src += src_linesize;
+        dst += dst_linesize;
+    }
+    __asm__ volatile ("mfence");
+}
-- 
1.9.1

>From 337a3ecc47873a6e7a279509f2f5a00c7cc5726f Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefa...@gmail.com>
Date: Thu, 28 May 2015 19:26:16 +0200
Subject: [PATCH] ffmpeg_dxva2: use optimized copy function from GPU to CPU

This should allow significant improved performances.
---
 ffmpeg_dxva2.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/ffmpeg_dxva2.c b/ffmpeg_dxva2.c
index 6b20195..34e4643 100644
--- a/ffmpeg_dxva2.c
+++ b/ffmpeg_dxva2.c
@@ -258,7 +258,9 @@ static int dxva2_retrieve_data(AVCodecContext *s, AVFrame *frame)
     D3DLOCKED_RECT     LockedRect;
     HRESULT            hr;
     int                ret;
+    int cpu_flags;
 
+    cpu_flags = av_get_cpu_flags();
     IDirect3DSurface9_GetDesc(surface, &surfaceDesc);
 
     ctx->tmp_frame->width  = frame->width;
@@ -275,14 +277,15 @@ static int dxva2_retrieve_data(AVCodecContext *s, AVFrame *frame)
         return AVERROR_UNKNOWN;
     }
 
-    av_image_copy_plane(ctx->tmp_frame->data[0], ctx->tmp_frame->linesize[0],
-                        (uint8_t*)LockedRect.pBits,
-                        LockedRect.Pitch, frame->width, frame->height);
-
-    av_image_copy_plane(ctx->tmp_frame->data[1], ctx->tmp_frame->linesize[1],
-                        (uint8_t*)LockedRect.pBits + LockedRect.Pitch * surfaceDesc.Height,
-                        LockedRect.Pitch, frame->width, frame->height / 2);
+   av_image_copy_plane_from_uswc(ctx->tmp_frame->data[0], ctx->tmp_frame->linesize[0],
+                                 (uint8_t*)LockedRect.pBits,
+                                 LockedRect.Pitch, frame->width, frame->height,
+                                 cpu_flags);
 
+    av_image_copy_plane_from_uswc(ctx->tmp_frame->data[1], ctx->tmp_frame->linesize[1],
+                                  (uint8_t*)LockedRect.pBits + LockedRect.Pitch * surfaceDesc.Height,
+                                  LockedRect.Pitch, frame->width, frame->height / 2,
+                                cpu_flags);
     IDirect3DSurface9_UnlockRect(surface);
 
     ret = av_frame_copy_props(ctx->tmp_frame, frame);
-- 
1.9.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to