---
 libavfilter/x86/Makefile            |    1 +
 libavfilter/x86/vf_gradfun.c        |  168 +++++------------------------------
 libavfilter/x86/vf_gradfun_yasm.asm |  144 ++++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+), 148 deletions(-)
 create mode 100644 libavfilter/x86/vf_gradfun_yasm.asm

diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 59cefe8..b50b373 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -3,6 +3,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER)                 += 
x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
 
+YASM-OBJS-$(CONFIG_GRADFUN_FILTER)           += x86/vf_gradfun_yasm.o
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
 YASM-OBJS-$(CONFIG_VOLUME_FILTER)            += x86/af_volume.o
 YASM-OBJS-$(CONFIG_YADIF_FILTER)             += x86/vf_yadif.o
diff --git a/libavfilter/x86/vf_gradfun.c b/libavfilter/x86/vf_gradfun.c
index e571af7..3bac16d 100644
--- a/libavfilter/x86/vf_gradfun.c
+++ b/libavfilter/x86/vf_gradfun.c
@@ -24,12 +24,10 @@
 #include "libavutil/x86/asm.h"
 #include "libavfilter/gradfun.h"
 
-#if HAVE_INLINE_ASM
-
-DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = 
{0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
-DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = 
{0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
-
-#if HAVE_MMXEXT_INLINE
+#if HAVE_YASM
+void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, uint8_t *src,
+                                   uint16_t *dc, int thresh,
+                                   const uint16_t *dithers);
 static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t 
*dc,
                                        int width, int thresh,
                                        const uint16_t *dithers)
@@ -41,72 +39,13 @@ static void gradfun_filter_line_mmxext(uint8_t *dst, 
uint8_t *src, uint16_t *dc,
         width = x;
     }
     x = -width;
-    __asm__ volatile(
-        "movd          %4, %%mm5 \n"
-        "pxor       %%mm7, %%mm7 \n"
-        "pshufw $0, %%mm5, %%mm5 \n"
-        "movq          %6, %%mm6 \n"
-        "movq          (%5), %%mm3 \n"
-        "movq         8(%5), %%mm4 \n"
-
-        "1: \n"
-        "movd     (%2,%0), %%mm0 \n"
-        "movd     (%3,%0), %%mm1 \n"
-        "punpcklbw  %%mm7, %%mm0 \n"
-        "punpcklwd  %%mm1, %%mm1 \n"
-        "psllw         $7, %%mm0 \n"
-        "pxor       %%mm2, %%mm2 \n"
-        "psubw      %%mm0, %%mm1 \n" // delta = dc - pix
-        "psubw      %%mm1, %%mm2 \n"
-        "pmaxsw     %%mm1, %%mm2 \n"
-        "pmulhuw    %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
-        "psubw      %%mm6, %%mm2 \n"
-        "pminsw     %%mm7, %%mm2 \n" // m = -max(0, 127-m)
-        "pmullw     %%mm2, %%mm2 \n"
-        "paddw      %%mm3, %%mm0 \n" // pix += dither
-        "psllw         $2, %%mm1 \n" // m = m*m*delta >> 14
-        "pmulhw     %%mm2, %%mm1 \n"
-        "paddw      %%mm1, %%mm0 \n" // pix += m
-        "psraw         $7, %%mm0 \n"
-        "packuswb   %%mm0, %%mm0 \n"
-        "movd       %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
-        "add           $4, %0 \n"
-        "jnl 2f \n"
-
-        "movd     (%2,%0), %%mm0 \n"
-        "movd     (%3,%0), %%mm1 \n"
-        "punpcklbw  %%mm7, %%mm0 \n"
-        "punpcklwd  %%mm1, %%mm1 \n"
-        "psllw         $7, %%mm0 \n"
-        "pxor       %%mm2, %%mm2 \n"
-        "psubw      %%mm0, %%mm1 \n" // delta = dc - pix
-        "psubw      %%mm1, %%mm2 \n"
-        "pmaxsw     %%mm1, %%mm2 \n"
-        "pmulhuw    %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
-        "psubw      %%mm6, %%mm2 \n"
-        "pminsw     %%mm7, %%mm2 \n" // m = -max(0, 127-m)
-        "pmullw     %%mm2, %%mm2 \n"
-        "paddw      %%mm4, %%mm0 \n" // pix += dither
-        "psllw         $2, %%mm1 \n" // m = m*m*delta >> 14
-        "pmulhw     %%mm2, %%mm1 \n"
-        "paddw      %%mm1, %%mm0 \n" // pix += m
-        "psraw         $7, %%mm0 \n"
-        "packuswb   %%mm0, %%mm0 \n"
-        "movd       %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
-        "add           $4, %0 \n"
-        "jl 1b \n"
-
-        "2: \n"
-        "emms \n"
-        :"+r"(x)
-        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
-         "rm"(thresh), "r"(dithers), "m"(*pw_7f)
-        :"memory"
-    );
+    ff_gradfun_filter_line_mmxext(x, dst+width, src+width, dc+width/2,
+                                  thresh, dithers);
 }
-#endif
 
-#if HAVE_SSSE3_INLINE
+void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, uint8_t *src,
+                                  uint16_t *dc, int thresh,
+                                  const uint16_t *dithers);
 static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t 
*dc, int width, int thresh, const uint16_t *dithers)
 {
     intptr_t x;
@@ -117,100 +56,33 @@ static void gradfun_filter_line_ssse3(uint8_t *dst, 
uint8_t *src, uint16_t *dc,
         width = x;
     }
     x = -width;
-    __asm__ volatile(
-        "movd           %4, %%xmm5 \n"
-        "pxor       %%xmm7, %%xmm7 \n"
-        "pshuflw $0,%%xmm5, %%xmm5 \n"
-        "movdqa         %6, %%xmm6 \n"
-        "punpcklqdq %%xmm5, %%xmm5 \n"
-        "movdqa         %5, %%xmm4 \n"
-        "1: \n"
-        "movq      (%2,%0), %%xmm0 \n"
-        "movq      (%3,%0), %%xmm1 \n"
-        "punpcklbw  %%xmm7, %%xmm0 \n"
-        "punpcklwd  %%xmm1, %%xmm1 \n"
-        "psllw          $7, %%xmm0 \n"
-        "psubw      %%xmm0, %%xmm1 \n" // delta = dc - pix
-        "pabsw      %%xmm1, %%xmm2 \n"
-        "pmulhuw    %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
-        "psubw      %%xmm6, %%xmm2 \n"
-        "pminsw     %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
-        "pmullw     %%xmm2, %%xmm2 \n"
-        "psllw          $2, %%xmm1 \n"
-        "paddw      %%xmm4, %%xmm0 \n" // pix += dither
-        "pmulhw     %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
-        "paddw      %%xmm1, %%xmm0 \n" // pix += m
-        "psraw          $7, %%xmm0 \n"
-        "packuswb   %%xmm0, %%xmm0 \n"
-        "movq       %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
-        "add            $8, %0 \n"
-        "jl 1b \n"
-        :"+&r"(x)
-        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
-         "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
-        :"memory"
-    );
+    ff_gradfun_filter_line_ssse3(x, dst+width, src+width, dc+width/2,
+                                 thresh, dithers);
 }
-#endif /* HAVE_SSSE3_INLINE */
 
-#if HAVE_SSE2_INLINE
+void ff_gradfun_blur_line_movdqa_sse2(intptr_t x, uint16_t *buf, uint16_t 
*buf1, uint16_t *dc, uint8_t *src1, uint8_t *src2);
+void ff_gradfun_blur_line_movdqu_sse2(intptr_t x, uint16_t *buf, uint16_t 
*buf1, uint16_t *dc, uint8_t *src1, uint8_t *src2);
 static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t 
*buf1, uint8_t *src, int src_linesize, int width)
 {
-#define BLURV(load)\
-    intptr_t x = -2*width;\
-    __asm__ volatile(\
-        "movdqa %6, %%xmm7 \n"\
-        "1: \n"\
-        load"   (%4,%0), %%xmm0 \n"\
-        load"   (%5,%0), %%xmm1 \n"\
-        "movdqa  %%xmm0, %%xmm2 \n"\
-        "movdqa  %%xmm1, %%xmm3 \n"\
-        "psrlw       $8, %%xmm0 \n"\
-        "psrlw       $8, %%xmm1 \n"\
-        "pand    %%xmm7, %%xmm2 \n"\
-        "pand    %%xmm7, %%xmm3 \n"\
-        "paddw   %%xmm1, %%xmm0 \n"\
-        "paddw   %%xmm3, %%xmm2 \n"\
-        "paddw   %%xmm2, %%xmm0 \n"\
-        "paddw  (%2,%0), %%xmm0 \n"\
-        "movdqa (%1,%0), %%xmm1 \n"\
-        "movdqa  %%xmm0, (%1,%0) \n"\
-        "psubw   %%xmm1, %%xmm0 \n"\
-        "movdqa  %%xmm0, (%3,%0) \n"\
-        "add        $16, %0 \n"\
-        "jl 1b \n"\
-        :"+&r"(x)\
-        :"r"(buf+width),\
-         "r"(buf1+width),\
-         "r"(dc+width),\
-         "r"(src+width*2),\
-         "r"(src+width*2+src_linesize),\
-         "m"(*pw_ff)\
-        :"memory"\
-    );
+    intptr_t x = -2*width;
     if (((intptr_t) src | src_linesize) & 15) {
-        BLURV("movdqu");
+        ff_gradfun_blur_line_movdqu_sse2(x, buf+width, buf1+width, dc+width,
+                                         src+width*2, 
src+width*2+src_linesize);
     } else {
-        BLURV("movdqa");
+        ff_gradfun_blur_line_movdqa_sse2(x, buf+width, buf1+width, dc+width,
+                                         src+width*2, 
src+width*2+src_linesize);
     }
 }
-#endif /* HAVE_SSE2_INLINE */
-
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_YASM */
 
 av_cold void ff_gradfun_init_x86(GradFunContext *gf)
 {
-#if HAVE_MMXEXT_INLINE
+#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
-
     if (cpu_flags & AV_CPU_FLAG_MMXEXT)
         gf->filter_line = gradfun_filter_line_mmxext;
-#endif
-#if HAVE_SSSE3_INLINE
     if (cpu_flags & AV_CPU_FLAG_SSSE3)
         gf->filter_line = gradfun_filter_line_ssse3;
-#endif
-#if HAVE_SSE2_INLINE
     if (cpu_flags & AV_CPU_FLAG_SSE2)
         gf->blur_line = gradfun_blur_line_sse2;
 #endif
diff --git a/libavfilter/x86/vf_gradfun_yasm.asm 
b/libavfilter/x86/vf_gradfun_yasm.asm
new file mode 100644
index 0000000..e1737dd
--- /dev/null
+++ b/libavfilter/x86/vf_gradfun_yasm.asm
@@ -0,0 +1,144 @@
+;*****************************************************************************
+;* x86-optimized functions for gradfun filter
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with Libav; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_7f: times 8 dw 0x7F
+pw_ff: times 8 dw 0xFF
+
+SECTION .text
+
+INIT_MMX mmxext
+cglobal gradfun_filter_line, 6, 6
+    movh      m5, r4d
+    pxor      m7, m7
+    pshufw    m5, m5,0
+    mova      m6, [pw_7f]
+    mova      m3, [r5]
+    mova      m4, [r5+8]
+.loop:
+    movh      m0, [r2+r0]
+    movh      m1, [r3+r0]
+    punpcklbw m0, m7
+    punpcklwd m1, m1
+    psllw     m0, 7
+    pxor      m2, m2
+    psubw     m1, m0
+    psubw     m2, m1
+    pmaxsw    m2, m1
+    pmulhuw   m2, m5
+    psubw     m2, m6
+    pminsw    m2, m7
+    pmullw    m2, m2
+    paddw     m0, m3
+    psllw     m1, 2
+    pmulhw    m1, m2
+    paddw     m0, m1
+    psraw     m0, 7
+    packuswb  m0, m0
+    movh [r1+r0], m0
+    add       r0, 4
+    jge .end
+    movh      m0, [r2+r0]
+    movh      m1, [r3+r0]
+    punpcklbw m0, m7
+    punpcklwd m1, m1
+    psllw     m0, 7
+    pxor      m2, m2
+    psubw     m1, m0
+    psubw     m2, m1
+    pmaxsw    m2, m1
+    pmulhuw   m2, m5
+    psubw     m2, m6
+    pminsw    m2, m7
+    pmullw    m2, m2
+    paddw     m0, m4
+    psllw     m1, 2
+    pmulhw    m1, m2
+    paddw     m0, m1
+    psraw     m0, 7
+    packuswb  m0, m0
+    movh [r1+r0], m0
+    add       r0, 4
+    jl .loop
+.end:
+    REP_RET
+
+INIT_XMM ssse3
+cglobal gradfun_filter_line, 6, 6, 8
+    movd       m5, r4d
+    pxor       m7, m7
+    pshuflw    m5, m5, 0
+    mova       m6, [pw_7f]
+    punpcklqdq m5, m5
+    mova       m4, [r5]
+.loop:
+    movh       m0, [r2+r0]
+    movh       m1, [r3+r0]
+    punpcklbw  m0, m7
+    punpcklwd  m1, m1
+    psllw      m0, 7
+    psubw      m1, m0
+    pabsw      m2, m1
+    pmulhuw    m2, m5
+    psubw      m2, m6
+    pminsw     m2, m7
+    pmullw     m2, m2
+    psllw      m1, 2
+    paddw      m0, m4
+    pmulhw     m1, m2
+    paddw      m0, m1
+    psraw      m0, 7
+    packuswb   m0, m0
+    movh  [r1+r0], m0
+    add        r0, 8
+    jl .loop
+    REP_RET
+
+%macro BLUR_LINE 1
+cglobal gradfun_blur_line_%1, 6, 6, 8
+    mova        m7, [pw_ff]
+.loop:
+    %1          m0, [r4+r0]
+    %1          m1, [r5+r0]
+    mova        m2, m0
+    mova        m3, m1
+    psrlw       m0, 8
+    psrlw       m1, 8
+    pand        m2, m7
+    pand        m3, m7
+    paddw       m0, m1
+    paddw       m2, m3
+    paddw       m0, m2
+    paddw       m0, [r2+r0]
+    mova        m1, [r1+r0]
+    mova   [r1+r0], m0
+    psubw       m0, m1
+    mova   [r3+r0], m0
+    add         r0, 16
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+BLUR_LINE movdqa
+BLUR_LINE movdqu
-- 
1.7.10.4

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to