From: Kieran Kunhya <[email protected]>

Signed-off-by: Ronald S. Bultje <[email protected]>
---
 libswscale/x86/scale.asm     |   61 ++++++++++++++++++++++++++++++++++++++++++
 libswscale/x86/swscale_mmx.c |   15 ++++++++++
 2 files changed, 76 insertions(+), 0 deletions(-)

diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index c74a2b2..ec8f4ec 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -34,6 +34,8 @@ pw_16:         times 8 dw 16
 pw_32:         times 8 dw 32
 pw_512:        times 8 dw 512
 pw_1024:       times 8 dw 1024
+yuv2yuvX_10_start:  times 4 dd 0x10000
+yuv2yuvX_10_upper:  times 8 dw 0x3ff
 
 SECTION .text
 
@@ -569,3 +571,62 @@ yuv2plane1_fn 10, sse2, 5, 3
 yuv2plane1_fn 16, sse2, 6, 3
 
 yuv2plane1_fn 16, sse4, 5, 3
+
+;void (*yuv2planarX_fn) (const int16_t *filter, int filterSize,
+;                        const int16_t **src, uint16_t *dest, int dstW,
+;                        const uint8_t *dither, int offset);
+%macro yuv2planeX10 1
+
+%ifdef ARCH_X86_32
+%define cntr_reg r1
+%else
+%define cntr_reg r11
+%endif
+
+cglobal yuv2planeX_10_%1, 7, 7
+    xor      r5, r5
+.pixelloop
+    mova     m1, [yuv2yuvX_10_start]
+    mova     m2, m1
+    movsxdifnidn cntr_reg, r1d
+.filterloop
+    pxor     m0, m0
+
+    mov      r6, [r2+gprsize*cntr_reg-2*gprsize]
+    mova     m3, [r6+r5]
+
+    mov      r6, [r2+gprsize*cntr_reg-gprsize]
+    mova     m4, [r6+r5]
+
+    punpcklwd m5, m3, m4
+    punpckhwd m3, m4
+
+    movd     m0, [r0+2*cntr_reg-4]
+    SPLATD   m0, m0
+
+    pmaddwd  m5, m0
+    pmaddwd  m3, m0
+
+    paddd    m2, m5
+    paddd    m1, m3
+
+    sub      cntr_reg, 2
+    jg .filterloop
+
+    psrad    m2, 17
+    psrad    m1, 17
+
+    packusdw m2, m1
+    pminsw   m2, [yuv2yuvX_10_upper]
+    mova     [r3+r5], m2
+
+    add      r5, mmsize
+    sub      r4d, mmsize/2
+    jg .pixelloop
+    REP_RET
+%endmacro
+
+INIT_XMM
+yuv2planeX10 sse2
+INIT_AVX
+yuv2planeX10 avx
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index 009d5fd..c2cbdb6 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -226,6 +226,14 @@ VSCALE_FUNCS(mmx, mmx2);
 VSCALE_FUNCS(sse2, sse2);
 VSCALE_FUNC(16, sse4);
 
+extern void ff_yuv2planeX10_sse2(const int16_t *filter, int filterSize,
+                                 const int16_t **src, uint8_t *dest, int dstW,
+                                 const uint8_t *dither, int offset);
+
+extern void ff_yuv2planeX10_avx(const int16_t *filter, int filterSize,
+                                const int16_t **src, uint8_t *dest, int dstW,
+                                const uint8_t *dither, int offset);
+
 void ff_sws_init_swScale_mmx(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -285,6 +293,8 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
         ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
         ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
         ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
+        if (c->dstBpc == 10 && !isBE(c->dstFormat) && !(c->vChrFilterSize&1))
+            c->yuv2planeX_chroma = ff_yuv2planeX10_sse2;
     }
     if (cpu_flags & AV_CPU_FLAG_SSSE3) {
         ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
@@ -297,5 +307,10 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
         if (c->dstBpc == 16 && !isBE(c->dstFormat))
             c->yuv2plane1 = ff_yuv2plane1_16_sse4;
     }
+
+    if (cpu_flags & AV_CPU_FLAG_AVX) {
+        if (c->dstBpc == 10 && !isBE(c->dstFormat) && !(c->vChrFilterSize&1))
+            c->yuv2planeX_chroma = ff_yuv2planeX10_avx;
+    }
 #endif
 }
-- 
1.7.2.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to