I made no changes to how the stack is used so that's all from gcc. I'll work on it. Perhaps something like what idct8_ssse3 uses?

On 11/19/2014 08:46 PM, chen wrote:
you never alloc stack but use it, eg: "[rsp - 72]"

At 2014-11-20 11:48:01,dave <[email protected]> wrote:

    How does it fail?  Are you getting a segmentation fault?  It works
    fine on debian/gcc but it is dependent on the stack being aligned
    at 8.  I don't have any windows environment.

    I also replaced all the registers and mov instructions with the
    defines of x86inc.asm for x86_64.  Perhaps I missed something that
    windows needs.

    On 11/19/2014 07:22 PM, chen wrote:
    The code can't pass testbench on my PC
    VS2008, Win7 x64
    At 2014-11-20 11:13:31,[email protected]  wrote:
    ># HG changeset patch
    ># User David T Yuen<[email protected]>
    ># Date 1416451149 28800
    ># Node ID 37392ba74268210aafa8123d9f7c12d46a22c152
    ># Parent  d059cfa88f1ac79b319bd8a05bc70704d454f0ba
    >idct8 sse2
    >
    >Based on the gcc of Debian 4.7.2-5 by the following command
    >
    >c++ -S -masm=intel -DX265_ARCH_X86=1 -DX86_64=1 -DHAVE_INT_TYPES_H=1 
-D__STDC_LIMIT_MACROS=1 -DHIGH_BIT_DEPTH=0 -O3 -DNDEBUG 
-I/home/shakezula/Development/x265/source/. 
-I/home/shakezula/Development/x265/source/Lib 
-I/home/shakezula/Development/x265/source/common 
-I/home/shakezula/Development/x265/source/encoder 
-I/home/shakezula/Development/x265/build/linux    -Wall -Wextra -Wshadow -fPIC 
-ffast-math -mstackrealign -fno-exceptions -Wno-unused-parameter -msse3 -o 
~/Development/dct-sse3.asm -c 
/home/shakezula/Development/x265/source/common/vec/dct-sse3.cpp
    >
    >It has been tweeked for better register usage with fewer values written to 
the stack and better setup of r2 indexing for write
    >
    >diff -r d059cfa88f1a -r 37392ba74268 source/common/x86/asm-primitives.cpp
    >--- a/source/common/x86/asm-primitives.cpp      Tue Nov 18 14:11:12 2014 
-0600
    >+++ b/source/common/x86/asm-primitives.cpp      Wed Nov 19 18:39:09 2014 
-0800
    >@@ -1377,6 +1377,7 @@
    >         p.dct[DCT_4x4] = x265_dct4_sse2;
    >         p.idct[IDCT_4x4] = x265_idct4_sse2;
    >         p.idct[IDST_4x4] = x265_idst4_sse2;
    >+        p.idct[IDCT_8x8] = x265_idct8_sse2;
    >
    >         LUMA_SS_FILTERS(_sse2);
    >     }
    >@@ -1567,6 +1568,7 @@
    >         p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
    >         p.dct[DCT_4x4] = x265_dct4_sse2;
    >         p.idct[IDCT_4x4] = x265_idct4_sse2;
    >+        p.idct[IDCT_8x8] = x265_idct8_sse2;
    >         p.idct[IDST_4x4] = x265_idst4_sse2;
    >         p.planecopy_sp = x265_downShift_16_sse2;
    >         p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
    >diff -r d059cfa88f1a -r 37392ba74268 source/common/x86/dct8.asm
    >--- a/source/common/x86/dct8.asm        Tue Nov 18 14:11:12 2014 -0600
    >+++ b/source/common/x86/dct8.asm        Wed Nov 19 18:39:09 2014 -0800
    >@@ -302,6 +302,19 @@
    >
    > pb_idct8odd:    db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
    >
    >+tab_idct8:  times 4 dw  89,  75
    >+            times 4 dw  50,  18
    >+            times 4 dw  75, -18
    >+            times 4 dw  -89, -50
    >+            times 4 dw  50, -89
    >+            times 4 dw  18,  75
    >+            times 4 dw  18, -50
    >+            times 4 dw  75, -89
    >+            times 4 dw  64,  64
    >+            times 4 dw  64, -64
    >+            times 4 dw  83,  36
    >+            times 4 dw  36, -83
    >+
    > SECTION .text
    > cextern pd_1
    > cextern pd_2
    >@@ -976,6 +989,387 @@
    > ;-------------------------------------------------------
    > ; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
    > ;-------------------------------------------------------
    >+INIT_XMM sse2
    >+
    >+%if BIT_DEPTH == 10
    >+    %define     IDCT_SHIFT 10
    >+    %define     IDCT_ADD pd_512
    >+%elif BIT_DEPTH == 8
    >+    %define     IDCT_SHIFT 12
    >+    %define     IDCT_ADD pd_2048
    >+%else
    >+    %error Unsupported BIT_DEPTH!
    >+%endif
    >+
    >+cglobal idct8, 3,7, 16
    >+    lea         r2, [r2 + r2]                           ;set r2 to index 
of 1
    >+    lea         r4, [r2 + r2]                           ;set r4 to index 
of 2
    >+    lea         r3, [r4 + r2]                           ;set r3 to index 
of 3
    >+    lea         r4, [r4 + r3]                           ;set r4 to index 
of 5
    >+    mova        m9, [r0 + 32]
    >+    packssdw    m9, [r0 + 48]
    >+    movu        m1, [r0 + 96]
    >+    packssdw    m1, [r0 + 112]
    >+    mova        m7, m9
    >+    punpcklwd   m7, m1
    >+    punpckhwd   m9, m1
    >+    mova        m14, [tab_idct8]
    >+    mova        m3, m14
    >+    pmaddwd     m14, m7
    >+    pmaddwd     m3, m9
    >+    mova        m0, [r0 + 160]
    >+    packssdw    m0, [r0 + 176]
    >+    mova        m10, [r0 + 224]
    >+    packssdw    m10, [r0 + 240]
    >+    mova        m2, m0
    >+    punpcklwd   m2, m10
    >+    punpckhwd   m0, m10
    >+    mova        m15, [tab_idct8 + 16]
    >+    mova        m11, [tab_idct8 + 16]
    >+    pmaddwd     m15, m2
    >+    mova        m4, [tab_idct8 + 32]
    >+    pmaddwd     m11, m0
    >+    mova        m1, [tab_idct8 + 32]
    >+    paddd       m15, m14
    >+    mova        m5, [tab_idct8 + 64]
    >+    mova        m12, [tab_idct8 + 64]
    >+    paddd       m11, m3
    >+    mova        [rsp - 72], m11
    >+    mova        [rsp - 88], m15
    >+    pmaddwd     m4, m7
    >+    pmaddwd     m1, m9
    >+    mova        m14, [tab_idct8 + 48]
    >+    mova        m3, [tab_idct8 + 48]
    >+    pmaddwd     m14, m2
    >+    pmaddwd     m3, m0
    >+    paddd       m14, m4
    >+    paddd       m3, m1
    >+    mova        [rsp - 40], m3
    >+    pmaddwd     m5, m9
    >+    pmaddwd     m9, [tab_idct8 + 96]
    >+    mova        m6, [tab_idct8 + 80]
    >+    pmaddwd     m12, m7
    >+    pmaddwd     m7, [tab_idct8 + 96]
    >+    mova        m4, [tab_idct8 + 80]
    >+    pmaddwd     m6, m2
    >+    paddd       m6, m12
    >+    pmaddwd     m2, [tab_idct8 + 112]
    >+    paddd       m7, m2
    >+    mova        [rsp - 24], m6
    >+    pmaddwd     m4, m0
    >+    pmaddwd     m0, [tab_idct8 + 112]
    >+    paddd       m9, m0
    >+    paddd       m5, m4
    >+    mova        m6, [r0]
    >+    packssdw    m6, [r0 + 16]
    >+    mova        m0, [r0 + 128]
    >+    packssdw    m0, [r0 + 144]
    >+    mova        m4, m6
    >+    mova        m12, [r0 + 64]
    >+    punpcklwd   m4, m0
    >+    punpckhwd   m6, m0
    >+    packssdw    m12, [r0 + 80]
    >+    mova        m0, [r0 + 192]
    >+    packssdw    m0, [r0 + 208]
    >+    mova        m13, m12
    >+    mova        m8, [tab_idct8 + 128]
    >+    punpcklwd   m13, m0
    >+    mova        m10, [tab_idct8 + 128]
    >+    punpckhwd   m12, m0
    >+    pmaddwd     m8, m4
    >+    mova        m3, m8
    >+    pmaddwd     m4, [tab_idct8 + 144]
    >+    pmaddwd     m10, m6
    >+    mova        m2, [tab_idct8 + 160]
    >+    mova        m1, m10
    >+    pmaddwd     m6, [tab_idct8 + 144]
    >+    mova        m0, [tab_idct8 + 160]
    >+    pmaddwd     m2, m13
    >+    paddd       m3, m2
    >+    psubd       m8, m2
    >+    mova        m2, m6
    >+    pmaddwd     m13, [tab_idct8 + 176]
    >+    pmaddwd     m0, m12
    >+    paddd       m1, m0
    >+    psubd       m10, m0
    >+    mova        m0, m4
    >+    pmaddwd     m12, [tab_idct8 + 176]
    >+    paddd       m3, [pd_64]
    >+    paddd       m1, [pd_64]
    >+    paddd       m8, [pd_64]
    >+    paddd       m10, [pd_64]
    >+    paddd       m0, m13
    >+    paddd       m2, m12
    >+    paddd       m0, [pd_64]
    >+    paddd       m2, [pd_64]
    >+    psubd       m4, m13
    >+    psubd       m6, m12
    >+    paddd       m4, [pd_64]
    >+    paddd       m6, [pd_64]
    >+    mova        m12, m8
    >+    psubd       m8, m7
    >+    psrad       m8, 7
    >+    paddd       m15, m3
    >+    psubd       m3, [rsp - 88]
    >+    psrad       m15, 7
    >+    paddd       m12, m7
    >+    psrad       m12, 7
    >+    paddd       m11, m1
    >+    mova        m13, m14
    >+    psrad       m11, 7
    >+    packssdw    m15, m11
    >+    psubd       m1, [rsp - 72]
    >+    psrad       m1, 7
    >+    mova        m11, [rsp - 40]
    >+    paddd       m14, m0
    >+    psrad       m14, 7
    >+    psubd       m0, m13
    >+    psrad       m0, 7
    >+    paddd       m11, m2
    >+    mova        m13, [rsp - 24]
    >+    psrad       m11, 7
    >+    packssdw    m14, m11
    >+    mova        m11, m6
    >+    psubd       m6, m5
    >+    paddd       m13, m4
    >+    psrad       m13, 7
    >+    psrad       m6, 7
    >+    paddd       m11, m5
    >+    psrad       m11, 7
    >+    packssdw    m13, m11
    >+    mova        m11, m10
    >+    psubd       m4, [rsp - 24]
    >+    psubd       m10, m9
    >+    psrad       m4, 7
    >+    psrad       m10, 7
    >+    packssdw    m4, m6
    >+    packssdw    m8, m10
    >+    paddd       m11, m9
    >+    psrad       m11, 7
    >+    packssdw    m12, m11
    >+    psubd       m2, [rsp - 40]
    >+    mova        m5, m15
    >+    psrad       m2, 7
    >+    packssdw    m0, m2
    >+    mova        m2, m14
    >+    psrad       m3, 7
    >+    packssdw    m3, m1
    >+    mova        m6, m13
    >+    punpcklwd   m5, m8
    >+    punpcklwd   m2, m4
    >+    mova        m1, m12
    >+    punpcklwd   m6, m0
    >+    punpcklwd   m1, m3
    >+    mova        m9, m5
    >+    punpckhwd   m13, m0
    >+    mova        m0, m2
    >+    punpcklwd   m9, m6
    >+    punpckhwd   m5, m6
    >+    punpcklwd   m0, m1
    >+    punpckhwd   m2, m1
    >+    punpckhwd   m15, m8
    >+    mova        m1, m5
    >+    punpckhwd   m14, m4
    >+    punpckhwd   m12, m3
    >+    mova        m6, m9
    >+    punpckhwd   m9, m0
    >+    punpcklwd   m1, m2
    >+    mova        m4, [tab_idct8]
    >+    punpckhwd   m5, m2
    >+    punpcklwd   m6, m0
    >+    mova        m2, m15
    >+    mova        m0, m14
    >+    mova        m7, m9
    >+    punpcklwd   m2, m13
    >+    punpcklwd   m0, m12
    >+    punpcklwd   m7, m5
    >+    punpckhwd   m14, m12
    >+    mova        m10, m2
    >+    punpckhwd   m15, m13
    >+    punpckhwd   m9, m5
    >+    pmaddwd     m4, m7
    >+    mova        m13, m1
    >+    punpckhwd   m2, m0
    >+    punpcklwd   m10, m0
    >+    mova        m0, m15
    >+    punpckhwd   m15, m14
    >+    mova        m12, m1
    >+    mova        m3, [tab_idct8]
    >+    punpcklwd   m0, m14
    >+    pmaddwd     m3, m9
    >+    mova        m11, m2
    >+    punpckhwd   m2, m15
    >+    punpcklwd   m11, m15
    >+    mova        m8, [tab_idct8 + 16]
    >+    punpcklwd   m13, m0
    >+    punpckhwd   m12, m0
    >+    pmaddwd     m8, m11
    >+    paddd       m8, m4
    >+    mova        [rsp - 88], m8
    >+    mova        m4, [tab_idct8 + 32]
    >+    pmaddwd     m4, m7
    >+    mova        m15, [tab_idct8 + 32]
    >+    mova        m5, [tab_idct8 + 16]
    >+    pmaddwd     m15, m9
    >+    pmaddwd     m5, m2
    >+    paddd       m5, m3
    >+    mova        [rsp - 72], m5
    >+    mova        m14, [tab_idct8 + 48]
    >+    mova        m5, [tab_idct8 + 48]
    >+    pmaddwd     m14, m11
    >+    paddd       m14, m4
    >+    mova        [rsp - 56], m14
    >+    pmaddwd     m5, m2
    >+    paddd       m5, m15
    >+    mova        [rsp - 40], m5
    >+    mova        m15, [tab_idct8 + 64]
    >+    mova        m5, [tab_idct8 + 64]
    >+    pmaddwd     m15, m7
    >+    pmaddwd     m7, [tab_idct8 + 96]
    >+    pmaddwd     m5, m9
    >+    pmaddwd     m9, [tab_idct8 + 96]
    >+    mova        m4, [tab_idct8 + 80]
    >+    pmaddwd     m4, m2
    >+    paddd       m5, m4
    >+    mova        m4, m6
    >+    mova        m8, [tab_idct8 + 80]
    >+    punpckhwd   m6, m10
    >+    pmaddwd     m2, [tab_idct8 + 112]
    >+    punpcklwd   m4, m10
    >+    paddd       m9, m2
    >+    pmaddwd     m8, m11
    >+    mova        m10, [tab_idct8 + 128]
    >+    paddd       m8, m15
    >+    pmaddwd     m11, [tab_idct8 + 112]
    >+    paddd       m7, m11
    >+    mova        [rsp - 24], m8
    >+    pmaddwd     m10, m6
    >+    pmaddwd     m6, [tab_idct8 + 144]
    >+    mova        m1, m10
    >+    mova        m8, [tab_idct8 + 128]
    >+    mova        m3, [tab_idct8 + 160]
    >+    pmaddwd     m8, m4
    >+    pmaddwd     m4, [tab_idct8 + 144]
    >+    mova        m0, m8
    >+    mova        m2, [tab_idct8 + 160]
    >+    pmaddwd     m3, m13
    >+    psubd       m8, m3
    >+    paddd       m0, m3
    >+    mova        m3, m6
    >+    pmaddwd     m13, [tab_idct8 + 176]
    >+    pmaddwd     m2, m12
    >+    paddd       m1, m2
    >+    psubd       m10, m2
    >+    mova        m2, m4
    >+    pmaddwd     m12, [tab_idct8 + 176]
    >+    paddd       m0, [IDCT_ADD]
    >+    paddd       m1, [IDCT_ADD]
    >+    paddd       m8, [IDCT_ADD]
    >+    paddd       m10, [IDCT_ADD]
    >+    paddd       m2, m13
    >+    paddd       m3, m12
    >+    paddd       m2, [IDCT_ADD]
    >+    paddd       m3, [IDCT_ADD]
    >+    psubd       m4, m13
    >+    psubd       m6, m12
    >+    paddd       m4, [IDCT_ADD]
    >+    paddd       m6, [IDCT_ADD]
    >+    mova        m15, [rsp - 88]
    >+    mova        m12, m8
    >+    psubd       m8, m7
    >+    psrad       m8, IDCT_SHIFT
    >+    mova        m11, [rsp - 72]
    >+    paddd       m15, m0
    >+    psrad       m15, IDCT_SHIFT
    >+    psubd       m0, [rsp - 88]
    >+    psrad       m0, IDCT_SHIFT
    >+    paddd       m12, m7
    >+    paddd       m11, m1
    >+    mova        m14, [rsp - 56]
    >+    psrad       m11, IDCT_SHIFT
    >+    packssdw    m15, m11
    >+    psubd       m1, [rsp - 72]
    >+    psrad       m1, IDCT_SHIFT
    >+    mova        m11, [rsp - 40]
    >+    paddd       m14, m2
    >+    psrad       m14, IDCT_SHIFT
    >+    packssdw    m0, m1
    >+    psrad       m12, IDCT_SHIFT
    >+    psubd       m2, [rsp - 56]
    >+    paddd       m11, m3
    >+    mova        m13, [rsp - 24]
    >+    psrad       m11, IDCT_SHIFT
    >+    packssdw    m14, m11
    >+    mova        m11, m6
    >+    psubd       m6, m5
    >+    paddd       m13, m4
    >+    psrad       m13, IDCT_SHIFT
    >+    mova        m1, m15
    >+    paddd       m11, m5
    >+    psrad       m11, IDCT_SHIFT
    >+    packssdw    m13, m11
    >+    mova        m11, m10
    >+    psubd       m10, m9
    >+    psrad       m10, IDCT_SHIFT
    >+    packssdw    m8, m10
    >+    psrad       m6, IDCT_SHIFT
    >+    psubd       m4, [rsp - 24]
    >+    paddd       m11, m9
    >+    psrad       m11, IDCT_SHIFT
    >+    packssdw    m12, m11
    >+    punpcklwd   m1, m14
    >+    mova        m5, m13
    >+    psrad       m4, IDCT_SHIFT
    >+    packssdw    m4, m6
    >+    psubd       m3, [rsp - 40]
    >+    psrad       m2, IDCT_SHIFT
    >+    mova        m6, m8
    >+    psrad       m3, IDCT_SHIFT
    >+    punpcklwd   m5, m12
    >+    packssdw    m2, m3
    >+    punpcklwd   m6, m4
    >+    punpckhwd   m8, m4
    >+    mova        m4, m1
    >+    mova        m3, m2
    >+    punpckhdq   m1, m5
    >+    punpckldq   m4, m5
    >+    punpcklwd   m3, m0
    >+    punpckhwd   m2, m0
    >+    mova        m0, m6
    >+    lea         r0, [r4 + r2 * 2]                      ;set r0 to index 
of 7
    >+    movq        [r1], m4
    >+    punpckhwd   m15, m14
    >+    movhps      [r1 + r2], m4
    >+    punpckhdq   m0, m3
    >+    movq        [r1 + r2 * 2], m1
    >+    punpckhwd   m13, m12
    >+    movhps      [r1 + r3], m1
    >+    mova        m1, m6
    >+    punpckldq   m1, m3
    >+    movq        [r1 + 8], m1
    >+    movhps      [r1 + r2 + 8], m1
    >+    movq        [r1 + r2 * 2 + 8], m0
    >+    movhps      [r1 + r3 + 8], m0
    >+    mova        m0, m15
    >+    punpckhdq   m15, m13
    >+    punpckldq   m0, m13
    >+    movq        [r1 + r2 * 4], m0
    >+    movhps      [r1 + r4], m0
    >+    mova        m0, m8
    >+    punpckhdq   m8, m2
    >+    movq        [r1 + r3 * 2], m15
    >+    punpckldq   m0, m2
    >+    movhps      [r1 + r0], m15
    >+    movq        [r1 + r2 * 4 + 8], m0
    >+    movhps      [r1 + r4 + 8], m0
    >+    movq        [r1 + r3 * 2 + 8], m8
    >+    movhps      [r1 + r0 + 8], m8
    >+    RET
    >+%undef IDCT_SHIFT
    >+
    >+;-------------------------------------------------------
    >+; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
    >+;-------------------------------------------------------
    > INIT_XMM ssse3
    >
    > cglobal patial_butterfly_inverse_internal_pass1
    >diff -r d059cfa88f1a -r 37392ba74268 source/common/x86/dct8.h
    >--- a/source/common/x86/dct8.h  Tue Nov 18 14:11:12 2014 -0600
    >+++ b/source/common/x86/dct8.h  Wed Nov 19 18:39:09 2014 -0800
    >@@ -35,6 +35,7 @@
    > void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
    > void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
    > void x265_idct4_avx2(int32_t *src, int16_t *dst, intptr_t stride);
    >+void x265_idct8_sse2(int32_t *src, int16_t *dst, intptr_t stride);
    > void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
    > void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride);
    > void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);
    >_______________________________________________
    >x265-devel mailing list
    >[email protected]
    >https://mailman.videolan.org/listinfo/x265-devel


    _______________________________________________
    x265-devel mailing list
    [email protected]
    https://mailman.videolan.org/listinfo/x265-devel



_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to