From: "Ronald S. Bultje" <rbul...@chromium.org> Use this in VP8/H264-8bit loopfilter functions so they can be used if there is no aligned stack (e.g. MSVC 32bit or ICC 10.x). --- libavcodec/x86/h264_deblock.asm | 27 ++----- libavcodec/x86/h264dsp_init.c | 4 +- libavcodec/x86/vp8dsp.asm | 68 +++++++++--------- libavcodec/x86/vp8dsp_init.c | 8 --- libavutil/x86/x86inc.asm | 156 +++++++++++++++++++++++++++++++++------- 5 files changed, 173 insertions(+), 90 deletions(-)
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 6abdba2..6137678 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -401,14 +401,12 @@ DEBLOCK_LUMA ;----------------------------------------------------------------------------- ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_%1_luma_8, 5,5 +cglobal deblock_%1_luma_8, 5, 6 - HAVE_ALIGNED_STACK, 0, 2 * %2 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 dec r3 ; beta-1 add r4, r0 ; pix-3*stride - %assign pad 2*%2+12-(stack_offset&15) - SUB esp, pad mova m0, [r4+r1] ; p1 mova m1, [r4+2*r1] ; p0 @@ -446,22 +444,19 @@ cglobal deblock_%1_luma_8, 5,5 DEBLOCK_P0_Q0 mova [r4+2*r1], m1 mova [r0], m2 - ADD esp, pad RET ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX cpuname -cglobal deblock_h_luma_8, 0,5 +cglobal deblock_h_luma_8, 0,6 - HAVE_ALIGNED_STACK, 0, 0x70 mov r0, r0mp mov r3, r1m lea r4, [r3*3] sub r0, 4 lea r1, [r0+r4] - %assign pad 0x78-(stack_offset&15) - SUB esp, pad -%define pix_tmp esp+12 +%define pix_tmp esp+16 ; transpose 6x16 -> tmp space TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp @@ -503,7 +498,6 @@ cglobal deblock_h_luma_8, 0,5 movq m3, [pix_tmp+0x48] TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) - ADD esp, pad RET %endmacro ; DEBLOCK_LUMA @@ -636,7 +630,7 @@ DEBLOCK_LUMA v, 16 %define mpb_0 m14 %define mpb_1 m15 %else - %define spill(x) [esp+16*x+((stack_offset+4)&15)] + %define spill(x) [esp+16*x] %define p2 [r4+r1] %define q2 [r0+2*r1] %define t4 spill(0) @@ -651,10 +645,7 @@ DEBLOCK_LUMA v, 16 ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_%1_luma_intra_8, 4,6,16 -%if ARCH_X86_64 == 0 - sub esp, 0x60 -%endif +cglobal deblock_%1_luma_intra_8, 4, 6, 16, ARCH_X86_32 * 0x50 lea r4, [r1*4] lea r5, [r1*3] ; 3*stride dec r2d ; alpha-1 @@ -703,9 +694,6 @@ cglobal deblock_%1_luma_intra_8, 4,6,16 LUMA_INTRA_SWAP_PQ LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] .end: -%if ARCH_X86_64 == 0 - add esp, 0x60 -%endif RET INIT_MMX cpuname @@ -742,12 +730,10 @@ cglobal deblock_h_luma_intra_8, 4,9 add rsp, 0x88 RET %else -cglobal deblock_h_luma_intra_8, 2,4 +cglobal deblock_h_luma_intra_8, 2, 5 - HAVE_ALIGNED_STACK, 0, 0x80 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] -%assign pad 0x8c-(stack_offset&15) - SUB rsp, pad %define pix_tmp rsp ; transpose 8x16 -> tmp space @@ -778,7 +764,6 @@ cglobal deblock_h_luma_intra_8, 2,4 lea r0, [r0+r1*8] lea r2, [r2+r1*8] TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) - ADD rsp, pad RET %endif ; ARCH_X86_64 %endmacro ; DEBLOCK_LUMA_INTRA diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index bb48867..f84e3e8 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -276,18 +276,16 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; -#if HAVE_ALIGNED_STACK c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; -#endif /* HAVE_ALIGNED_STACK */ } if (EXTERNAL_SSSE3(mm_flags)) { c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; } - if (EXTERNAL_AVX(mm_flags) && HAVE_ALIGNED_STACK) { + if (EXTERNAL_AVX(mm_flags)) { c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 19853c4..32f1c25 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -1632,28 +1632,31 @@ SIMPLE_LOOPFILTER h, 5 ;----------------------------------------------------------------------------- %macro INNER_LOOPFILTER 2 +%define stack_size 0 +%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr +%ifidn %1, v ; [3]=hev() result +%define stack_size mmsize * 4 +%else ; h ; extra storage space for transposes +%define stack_size mmsize * 5 +%endif +%endif + %if %2 == 8 ; chroma -cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr +cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr %else ; luma -cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr +cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr %endif %if cpuflag(ssse3) pxor m7, m7 %endif -%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr -%ifidn %1, v ; [3]=hev() result -%assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15) -%else ; h ; extra storage space for transposes -%assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15) -%endif + +%ifndef m8 ; splat function arguments SPLATB_REG m0, flimEq, m7 ; E SPLATB_REG m1, flimIq, m7 ; I SPLATB_REG m2, hevthrq, m7 ; hev_thresh - SUB rsp, pad - %define m_flimE [rsp] %define m_flimI [rsp+mmsize] %define m_hevthr [rsp+mmsize*2] @@ -2083,12 +2086,10 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr dec cntrq jg .next8px %endif -%endif - -%ifndef m8 ; sse2 on x86-32 or mmx/mmxext - ADD rsp, pad -%endif + REP_RET +%else ; mmsize == 16 RET +%endif %endmacro %if ARCH_X86_32 @@ -2123,31 +2124,34 @@ INNER_LOOPFILTER h, 8 ;----------------------------------------------------------------------------- %macro MBEDGE_LOOPFILTER 2 -%if %2 == 8 ; chroma -cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr -%else ; luma -cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr -%endif - -%if cpuflag(ssse3) - pxor m7, m7 -%endif +%define stack_size 0 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr %if mmsize == 16 ; [3]=hev() result ; [4]=filter tmp result ; [5]/[6] = p2/q2 backup ; [7]=lim_res sign result -%assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15) +%define stack_size mmsize * 7 %else ; 8 ; extra storage space for transposes -%assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15) +%define stack_size mmsize * 8 +%endif %endif + +%if %2 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr +%else ; luma +cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr +%endif + +%if cpuflag(ssse3) + pxor m7, m7 +%endif + +%ifndef m8 ; splat function arguments SPLATB_REG m0, flimEq, m7 ; E SPLATB_REG m1, flimIq, m7 ; I SPLATB_REG m2, hevthrq, m7 ; hev_thresh - SUB rsp, pad - %define m_flimE [rsp] %define m_flimI [rsp+mmsize] %define m_hevthr [rsp+mmsize*2] @@ -2741,12 +2745,10 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevt dec cntrq jg .next8px %endif -%endif - -%ifndef m8 ; sse2 on x86-32 or mmx/mmxext - ADD rsp, pad -%endif + REP_RET +%else ; mmsize == 16 RET +%endif %endmacro %if ARCH_X86_32 diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c index a9f2740..1556133 100644 --- a/libavcodec/x86/vp8dsp_init.c +++ b/libavcodec/x86/vp8dsp_init.c @@ -390,13 +390,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; -#if ARCH_X86_64 || HAVE_ALIGNED_STACK c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; -#endif } if (mm_flags & AV_CPU_FLAG_SSE2) { @@ -404,13 +402,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; -#if ARCH_X86_64 || HAVE_ALIGNED_STACK c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; -#endif } if (mm_flags & AV_CPU_FLAG_SSSE3) { @@ -424,7 +420,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; -#if ARCH_X86_64 || HAVE_ALIGNED_STACK c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; @@ -434,17 +429,14 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; -#endif } if (mm_flags & AV_CPU_FLAG_SSE4) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; -#if ARCH_X86_64 || HAVE_ALIGNED_STACK c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; -#endif } #endif /* HAVE_YASM */ } diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index c1827fb..0284821 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -164,10 +164,10 @@ CPUNOP amdnop %define r%1m %2d %define r%1mp %2 %elif ARCH_X86_64 ; memory - %define r%1m [rsp + stack_offset + %3] + %define r%1m [rSTK + stack_offset + %3] %define r%1mp qword r %+ %1 %+ m %else - %define r%1m [esp + stack_offset + %3] + %define r%1m [rSTK + stack_offset + %3] %define r%1mp dword r %+ %1 %+ m %endif %define r%1 %2 @@ -229,12 +229,16 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %macro PUSH 1 push %1 - %assign stack_offset stack_offset+gprsize + %ifidn rSTK, rsp + %assign stack_offset stack_offset+gprsize + %endif %endmacro %macro POP 1 pop %1 - %assign stack_offset stack_offset-gprsize + %ifidn rSTK, rsp + %assign stack_offset stack_offset-gprsize + %endif %endmacro %macro PUSH_IF_USED 1-* @@ -267,15 +271,19 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %macro SUB 2 sub %1, %2 %ifidn %1, rsp + %ifidn rSTK, rsp %assign stack_offset stack_offset+(%2) %endif + %endif %endmacro %macro ADD 2 add %1, %2 %ifidn %1, rsp + %ifidn rSTK, rsp %assign stack_offset stack_offset-(%2) %endif + %endif %endmacro %macro movifnidn 2 @@ -331,6 +339,46 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %assign n_arg_names %0 %endmacro +%macro ALLOC_STACK 1-2 ; stack_size, n_xmm_regs (for win64 only) + ASSERT %1 > 0 + %assign stack_size_alignment ((mmsize + 8) & ~8) + %assign stack_size_aligned (%1 + stack_size_alignment - 1) & ~(stack_size_alignment - 1) + %if %0 == 2 + %assign xmm_regs_used %2 + %else + %assign xmm_regs_used 0 + %endif + %if mmsize <= 16 && HAVE_ALIGNED_STACK + %assign stack_size_padded stack_size_aligned + stack_size_alignment - gprsize - (stack_offset & (stack_size_alignment - 1)) + %if xmm_regs_used > 6 + %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 + %endif + SUB rsp, stack_size_padded + %if xmm_regs_used > 6 + WIN64_PUSH_XMM + %endif + %else + %assign reg_num (regs_used - 1) + %xdefine rSTK r %+ reg_num + ; align stack, and save original stack location directly above it, i.e. + ; in [rsp+stack_size_padded], so we can restore the stack in a single + ; instruction (i.e. mov rsp, [rsp+stack_size_padded]) + lea rSTK, [rsp-gprsize] + and rSTK, stack_size_alignment - 1 + sub rsp, rSTK + add rSTK, rsp + push rSTK + %assign stack_size_padded stack_size_aligned + %if xmm_regs_used > 6 + %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 + %endif + sub rsp, stack_size_padded + %if xmm_regs_used > 6 + WIN64_PUSH_XMM + %endif + %endif +%endmacro + %if WIN64 ; Windows x64 ;================================================= DECLARE_REG 0, rcx @@ -349,31 +397,46 @@ DECLARE_REG 12, R13, 104 DECLARE_REG 13, R14, 112 DECLARE_REG 14, R15, 120 -%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args ASSERT regs_used <= 15 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 - %if mmsize == 8 - %assign xmm_regs_used 0 - %else + %assign xmm_regs_used 0 + %ifnum %4 + %if %4 > 0 + ALLOC_STACK %4, %3 + %endif + %endif + %if mmsize != 8 && stack_size_aligned == 0 WIN64_SPILL_XMM %3 %endif LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS %4 + %ifnum %4 + DEFINE_ARGS %5 + %elif %0 == 4 + DEFINE_ARGS %4 + %elif %0 > 4 + DEFINE_ARGS %4, %5 + %endif +%endmacro + +%macro WIN64_PUSH_XMM 0 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa [rsp + (%%i-6)*16 + stack_size_aligned], xmm %+ %%i + %endrep %endmacro %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 %if xmm_regs_used > 6 - SUB rsp, (xmm_regs_used-6)*16+16 - %assign %%i xmm_regs_used - %rep (xmm_regs_used-6) - %assign %%i %%i-1 - movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i - %endrep + %assign stack_size_padded (xmm_regs_used-6)*16+16-gprsize-(stack_offset&15) + SUB rsp, stack_size_padded + WIN64_PUSH_XMM %endif %endmacro @@ -382,19 +445,23 @@ DECLARE_REG 14, R15, 120 %assign %%i xmm_regs_used %rep (xmm_regs_used-6) %assign %%i %%i-1 - movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] + movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size_aligned] %endrep - add %1, (xmm_regs_used-6)*16+16 + %endif + %if mmsize == 32 || HAVE_ALIGNED_STACK == 0 + mov rsp, [rsp+stack_size_padded] + %else + add %1, stack_size_padded %endif %endmacro %macro WIN64_RESTORE_XMM 1 WIN64_RESTORE_XMM_INTERNAL %1 - %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 + %assign stack_offset (stack_offset-stack_size_padded) %assign xmm_regs_used 0 %endmacro -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size_aligned > 0 %macro RET 0 WIN64_RESTORE_XMM_INTERNAL rsp @@ -423,19 +490,37 @@ DECLARE_REG 12, R13, 56 DECLARE_REG 13, R14, 64 DECLARE_REG 14, R15, 72 -%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args ASSERT regs_used <= 15 PUSH_IF_USED 9, 10, 11, 12, 13, 14 + %ifnum %4 + %if %4 > 0 + ALLOC_STACK %4 + %endif + %endif LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS %4 + %ifnum %4 + DEFINE_ARGS %5 + %elif %0 == 4 + DEFINE_ARGS %4 + %elif %0 > 4 + DEFINE_ARGS %4, %5 + %endif %endmacro -%define has_epilogue regs_used > 9 || mmsize == 32 +%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size_aligned > 0 %macro RET 0 +%if stack_size_aligned > 0 +%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 + mov rsp, [rsp+stack_size_padded] +%else + add rsp, stack_size_padded +%endif +%endif POP_IF_USED 14, 13, 12, 11, 10, 9 %if mmsize == 32 vzeroupper @@ -464,7 +549,7 @@ DECLARE_REG 6, ebp, 28 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 -%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, arg_names... %assign num_args %1 %assign regs_used %2 %if num_args > 7 @@ -475,13 +560,31 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endif ASSERT regs_used >= num_args PUSH_IF_USED 3, 4, 5, 6 + %ifnum %4 + %if %4 > 0 + ALLOC_STACK %4 + %endif + %endif LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 - DEFINE_ARGS %4 + %ifnum %4 + DEFINE_ARGS %5 + %elif %0 == 4 + DEFINE_ARGS %4 + %elif %0 > 4 + DEFINE_ARGS %4, %5 + %endif %endmacro -%define has_epilogue regs_used > 3 || mmsize == 32 +%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size_aligned > 0 %macro RET 0 +%if stack_size_aligned > 0 +%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 + mov rsp, [rsp+stack_size_padded] +%else + add rsp, stack_size_padded +%endif +%endif POP_IF_USED 6, 5, 4, 3 %if mmsize == 32 vzeroupper @@ -543,7 +646,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 align function_align %1: RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer + %xdefine rSTK rsp %assign stack_offset 0 + %assign stack_size_aligned 0 + %assign stack_size_padded 0 %ifnidn %2, "" PROLOGUE %2 %endif -- 1.7.11.5 _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel