On 10/30/2011 01:29 PM, Ronald S. Bultje wrote: > From: Loren Merritt <[email protected]> > > But keep INIT_AVX (for backwards compatibility). > --- > libavutil/x86/x86inc.asm | 182 > +++++++++++++++++++++++++++++++++++++++------- > 1 files changed, 154 insertions(+), 28 deletions(-) > > diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm > index c84d556..d7a3b3c 100644 > --- a/libavutil/x86/x86inc.asm > +++ b/libavutil/x86/x86inc.asm > @@ -1,5 +1,5 @@ > > ;***************************************************************************** > -;* x86inc.asm > +;* x86inc.asm: x264asm abstraction layer > > ;***************************************************************************** > ;* Copyright (C) 2005-2011 x264 project > ;*
ok i suppose. if we want a cleaner diff. > @@ -112,7 +112,7 @@ > ; we need more flexible macro. > > ; RET: > -; Pops anything that was pushed by PROLOGUE > +; Pops anything that was pushed by PROLOGUE, and returns. > > ; REP_RET: > ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons ok. > @@ -297,6 +297,9 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + > 56] > > %macro WIN64_SPILL_XMM 1 > %assign xmm_regs_used %1 > + %if mmsize == 8 > + %assign xmm_regs_used 0 > + %endif > ASSERT xmm_regs_used <= 16 > %if xmm_regs_used > 6 > sub rsp, (xmm_regs_used-6)*16+16 great. no need to have separate xmm_reg to avoid spilling xmm in mmx functions. > @@ -459,10 +462,24 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset > + 28] > > %assign function_align 16 > > -; Symbol prefix for C linkage > -%macro cglobal 1-2+ > - %xdefine %1 mangle(program_name %+ _ %+ %1) > - %xdefine %1.skip_prologue %1 %+ .skip_prologue > +; Begin a function. > +; Applies any symbol mangling needed for C linkage, and sets up a define > such that > +; subsequent uses of the function name automatically refer to the mangled > version. > +; Appends cpuflags to the function name if cpuflags has been specified. > +%macro cglobal 1-2+ ; name, [PROLOGUE args] > +%if %0 == 1 > + cglobal_internal %1 %+ SUFFIX > +%else > + cglobal_internal %1 %+ SUFFIX, %2 > +%endif > +%endmacro > +%macro cglobal_internal 1-2+ > + %ifndef cglobaled_%1 > + %xdefine %1 mangle(program_name %+ _ %+ %1) > + %xdefine %1.skip_prologue %1 %+ .skip_prologue > + CAT_XDEFINE cglobaled_, %1, 1 > + %endif > + %xdefine current_function %1 > %ifidn __OUTPUT_FORMAT__,elf > global %1:function hidden > %else > @@ -479,12 +496,14 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset > + 28] > > %macro cextern 1 > %xdefine %1 mangle(program_name %+ _ %+ %1) > + CAT_XDEFINE cglobaled_, %1, 1 > extern %1 > %endmacro > > -;like cextern, but without the prefix > +; like cextern, but without the prefix > %macro cextern_naked 1 > %xdefine %1 mangle(%1) > + CAT_XDEFINE cglobaled_, %1, 1 > extern %1 > %endmacro do the above changes do anything functional other than adding the cpuflags suffix? > @@ -500,6 +519,58 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + > 28] > SECTION .note.GNU-stack noalloc noexec nowrite progbits > %endif > > +; cpuflags > + > +%assign cpuflags_mmx (1<<0) > +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx > +%assign cpuflags_sse (1<<2) | cpuflags_mmx2 > +%assign cpuflags_sse2 (1<<3) | cpuflags_sse > +%assign cpuflags_sse2slow (1<<4) | cpuflags_sse2 > +%assign cpuflags_sse3 (1<<5) | cpuflags_sse2 > +%assign cpuflags_ssse3 (1<<6) | cpuflags_sse3 > +%assign cpuflags_sse4 (1<<7) | cpuflags_ssse3 > +%assign cpuflags_sse42 (1<<8) | cpuflags_sse4 > +%assign cpuflags_avx (1<<9) | cpuflags_sse42 > +%assign cpuflags_xop (1<<10)| cpuflags_avx > +%assign cpuflags_fma4 (1<<11)| cpuflags_avx > + > +%assign cpuflags_cache32 (1<<16) > +%assign cpuflags_cache64 (1<<17) > +%assign cpuflags_slowctz (1<<18) > +%assign cpuflags_lzcnt (1<<19) > +%assign cpuflags_misalign (1<<20) > +%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant we also have 3dnow, 3dnow2, and atom > + > +%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) > +%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) > + > +; Takes up to 2 cpuflags from the above list. > +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the > specified cpu. > +; You shouldn't need to invoke this macro directly, it's a subroutine for > INIT_MMX &co. > +%macro INIT_CPUFLAGS 0-2 > + %if %0 >= 1 > + %xdefine cpuname %1 > + %assign cpuflags cpuflags_%1 > + %if %0 >= 2 > + %xdefine cpuname %1_%2 > + %assign cpuflags cpuflags | cpuflags_%2 > + %endif > + %xdefine SUFFIX _ %+ cpuname > + %if cpuflag(avx) > + %assign avx_enabled 1 > + %endif > + %if cpuflag(aligned) > + %define movu mova > + %elifidn %1, sse3 > + %define movu lddqu > + %endif > + %else > + %xdefine SUFFIX > + %undef cpuname > + %undef cpuflags > + %endif > +%endmacro > + > ; merge mmx and sse* > > %macro CAT_XDEFINE 3 > @@ -510,9 +581,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits > %undef %1%2 > %endmacro > > -%macro INIT_MMX 0 > +%macro INIT_MMX 0-1+ > %assign avx_enabled 0 > - %define RESET_MM_PERMUTATION INIT_MMX > + %define RESET_MM_PERMUTATION INIT_MMX %1 > %define mmsize 8 > %define num_mmregs 8 > %define mova movq > @@ -530,11 +601,12 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits > CAT_UNDEF nmm, %%i > %assign %%i %%i+1 > %endrep > + INIT_CPUFLAGS %1 > %endmacro > > -%macro INIT_XMM 0 > +%macro INIT_XMM 0-1+ > %assign avx_enabled 0 > - %define RESET_MM_PERMUTATION INIT_XMM > + %define RESET_MM_PERMUTATION INIT_XMM %1 > %define mmsize 16 > %define num_mmregs 8 > %ifdef ARCH_X86_64 > @@ -550,6 +622,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits > CAT_XDEFINE nxmm, %%i, %%i > %assign %%i %%i+1 > %endrep > + INIT_CPUFLAGS %1 > %endmacro > > %macro INIT_AVX 0 > @@ -559,9 +632,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits > %define RESET_MM_PERMUTATION INIT_AVX > %endmacro > > -%macro INIT_YMM 0 > +%macro INIT_YMM 0-1+ > %assign avx_enabled 1 > - %define RESET_MM_PERMUTATION INIT_YMM > + %define RESET_MM_PERMUTATION INIT_YMM %1 > %define mmsize 32 > %define num_mmregs 8 > %ifdef ARCH_X86_64 > @@ -569,15 +642,18 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits > %endif > %define mova vmovaps > %define movu vmovups > + %undef movh > + %undef movnta what is this about? > %assign %%i 0 > %rep num_mmregs > CAT_XDEFINE m, %%i, ymm %+ %%i > CAT_XDEFINE nymm, %%i, %%i > %assign %%i %%i+1 > %endrep > + INIT_CPUFLAGS %1 > %endmacro > > -INIT_MMX > +INIT_XMM do we have any code that assumes INIT_MMX as the default? > > ; I often want to use macros that permute their arguments. e.g. there's no > ; efficient way to implement butterfly or transpose or dct without swapping > some > @@ -633,31 +709,46 @@ INIT_MMX > %endrep > %endmacro > > -; If SAVE_MM_PERMUTATION is placed at the end of a function and given the > -; function name, then any later calls to that function will automatically > -; load the permutation, so values can be returned in mmregs. > -%macro SAVE_MM_PERMUTATION 1 ; name to save as > +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later > +; calls to that function will automatically load the permutation, so values > can > +; be returned in mmregs. > +%macro SAVE_MM_PERMUTATION 0-1 > + %if %0 > + %xdefine %%f %1_m > + %else > + %xdefine %%f current_function %+ _m > + %endif > %assign %%i 0 > %rep num_mmregs > - CAT_XDEFINE %1_m, %%i, m %+ %%i > + CAT_XDEFINE %%f, %%i, m %+ %%i > %assign %%i %%i+1 > %endrep > %endmacro > > %macro LOAD_MM_PERMUTATION 1 ; name to load from > - %assign %%i 0 > - %rep num_mmregs > - CAT_XDEFINE m, %%i, %1_m %+ %%i > - CAT_XDEFINE n, m %+ %%i, %%i > - %assign %%i %%i+1 > - %endrep > + %ifdef %1_m0 > + %assign %%i 0 > + %rep num_mmregs > + CAT_XDEFINE m, %%i, %1_m %+ %%i > + CAT_XDEFINE n, m %+ %%i, %%i > + %assign %%i %%i+1 > + %endrep > + %endif > %endmacro ok > > +; Append cpuflags to the callee's name iff the appended name is known and > the plain name isn't > %macro call 1 > - call %1 > - %ifdef %1_m0 > - LOAD_MM_PERMUTATION %1 > + call_internal %1, %1 %+ SUFFIX > +%endmacro > +%macro call_internal 2 > + %xdefine %%i %1 > + %ifndef cglobaled_%1 > + %ifdef cglobaled_%2 > + %xdefine %%i %2 > + %endif > %endif > + call %%i > + LOAD_MM_PERMUTATION %%i > %endmacro looks ok > > ; Substitutions that reduce instruction size but are functionally equivalent > @@ -789,6 +880,8 @@ AVX_INSTR minpd, 1, 0 > AVX_INSTR minps, 1, 0 > AVX_INSTR minsd, 1, 0 > AVX_INSTR minss, 1, 0 > +AVX_INSTR movsd, 1, 0 > +AVX_INSTR movss, 1, 0 > AVX_INSTR mpsadbw, 0, 1 > AVX_INSTR mulpd, 1, 0 > AVX_INSTR mulps, 1, 0 ok > @@ -903,3 +996,36 @@ AVX_INSTR xorps, 1, 0 > AVX_INSTR pfadd, 1, 0 > AVX_INSTR pfsub, 1, 0 > AVX_INSTR pfmul, 1, 0 > + > +; base-4 constants for shuffles > +%assign i 0 > +%rep 256 > + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) > + %if j < 10 > + CAT_XDEFINE q000, j, i > + %elif j < 100 > + CAT_XDEFINE q00, j, i > + %elif j < 1000 > + CAT_XDEFINE q0, j, i > + %else > + CAT_XDEFINE q, j, i > + %endif > +%assign i i+1 > +%endrep > +%undef i > +%undef j interesting. how does one use this? > + > +%macro FMA_INSTR 3 > + %macro %1 4-7 %1, %2, %3 > + %if cpuflag(xop) > + v%5 %1, %2, %3, %4 > + %else > + %6 %1, %2, %3 > + %7 %1, %4 > + %endif > + %endmacro > +%endmacro > + > +FMA_INSTR pmacsdd, pmulld, paddd > +FMA_INSTR pmacsww, pmullw, paddw > +FMA_INSTR pmadcswd, pmaddwd, paddd fma emulation. neat. Thanks, Justin _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
