On Tue, 1 Nov 2011, Justin Ruggles wrote:
> On 10/30/2011 01:29 PM, Ronald S. Bultje wrote:
>
>> @@ -459,10 +462,24 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset
>> + 28]
>>
>> %assign function_align 16
>>
>> -; Symbol prefix for C linkage
>> -%macro cglobal 1-2+
>> - %xdefine %1 mangle(program_name %+ _ %+ %1)
>> - %xdefine %1.skip_prologue %1 %+ .skip_prologue
>> +; Begin a function.
>> +; Applies any symbol mangling needed for C linkage, and sets up a define
>> such that
>> +; subsequent uses of the function name automatically refer to the mangled
>> version.
>> +; Appends cpuflags to the function name if cpuflags has been specified.
>> +%macro cglobal 1-2+ ; name, [PROLOGUE args]
>> +%if %0 == 1
>> + cglobal_internal %1 %+ SUFFIX
>> +%else
>> + cglobal_internal %1 %+ SUFFIX, %2
>> +%endif
>> +%endmacro
>> +%macro cglobal_internal 1-2+
>> + %ifndef cglobaled_%1
>> + %xdefine %1 mangle(program_name %+ _ %+ %1)
>> + %xdefine %1.skip_prologue %1 %+ .skip_prologue
>> + CAT_XDEFINE cglobaled_, %1, 1
>> + %endif
>> + %xdefine current_function %1
>> %ifidn __OUTPUT_FORMAT__,elf
>> global %1:function hidden
>> %else
>> @@ -479,12 +496,14 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset
>> + 28]
>>
>> %macro cextern 1
>> %xdefine %1 mangle(program_name %+ _ %+ %1)
>> + CAT_XDEFINE cglobaled_, %1, 1
>> extern %1
>> %endmacro
>>
>> -;like cextern, but without the prefix
>> +; like cextern, but without the prefix
>> %macro cextern_naked 1
>> %xdefine %1 mangle(%1)
>> + CAT_XDEFINE cglobaled_, %1, 1
>> extern %1
>> %endmacro
>
> do the above changes do anything functional other than adding the
> cpuflags suffix?
They also make multiple functions of the same name into a fatal error,
instead of silently renaming one of them. This detected the issue fixed
in patch4 in this thread.
Btw, the avx symmetry patch fixes another fatal error also introduced by
x86inc. If you don't want any uncompilable intermediate revisions, you
have to squash avx symmetry into x86inc and move patch4 first.
>> @@ -500,6 +519,58 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset
>> + 28]
>> SECTION .note.GNU-stack noalloc noexec nowrite progbits
>> %endif
>>
>> +; cpuflags
>> +
>> +%assign cpuflags_mmx (1<<0)
>> +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
>> +%assign cpuflags_sse (1<<2) | cpuflags_mmx2
>> +%assign cpuflags_sse2 (1<<3) | cpuflags_sse
>> +%assign cpuflags_sse2slow (1<<4) | cpuflags_sse2
>> +%assign cpuflags_sse3 (1<<5) | cpuflags_sse2
>> +%assign cpuflags_ssse3 (1<<6) | cpuflags_sse3
>> +%assign cpuflags_sse4 (1<<7) | cpuflags_ssse3
>> +%assign cpuflags_sse42 (1<<8) | cpuflags_sse4
>> +%assign cpuflags_avx (1<<9) | cpuflags_sse42
>> +%assign cpuflags_xop (1<<10)| cpuflags_avx
>> +%assign cpuflags_fma4 (1<<11)| cpuflags_avx
>> +
>> +%assign cpuflags_cache32 (1<<16)
>> +%assign cpuflags_cache64 (1<<17)
>> +%assign cpuflags_slowctz (1<<18)
>> +%assign cpuflags_lzcnt (1<<19)
>> +%assign cpuflags_misalign (1<<20)
>> +%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function
>> variant
>
> we also have 3dnow, 3dnow2, and atom
Added.
>> @@ -569,15 +642,18 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
>> %endif
>> %define mova vmovaps
>> %define movu vmovups
>> + %undef movh
>> + %undef movnta
>
> what is this about?
I don't normally implement things until they're needed.
movnta is easy enough, but I don't think a ymm version of movh could be a
1-line %define.
>> %assign %%i 0
>> %rep num_mmregs
>> CAT_XDEFINE m, %%i, ymm %+ %%i
>> CAT_XDEFINE nymm, %%i, %%i
>> %assign %%i %%i+1
>> %endrep
>> + INIT_CPUFLAGS %1
>> %endmacro
>>
>> -INIT_MMX
>> +INIT_XMM
>
> do we have any code that assumes INIT_MMX as the default?
no
>> @@ -903,3 +996,36 @@ AVX_INSTR xorps, 1, 0
>> AVX_INSTR pfadd, 1, 0
>> AVX_INSTR pfsub, 1, 0
>> AVX_INSTR pfmul, 1, 0
>> +
>> +; base-4 constants for shuffles
>> +%assign i 0
>> +%rep 256
>> + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
>> + %if j < 10
>> + CAT_XDEFINE q000, j, i
>> + %elif j < 100
>> + CAT_XDEFINE q00, j, i
>> + %elif j < 1000
>> + CAT_XDEFINE q0, j, i
>> + %else
>> + CAT_XDEFINE q, j, i
>> + %endif
>> +%assign i i+1
>> +%endrep
>> +%undef i
>> +%undef j
>
> interesting. how does one use this?
All of the following are equivalent, and swap pairs of dwords.
pshufd m0, m1, q2301
pshufd m0, m1, 0xb1
pshufd m0, m1, 10110001b
pshufd m0, m1, 177
I think the base-4 one is by far the most readable. I've also modified my
local version of objdump to use it.
--Loren Merritt
From c9c5041e1d909d1e084f70473b2ec49646fe7396 Mon Sep 17 00:00:00 2001
From: Loren Merritt <[email protected]>
Date: Wed, 2 Nov 2011 20:13:26 +0000
Subject: [PATCH] Update x86inc.asm to latest x264 version
But keep INIT_AVX (for backwards compatibility).
---
libavutil/x86/x86inc.asm | 186 +++++++++++++++++++++++++++++++++++++++-------
1 files changed, 158 insertions(+), 28 deletions(-)
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index c84d556..48e649b 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1,5 +1,5 @@
;*****************************************************************************
-;* x86inc.asm
+;* x86inc.asm: x264asm abstraction layer
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
@@ -112,7 +112,7 @@
; we need more flexible macro.
; RET:
-; Pops anything that was pushed by PROLOGUE
+; Pops anything that was pushed by PROLOGUE, and returns.
; REP_RET:
; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
@@ -297,6 +297,9 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
+ %if mmsize == 8
+ %assign xmm_regs_used 0
+ %endif
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6
sub rsp, (xmm_regs_used-6)*16+16
@@ -459,10 +462,24 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset +
28]
%assign function_align 16
-; Symbol prefix for C linkage
-%macro cglobal 1-2+
- %xdefine %1 mangle(program_name %+ _ %+ %1)
- %xdefine %1.skip_prologue %1 %+ .skip_prologue
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such
that
+; subsequent uses of the function name automatically refer to the mangled
version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+%macro cglobal 1-2+ ; name, [PROLOGUE args]
+%if %0 == 1
+ cglobal_internal %1 %+ SUFFIX
+%else
+ cglobal_internal %1 %+ SUFFIX, %2
+%endif
+%endmacro
+%macro cglobal_internal 1-2+
+ %ifndef cglobaled_%1
+ %xdefine %1 mangle(program_name %+ _ %+ %1)
+ %xdefine %1.skip_prologue %1 %+ .skip_prologue
+ CAT_XDEFINE cglobaled_, %1, 1
+ %endif
+ %xdefine current_function %1
%ifidn __OUTPUT_FORMAT__,elf
global %1:function hidden
%else
@@ -479,12 +496,14 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset +
28]
%macro cextern 1
%xdefine %1 mangle(program_name %+ _ %+ %1)
+ CAT_XDEFINE cglobaled_, %1, 1
extern %1
%endmacro
-;like cextern, but without the prefix
+; like cextern, but without the prefix
%macro cextern_naked 1
%xdefine %1 mangle(%1)
+ CAT_XDEFINE cglobaled_, %1, 1
extern %1
%endmacro
@@ -500,6 +519,61 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset +
28]
SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endif
+; cpuflags
+
+%assign cpuflags_mmx (1<<0)
+%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2 (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
+%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
+%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
+%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
+%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
+%assign cpuflags_avx (1<<11)| cpuflags_sse42
+%assign cpuflags_xop (1<<12)| cpuflags_avx
+%assign cpuflags_fma4 (1<<13)| cpuflags_avx
+
+%assign cpuflags_cache32 (1<<16)
+%assign cpuflags_cache64 (1<<17)
+%assign cpuflags_slowctz (1<<18)
+%assign cpuflags_lzcnt (1<<19)
+%assign cpuflags_misalign (1<<20)
+%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<22)
+
+%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+
+; Takes up to 2 cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the
specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for
INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-2
+ %if %0 >= 1
+ %xdefine cpuname %1
+ %assign cpuflags cpuflags_%1
+ %if %0 >= 2
+ %xdefine cpuname %1_%2
+ %assign cpuflags cpuflags | cpuflags_%2
+ %endif
+ %xdefine SUFFIX _ %+ cpuname
+ %if cpuflag(avx)
+ %assign avx_enabled 1
+ %endif
+ %if cpuflag(aligned)
+ %define movu mova
+ %elifidn %1, sse3
+ %define movu lddqu
+ %endif
+ %else
+ %xdefine SUFFIX
+ %undef cpuname
+ %undef cpuflags
+ %endif
+%endmacro
+
; merge mmx and sse*
%macro CAT_XDEFINE 3
@@ -510,9 +584,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%undef %1%2
%endmacro
-%macro INIT_MMX 0
+%macro INIT_MMX 0-1+
%assign avx_enabled 0
- %define RESET_MM_PERMUTATION INIT_MMX
+ %define RESET_MM_PERMUTATION INIT_MMX %1
%define mmsize 8
%define num_mmregs 8
%define mova movq
@@ -530,11 +604,12 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
CAT_UNDEF nmm, %%i
%assign %%i %%i+1
%endrep
+ INIT_CPUFLAGS %1
%endmacro
-%macro INIT_XMM 0
+%macro INIT_XMM 0-1+
%assign avx_enabled 0
- %define RESET_MM_PERMUTATION INIT_XMM
+ %define RESET_MM_PERMUTATION INIT_XMM %1
%define mmsize 16
%define num_mmregs 8
%ifdef ARCH_X86_64
@@ -550,8 +625,10 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
CAT_XDEFINE nxmm, %%i, %%i
%assign %%i %%i+1
%endrep
+ INIT_CPUFLAGS %1
%endmacro
+; FIXME: INIT_AVX can be replaced by INIT_XMM avx
%macro INIT_AVX 0
INIT_XMM
%assign avx_enabled 1
@@ -559,9 +636,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define RESET_MM_PERMUTATION INIT_AVX
%endmacro
-%macro INIT_YMM 0
+%macro INIT_YMM 0-1+
%assign avx_enabled 1
- %define RESET_MM_PERMUTATION INIT_YMM
+ %define RESET_MM_PERMUTATION INIT_YMM %1
%define mmsize 32
%define num_mmregs 8
%ifdef ARCH_X86_64
@@ -569,15 +646,18 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endif
%define mova vmovaps
%define movu vmovups
+ %undef movh
+ %define movnta vmovntps
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE m, %%i, ymm %+ %%i
CAT_XDEFINE nymm, %%i, %%i
%assign %%i %%i+1
%endrep
+ INIT_CPUFLAGS %1
%endmacro
-INIT_MMX
+INIT_XMM
; I often want to use macros that permute their arguments. e.g. there's no
; efficient way to implement butterfly or transpose or dct without swapping
some
@@ -633,31 +713,46 @@ INIT_MMX
%endrep
%endmacro
-; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
-; function name, then any later calls to that function will automatically
-; load the permutation, so values can be returned in mmregs.
-%macro SAVE_MM_PERMUTATION 1 ; name to save as
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
%assign %%i 0
%rep num_mmregs
- CAT_XDEFINE %1_m, %%i, m %+ %%i
+ CAT_XDEFINE %%f, %%i, m %+ %%i
%assign %%i %%i+1
%endrep
%endmacro
%macro LOAD_MM_PERMUTATION 1 ; name to load from
- %assign %%i 0
- %rep num_mmregs
- CAT_XDEFINE m, %%i, %1_m %+ %%i
- CAT_XDEFINE n, m %+ %%i, %%i
- %assign %%i %%i+1
- %endrep
+ %ifdef %1_m0
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, %1_m %+ %%i
+ CAT_XDEFINE n, m %+ %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
%endmacro
+; Append cpuflags to the callee's name iff the appended name is known and the
plain name isn't
%macro call 1
- call %1
- %ifdef %1_m0
- LOAD_MM_PERMUTATION %1
+ call_internal %1, %1 %+ SUFFIX
+%endmacro
+%macro call_internal 2
+ %xdefine %%i %1
+ %ifndef cglobaled_%1
+ %ifdef cglobaled_%2
+ %xdefine %%i %2
+ %endif
%endif
+ call %%i
+ LOAD_MM_PERMUTATION %%i
%endmacro
; Substitutions that reduce instruction size but are functionally equivalent
@@ -789,6 +884,8 @@ AVX_INSTR minpd, 1, 0
AVX_INSTR minps, 1, 0
AVX_INSTR minsd, 1, 0
AVX_INSTR minss, 1, 0
+AVX_INSTR movsd, 1, 0
+AVX_INSTR movss, 1, 0
AVX_INSTR mpsadbw, 0, 1
AVX_INSTR mulpd, 1, 0
AVX_INSTR mulps, 1, 0
@@ -903,3 +1000,36 @@ AVX_INSTR xorps, 1, 0
AVX_INSTR pfadd, 1, 0
AVX_INSTR pfsub, 1, 0
AVX_INSTR pfmul, 1, 0
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+ %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+ %if j < 10
+ CAT_XDEFINE q000, j, i
+ %elif j < 100
+ CAT_XDEFINE q00, j, i
+ %elif j < 1000
+ CAT_XDEFINE q0, j, i
+ %else
+ CAT_XDEFINE q, j, i
+ %endif
+%assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+ %macro %1 4-7 %1, %2, %3
+ %if cpuflag(xop)
+ v%5 %1, %2, %3, %4
+ %else
+ %6 %1, %2, %3
+ %7 %1, %4
+ %endif
+ %endmacro
+%endmacro
+
+FMA_INSTR pmacsdd, pmulld, paddd
+FMA_INSTR pmacsww, pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
--
1.7.4.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel