Use 16 xmmregs instead of spills, and transpose in pass5.
125->104 cycles on penryn x86_64. (But take the numbers with some salt: it's sensitive to code alignment (and was before the patch too).)
Doesn't touch avx; I don't know if the same strategy would help there.

I modified the x86_32 version too, but it doesn't get any speedup. Mine is more regular than the giant list of unstructured scalar math in PASS6_AND_PERMUTE; if this method can be applied to avx (and thus remove PASS6_AND_PERMUTE) then that's a simplification, but if it can't then the extra version is a complication and should be reverted.

--Loren Merritt
From 701f40aef4de4c001f619db20fecddaf8d1348af Mon Sep 17 00:00:00 2001
From: Loren Merritt <lor...@u.washington.edu>
Date: Tue, 17 May 2011 08:51:10 +0000
Subject: [PATCH 1/2] s/xmm/m/

---
 libavcodec/x86/dct32_sse.asm |  151 +++++++++++++++++++++---------------------
 1 files changed, 76 insertions(+), 75 deletions(-)

diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index 4df89fe..16a6dd5 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -264,6 +264,7 @@ cglobal dct32_float_avx, 2,3,8, out, in, tmp
     RET
 %endif
 
+INIT_XMM
 %define BUTTERFLY  BUTTERFLY_SSE
 %define BUTTERFLY0 BUTTERFLY0_SSE
 
@@ -271,116 +272,116 @@ cglobal dct32_float_avx, 2,3,8, out, in, tmp
 cglobal dct32_float_sse, 2,3,8, out, in, tmp
     ; pass 1
 
-    movaps      xmm0, [inq+0]
-    movaps      xmm1, [inq+112]
-    shufps      xmm1, xmm1, 0x1b
-    BUTTERFLY   xmm0, xmm1, [ps_cos_vec], xmm3
+    movaps      m0, [inq+0]
+    movaps      m1, [inq+112]
+    shufps      m1, m1, 0x1b
+    BUTTERFLY   m0, m1, [ps_cos_vec], m3
 
-    movaps      xmm7, [inq+64]
-    movaps      xmm4, [inq+48]
-    shufps      xmm4, xmm4, 0x1b
-    BUTTERFLY   xmm7, xmm4, [ps_cos_vec+32], xmm3
+    movaps      m7, [inq+64]
+    movaps      m4, [inq+48]
+    shufps      m4, m4, 0x1b
+    BUTTERFLY   m7, m4, [ps_cos_vec+32], m3
 
     ; pass 2
-    movaps      xmm2, [ps_cos_vec+64]
-    BUTTERFLY   xmm1, xmm4, xmm2, xmm3
-    movaps      [outq+48], xmm1
-    movaps      [outq+0], xmm4
+    movaps      m2, [ps_cos_vec+64]
+    BUTTERFLY   m1, m4, m2, m3
+    movaps      [outq+48], m1
+    movaps      [outq+0], m4
 
     ; pass 1
-    movaps      xmm1, [inq+16]
-    movaps      xmm6, [inq+96]
-    shufps      xmm6, xmm6, 0x1b
-    BUTTERFLY   xmm1, xmm6, [ps_cos_vec+16], xmm3
+    movaps      m1, [inq+16]
+    movaps      m6, [inq+96]
+    shufps      m6, m6, 0x1b
+    BUTTERFLY   m1, m6, [ps_cos_vec+16], m3
 
-    movaps      xmm4, [inq+80]
-    movaps      xmm5, [inq+32]
-    shufps      xmm5, xmm5, 0x1b
-    BUTTERFLY   xmm4, xmm5, [ps_cos_vec+48], xmm3
+    movaps      m4, [inq+80]
+    movaps      m5, [inq+32]
+    shufps      m5, m5, 0x1b
+    BUTTERFLY   m4, m5, [ps_cos_vec+48], m3
 
     ; pass 2
-    BUTTERFLY   xmm0, xmm7, xmm2, xmm3
+    BUTTERFLY   m0, m7, m2, m3
 
-    movaps      xmm2, [ps_cos_vec+80]
-    BUTTERFLY   xmm6, xmm5, xmm2, xmm3
+    movaps      m2, [ps_cos_vec+80]
+    BUTTERFLY   m6, m5, m2, m3
 
-    BUTTERFLY   xmm1, xmm4, xmm2, xmm3
+    BUTTERFLY   m1, m4, m2, m3
 
     ; pass 3
-    movaps      xmm2, [ps_cos_vec+96]
-    shufps      xmm1, xmm1, 0x1b
-    BUTTERFLY   xmm0, xmm1, xmm2, xmm3
-    movaps      [outq+112], xmm0
-    movaps      [outq+96], xmm1
+    movaps      m2, [ps_cos_vec+96]
+    shufps      m1, m1, 0x1b
+    BUTTERFLY   m0, m1, m2, m3
+    movaps      [outq+112], m0
+    movaps      [outq+96], m1
 
-    movaps      xmm0, [outq+0]
-    shufps      xmm5, xmm5, 0x1b
-    BUTTERFLY   xmm0, xmm5, xmm2, xmm3
+    movaps      m0, [outq+0]
+    shufps      m5, m5, 0x1b
+    BUTTERFLY   m0, m5, m2, m3
 
-    movaps      xmm1, [outq+48]
-    shufps      xmm6, xmm6, 0x1b
-    BUTTERFLY   xmm1, xmm6, xmm2, xmm3
-    movaps      [outq+48], xmm1
+    movaps      m1, [outq+48]
+    shufps      m6, m6, 0x1b
+    BUTTERFLY   m1, m6, m2, m3
+    movaps      [outq+48], m1
 
-    shufps      xmm4, xmm4, 0x1b
-    BUTTERFLY   xmm7, xmm4, xmm2, xmm3
+    shufps      m4, m4, 0x1b
+    BUTTERFLY   m7, m4, m2, m3
 
     ; pass 4
-    movaps      xmm3, [ps_p1p1m1m1+0]
-    movaps      xmm2, [ps_cos_vec+128]
+    movaps      m3, [ps_p1p1m1m1+0]
+    movaps      m2, [ps_cos_vec+128]
 
-    BUTTERFLY2  xmm5, xmm3, xmm2, xmm1
+    BUTTERFLY2  m5, m3, m2, m1
 
-    BUTTERFLY2  xmm0, xmm3, xmm2, xmm1
-    movaps      [outq+16], xmm0
+    BUTTERFLY2  m0, m3, m2, m1
+    movaps      [outq+16], m0
 
-    BUTTERFLY2  xmm6, xmm3, xmm2, xmm1
-    movaps      [outq+32], xmm6
+    BUTTERFLY2  m6, m3, m2, m1
+    movaps      [outq+32], m6
 
-    movaps      xmm0, [outq+48]
-    BUTTERFLY2  xmm0, xmm3, xmm2, xmm1
-    movaps      [outq+48], xmm0
+    movaps      m0, [outq+48]
+    BUTTERFLY2  m0, m3, m2, m1
+    movaps      [outq+48], m0
 
-    BUTTERFLY2  xmm4, xmm3, xmm2, xmm1
+    BUTTERFLY2  m4, m3, m2, m1
 
-    BUTTERFLY2  xmm7, xmm3, xmm2, xmm1
+    BUTTERFLY2  m7, m3, m2, m1
 
-    movaps      xmm6, [outq+96]
-    BUTTERFLY2  xmm6, xmm3, xmm2, xmm1
+    movaps      m6, [outq+96]
+    BUTTERFLY2  m6, m3, m2, m1
 
-    movaps      xmm0, [outq+112]
-    BUTTERFLY2  xmm0, xmm3, xmm2, xmm1
+    movaps      m0, [outq+112]
+    BUTTERFLY2  m0, m3, m2, m1
 
     ; pass 5
-    movaps      xmm2, [ps_cos_vec+160]
-    shufps      xmm3, xmm3, 0xcc
+    movaps      m2, [ps_cos_vec+160]
+    shufps      m3, m3, 0xcc
 
-    BUTTERFLY3  xmm5, xmm3, xmm2, xmm1
-    movaps      [outq+0], xmm5
+    BUTTERFLY3  m5, m3, m2, m1
+    movaps      [outq+0], m5
 
-    movaps      xmm1, [outq+16]
-    BUTTERFLY3  xmm1, xmm3, xmm2, xmm5
-    movaps      [outq+96], xmm1
+    movaps      m1, [outq+16]
+    BUTTERFLY3  m1, m3, m2, m5
+    movaps      [outq+96], m1
 
-    BUTTERFLY3  xmm4, xmm3, xmm2, xmm5
-    movaps      [outq+64], xmm4
+    BUTTERFLY3  m4, m3, m2, m5
+    movaps      [outq+64], m4
 
-    BUTTERFLY3  xmm7, xmm3, xmm2, xmm5
-    movaps      [outq+80], xmm7
+    BUTTERFLY3  m7, m3, m2, m5
+    movaps      [outq+80], m7
 
-    movaps      xmm5, [outq+32]
-    BUTTERFLY3  xmm5, xmm3, xmm2, xmm7
-    movaps      [outq+32], xmm5
+    movaps      m5, [outq+32]
+    BUTTERFLY3  m5, m3, m2, m7
+    movaps      [outq+32], m5
 
-    movaps      xmm4, [outq+48]
-    BUTTERFLY3  xmm4, xmm3, xmm2, xmm7
-    movaps      [outq+48], xmm4
+    movaps      m4, [outq+48]
+    BUTTERFLY3  m4, m3, m2, m7
+    movaps      [outq+48], m4
 
-    BUTTERFLY3  xmm6, xmm3, xmm2, xmm7
-    movaps      [outq+16], xmm6
+    BUTTERFLY3  m6, m3, m2, m7
+    movaps      [outq+16], m6
 
-    BUTTERFLY3  xmm0, xmm3, xmm2, xmm7
-    movaps      [outq+112], xmm0
+    BUTTERFLY3  m0, m3, m2, m7
+    movaps      [outq+112], m0
 
 
     ;    pass 6, no SIMD...
-- 
1.7.4.1


From 0e3bff6d0e70555b213bfcad0de3212617f2702f Mon Sep 17 00:00:00 2001
From: Loren Merritt <lor...@u.washington.edu>
Date: Tue, 17 May 2011 09:00:52 +0000
Subject: [PATCH 2/2] dct32_sse: eliminate some spills
 125->104 cycles on penryn x86_64
 no speedup on x86_32

---
 libavcodec/x86/dct32_sse.asm |  239 +++++++++++++++++++++++++++++++++---------
 libavcodec/x86/fft.c         |    5 +-
 libavcodec/x86/fft.h         |    1 +
 libavcodec/x86/x86util.asm   |   20 ++++
 4 files changed, 214 insertions(+), 51 deletions(-)

diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index 16a6dd5..20e2ee4 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -20,6 +20,7 @@
 ;******************************************************************************
 
 %include "x86inc.asm"
+%include "x86util.asm"
 %include "config.asm"
 
 SECTION_RODATA 32
@@ -37,8 +38,9 @@ ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
             dd   1.000000,  1.000000,  1.306563,  0.541196
             dd   1.000000,  0.707107,  1.000000, -0.707107
             dd   1.000000,  0.707107,  1.000000, -0.707107
+            dd   0.707107,  0.707107,  0.707107,  0.707107
 
-
+align 32
 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
 
 %macro BUTTERFLY_SSE 4
@@ -77,6 +79,18 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 
0x80000000, 0x80000000
     BUTTERFLY0 %1, %2, %3, %4, 0xb1
 %endmacro
 
+%macro BUTTERFLY3V 5
+    movaps m%5, m%1
+    addps  m%1, m%2
+    subps  m%5, m%2
+    SWAP %2, %5
+    mulps  m%2, [ps_cos_vec+192]
+    movaps m%5, m%3
+    addps  m%3, m%4
+    subps  m%4, m%5
+    mulps  m%4, [ps_cos_vec+192]
+%endmacro
+
 %macro PASS6_AND_PERMUTE 0
     mov         tmpd, [outd+4]
     movss       xmm7, [outd+72]
@@ -268,8 +282,159 @@ INIT_XMM
 %define BUTTERFLY  BUTTERFLY_SSE
 %define BUTTERFLY0 BUTTERFLY0_SSE
 
+%macro SHUFL1_SSE2 2
+    pshuflw %1, %2, 0xe
+%endmacro
+
+%macro SHUFL1_SSE1 2 ; assumes args are the same
+    shufps %1, %1, 1
+%endmacro
+
+%macro PASS6A 16
+    movss [outq+0x00], %9
+    SHUFL1  %1, %9
+    movss [outq+0x10], %10
+    SHUFL1  %2, %10
+    movss [outq+0x20], %11
+    SHUFL1  %3, %11
+    movss [outq+0x30], %12
+    SHUFL1  %4, %12
+    movss [outq+0x40], %13
+    SHUFL1  %5, %13
+    movss [outq+0x50], %14
+    SHUFL1  %6, %14
+    movss [outq+0x60], %15
+    SHUFL1  %7, %15
+    movaps [outq+0x70], %16
+    SHUFL1  %8, %16
+    addss   %1, %2
+    addss   %2, %3
+    movss [outq+0x08], %1
+    addss   %3, %4
+    movss [outq+0x18], %2
+    addss   %4, %5
+    movss [outq+0x28], %3
+    addss   %5, %6
+    movss [outq+0x38], %4
+    addss   %6, %7
+    movss [outq+0x48], %5
+    addss   %7, %8
+    movss [outq+0x58], %6
+    movss [outq+0x68], %7
+    movss [outq+0x78], %8
+%endmacro
+
+%ifdef ARCH_X86_64
+%define SPILL SWAP
+%define UNSPILL SWAP
+
+%macro PASS5 0
+    nop ; FIXME code alignment
+    SWAP 5, 8
+    SWAP 4, 12
+    SWAP 6, 14
+    SWAP 7, 13
+    SWAP 0, 15
+    PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
+    TRANSPOSE4x4PS 8, 9, 10, 11, 0
+    BUTTERFLY3V    8, 9, 10, 11, 0
+    addps   m10, m11
+    TRANSPOSE4x4PS 12, 13, 14, 15, 0
+    BUTTERFLY3V    12, 13, 14, 15, 0
+    addps   m14, m15
+    addps   m12, m14
+    addps   m14, m13
+    addps   m13, m15
+%endmacro
+
+%macro PASS6 0
+    SWAP 9, 12
+    SWAP 11, 14
+    PASS6A m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15
+    PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
+    movhlps m0, m1
+    pshufd  m1, m1, 3
+    SWAP 0, 2, 4, 6, 8, 10, 12, 14
+    SWAP 1, 3, 5, 7, 9, 11, 13, 15
+%rep 7
+    movhlps m0, m1
+    pshufd  m1, m1, 3
+    addss   m15, m1
+    SWAP 0, 2, 4, 6, 8, 10, 12, 14
+    SWAP 1, 3, 5, 7, 9, 11, 13, 15
+%endrep
+%assign i 4
+%rep 15
+    addss m0, m1
+    movss [outq+i], m0
+    SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    %assign i i+8
+%endrep
+%endmacro
+
+%else ; ARCH_X86_32
+%macro SPILL 2 ; xmm#, mempos
+    movaps [outq+(%2-8)*16], m%1
+%endmacro
+%macro UNSPILL 2
+    movaps m%1, [outq+(%2-8)*16]
+%endmacro
+
+%macro PASS5 0
+    %assign pad 8*mmsize+(16-gprsize)-(stack_offset&15)
+    SUB rsp, pad
+    PERMUTE 0,5, 1,1, 2,4, 3,6, 4,2, 5,3, 6,7, 7,0
+    UNSPILL 1, 10
+    TRANSPOSE4x4PS 0, 1, 2, 3, 4
+    BUTTERFLY3V    0, 1, 2, 3, 4
+    addps   m2, m3
+    movaps [rsp+0x00], m0
+    movaps [rsp+0x40], m1
+    movaps [rsp+0x20], m2
+    movaps [rsp+0x60], m3
+    UNSPILL 4, 9
+    UNSPILL 5, 11
+    TRANSPOSE4x4PS 4, 5, 6, 7, 3
+    BUTTERFLY3V    4, 5, 6, 7, 3
+    addps   m6, m7
+    addps   m4, m6
+    addps   m6, m5
+    addps   m5, m7
+    movaps m3, [rsp+0x60]
+    movaps [rsp+0x10], m4
+    movaps [rsp+0x50], m5
+    movaps [rsp+0x30], m6
+    movaps [rsp+0x70], m7
+%endmacro
+
+%macro PASS6 0
+    SWAP 1, 4
+    SWAP 3, 6
+    PASS6A m0, m1, m2, m3, m4, m5, m6, m7, m0, m1, m2, m3, m4, m5, m6, m7
+    movss   m1, [rsp+0x0c]
+    movss   m0, [rsp+0x08]
+%assign i 0
+%rep 7
+    movss   m3, [rsp+0x1c+i]
+    movss   m2, [rsp+0x18+i]
+    addss   m1, m3
+    addss   m0, m1
+    addss   m1, m2
+    movss [outq+0x04+i], m0
+    movss [outq+0x0c+i], m1
+    SWAP 0, 2
+    SWAP 1, 3
+    %assign i i+16
+%endrep
+    addss   m0, m1
+    movss [outq+0x74], m0
+    ADD rsp, pad
+%endmacro
+%endif ; ARCH
+
+%macro DCT32_SSE 1
 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
-cglobal dct32_float_sse, 2,3,8, out, in, tmp
+cglobal dct32_float_%1, 2,3,16, out, in, tmp
     ; pass 1
 
     movaps      m0, [inq+0]
@@ -285,8 +450,8 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
     ; pass 2
     movaps      m2, [ps_cos_vec+64]
     BUTTERFLY   m1, m4, m2, m3
-    movaps      [outq+48], m1
-    movaps      [outq+0], m4
+    SPILL 1, 11
+    SPILL 4, 8
 
     ; pass 1
     movaps      m1, [inq+16]
@@ -311,17 +476,17 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
     movaps      m2, [ps_cos_vec+96]
     shufps      m1, m1, 0x1b
     BUTTERFLY   m0, m1, m2, m3
-    movaps      [outq+112], m0
-    movaps      [outq+96], m1
+    SPILL 0, 15
+    SPILL 1, 14
 
-    movaps      m0, [outq+0]
+    UNSPILL 0, 8
     shufps      m5, m5, 0x1b
     BUTTERFLY   m0, m5, m2, m3
 
-    movaps      m1, [outq+48]
+    UNSPILL 1, 11
     shufps      m6, m6, 0x1b
     BUTTERFLY   m1, m6, m2, m3
-    movaps      [outq+48], m1
+    SPILL 1, 11
 
     shufps      m4, m4, 0x1b
     BUTTERFLY   m7, m4, m2, m3
@@ -333,57 +498,33 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
     BUTTERFLY2  m5, m3, m2, m1
 
     BUTTERFLY2  m0, m3, m2, m1
-    movaps      [outq+16], m0
+    SPILL 0, 9
 
     BUTTERFLY2  m6, m3, m2, m1
-    movaps      [outq+32], m6
+    SPILL 6, 10
 
-    movaps      m0, [outq+48]
+    UNSPILL 0, 11
     BUTTERFLY2  m0, m3, m2, m1
-    movaps      [outq+48], m0
+    SPILL 0, 11
 
     BUTTERFLY2  m4, m3, m2, m1
 
     BUTTERFLY2  m7, m3, m2, m1
 
-    movaps      m6, [outq+96]
+    UNSPILL 6, 14
     BUTTERFLY2  m6, m3, m2, m1
 
-    movaps      m0, [outq+112]
+    UNSPILL 0, 15
     BUTTERFLY2  m0, m3, m2, m1
 
-    ; pass 5
-    movaps      m2, [ps_cos_vec+160]
-    shufps      m3, m3, 0xcc
-
-    BUTTERFLY3  m5, m3, m2, m1
-    movaps      [outq+0], m5
-
-    movaps      m1, [outq+16]
-    BUTTERFLY3  m1, m3, m2, m5
-    movaps      [outq+96], m1
-
-    BUTTERFLY3  m4, m3, m2, m5
-    movaps      [outq+64], m4
-
-    BUTTERFLY3  m7, m3, m2, m5
-    movaps      [outq+80], m7
-
-    movaps      m5, [outq+32]
-    BUTTERFLY3  m5, m3, m2, m7
-    movaps      [outq+32], m5
-
-    movaps      m4, [outq+48]
-    BUTTERFLY3  m4, m3, m2, m7
-    movaps      [outq+48], m4
-
-    BUTTERFLY3  m6, m3, m2, m7
-    movaps      [outq+16], m6
-
-    BUTTERFLY3  m0, m3, m2, m7
-    movaps      [outq+112], m0
-
-
-    ;    pass 6, no SIMD...
-    PASS6_AND_PERMUTE
+    PASS5
+    PASS6
     RET
+%endmacro
+
+%ifndef ARCH_X86_64
+%define SHUFL1 SHUFL1_SSE1
+DCT32_SSE sse
+%endif
+%define SHUFL1 SHUFL1_SSE2
+DCT32_SSE sse2
diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index 8eef421..f6d44d7 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -59,8 +59,9 @@ av_cold void ff_dct_init_mmx(DCTContext *s)
     int has_vectors = av_get_cpu_flags();
     if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX)
         s->dct32 = ff_dct32_float_avx;
-    else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
+    else if (has_vectors & AV_CPU_FLAG_SSE2 && HAVE_SSE)
+        s->dct32 = ff_dct32_float_sse2;
+    else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE && ARCH_X86_32)
         s->dct32 = ff_dct32_float_sse;
 }
 #endif
-
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index c714185..10feb89 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -35,6 +35,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, 
const FFTSample *input)
 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample 
*input);
 void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample 
*input);
 void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
+void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
 void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
 
 #endif
diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm
index 8c3fc87..c43e711 100644
--- a/libavcodec/x86/x86util.asm
+++ b/libavcodec/x86/x86util.asm
@@ -41,6 +41,13 @@
     SWAP %2, %4, %3
 %endmacro
 
+%macro SBUTTERFLYPS 3
+    movaps   m%3, m%1
+    unpcklps m%1, m%2
+    unpckhps m%3, m%2
+    SWAP %2, %3
+%endmacro
+
 %macro TRANSPOSE4x4B 5
     SBUTTERFLY bw, %1, %2, %5
     SBUTTERFLY bw, %3, %4, %5
@@ -74,6 +81,19 @@
     SWAP %2, %3
 %endmacro
 
+; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops
+%macro TRANSPOSE4x4PS 5
+    SBUTTERFLYPS %1, %2, %5
+    SBUTTERFLYPS %3, %4, %5
+    movaps  m%5, m%1
+    movlhps m%1, m%3
+    movhlps m%3, m%5
+    movaps  m%5, m%2
+    movlhps m%2, m%4
+    movhlps m%4, m%5
+    SWAP %2, %3
+%endmacro
+
 %macro TRANSPOSE8x8W 9-11
 %ifdef ARCH_X86_64
     SBUTTERFLY wd,  %1, %2, %9
-- 
1.7.4.1

_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to