6316->5677 cycles on penryn.
--Loren Merritt
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 4dcfe50..be3bccd 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -35,7 +35,7 @@ pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
; used in ff_ac3_extract_exponents()
pd_1: times 4 dd 1
pd_151: times 4 dd 151
-pb_shuf_4dwb: db 0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255
+pb_shuf_4dwb: db 0, 4, 8, 12
SECTION .text
@@ -369,15 +369,18 @@ cglobal ac3_compute_mantissa_size_sse2, 1,2,4, mant_cnt,
sum
%macro AC3_EXTRACT_EXPONENTS 1
cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len
+ add expq, lenq
+ lea coefq, [coefq+4*lenq]
+ neg lenq
mova m2, [pd_1]
mova m3, [pd_151]
%ifidn %1, ssse3 ;
- mova m4, [pb_shuf_4dwb]
+ movd m4, [pb_shuf_4dwb]
%endif
ALIGN 16
.loop:
; move 4 32-bit coefs to xmm0
- mova m0, [coefq]
+ mova m0, [coefq+4*lenq]
; absolute value
PABSD m0, m1
; convert to float and extract exponents
@@ -394,13 +397,11 @@ cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len
packssdw m0, m0
packuswb m0, m0
%endif
- movd [expq], m0
+ movd [expq+lenq], m0
- add coefq, 16
- add expq, 4
- sub lend, 4
- jg .loop
- RET
+ add lenq, 4
+ jl .loop
+ REP_RET
%endmacro
INIT_XMM
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel