On Sun, 24 Jul 2011, Ronald S. Bultje wrote: > +%macro SCALE_16BITS_FUNC 2 > +INIT_XMM > +cglobal hscale%1_%2, 7, 7, 6 > +%ifdef ARCH_X86_64 > + movsxd r2, r2d > +%endif > +%ifidn %2, sse4 > + movdqa m2, [max_19bit_int] > +%else > + movdqa m2, [max_19bit_flt] > +%endif > + pxor m3, m3 > + cmp r6d, 8 > + je .scale8 > + jg .scaleX > + > + ; filterSize == 4 scaling > + lea r1, [r1+r2*4] > + lea r4, [r4+r2*8] > + lea r5, [r5+r2*2] > + neg r2 > +.loop4: > + movsx r0, word [r5+r2*2+0] ; filterPos[0] > + movsx r6, word [r5+r2*2+2] ; filterPos[2] > + movq m0, [r3+r0*2] ; src[filterPos[0] + {0,1,2,3}] > + movhps m0, [r3+r6*2] ; src[filterPos[2] + {0,1,2,3}] > + movsx r0, word [r5+r2*2+4] ; filterPos[1] > + movsx r6, word [r5+r2*2+6] ; filterPos[3] > + movq m1, [r3+r0*2] ; src[filterPos[1] + {0,1,2,3}] > + movhps m1, [r3+r6*2] ; src[filterPos[3] + {0,1,2,3}] > + pmaddwd m0, [r4+r2*8+ 0] ; *= filter[{0,1,..,6,7}] > + pmaddwd m1, [r4+r2*8+16] ; *= filter[{8,9,..,14,15}] > +%ifidn %2, sse2 > + pshufd m4, m0, 00001101b > + pshufd m0, m0, 00001000b > + pshufd m5, m1, 00001101b > + pshufd m1, m1, 00001000b > + paddd m0, m4 > + paddd m1, m5 > + movlhps m0, m1
mova m4, m0 shufps m0, m1, 10001000b shufps m4, m1, 11011101b paddd m0, m4 Might also be faster than phadd on conroe. Likewise for all the other cases of sse2 vs phadd. > +%else ; ssse3/sse4 > + phaddd m0, m1 ; filter[{ 0, 1, 2, > 3}]*src[filterPos[0]+{0,1,2,3}], > + ; filter[{ 4, 5, 6, > 7}]*src[filterPos[1]+{0,1,2,3}], > + ; filter[{ 8, > 9,10,11}]*src[filterPos[2]+{0,1,2,3}], > + ; > filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] > +%endif > + psrad m0, %1 - 5 > + CLIPD m0, m3, m2 > + movdqu [r1+r2*4], m0 > + add r2, 4 > + jl .loop4 > + REP_RET > + > +.scale8: > + shl r2, 1 ; this allows *16 (i.e. now *8) in > lea instructions > + lea r1, [r1+r2*2] > + lea r4, [r4+r2*8] > + lea r5, [r5+r2*1] > + neg r2 > +.loop8: > + movsx r0, word [r5+r2*1+0] ; filterPos[0] > + movsx r6, word [r5+r2*1+2] ; filterPos[1] > + movdqu m0, [r3+r0*2] ; src[filterPos[0] + > {0,1,2,3,4,5,6,7}] > + movdqu m1, [r3+r6*2] ; src[filterPos[1] + > {0,1,2,3,4,5,6,7}] > + pmaddwd m0, [r4+r2*8 ] ; *= filter[{0,1,..,6,7}] > + pmaddwd m1, [r4+r2*8+16] ; *= filter[{8,9,..,14,15}] > +%ifidn %2, sse2 > + movhlps m4, m0 > + paddd m0, m4 > + movhlps m5, m1 > + paddd m1, m5 > + movlhps m0, m1 > + pshufd m1, m0, 00001101b > + pshufd m0, m0, 00001000b > + paddd m0, m1 mova m4, m0 punpckldq m0, m1 punpckhdq m4, m1 paddd m0, m4 movhlps m1, m0 paddd m0, m1 > +%else ; ssse3/sse4 > + phaddd m0, m1 > + phaddd m0, m0 ; filter[{0,1,..., 6, > 7}]*src[filterPos[0]+{0,1,...,6,7}], > + ; > filter[{8,9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], > +%endif > + psrad m0, %1 - 5 > + CLIPD m0, m3, m2 > + movq [r1+r2*2], m0 > + add r2, 4 ; it really only does 2px, see "shl > r2,1" above > + jl .loop8 > + REP_RET > + > +.scaleX: > +%ifdef ARCH_X86_64 > + push r12 > + movsxd r6, r6d > + lea r12, [r3+r6*2] ; &src[filterSize] > +%define src_reg r11 > +%define r1x r10 > +%define filter2 r12 > +%else > + lea r0, [r3+r6*2] ; &src[filterSize] > + mov r6m, r0 > +%define src_reg r3 > +%define r1x r1 > +%define filter2 r6m > +%endif > + lea r5, [r5+r2*2] > + lea r1, [r1+r2*4] > +%ifdef ARCH_X86_32 > + mov r1m, r1 > +%endif movifnidn r1mp, r1 > + neg r2 > +.loopX: > + movsx r0, word [r5+r2*2+0] ; filterPos[0] > + movsx r1x, word [r5+r2*2+2] ; filterPos[1] > + pxor m4, m4 > +%ifdef ARCH_X86_64 > + mov src_reg, r3 > +%else > + mov src_reg, r3m > +%endif r3mp covers both cases --Loren Merritt _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel