On Sun, 24 Jul 2011, Ronald S. Bultje wrote:

> +%macro SCALE_16BITS_FUNC 2
> +INIT_XMM
> +cglobal hscale%1_%2, 7, 7, 6
> +%ifdef ARCH_X86_64
> +    movsxd        r2, r2d
> +%endif
> +%ifidn %2, sse4
> +    movdqa        m2, [max_19bit_int]
> +%else
> +    movdqa        m2, [max_19bit_flt]
> +%endif
> +    pxor          m3, m3
> +    cmp          r6d, 8
> +    je .scale8
> +    jg .scaleX
> +
> +    ; filterSize == 4 scaling
> +    lea           r1, [r1+r2*4]
> +    lea           r4, [r4+r2*8]
> +    lea           r5, [r5+r2*2]
> +    neg           r2
> +.loop4:
> +    movsx         r0, word [r5+r2*2+0]  ; filterPos[0]
> +    movsx         r6, word [r5+r2*2+2]  ; filterPos[2]
> +    movq          m0, [r3+r0*2]         ; src[filterPos[0] + {0,1,2,3}]
> +    movhps        m0, [r3+r6*2]         ; src[filterPos[2] + {0,1,2,3}]
> +    movsx         r0, word [r5+r2*2+4]  ; filterPos[1]
> +    movsx         r6, word [r5+r2*2+6]  ; filterPos[3]
> +    movq          m1, [r3+r0*2]         ; src[filterPos[1] + {0,1,2,3}]
> +    movhps        m1, [r3+r6*2]         ; src[filterPos[3] + {0,1,2,3}]
> +    pmaddwd       m0, [r4+r2*8+ 0]      ; *= filter[{0,1,..,6,7}]
> +    pmaddwd       m1, [r4+r2*8+16]      ; *= filter[{8,9,..,14,15}]
> +%ifidn %2, sse2
> +    pshufd        m4, m0, 00001101b
> +    pshufd        m0, m0, 00001000b
> +    pshufd        m5, m1, 00001101b
> +    pshufd        m1, m1, 00001000b
> +    paddd         m0, m4
> +    paddd         m1, m5
> +    movlhps       m0, m1

mova   m4, m0
shufps m0, m1, 10001000b
shufps m4, m1, 11011101b
paddd  m0, m4

Might also be faster than phadd on conroe. Likewise for all the other
cases of sse2 vs phadd.

> +%else ; ssse3/sse4
> +    phaddd        m0, m1                ; filter[{ 0, 1, 2, 
> 3}]*src[filterPos[0]+{0,1,2,3}],
> +                                        ; filter[{ 4, 5, 6, 
> 7}]*src[filterPos[1]+{0,1,2,3}],
> +                                        ; filter[{ 8, 
> 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
> +                                        ; 
> filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
> +%endif
> +    psrad         m0, %1 - 5
> +    CLIPD         m0, m3, m2
> +    movdqu [r1+r2*4], m0
> +    add           r2, 4
> +    jl .loop4
> +    REP_RET
> +
> +.scale8:
> +    shl           r2, 1                 ; this allows *16 (i.e. now *8) in 
> lea instructions
> +    lea           r1, [r1+r2*2]
> +    lea           r4, [r4+r2*8]
> +    lea           r5, [r5+r2*1]
> +    neg           r2
> +.loop8:
> +    movsx         r0, word [r5+r2*1+0]  ; filterPos[0]
> +    movsx         r6, word [r5+r2*1+2]  ; filterPos[1]
> +    movdqu        m0, [r3+r0*2]         ; src[filterPos[0] + 
> {0,1,2,3,4,5,6,7}]
> +    movdqu        m1, [r3+r6*2]         ; src[filterPos[1] + 
> {0,1,2,3,4,5,6,7}]
> +    pmaddwd       m0, [r4+r2*8   ]      ; *= filter[{0,1,..,6,7}]
> +    pmaddwd       m1, [r4+r2*8+16]      ; *= filter[{8,9,..,14,15}]
> +%ifidn %2, sse2
> +    movhlps       m4, m0
> +    paddd         m0, m4
> +    movhlps       m5, m1
> +    paddd         m1, m5
> +    movlhps       m0, m1
> +    pshufd        m1, m0, 00001101b
> +    pshufd        m0, m0, 00001000b
> +    paddd         m0, m1

mova      m4, m0
punpckldq m0, m1
punpckhdq m4, m1
paddd     m0, m4
movhlps   m1, m0
paddd     m0, m1

> +%else ; ssse3/sse4
> +    phaddd        m0, m1
> +    phaddd        m0, m0                ; filter[{0,1,..., 6, 
> 7}]*src[filterPos[0]+{0,1,...,6,7}],
> +                                        ; 
> filter[{8,9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
> +%endif
> +    psrad         m0, %1 - 5
> +    CLIPD         m0, m3, m2
> +    movq   [r1+r2*2], m0
> +    add           r2, 4                 ; it really only does 2px, see "shl 
> r2,1" above
> +    jl .loop8
> +    REP_RET
> +
> +.scaleX:
> +%ifdef ARCH_X86_64
> +    push         r12
> +    movsxd        r6, r6d
> +    lea          r12, [r3+r6*2]         ; &src[filterSize]
> +%define src_reg r11
> +%define r1x     r10
> +%define filter2 r12
> +%else
> +    lea           r0, [r3+r6*2]         ; &src[filterSize]
> +    mov          r6m, r0
> +%define src_reg r3
> +%define r1x     r1
> +%define filter2 r6m
> +%endif
> +    lea           r5, [r5+r2*2]
> +    lea           r1, [r1+r2*4]
> +%ifdef ARCH_X86_32
> +    mov          r1m, r1
> +%endif

movifnidn r1mp, r1

> +    neg           r2
> +.loopX:
> +    movsx         r0, word [r5+r2*2+0]  ; filterPos[0]
> +    movsx        r1x, word [r5+r2*2+2]  ; filterPos[1]
> +    pxor          m4, m4
> +%ifdef ARCH_X86_64
> +    mov      src_reg, r3
> +%else
> +    mov      src_reg, r3m
> +%endif

r3mp covers both cases

--Loren Merritt
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to