---------- Forwarded message ---------- From: <aasaipr...@multicorewareinc.com> Date: Mon, Jun 29, 2015 at 4:51 PM Subject: [x265] [PATCH] asm: avx2 code for weight_sp() 16bpp To: x265-devel@videolan.org
# HG changeset patch # User Aasaipriya Chandran <aasaipr...@multicorewareinc.com> # Date 1435562395 -19800 # Mon Jun 29 12:49:55 2015 +0530 # Node ID bebe4e496a432608cf0a9c495debd1970caa387e # Parent 9feee64efa440c25f016d15ae982789e5393a77e asm: avx2 code for weight_sp() 16bpp avx2: weight_sp 11.37x 4496.63 51139.20 sse4: weight_sp 6.48x 8163.87 52870.36 diff -r 9feee64efa44 -r bebe4e496a43 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jun 26 15:29:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jun 29 12:49:55 2015 +0530 @@ -1517,6 +1517,7 @@ p.scale1D_128to64 = PFX(scale1D_128to64_avx2); p.scale2D_64to32 = PFX(scale2D_64to32_avx2); p.weight_pp = PFX(weight_pp_avx2); + p.weight_sp = PFX(weight_sp_avx2); p.sign = PFX(calSign_avx2); p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2); diff -r 9feee64efa44 -r bebe4e496a43 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Fri Jun 26 15:29:51 2015 +0530 +++ b/source/common/x86/pixel-util8.asm Mon Jun 29 12:49:55 2015 +0530 @@ -1674,8 +1674,128 @@ dec r5d jnz .loopH RET - -%if ARCH_X86_64 +%endif + + +%if HIGH_BIT_DEPTH +INIT_YMM avx2 +cglobal weight_sp, 6,7,9 + mova m1, [pw_1023] + mova m2, [pw_1] + mov r6d, r7m r7 is 8th register (0-7). so it should be cglobal weight_sp, 6, 8, 9 and ARCH_X86_64 only code. + shl r6d, 16 + or r6d, r6m + vpbroadcastd m3, r6d ; m3 = [round w0] + movd xm4, r8m ; m4 = [shift] + vpbroadcastd m5, r9m ; m5 = [offset] + + ; correct row stride + add r3d, r3d + add r2d, r2d + mov r6d, r4d + and r6d, ~(mmsize / SIZEOF_PIXEL - 1) + sub r3d, r6d + sub r3d, r6d + sub r2d, r6d + sub r2d, r6d + + ; generate partial width mask (MUST BE IN YMM0) + mov r6d, r4d + and r6d, (mmsize / SIZEOF_PIXEL - 1) + movd xm0, r6d + pshuflw m0, m0, 0 + punpcklqdq m0, m0 + vinserti128 m0, m0, xm0, 1 + pcmpgtw m0, [pw_0_15] + +.loopH: + mov r6d, r4d + +.loopW: + movu m6, [r0] + paddw m6, [pw_2000] + + punpcklwd m7, m6, m2 + pmaddwd m7, m3 ;(round w0) + psrad m7, xm4 ;(shift) + paddd m7, m5 ;(offset) + + punpckhwd m6, m2 + pmaddwd m6, m3 + psrad m6, xm4 + paddd m6, m5 + + packusdw m7, m6 + pminuw m7, m1 + + sub r6d, (mmsize / SIZEOF_PIXEL) + jl .width14 + movu [r1], m7 + lea r0, [r0 + mmsize] + lea r1, [r1 + mmsize] + je .nextH + jmp .loopW + +.width14: + add r6d, 16 + cmp r6d, 14 + jl .width12 + movu [r1], xm7 + vextracti128 xm8, m7, 1 + movq [r1 + 16], xm8 + pextrd [r1 + 24], xm8, 2 + je .nextH + +.width12: + cmp r6d, 12 + jl .width10 + movu [r1], xm7 + vextracti128 xm8, m7, 1 + movq [r1 + 16], xm8 + je .nextH + +.width10: + cmp r6d, 10 + jl .width8 + movu [r1], xm7 + vextracti128 xm8, m7, 1 + movd [r1 + 16], xm8 + je .nextH + +.width8: + cmp r6d, 8 + jl .width6 + movu [r1], xm7 + je .nextH + +.width6 + cmp r6d, 6 + jl .width4 + movq [r1], xm7 + pextrd [r1 + 8], xm7, 2 + je .nextH + +.width4: + cmp r6d, 4 + jl .width2 + movq [r1], xm7 + je .nextH + add r1, 4 + pshufd m6, m6, 1 + je .nextH + +.width2: + movd [r1], xm7 + +.nextH: + add r0, r2 + add r1, r3 + + dec r5d + jnz .loopH + RET + +%else INIT_YMM avx2 cglobal weight_sp, 6, 9, 7 mov r7d, r7m @@ -1752,8 +1872,6 @@ jnz .loopH RET %endif -%endif ; end of (HIGH_BIT_DEPTH == 0) - ;----------------------------------------------------------------- ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride) _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel