On Thu, Jun 8, 2017 at 6:51 PM, <jayas...@multicorewareinc.com> wrote:
> # HG changeset patch > # User Jayashri Murugan <jayas...@multicorewareinc.com> > # Date 1496062876 -19800 > # Mon May 29 18:31:16 2017 +0530 > # Node ID dd8b50b2d936a5f21972563e117101f82a1ae9c4 > # Parent e75d5f5eeae3413057437af9f7d3ba9bc10fa3fa > avx2: 'integral4h' asm code -> 6.01x faster than 'C' version > > integral_init4h 6.01x 353.40 2122.37 > Pushed to default branch of x265. > > diff -r e75d5f5eeae3 -r dd8b50b2d936 source/common/x86/asm-primitives.cpp > --- a/source/common/x86/asm-primitives.cpp Mon Jun 05 15:20:44 2017 > +0530 > +++ b/source/common/x86/asm-primitives.cpp Mon May 29 18:31:16 2017 > +0530 > @@ -3709,6 +3709,7 @@ > p.integral_initv[INTEGRAL_16] = PFX(integral16v_avx2); > p.integral_initv[INTEGRAL_24] = PFX(integral24v_avx2); > p.integral_initv[INTEGRAL_32] = PFX(integral32v_avx2); > + p.integral_inith[INTEGRAL_4] = PFX(integral4h_avx2); > > } > #endif > diff -r e75d5f5eeae3 -r dd8b50b2d936 source/common/x86/seaintegral.asm > --- a/source/common/x86/seaintegral.asm Mon Jun 05 15:20:44 2017 +0530 > +++ b/source/common/x86/seaintegral.asm Mon May 29 18:31:16 2017 +0530 > @@ -152,8 +152,67 @@ > ;static void integral_init4h_c(uint32_t *sum, pixel *pix, intptr_t stride) > ;----------------------------------------------------------- > ------------------ > INIT_YMM avx2 > -cglobal integral4h, 3, 3, 0 > - > + > +%macro INTEGRAL_FOUR_HORIZONTAL_16 0 > + pmovzxbw m0, [r1] > + pmovzxbw m1, [r1 + 1] > + paddw m0, m1 > + pmovzxbw m1, [r1 + 2] > + paddw m0, m1 > + pmovzxbw m1, [r1 + 3] > + paddw m0, m1 > +%endmacro > + > +cglobal integral4h, 3, 5, 3 > + lea r3, [4 * r2] > + sub r0, r3 > + sub r2, 4 ;stride - 4 > + mov r4, r2 > + shr r4, 4 > + > +.loop_16: > + INTEGRAL_FOUR_HORIZONTAL_16 > + vperm2i128 m2, m0, m0, 1 > + pmovzxwd m2, xm2 > + pmovzxwd m0, xm0 > + movu m1, [r0] > + paddd m0, m1 > + movu [r0 + r3], m0 > + movu m1, [r0 + 32] > + paddd m2, m1 > + movu [r0 + r3 + 32], m2 > + add r1, 16 > + add r0, 64 > + sub r2, 16 > + sub r4, 1 > + jnz .loop_16 > + cmp r2, 12 > + je .loop_12 > + cmp r2, 4 > + je .loop_4 > + > +.loop_12: > + INTEGRAL_FOUR_HORIZONTAL_16 > + vperm2i128 m2, m0, m0, 1 > + pmovzxwd xm2, xm2 > + pmovzxwd m0, xm0 > + movu m1, [r0] > + paddd m0, m1 > + movu [r0 + r3], m0 > + movu xm1, [r0 + 32] > + paddd xm2, xm1 > + movu [r0 + r3 + 32], xm2 > + jmp .end > + > +.loop_4: > + INTEGRAL_FOUR_HORIZONTAL_16 > + pmovzxwd xm0, xm0 > + movu xm1, [r0] > + paddd xm0, xm1 > + movu [r0 + r3], xm0 > + jmp .end > + > +.end > RET > > ;----------------------------------------------------------- > ------------------ > _______________________________________________ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel >
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel