# HG changeset patch # User Jayashree # Date 1517283539 28800 # Mon Jan 29 19:38:59 2018 -0800 # Node ID 3c6e5ce07dbca7f967e4b5b62fe450979da3bf81 # Parent 624c83571d1df840e1206c46e589044fbf87ff32 x86: AVX512 'count_nonzero_16x16' avx-512 kernel, 22% speedup over avx2
count_nonzero[16x16] 18.88x -> 23.04x diff -r 624c83571d1d -r 3c6e5ce07dbc source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jan 12 12:40:16 2018 -0800 +++ b/source/common/x86/asm-primitives.cpp Mon Jan 29 19:38:59 2018 -0800 @@ -5375,6 +5375,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512); p.planecopy_sp_shl = PFX(upShift_16_avx512); + p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16x16_avx512); } #endif diff -r 624c83571d1d -r 3c6e5ce07dbc source/common/x86/pixel-util.h --- a/source/common/x86/pixel-util.h Fri Jan 12 12:40:16 2018 -0800 +++ b/source/common/x86/pixel-util.h Mon Jan 29 19:38:59 2018 -0800 @@ -61,4 +61,6 @@ uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)); uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)); +int PFX(count_nonzero_16x16_avx512(const int16_t* quantCoeff)); + #endif // ifndef X265_PIXEL_UTIL_H diff -r 624c83571d1d -r 3c6e5ce07dbc source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Fri Jan 12 12:40:16 2018 -0800 +++ b/source/common/x86/pixel-util8.asm Mon Jan 29 19:38:59 2018 -0800 @@ -4,6 +4,7 @@ ;* Authors: Min Chen <chenm...@163.com> <min.c...@multicorewareinc.com> ;* Nabajit Deka <naba...@multicorewareinc.com> ;* Rajesh Paulraj <raj...@multicorewareinc.com> +;* Praveen Kumar Tiwari <prav...@multicorewareinc.com> ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -1857,6 +1858,30 @@ movd eax, xm0 RET +;----------------------------------------------------------------------------- +; int x265_count_nonzero_16x16_avx512(const int16_t *quantCoeff); +;----------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal count_nonzero_16x16, 1,4,2 + mov r1, 0xFFFFFFFFFFFFFFFF + kmovq k2, r1 + xor r3, r3 + pxor m0, m0 + +%assign x 0 +%rep 4 + movu m1, [r0 + x] + vpacksswb m1, [r0 + x + 64] +%assign x x+128 + vpcmpb k1 {k2}, m1, m0, 00000100b + kmovq r1, k1 + popcnt r2, r1 + add r3d, r2d +%endrep + mov eax, r3d + + RET + ;----------------------------------------------------------------------------- ; int x265_count_nonzero_32x32_sse2(const int16_t *quantCoeff); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel