# HG changeset patch # User Akil Ayyappan<[email protected]> # Date 1551693998 -19800 # Mon Mar 04 15:36:38 2019 +0530 # Node ID 19f27e0c8a6f8250af5e2f7c7984dc3b57bea7e4 # Parent d12a4caf7963fd47d646040689ad5f02754ad879 x86: normFactor primitive
This patch adds AVX2 assembly for this primitive. Pushed to default branch of x265 repo Thanks & Regards, Dinesh On Tue, Mar 5, 2019 at 10:04 AM Akil <[email protected]> wrote: > # HG changeset patch > # User Akil Ayyappan<[email protected]> > # Date 1551693998 -19800 > # Mon Mar 04 15:36:38 2019 +0530 > # Node ID 19f27e0c8a6f8250af5e2f7c7984dc3b57bea7e4 > # Parent d12a4caf7963fd47d646040689ad5f02754ad879 > x86: normFactor primitive > > This patch adds AVX2 assembly for this primitive. > > |---------|-----------|-----------------|-----------------| > | Size |Performance|AVX2 clock cycles|CPP clock cycles | > |---------|-----------|-----------------|-----------------| > | [8x8] | 7.65x | 312.90 | 2394.83 | > | [16x16] | 8.42x | 1157.14 | 9741.56 | > | [32x32] | 9.56x | 3942.18 | 37692.20 | > | [64x64] | 8.96x | 15388.24 | 137889.28 | > |---------|-----------|-----------------|-----------------| > > diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/pixel.cpp > --- a/source/common/pixel.cpp Wed Feb 27 12:35:02 2019 +0530 > +++ b/source/common/pixel.cpp Mon Mar 04 15:36:38 2019 +0530 > @@ -959,6 +959,19 @@ > } > } > > +static void normFact_c(const pixel* src, uint32_t blockSize, int shift, > uint64_t *z_k) > +{ > + *z_k = 0; > + for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1) > + { > + for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1) > + { > + uint32_t temp = src[block_yy * blockSize + block_xx] >> shift; > + *z_k += temp * temp; > + } > + } > +} > + > #if HIGH_BIT_DEPTH > static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, > int height, uint64_t *outsum, > const pixel minPix, const pixel maxPix) > @@ -1314,5 +1327,10 @@ > p.cu[BLOCK_16x16].ssimDist = ssimDist_c<4>; > p.cu[BLOCK_32x32].ssimDist = ssimDist_c<5>; > p.cu[BLOCK_64x64].ssimDist = ssimDist_c<6>; > + > + p.cu[BLOCK_8x8].normFact = normFact_c; > + p.cu[BLOCK_16x16].normFact = normFact_c; > + p.cu[BLOCK_32x32].normFact = normFact_c; > + p.cu[BLOCK_64x64].normFact = normFact_c; > } > } > diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/primitives.h > --- a/source/common/primitives.h Wed Feb 27 12:35:02 2019 +0530 > +++ b/source/common/primitives.h Mon Mar 04 15:36:38 2019 +0530 > @@ -228,6 +228,7 @@ > typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t > *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t > blkPos); > typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t > *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t > *totalRdCost, int64_t *psyScale, uint32_t blkPos); > typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, > const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, > uint64_t *ac_k); > +typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int > shift, uint64_t *z_k); > /* Function pointers to optimized encoder primitives. Each pointer can > reference > * either an assembly routine, a SIMD intrinsic primitive, or a C > function */ > struct EncoderPrimitives > @@ -305,6 +306,7 @@ > psyRdoQuant_t1 psyRdoQuant_1p; > psyRdoQuant_t2 psyRdoQuant_2p; > ssimDistortion_t ssimDist; > + normFactor_t normFact; > } > cu[NUM_CU_SIZES]; > /* These remaining primitives work on either fixed block sizes or take > diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/asm-primitives.cpp > --- a/source/common/x86/asm-primitives.cpp Wed Feb 27 12:35:02 2019 +0530 > +++ b/source/common/x86/asm-primitives.cpp Mon Mar 04 15:36:38 2019 +0530 > @@ -2325,6 +2325,11 @@ > p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2); > p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2); > > + p.cu[BLOCK_8x8].normFact = PFX(normFact8_avx2); > + p.cu[BLOCK_16x16].normFact = PFX(normFact16_avx2); > + p.cu[BLOCK_32x32].normFact = PFX(normFact32_avx2); > + p.cu[BLOCK_64x64].normFact = PFX(normFact64_avx2); > + > /* TODO: This kernel needs to be modified to work with > HIGH_BIT_DEPTH only > p.planeClipAndMax = PFX(planeClipAndMax_avx2); */ > > @@ -4718,6 +4723,11 @@ > p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2); > p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2); > > + p.cu[BLOCK_8x8].normFact = PFX(normFact8_avx2); > + p.cu[BLOCK_16x16].normFact = PFX(normFact16_avx2); > + p.cu[BLOCK_32x32].normFact = PFX(normFact32_avx2); > + p.cu[BLOCK_64x64].normFact = PFX(normFact64_avx2); > + > } > if (cpuMask & X265_CPU_AVX512) > { > diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/pixel-a.asm > --- a/source/common/x86/pixel-a.asm Wed Feb 27 12:35:02 2019 +0530 > +++ b/source/common/x86/pixel-a.asm Mon Mar 04 15:36:38 2019 +0530 > @@ -388,6 +388,16 @@ > vpaddq m7, m6 > %endmacro > > +%macro NORM_FACT_COL 1 > + vpsrld m1, m0, SSIMRD_SHIFT > + vpmuldq m2, m1, m1 > + vpsrldq m1, m1, 4 > + vpmuldq m1, m1, m1 > + > + vpaddq m1, m2 > + vpaddq m3, m1 > +%endmacro > + > ; FIXME avoid the spilling of regs to hold 3*stride. > ; for small blocks on x86_32, modify pixel pointer instead. > > @@ -16303,3 +16313,266 @@ > movq [r4], xm4 > movq [r6], xm7 > RET > + > + > +;static void normFact_c(const pixel* src, uint32_t blockSize, int shift, > uint64_t *z_k) > +;{ > +; *z_k = 0; > +; for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1) > +; { > +; for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1) > +; { > +; uint32_t temp = src[block_yy * blockSize + block_xx] >> > shift; > +; *z_k += temp * temp; > +; } > +; } > +;} > > +;-------------------------------------------------------------------------------------- > +; void normFact_c(const pixel* src, uint32_t blockSize, int shift, > uint64_t *z_k) > > +;-------------------------------------------------------------------------------------- > +INIT_YMM avx2 > +cglobal normFact8, 4, 5, 6 > + mov r4d, 8 > + vpxor m3, m3 ;z_k > + vpxor m5, m5 > +.row: > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +%if HIGH_BIT_DEPTH > + lea r0, [r0 + 2 * r1] > +%else > + lea r0, [r0 + r1] > +%endif > + dec r4d > + jnz .row > + vextracti128 xm4, m3, 1 > + vpaddq xm3, xm4 > + punpckhqdq xm2, xm3, xm5 > + paddq xm3, xm2 > + movq [r3], xm3 > + RET > + > + > +INIT_YMM avx2 > +cglobal normFact16, 4, 5, 6 > + mov r4d, 16 > + vpxor m3, m3 ;z_k > + vpxor m5, m5 > +.row: > +;Col 1-8 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +;Col 9-16 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0 + 16] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0 + 8] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +%if HIGH_BIT_DEPTH > + lea r0, [r0 + 2 * r1] > +%else > + lea r0, [r0 + r1] > +%endif > + dec r4d > + jnz .row > + vextracti128 xm4, m3, 1 > + vpaddq xm3, xm4 > + punpckhqdq xm2, xm3, xm5 > + paddq xm3, xm2 > + movq [r3], xm3 > + RET > + > + > +INIT_YMM avx2 > +cglobal normFact32, 4, 5, 6 > + mov r4d, 32 > + vpxor m3, m3 ;z_k > + vpxor m5, m5 > +.row: > +;Col 1-8 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +;Col 9-16 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0 + 16] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0 + 8] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +;Col 17-24 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0 + 32] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0 + 16] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +;Col 25-32 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0 + 48] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0 + 24] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +%if HIGH_BIT_DEPTH > + lea r0, [r0 + 2 * r1] > +%else > + lea r0, [r0 + r1] > +%endif > + dec r4d > + jnz .row > + vextracti128 xm4, m3, 1 > + vpaddq xm3, xm4 > + punpckhqdq xm2, xm3, xm5 > + paddq xm3, xm2 > + movq [r3], xm3 > + RET > + > + > +INIT_YMM avx2 > +cglobal normFact64, 4, 5, 6 > + mov r4d, 64 > + vpxor m3, m3 ;z_k > + vpxor m5, m5 > +.row: > +;Col 1-8 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +;Col 9-16 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0 + 16] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0 + 8] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +;Col 17-24 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0 + 32] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0 + 16] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +;Col 25-32 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0 + 48] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0 + 24] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +;Col 33-40 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0 + 64] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0 + 32] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +;Col 41-48 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0 + 80] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0 + 40] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +;Col 49-56 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0 + 96] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0 + 48] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +;Col 57-64 > +%if HIGH_BIT_DEPTH > + vpmovzxwd m0, [r0 + 112] ;src > +%elif BIT_DEPTH == 8 > + vpmovzxbd m0, [r0 + 56] > +%else > + %error Unsupported BIT_DEPTH! > +%endif > + > + NORM_FACT_COL m0 > + > +%if HIGH_BIT_DEPTH > + lea r0, [r0 + 2 * r1] > +%else > + lea r0, [r0 + r1] > +%endif > + dec r4d > + jnz .row > + vextracti128 xm4, m3, 1 > + vpaddq xm3, xm4 > + punpckhqdq xm2, xm3, xm5 > + paddq xm3, xm2 > + movq [r3], xm3 > + RET > diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/pixel.h > --- a/source/common/x86/pixel.h Wed Feb 27 12:35:02 2019 +0530 > +++ b/source/common/x86/pixel.h Mon Mar 04 15:36:38 2019 +0530 > @@ -61,7 +61,8 @@ > FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \ > FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t > sstride, const pixel* recon, intptr_t rstride); \ > FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t > sstride, const int16_t* recon, intptr_t rstride); \ > - FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t > fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int > shift, uint64_t *ac_k) > + FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t > fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int > shift, uint64_t *ac_k); \ > + FUNCDEF_TU_S2(void, normFact, cpu, const pixel *src, uint32_t > blockSize, int shift, uint64_t *z_k) > > DECL_PIXELS(mmx); > DECL_PIXELS(mmx2); > diff -r d12a4caf7963 -r 19f27e0c8a6f source/encoder/analysis.cpp > --- a/source/encoder/analysis.cpp Wed Feb 27 12:35:02 2019 +0530 > +++ b/source/encoder/analysis.cpp Mon Mar 04 15:36:38 2019 +0530 > @@ -3696,14 +3696,8 @@ > > // 2. Calculate ac component > uint64_t z_k = 0; > - for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1) > - { > - for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1) > - { > - uint32_t temp = src[block_yy * blockSize + block_xx] >> shift; > - z_k += temp * temp; > - } > - } > + int block = (int)((log(blockSize) / log(2)) - 2); > + primitives.cu[block].normFact(src, blockSize, shift, &z_k); > > // Remove the DC part > z_k -= z_o; > diff -r d12a4caf7963 -r 19f27e0c8a6f source/test/pixelharness.cpp > --- a/source/test/pixelharness.cpp Wed Feb 27 12:35:02 2019 +0530 > +++ b/source/test/pixelharness.cpp Mon Mar 04 15:36:38 2019 +0530 > @@ -2296,6 +2296,30 @@ > return true; > } > > +bool PixelHarness::check_normFact(normFactor_t ref, normFactor_t opt, int > block) > +{ > + int shift = X265_DEPTH - 8; > + uint64_t opt_dest = 0, ref_dest = 0; > + int j = 0; > + int blockSize = 4 << block; > + > + for (int i = 0; i < ITERS; i++) > + { > + int index = i % TEST_CASES; > + ref(pixel_test_buff[index] + j, blockSize, shift, &ref_dest); > + opt(pixel_test_buff[index] + j, blockSize, shift, &opt_dest); > + > + if (opt_dest != ref_dest) > + { > + return false; > + } > + > + reportfail() > + j += INCR; > + } > + return true; > +} > + > bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const > EncoderPrimitives& opt) > { > if (opt.pu[part].satd) > @@ -3129,6 +3153,18 @@ > } > } > > + for (int i = BLOCK_8x8; i < NUM_CU_SIZES; i++) > + { > + if (opt.cu[i].normFact) > + { > + if (!check_normFact(ref.cu[i].normFact, opt.cu[i].normFact, > i)) > + { > + printf("\nnormFact[%dx%d] failed!\n", 4 << i, 4 << i); > + return false; > + } > + } > + } > + > return true; > } > > @@ -3769,4 +3805,16 @@ > REPORT_SPEEDUP(opt.integral_inith[k], ref.integral_inith[k], > dst_buf, pbuf1, STRIDE); > } > } > + > + for (int i = BLOCK_8x8; i < NUM_CU_SIZES; i++) > + { > + if (opt.cu[i].normFact) > + { > + uint64_t dst = 0; > + int blockSize = 4 << i; > + int shift = X265_DEPTH - 8; > + printf("normFact[%dx%d]", blockSize, blockSize); > + REPORT_SPEEDUP(opt.cu[i].normFact, ref.cu[i].normFact, > pixel_test_buff[0], blockSize, shift, &dst); > + } > + } > } > diff -r d12a4caf7963 -r 19f27e0c8a6f source/test/pixelharness.h > --- a/source/test/pixelharness.h Wed Feb 27 12:35:02 2019 +0530 > +++ b/source/test/pixelharness.h Mon Mar 04 15:36:38 2019 +0530 > @@ -137,6 +137,7 @@ > bool check_integral_initv(integralv_t ref, integralv_t opt); > bool check_integral_inith(integralh_t ref, integralh_t opt); > bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt); > + bool check_normFact(normFactor_t ref, normFactor_t opt, int block); > > public: > > > > -- > *Regards,* > *Akil R* > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
