# HG changeset patch # User Jayashri Murugan <jayas...@multicorewareinc.com> # Date 1513143310 -19800 # Wed Dec 13 11:05:10 2017 +0530 # Node ID 458b708e6d17aafb49a5fd369b2e9540d0268726 # Parent ab5b1becd807647d5264381c1fb74750c20fdfae x86: Aligned routine implementation for cpy1Dto2D_shl primitive
1. cpy1Dto2D_shl optimization 2. Aligned code impementation 3. Linking with encoder diff -r ab5b1becd807 -r 458b708e6d17 source/common/pixel.cpp --- a/source/common/pixel.cpp Fri Dec 08 14:00:59 2017 +0530 +++ b/source/common/pixel.cpp Wed Dec 13 11:05:10 2017 +0530 @@ -1004,7 +1004,8 @@ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED] = blockfill_s_c<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr<W>; \ - p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \ + p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl<W>; \ + p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr<W>; \ p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp<BLOCK_ ## W ## x ## H>; \ p.cu[BLOCK_ ## W ## x ## H].transpose = transpose<W>; \ diff -r ab5b1becd807 -r 458b708e6d17 source/common/primitives.h --- a/source/common/primitives.h Fri Dec 08 14:00:59 2017 +0530 +++ b/source/common/primitives.h Wed Dec 13 11:05:10 2017 +0530 @@ -280,9 +280,8 @@ count_nonzero_t count_nonzero; cpy2Dto1D_shl_t cpy2Dto1D_shl; cpy2Dto1D_shr_t cpy2Dto1D_shr; - cpy1Dto2D_shl_t cpy1Dto2D_shl; + cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_ALIGNMENT_TYPES]; cpy1Dto2D_shr_t cpy1Dto2D_shr; - copy_sp_t copy_sp; copy_ps_t copy_ps; copy_ss_t copy_ss; diff -r ab5b1becd807 -r 458b708e6d17 source/common/quant.cpp --- a/source/common/quant.cpp Fri Dec 08 14:00:59 2017 +0530 +++ b/source/common/quant.cpp Wed Dec 13 11:05:10 2017 +0530 @@ -560,13 +560,11 @@ uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig) { const uint32_t sizeIdx = log2TrSize - 2; - if (cu.m_tqBypass[0]) { - primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, coeff, resiStride, 0); + primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, coeff, resiStride, 0); return; } - // Values need to pass as input parameter in dequant int rem = m_qpParam[ttype].rem; int per = m_qpParam[ttype].per; @@ -595,7 +593,7 @@ if (transformShift > 0) primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift); else - primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, m_resiDctCoeff, resiStride, -transformShift); + primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, m_resiDctCoeff, resiStride, -transformShift); #endif } else diff -r ab5b1becd807 -r 458b708e6d17 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Dec 08 14:00:59 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 13 11:05:10 2017 +0530 @@ -989,7 +989,7 @@ ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2); ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2); ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2); - ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2); + ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2); ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2); ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2); #if X86_64 @@ -1692,11 +1692,9 @@ ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2); ASSIGN2(p.cu[BLOCK_32x32].blockfill_s, blockfill_s_32x32_avx2); - ALL_LUMA_TU(count_nonzero, count_nonzero, avx2); - ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2); + ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2); ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2); - p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2); p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2); p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2); @@ -2526,10 +2524,10 @@ p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512); p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512); p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512); - p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); - p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); + p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512); + p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512); p.weight_pp = PFX(weight_pp_avx512); p.weight_sp = PFX(weight_sp_avx512); p.dequant_normal = PFX(dequant_normal_avx512); @@ -3196,10 +3194,9 @@ ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2); ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2); ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2); - ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2); + ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2); ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2); ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2); - ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse2); ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2); @@ -3794,12 +3791,9 @@ p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2); p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2); p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2); - ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2); - - ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2); + ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2); ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2); - p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8_avx2); p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx2); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx2); @@ -4861,7 +4855,8 @@ p.cu[BLOCK_32x32].calcresidual[ALIGNED] = PFX(getResidual_aligned32_avx512); p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); - p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); + p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512); + p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512); p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); diff -r ab5b1becd807 -r 458b708e6d17 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Fri Dec 08 14:00:59 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Wed Dec 13 11:05:10 2017 +0530 @@ -5599,17 +5599,58 @@ add r2d, r2d movd xm0, r3d lea r3, [3 * r2] - +%rep 3 PROCESS_CPY1Dto2D_SHL_32x8_AVX512 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] +%endrep PROCESS_CPY1Dto2D_SHL_32x8_AVX512 + RET + +%macro PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512 0 + mova m1, [r1 + 0 * mmsize] + mova m2, [r1 + 1 * mmsize] + mova m3, [r1 + 2 * mmsize] + mova m4, [r1 + 3 * mmsize] + psllw m1, xm0 + psllw m2, xm0 + psllw m3, xm0 + psllw m4, xm0 + mova [r0], m1 + mova [r0 + r2], m2 + mova [r0 + 2 * r2], m3 + mova [r0 + r3], m4 + + add r1, 4 * mmsize + lea r0, [r0 + r2 * 4] + + mova m1, [r1 + 0 * mmsize] + mova m2, [r1 + 1 * mmsize] + mova m3, [r1 + 2 * mmsize] + mova m4, [r1 + 3 * mmsize] + psllw m1, xm0 + psllw m2, xm0 + psllw m3, xm0 + psllw m4, xm0 + mova [r0], m1 + mova [r0 + r2], m2 + mova [r0 + 2 * r2], m3 + mova [r0 + r3], m4 +%endmacro +;-------------------------------------------------------------------------------------- +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) +;-------------------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal cpy1Dto2D_shl_aligned_32, 4, 4, 5 + add r2d, r2d + movd xm0, r3d + lea r3, [3 * r2] +%rep 3 + PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] - PROCESS_CPY1Dto2D_SHL_32x8_AVX512 - add r1, 4 * mmsize - lea r0, [r0 + r2 * 4] - PROCESS_CPY1Dto2D_SHL_32x8_AVX512 +%endrep + PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512 RET ;-------------------------------------------------------------------------------------- ; copy_cnt avx512 code end diff -r ab5b1becd807 -r 458b708e6d17 source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.h Fri Dec 08 14:00:59 2017 +0530 +++ b/source/common/x86/blockcopy8.h Wed Dec 13 11:05:10 2017 +0530 @@ -38,7 +38,7 @@ FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); - +FUNCDEF_TU_S(void, cpy1Dto2D_shl_aligned, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); diff -r ab5b1becd807 -r 458b708e6d17 source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Fri Dec 08 14:00:59 2017 +0530 +++ b/source/test/pixelharness.cpp Wed Dec 13 11:05:10 2017 +0530 @@ -469,12 +469,10 @@ return true; } - bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt) { - ALIGN_VAR_16(int16_t, ref_dest[64 * 64]); - ALIGN_VAR_16(int16_t, opt_dest[64 * 64]); - + ALIGN_VAR_64(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_64(int16_t, opt_dest[64 * 64]); memset(ref_dest, 0xCD, sizeof(ref_dest)); memset(opt_dest, 0xCD, sizeof(opt_dest)); @@ -497,6 +495,33 @@ return true; } +bool PixelHarness::check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt) +{ + ALIGN_VAR_64(int16_t, ref_dest[64 * 64]); + ALIGN_VAR_64(int16_t, opt_dest[64 * 64]); + + memset(ref_dest, 0xCD, sizeof(ref_dest)); + memset(opt_dest, 0xCD, sizeof(opt_dest)); + + int j = 0; + intptr_t stride = STRIDE; + for (int i = 0; i < ITERS; i++) + { + int shift = (rand() % 7 + 1); + + int index = i % TEST_CASES; + checked(opt, opt_dest, short_test_buff[index] + j, stride, shift); + ref(ref_dest, short_test_buff[index] + j, stride, shift); + + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t))) + return false; + + reportfail(); + j += INCR + 32; + } + + return true; +} bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt) { @@ -2597,15 +2622,22 @@ return false; } } - - if (opt.cu[i].cpy1Dto2D_shl) + if (opt.cu[i].cpy1Dto2D_shl[NONALIGNED]) { - if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl, opt.cu[i].cpy1Dto2D_shl)) + if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl[NONALIGNED], opt.cu[i].cpy1Dto2D_shl[NONALIGNED])) { printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i); return false; } } + if (opt.cu[i].cpy1Dto2D_shl[ALIGNED]) + { + if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl[ALIGNED], opt.cu[i].cpy1Dto2D_shl[ALIGNED])) + { + printf("cpy1Dto2D_shl_aligned[%dx%d] failed!\n", 4 << i, 4 << i); + return false; + } + } if (opt.cu[i].cpy1Dto2D_shr) { @@ -3270,13 +3302,17 @@ HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i); REPORT_SPEEDUP(opt.cu[i].cpy2Dto1D_shr, ref.cu[i].cpy2Dto1D_shr, sbuf1, sbuf2, STRIDE, 3); } - - if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl) + if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl[NONALIGNED]) { HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i); - REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl, ref.cu[i].cpy1Dto2D_shl, sbuf1, sbuf2, STRIDE, 64); + REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl[NONALIGNED], ref.cu[i].cpy1Dto2D_shl[NONALIGNED], sbuf1, sbuf2, STRIDE, 64); } + if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl[ALIGNED]) + { + HEADER("cpy1Dto2D_shl_aligned[%dx%d]", 4 << i, 4 << i); + REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl[ALIGNED], ref.cu[i].cpy1Dto2D_shl[ALIGNED], sbuf1, sbuf2, STRIDE, 64); + } if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shr) { HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i); diff -r ab5b1becd807 -r 458b708e6d17 source/test/pixelharness.h --- a/source/test/pixelharness.h Fri Dec 08 14:00:59 2017 +0530 +++ b/source/test/pixelharness.h Wed Dec 13 11:05:10 2017 +0530 @@ -97,6 +97,7 @@ bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt); bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt); bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt); + bool check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt); bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt); bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt); bool check_pixel_var(var_t ref, var_t opt); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel