Re: [x265] [PATCH x265] Add AVX2 assembly code for ssimDistortion primitive.
# HG changeset patch # User Akil Ayyappan # Date 1551251102 -19800 # Wed Feb 27 12:35:02 2019 +0530 # Node ID d12a4caf7963fd47d646040689ad5f02754ad879 # Parent cb3e172a5f51c6a4bf8adb7953fe53277f5a1979 x86: ssimDistortion primitive This patch adds AVX2 assembly for this primitive. Pushed patch to default branch of x265 repo. Thanks & Regards, Dinesh On Tue, Mar 5, 2019 at 10:03 AM Akil wrote: > > # HG changeset patch > # User Akil Ayyappan > # Date 1551251102 -19800 > # Wed Feb 27 12:35:02 2019 +0530 > # Node ID d12a4caf7963fd47d646040689ad5f02754ad879 > # Parent cb3e172a5f51c6a4bf8adb7953fe53277f5a1979 > x86: ssimDistortion primitive > > This patch adds AVX2 assembly for this primitive. > > |-|---|-|-| > | Size |Performance|AVX2 clock cycles|CPP clock cycles | > |-|---|-|-| > | [4x4] | 3.52x | 264.43 | 932.05 | > | [8x8] | 5.11x | 619.24 | 3163.56 | > | [16x16] | 5.44x | 2114.00 | 11490.52| > | [32x32] | 6.01x | 7589.70 | 45608.01| > | [64x64] | 6.70x | 27859.21| 186634.25 | > |-|---|-|-| > > diff -r cb3e172a5f51 -r d12a4caf7963 source/common/pixel.cpp > --- a/source/common/pixel.cpp Tue Feb 19 20:20:35 2019 +0530 > +++ b/source/common/pixel.cpp Wed Feb 27 12:35:02 2019 +0530 > @@ -934,6 +934,31 @@ > } > } > > +template > +static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* > recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k) > +{ > +*ssBlock = 0; > +const uint32_t trSize = 1 << log2TrSize; > +for (int y = 0; y < trSize; y++) > +{ > +for (int x = 0; x < trSize; x++) > +{ > +int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // > copy of residual coeff > +*ssBlock += temp * temp; > +} > +} > + > +*ac_k = 0; > +for (int block_yy = 0; block_yy < trSize; block_yy += 1) > +{ > +for (int block_xx = 0; block_xx < trSize; block_xx += 1) > +{ > +uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift; > +*ac_k += temp * temp; > +} > +} > +} > + > #if HIGH_BIT_DEPTH > static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, > int height, uint64_t *outsum, > const pixel minPix, const pixel maxPix) > @@ -1283,5 +1308,11 @@ > p.propagateCost = estimateCUPropagateCost; > p.fix8Unpack = cuTreeFix8Unpack; > p.fix8Pack = cuTreeFix8Pack; > + > +p.cu[BLOCK_4x4].ssimDist = ssimDist_c<2>; > +p.cu[BLOCK_8x8].ssimDist = ssimDist_c<3>; > +p.cu[BLOCK_16x16].ssimDist = ssimDist_c<4>; > +p.cu[BLOCK_32x32].ssimDist = ssimDist_c<5>; > +p.cu[BLOCK_64x64].ssimDist = ssimDist_c<6>; > } > } > diff -r cb3e172a5f51 -r d12a4caf7963 source/common/primitives.h > --- a/source/common/primitives.h Tue Feb 19 20:20:35 2019 +0530 > +++ b/source/common/primitives.h Wed Feb 27 12:35:02 2019 +0530 > @@ -227,6 +227,7 @@ > typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t > *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t > *totalRdCost, int64_t *psyScale, uint32_t blkPos); > typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t > *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t > blkPos); > typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t > *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t > *totalRdCost, int64_t *psyScale, uint32_t blkPos); > +typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, > const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, > uint64_t *ac_k); > /* Function pointers to optimized encoder primitives. Each pointer can > reference > * either an assembly routine, a SIMD intrinsic primitive, or a C > function */ > struct EncoderPrimitives > @@ -303,6 +304,7 @@ > psyRdoQuant_tpsyRdoQuant; > psyRdoQuant_t1 psyRdoQuant_1p; > psyRdoQuant_t2 psyRdoQuant_2p; > +ssimDistortion_t ssimDist; > } > cu[NUM_CU_SIZES]; > /* These remaining primitives work on either fixed block sizes or take > diff -r cb3e172a5f51 -r d12a4caf7963 source/common/quant.cpp > --- a/source/common/quant.cpp Tue Feb 19 20:20:35 2019 +0530 > +++ b/source/common/quant.cpp Wed Feb 27 12:35:02 2019 +0530 > @@ -501,15 +501,8 @@ > > // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC > ssBlock = 0; > -for (int y = 0; y < trSize; y++) > -{ > -for (int x = 0; x < trSize; x++) > -{ > -int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // > copy of residual coeff > -ssBlock += temp * temp; > -} > -} > - > +uint64_t ac_k = 0; > +primitives.cu[log2TrSize - 2].ssimDist(fenc,
[x265] [PATCH x265] Add AVX2 assembly code for ssimDistortion primitive.
# HG changeset patch # User Akil Ayyappan # Date 1551251102 -19800 # Wed Feb 27 12:35:02 2019 +0530 # Node ID d12a4caf7963fd47d646040689ad5f02754ad879 # Parent cb3e172a5f51c6a4bf8adb7953fe53277f5a1979 x86: ssimDistortion primitive This patch adds AVX2 assembly for this primitive. |-|---|-|-| | Size |Performance|AVX2 clock cycles|CPP clock cycles | |-|---|-|-| | [4x4] | 3.52x | 264.43 | 932.05 | | [8x8] | 5.11x | 619.24 | 3163.56 | | [16x16] | 5.44x | 2114.00 | 11490.52| | [32x32] | 6.01x | 7589.70 | 45608.01| | [64x64] | 6.70x | 27859.21| 186634.25 | |-|---|-|-| diff -r cb3e172a5f51 -r d12a4caf7963 source/common/pixel.cpp --- a/source/common/pixel.cpp Tue Feb 19 20:20:35 2019 +0530 +++ b/source/common/pixel.cpp Wed Feb 27 12:35:02 2019 +0530 @@ -934,6 +934,31 @@ } } +template +static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k) +{ +*ssBlock = 0; +const uint32_t trSize = 1 << log2TrSize; +for (int y = 0; y < trSize; y++) +{ +for (int x = 0; x < trSize; x++) +{ +int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff +*ssBlock += temp * temp; +} +} + +*ac_k = 0; +for (int block_yy = 0; block_yy < trSize; block_yy += 1) +{ +for (int block_xx = 0; block_xx < trSize; block_xx += 1) +{ +uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift; +*ac_k += temp * temp; +} +} +} + #if HIGH_BIT_DEPTH static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix) @@ -1283,5 +1308,11 @@ p.propagateCost = estimateCUPropagateCost; p.fix8Unpack = cuTreeFix8Unpack; p.fix8Pack = cuTreeFix8Pack; + +p.cu[BLOCK_4x4].ssimDist = ssimDist_c<2>; +p.cu[BLOCK_8x8].ssimDist = ssimDist_c<3>; +p.cu[BLOCK_16x16].ssimDist = ssimDist_c<4>; +p.cu[BLOCK_32x32].ssimDist = ssimDist_c<5>; +p.cu[BLOCK_64x64].ssimDist = ssimDist_c<6>; } } diff -r cb3e172a5f51 -r d12a4caf7963 source/common/primitives.h --- a/source/common/primitives.h Tue Feb 19 20:20:35 2019 +0530 +++ b/source/common/primitives.h Wed Feb 27 12:35:02 2019 +0530 @@ -227,6 +227,7 @@ typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos); typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t blkPos); typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos); +typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k); /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ struct EncoderPrimitives @@ -303,6 +304,7 @@ psyRdoQuant_tpsyRdoQuant; psyRdoQuant_t1 psyRdoQuant_1p; psyRdoQuant_t2 psyRdoQuant_2p; +ssimDistortion_t ssimDist; } cu[NUM_CU_SIZES]; /* These remaining primitives work on either fixed block sizes or take diff -r cb3e172a5f51 -r d12a4caf7963 source/common/quant.cpp --- a/source/common/quant.cpp Tue Feb 19 20:20:35 2019 +0530 +++ b/source/common/quant.cpp Wed Feb 27 12:35:02 2019 +0530 @@ -501,15 +501,8 @@ // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC ssBlock = 0; -for (int y = 0; y < trSize; y++) -{ -for (int x = 0; x < trSize; x++) -{ -int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff -ssBlock += temp * temp; -} -} - +uint64_t ac_k = 0; +primitives.cu[log2TrSize - 2].ssimDist(fenc, fStride, recon, rstride, , shift, _k); ssAc = ssBlock - ssDc; // 1. Calculation of fdc' @@ -535,15 +528,6 @@ uint64_t fAc_num = 0; // 2. Calculate ac component -uint64_t ac_k = 0; -for (int block_yy = 0; block_yy < trSize; block_yy += 1) -{ -for (int block_xx = 0; block_xx < trSize; block_xx += 1) -{ -uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift; -ac_k += temp * temp; -} -} ac_k -= dc_k; double s = 1 + 0.005 * cu.m_qp[absPartIdx]; diff -r cb3e172a5f51 -r d12a4caf7963 source/common/x86/asm-primitives.cpp ---