Re: [x265] [PATCH x265] Add AVX2 assembly code for ssimDistortion primitive.

2019-03-07 Thread Dinesh Kumar Reddy
# HG changeset patch
# User Akil Ayyappan
# Date 1551251102 -19800
#  Wed Feb 27 12:35:02 2019 +0530
# Node ID d12a4caf7963fd47d646040689ad5f02754ad879
# Parent  cb3e172a5f51c6a4bf8adb7953fe53277f5a1979
x86: ssimDistortion primitive

This patch adds AVX2 assembly for this primitive.

Pushed patch to default branch of x265 repo.

Thanks & Regards,
Dinesh

On Tue, Mar 5, 2019 at 10:03 AM Akil  wrote:

>
> # HG changeset patch
> # User Akil Ayyappan
> # Date 1551251102 -19800
> #  Wed Feb 27 12:35:02 2019 +0530
> # Node ID d12a4caf7963fd47d646040689ad5f02754ad879
> # Parent  cb3e172a5f51c6a4bf8adb7953fe53277f5a1979
> x86: ssimDistortion primitive
>
> This patch adds AVX2 assembly for this primitive.
>
> |-|---|-|-|
> |  Size   |Performance|AVX2 clock cycles|CPP clock cycles |
> |-|---|-|-|
> | [4x4]   |   3.52x   | 264.43  | 932.05  |
> | [8x8]   |   5.11x   | 619.24  | 3163.56 |
> | [16x16] |   5.44x   | 2114.00 | 11490.52|
> | [32x32] |   6.01x   | 7589.70 | 45608.01|
> | [64x64] |   6.70x   | 27859.21| 186634.25   |
> |-|---|-|-|
>
> diff -r cb3e172a5f51 -r d12a4caf7963 source/common/pixel.cpp
> --- a/source/common/pixel.cpp Tue Feb 19 20:20:35 2019 +0530
> +++ b/source/common/pixel.cpp Wed Feb 27 12:35:02 2019 +0530
> @@ -934,6 +934,31 @@
>  }
>  }
>
> +template
> +static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel*
> recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
> +{
> +*ssBlock = 0;
> +const uint32_t trSize = 1 << log2TrSize;
> +for (int y = 0; y < trSize; y++)
> +{
> +for (int x = 0; x < trSize; x++)
> +{
> +int temp = fenc[y * fStride + x] - recon[y * rstride + x]; //
> copy of residual coeff
> +*ssBlock += temp * temp;
> +}
> +}
> +
> +*ac_k = 0;
> +for (int block_yy = 0; block_yy < trSize; block_yy += 1)
> +{
> +for (int block_xx = 0; block_xx < trSize; block_xx += 1)
> +{
> +uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
> +*ac_k += temp * temp;
> +}
> +}
> +}
> +
>  #if HIGH_BIT_DEPTH
>  static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width,
> int height, uint64_t *outsum,
> const pixel minPix, const pixel maxPix)
> @@ -1283,5 +1308,11 @@
>  p.propagateCost = estimateCUPropagateCost;
>  p.fix8Unpack = cuTreeFix8Unpack;
>  p.fix8Pack = cuTreeFix8Pack;
> +
> +p.cu[BLOCK_4x4].ssimDist = ssimDist_c<2>;
> +p.cu[BLOCK_8x8].ssimDist = ssimDist_c<3>;
> +p.cu[BLOCK_16x16].ssimDist = ssimDist_c<4>;
> +p.cu[BLOCK_32x32].ssimDist = ssimDist_c<5>;
> +p.cu[BLOCK_64x64].ssimDist = ssimDist_c<6>;
>  }
>  }
> diff -r cb3e172a5f51 -r d12a4caf7963 source/common/primitives.h
> --- a/source/common/primitives.h Tue Feb 19 20:20:35 2019 +0530
> +++ b/source/common/primitives.h Wed Feb 27 12:35:02 2019 +0530
> @@ -227,6 +227,7 @@
>  typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t
> *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
> *totalRdCost, int64_t *psyScale, uint32_t blkPos);
>  typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t
> *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t
> blkPos);
>  typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t
> *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
> *totalRdCost, int64_t *psyScale, uint32_t blkPos);
> +typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride,
> const pixel *recon,  intptr_t rstride, uint64_t *ssBlock, int shift,
> uint64_t *ac_k);
>  /* Function pointers to optimized encoder primitives. Each pointer can
> reference
>   * either an assembly routine, a SIMD intrinsic primitive, or a C
> function */
>  struct EncoderPrimitives
> @@ -303,6 +304,7 @@
>  psyRdoQuant_tpsyRdoQuant;
>   psyRdoQuant_t1   psyRdoQuant_1p;
>   psyRdoQuant_t2   psyRdoQuant_2p;
> +ssimDistortion_t ssimDist;
>  }
>  cu[NUM_CU_SIZES];
>  /* These remaining primitives work on either fixed block sizes or take
> diff -r cb3e172a5f51 -r d12a4caf7963 source/common/quant.cpp
> --- a/source/common/quant.cpp Tue Feb 19 20:20:35 2019 +0530
> +++ b/source/common/quant.cpp Wed Feb 27 12:35:02 2019 +0530
> @@ -501,15 +501,8 @@
>
>  // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC
>  ssBlock = 0;
> -for (int y = 0; y < trSize; y++)
> -{
> -for (int x = 0; x < trSize; x++)
> -{
> -int temp = fenc[y * fStride + x] - recon[y * rstride + x]; //
> copy of residual coeff
> -ssBlock += temp * temp;
> -}
> -}
> -
> +uint64_t ac_k = 0;
> +primitives.cu[log2TrSize - 2].ssimDist(fenc, 

[x265] [PATCH x265] Add AVX2 assembly code for ssimDistortion primitive.

2019-03-04 Thread Akil
# HG changeset patch
# User Akil Ayyappan
# Date 1551251102 -19800
#  Wed Feb 27 12:35:02 2019 +0530
# Node ID d12a4caf7963fd47d646040689ad5f02754ad879
# Parent  cb3e172a5f51c6a4bf8adb7953fe53277f5a1979
x86: ssimDistortion primitive

This patch adds AVX2 assembly for this primitive.

|-|---|-|-|
|  Size   |Performance|AVX2 clock cycles|CPP clock cycles |
|-|---|-|-|
| [4x4]   |   3.52x   | 264.43  | 932.05  |
| [8x8]   |   5.11x   | 619.24  | 3163.56 |
| [16x16] |   5.44x   | 2114.00 | 11490.52|
| [32x32] |   6.01x   | 7589.70 | 45608.01|
| [64x64] |   6.70x   | 27859.21| 186634.25   |
|-|---|-|-|

diff -r cb3e172a5f51 -r d12a4caf7963 source/common/pixel.cpp
--- a/source/common/pixel.cpp Tue Feb 19 20:20:35 2019 +0530
+++ b/source/common/pixel.cpp Wed Feb 27 12:35:02 2019 +0530
@@ -934,6 +934,31 @@
 }
 }

+template
+static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel*
recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
+{
+*ssBlock = 0;
+const uint32_t trSize = 1 << log2TrSize;
+for (int y = 0; y < trSize; y++)
+{
+for (int x = 0; x < trSize; x++)
+{
+int temp = fenc[y * fStride + x] - recon[y * rstride + x]; //
copy of residual coeff
+*ssBlock += temp * temp;
+}
+}
+
+*ac_k = 0;
+for (int block_yy = 0; block_yy < trSize; block_yy += 1)
+{
+for (int block_xx = 0; block_xx < trSize; block_xx += 1)
+{
+uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
+*ac_k += temp * temp;
+}
+}
+}
+
 #if HIGH_BIT_DEPTH
 static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int
height, uint64_t *outsum,
const pixel minPix, const pixel maxPix)
@@ -1283,5 +1308,11 @@
 p.propagateCost = estimateCUPropagateCost;
 p.fix8Unpack = cuTreeFix8Unpack;
 p.fix8Pack = cuTreeFix8Pack;
+
+p.cu[BLOCK_4x4].ssimDist = ssimDist_c<2>;
+p.cu[BLOCK_8x8].ssimDist = ssimDist_c<3>;
+p.cu[BLOCK_16x16].ssimDist = ssimDist_c<4>;
+p.cu[BLOCK_32x32].ssimDist = ssimDist_c<5>;
+p.cu[BLOCK_64x64].ssimDist = ssimDist_c<6>;
 }
 }
diff -r cb3e172a5f51 -r d12a4caf7963 source/common/primitives.h
--- a/source/common/primitives.h Tue Feb 19 20:20:35 2019 +0530
+++ b/source/common/primitives.h Wed Feb 27 12:35:02 2019 +0530
@@ -227,6 +227,7 @@
 typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t
*m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
*totalRdCost, int64_t *psyScale, uint32_t blkPos);
 typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t
*costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t
blkPos);
 typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t
*m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
*totalRdCost, int64_t *psyScale, uint32_t blkPos);
+typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const
pixel *recon,  intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t
*ac_k);
 /* Function pointers to optimized encoder primitives. Each pointer can
reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function
*/
 struct EncoderPrimitives
@@ -303,6 +304,7 @@
 psyRdoQuant_tpsyRdoQuant;
  psyRdoQuant_t1   psyRdoQuant_1p;
  psyRdoQuant_t2   psyRdoQuant_2p;
+ssimDistortion_t ssimDist;
 }
 cu[NUM_CU_SIZES];
 /* These remaining primitives work on either fixed block sizes or take
diff -r cb3e172a5f51 -r d12a4caf7963 source/common/quant.cpp
--- a/source/common/quant.cpp Tue Feb 19 20:20:35 2019 +0530
+++ b/source/common/quant.cpp Wed Feb 27 12:35:02 2019 +0530
@@ -501,15 +501,8 @@

 // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC
 ssBlock = 0;
-for (int y = 0; y < trSize; y++)
-{
-for (int x = 0; x < trSize; x++)
-{
-int temp = fenc[y * fStride + x] - recon[y * rstride + x]; //
copy of residual coeff
-ssBlock += temp * temp;
-}
-}
-
+uint64_t ac_k = 0;
+primitives.cu[log2TrSize - 2].ssimDist(fenc, fStride, recon, rstride,
, shift, _k);
 ssAc = ssBlock - ssDc;

 // 1. Calculation of fdc'
@@ -535,15 +528,6 @@
 uint64_t fAc_num = 0;

 // 2. Calculate ac component
-uint64_t ac_k = 0;
-for (int block_yy = 0; block_yy < trSize; block_yy += 1)
-{
-for (int block_xx = 0; block_xx < trSize; block_xx += 1)
-{
-uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
-ac_k += temp * temp;
-}
-}
 ac_k -= dc_k;

 double s = 1 + 0.005 * cu.m_qp[absPartIdx];
diff -r cb3e172a5f51 -r d12a4caf7963 source/common/x86/asm-primitives.cpp
---