[x265] [PATCH 2 of 4] limitTU : use spatial and temporal CUs' TU depth to limit recursion

2016-11-18 Thread bhavna
# HG changeset patch
# User Bhavna Hariharan 
# Date 1479450968 -19800
#  Fri Nov 18 12:06:08 2016 +0530
# Node ID 40a0a322b26fc0516a72d4de9a941e18b5bb97b9
# Parent  c5295126f248411481a8361acfd2bc8b0636cedc
limitTU : use spatial and temporal CUs' TU depth to limit recursion

diff -r c5295126f248 -r 40a0a322b26f doc/reST/cli.rst
--- a/doc/reST/cli.rst  Fri Nov 18 11:49:05 2016 +0530
+++ b/doc/reST/cli.rst  Fri Nov 18 12:06:08 2016 +0530
@@ -869,13 +869,15 @@
partitions, in which case a TU split is implied and thus the
residual quad-tree begins one layer below the CU quad-tree.
 
-.. option:: --limit-tu <0|1|2>
+.. option:: --limit-tu <0..3>
 
Enables early exit from TU depth recursion, for inter coded blocks.
Level 1 - decides to recurse to next higher depth based on cost 
comparison of full size TU and split TU.
Level 2 - based on first split subTU's depth, limits recursion of
other split subTUs.
+   Level 3 - based on the average depth of the co-located and the neighbor
+   CUs' TU depth, limits recursion of the current CU.
 
Default: 0
 
diff -r c5295126f248 -r 40a0a322b26f source/common/cudata.cpp
--- a/source/common/cudata.cpp  Fri Nov 18 11:49:05 2016 +0530
+++ b/source/common/cudata.cpp  Fri Nov 18 12:06:08 2016 +0530
@@ -296,6 +296,9 @@
 /* initialize the remaining CU data in one memset */
 memset(m_cuDepth, 0, (frame.m_param->internalCsp == X265_CSP_I400 ? 
BytesPerPartition - 11 : BytesPerPartition - 7) * m_numPartitions);
 
+for (int8_t i = 0; i < NUM_TU_DEPTH; i++)
+m_refTuDepth[i] = -1;
+
 uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
 m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : 
NULL;
 m_cuAbove = (m_cuAddr >= widthInCU) && !m_bFirstRowInSlice ? 
m_encData->getPicCTU(m_cuAddr - widthInCU) : NULL;
diff -r c5295126f248 -r 40a0a322b26f source/common/cudata.h
--- a/source/common/cudata.hFri Nov 18 11:49:05 2016 +0530
+++ b/source/common/cudata.hFri Nov 18 12:06:08 2016 +0530
@@ -28,6 +28,8 @@
 #include "slice.h"
 #include "mv.h"
 
+#define NUM_TU_DEPTH 21
+
 namespace X265_NS {
 // private namespace
 
@@ -204,6 +206,7 @@
 enum { BytesPerPartition = 21 };  // combined sizeof() of all per-part data
 
 coeff_t*  m_trCoeff[3];   // transformed coefficient buffer per 
plane
+int8_tm_refTuDepth[NUM_TU_DEPTH];   // TU depth of CU at depths 0, 
1 and 2
 
 MV*   m_mv[2];// array of motion vectors per list
 MV*   m_mvd[2];   // array of coded motion vector deltas 
per list
diff -r c5295126f248 -r 40a0a322b26f source/common/param.cpp
--- a/source/common/param.cpp   Fri Nov 18 11:49:05 2016 +0530
+++ b/source/common/param.cpp   Fri Nov 18 12:06:08 2016 +0530
@@ -1126,7 +1126,7 @@
   "QuadtreeTUMaxDepthInter must be less than or equal to the 
difference between log2(maxCUSize) and QuadtreeTULog2MinSize plus 1");
 CHECK((param->maxTUSize != 32 && param->maxTUSize != 16 && 
param->maxTUSize != 8 && param->maxTUSize != 4),
   "max TU size must be 4, 8, 16, or 32");
-CHECK(param->limitTU > 2, "Invalid limit-tu option, limit-TU must be 0, 1 
or 2");
+CHECK(param->limitTU > 3, "Invalid limit-tu option, limit-TU must be 
between 0 and 3");
 CHECK(param->maxNumMergeCand < 1, "MaxNumMergeCand must be 1 or greater.");
 CHECK(param->maxNumMergeCand > 5, "MaxNumMergeCand must be 5 or smaller.");
 
diff -r c5295126f248 -r 40a0a322b26f source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp   Fri Nov 18 11:49:05 2016 +0530
+++ b/source/encoder/analysis.cpp   Fri Nov 18 12:06:08 2016 +0530
@@ -203,6 +203,57 @@
 return *m_modeDepth[0].bestMode;
 }
 
+int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU)
+{
+float predDepth = 0;
+CUData* neighbourCU;
+uint8_t count = 0;
+int32_t maxTUDepth = -1;
+neighbourCU = m_slice->m_refFrameList[0][0]->m_encData->m_picCTU;
+predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+if (m_slice->isInterB())
+{
+neighbourCU = m_slice->m_refFrameList[1][0]->m_encData->m_picCTU;
+predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+}
+if (parentCTU.m_cuAbove)
+{
+predDepth += parentCTU.m_cuAbove->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+if (parentCTU.m_cuAboveLeft)
+{
+predDepth += 
parentCTU.m_cuAboveLeft->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+}
+if (parentCTU.m_cuAboveRight)
+{
+predDepth += 
parentCTU.m_cuAboveRight->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+}
+}
+if (parentCTU.m_cuLeft)
+{
+predDepth += parentCTU.m_cuLeft->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+}
+predDepth /= count;
+
+if (predDepth == 0)
+

[x265] [PATCH 2 of 4] limitTU : use spatial and temporal CUs' TU depth to limit recursion

2016-11-17 Thread bhavna
# HG changeset patch
# User Bhavna Hariharan 
# Date 1479365378 -19800
#  Thu Nov 17 12:19:38 2016 +0530
# Node ID 07a4e4d785a69f719922129ca5997b12552bb4ab
# Parent  da1c770fa6e905fe341705b3f95a201a1a31fcf9
limitTU : use spatial and temporal CUs' TU depth to limit recursion

diff -r da1c770fa6e9 -r 07a4e4d785a6 source/common/cudata.cpp
--- a/source/common/cudata.cpp  Tue Nov 15 11:34:06 2016 +0530
+++ b/source/common/cudata.cpp  Thu Nov 17 12:19:38 2016 +0530
@@ -295,6 +295,9 @@
 
 /* initialize the remaining CU data in one memset */
 memset(m_cuDepth, 0, (frame.m_param->internalCsp == X265_CSP_I400 ? 
BytesPerPartition - 11 : BytesPerPartition - 7) * m_numPartitions);
+
+for (int8_t i = 0; i < NUM_TU_DEPTH; i++)
+m_refTuDepth[i] = -1;
 
 uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
 m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : 
NULL;
diff -r da1c770fa6e9 -r 07a4e4d785a6 source/common/cudata.h
--- a/source/common/cudata.hTue Nov 15 11:34:06 2016 +0530
+++ b/source/common/cudata.hThu Nov 17 12:19:38 2016 +0530
@@ -28,6 +28,8 @@
 #include "slice.h"
 #include "mv.h"
 
+#define NUM_TU_DEPTH 21
+
 namespace X265_NS {
 // private namespace
 
@@ -204,6 +206,7 @@
 enum { BytesPerPartition = 21 };  // combined sizeof() of all per-part data
 
 coeff_t*  m_trCoeff[3];   // transformed coefficient buffer per 
plane
+int8_tm_refTuDepth[NUM_TU_DEPTH];   // TU depth of CU at depths 0, 
1 and 2
 
 MV*   m_mv[2];// array of motion vectors per list
 MV*   m_mvd[2];   // array of coded motion vector deltas 
per list
diff -r da1c770fa6e9 -r 07a4e4d785a6 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp   Tue Nov 15 11:34:06 2016 +0530
+++ b/source/encoder/analysis.cpp   Thu Nov 17 12:19:38 2016 +0530
@@ -203,6 +203,57 @@
 return *m_modeDepth[0].bestMode;
 }
 
+int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU)
+{
+float predDepth = 0;
+CUData* neighbourCU;
+uint8_t count = 0;
+int32_t maxTUDepth = -1;
+neighbourCU = m_slice->m_refFrameList[0][0]->m_encData->m_picCTU;
+predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+if (m_slice->isInterB())
+{
+neighbourCU = m_slice->m_refFrameList[1][0]->m_encData->m_picCTU;
+predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+}
+if (parentCTU.m_cuAbove)
+{
+predDepth += parentCTU.m_cuAbove->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+if (parentCTU.m_cuAboveLeft)
+{
+predDepth += 
parentCTU.m_cuAboveLeft->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+}
+if (parentCTU.m_cuAboveRight)
+{
+predDepth += 
parentCTU.m_cuAboveRight->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+}
+}
+if (parentCTU.m_cuLeft)
+{
+predDepth += parentCTU.m_cuLeft->m_refTuDepth[cuGeom.geomRecurId];
+count++;
+}
+predDepth /= count;
+
+if (predDepth == 0)
+maxTUDepth = 0;
+else if (predDepth < 1)
+maxTUDepth = 1;
+else if (predDepth >= 1 && predDepth <= 1.5)
+maxTUDepth = 2;
+else if (predDepth > 1.5 && predDepth <= 2.5)
+maxTUDepth = 3;
+else
+maxTUDepth = -1;
+
+return maxTUDepth;
+}
+
 void Analysis::tryLossless(const CUGeom& cuGeom)
 {
 ModeDepth& md = m_modeDepth[cuGeom.depth];
@@ -326,6 +377,15 @@
 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
 }
 
+if (limitTU == X265_TU_LIMIT_NEIGH && cuGeom.log2CUSize >= 4)
+{
+CUData* ctu = 
md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
+int8_t maxTUDepth = -1;
+for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
+maxTUDepth = X265_MAX(maxTUDepth, 
md.bestMode->cu.m_tuDepth[i]);
+ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
+}
+
 if (m_bTryLossless)
 tryLossless(cuGeom);
 
@@ -894,6 +954,9 @@
 bool skipRectAmp = false;
 bool chooseMerge = false;
 
+if (limitTU == X265_TU_LIMIT_NEIGH && cuGeom.log2CUSize >= 4)
+m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
+
 SplitData splitData[4];
 splitData[0].initSplitCUData();
 splitData[1].initSplitCUData();
@@ -1400,6 +1463,17 @@
 if (m_param->rdLevel)
 md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, 
cuGeom.absPartIdx);
 
+if (limitTU == X265_TU_LIMIT_NEIGH && cuGeom.log2CUSize >= 4)
+{
+if (mightNotSplit)
+{
+CUData* ctu = 
md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
+int8_t maxTUDepth = -1;
+for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
+maxTUDepth = X265_MAX(maxTUDepth, 
md.bestMode->cu.m_tuDepth[i]);
+