>From memory usage point of view approach #2 is better. And by improving the storage based on min-max cu sizes we will be optimizing the memory usage for other combinations too. Approach #2 will optimize the output file size too.
min cu max cu qg size resolution cutree offsets buf size (bytes) low res stats buf size (bytes) 8 64 32 1920x1080 43350 65280 8 64 8 1920x1080 43350 261120 Will work on the above and other minor observations on the patch and get back. On Fri, Jan 22, 2021 at 10:55 PM Aruna Matheswaran < ar...@multicorewareinc.com> wrote: > > > On Thu, Jan 21, 2021 at 5:34 PM Srikanth Kurapati < > srikanth.kurap...@multicorewareinc.com> wrote: > >> >> Adding to my reply above. >> >> [AM] Why MAX_NUM_CU_GEOMS combinations? >> >> [KS] Will optimize storage based on min-cu-size configuration. >> >> On Thu, Jan 21, 2021 at 4:09 PM Srikanth Kurapati < >> srikanth.kurap...@multicorewareinc.com> wrote: >> >>> >>> [AM] Can't we share lowres cutree stats generated at qg size >>> granularity? Why MAX_NUM_CU_GEOMS combinations? >>> >>> [KS] If we share like that then we will have to calculate the dqp per cu >>> at analysis phase just like save encode and we will not get the savings in >>> cpu cycles there. Currently we are storing the final dqp derived from >>> lowres mv costs at qg size granularity by taking the difference between the >>> final qp and base qp per slice. >>> >> [AM] What is the memory footprint and performance impact of 1. Sharing > cutree offsets per qg and collating CU-level offsets from qg-level offsets, > and 2. Sharing cu-tree offsets of all partition sizes? I don't think #1 > will have a significant hit on performance as the partition evaluations in > load encode is restricted. > >> MAX_NUM_CU_GEOMS is 85 = ( 1 + 4 + 16 + 64 ) this is maximum number of >>> partitions at which qp can be computed and used in a ctu. >>> >>> [AM] Won't this implicitly turn OFF cutree at reuse-level 1? >>> >>> [KS] Agreed and addressed. >>> >>> >>> On Tue, Jan 19, 2021 at 11:12 PM Aruna Matheswaran < >>> ar...@multicorewareinc.com> wrote: >>> >>>> >>>> >>>> On Mon, Jan 11, 2021 at 8:08 PM Srikanth Kurapati < >>>> srikanth.kurap...@multicorewareinc.com> wrote: >>>> >>>>> From d516d0564888e154d88d89320302725d87bfab78 Mon Sep 17 00:00:00 2001 >>>>> From: Srikanth Kurapati <srikanth.kurap...@multicorewareinc.com> >>>>> Date: Wed, 30 Dec 2020 17:00:08 +0530 >>>>> Subject: [PATCH] fix: corrects output mismatch for cutree enabled >>>>> analysis >>>>> save/load enodes with reuse-levels in between 1 to 10 for similar >>>>> encoder >>>>> settings. >>>>> >>>>> --- >>>>> source/abrEncApp.cpp | 14 +++- >>>>> source/common/common.h | 3 +- >>>>> source/common/cudata.h | 2 +- >>>>> source/encoder/analysis.cpp | 31 ++++++++- >>>>> source/encoder/analysis.h | 1 + >>>>> source/encoder/api.cpp | 28 +++++++- >>>>> source/encoder/encoder.cpp | 123 ++++++++++++++++++++++++++--------- >>>>> source/encoder/slicetype.cpp | 2 +- >>>>> source/x265.h | 4 +- >>>>> 9 files changed, 166 insertions(+), 42 deletions(-) >>>>> >>>>> diff --git a/source/abrEncApp.cpp b/source/abrEncApp.cpp >>>>> index fa62ebf63..ea255e3f6 100644 >>>>> --- a/source/abrEncApp.cpp >>>>> +++ b/source/abrEncApp.cpp >>>>> @@ -340,7 +340,12 @@ namespace X265_NS { >>>>> memcpy(intraDst->partSizes, intraSrc->partSizes, >>>>> sizeof(char) * src->depthBytes); >>>>> memcpy(intraDst->chromaModes, intraSrc->chromaModes, >>>>> sizeof(uint8_t) * src->depthBytes); >>>>> if (m_param->rc.cuTree) >>>>> - memcpy(intraDst->cuQPOff, intraSrc->cuQPOff, >>>>> sizeof(int8_t) * src->depthBytes); >>>>> + { >>>>> + if (m_param->analysisSaveReuseLevel == 10) >>>>> + memcpy(intraDst->cuQPOff, intraSrc->cuQPOff, >>>>> sizeof(int8_t) * src->depthBytes); >>>>> + else >>>>> + memcpy(intraDst->cuQPOff, intraSrc->cuQPOff, >>>>> sizeof(int8_t) * (src->numCUsInFrame * MAX_NUM_CU_GEOMS)); >>>>> + } >>>>> } >>>>> else >>>>> { >>>>> @@ -355,7 +360,12 @@ namespace X265_NS { >>>>> memcpy(interDst->depth, interSrc->depth, sizeof(uint8_t) >>>>> * src->depthBytes); >>>>> memcpy(interDst->modes, interSrc->modes, sizeof(uint8_t) >>>>> * src->depthBytes); >>>>> if (m_param->rc.cuTree) >>>>> - memcpy(interDst->cuQPOff, interSrc->cuQPOff, >>>>> sizeof(int8_t) * src->depthBytes); >>>>> + { >>>>> + if (m_param->analysisReuseLevel == 10) >>>>> + memcpy(interDst->cuQPOff, interSrc->cuQPOff, >>>>> sizeof(int8_t) * src->depthBytes); >>>>> + else >>>>> + memcpy(interDst->cuQPOff, interSrc->cuQPOff, >>>>> sizeof(int8_t) * (src->numCUsInFrame * MAX_NUM_CU_GEOMS)); >>>>> + } >>>>> if (m_param->analysisSaveReuseLevel > 4) >>>>> { >>>>> memcpy(interDst->partSize, interSrc->partSize, >>>>> sizeof(uint8_t) * src->depthBytes); >>>>> diff --git a/source/common/common.h b/source/common/common.h >>>>> index 8c06cd79e..0ffbf17eb 100644 >>>>> --- a/source/common/common.h >>>>> +++ b/source/common/common.h >>>>> @@ -326,7 +326,8 @@ typedef int16_t coeff_t; // transform >>>>> coefficient >>>>> >>>>> #define CHROMA_H_SHIFT(x) (x == X265_CSP_I420 || x == X265_CSP_I422) >>>>> #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420) >>>>> -#define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8 >>>>> +#define MAX_NUM_CU_GEOMS 85 >>>>> +#define X265_MAX_PRED_MODE_PER_CTU MAX_NUM_CU_GEOMS * 2 * 8 >>>>> >>>>> #define MAX_NUM_TR_COEFFS MAX_TR_SIZE * MAX_TR_SIZE // >>>>> Maximum number of transform coefficients, for a 32x32 transform >>>>> #define MAX_NUM_TR_CATEGORIES 16 // 32, >>>>> 16, 8, 4 transform categories each for luma and chroma >>>>> diff --git a/source/common/cudata.h b/source/common/cudata.h >>>>> index 8397f0568..c7d9a1972 100644 >>>>> --- a/source/common/cudata.h >>>>> +++ b/source/common/cudata.h >>>>> @@ -371,7 +371,7 @@ struct CUDataMemPool >>>>> CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL) * >>>>> numInstances); >>>>> } >>>>> else >>>>> - { >>>>> + { >>>>> uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + >>>>> CHROMA_V_SHIFT(csp)); >>>>> CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * >>>>> 2) * numInstances); >>>>> } >>>>> diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp >>>>> index aabf386ca..22a4ba74f 100644 >>>>> --- a/source/encoder/analysis.cpp >>>>> +++ b/source/encoder/analysis.cpp >>>>> @@ -220,6 +220,9 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& >>>>> frame, const CUGeom& cuGeom, con >>>>> if (m_param->analysisSave && !m_param->analysisLoad) >>>>> for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * >>>>> numPredDir; i++) >>>>> m_reuseRef[i] = -1; >>>>> + >>>>> + if (m_param->rc.cuTree) >>>>> + m_reuseQP = &m_reuseInterDataCTU->cuQPOff[ctu.m_cuAddr * >>>>> MAX_NUM_CU_GEOMS]; >>>>> } >>>>> ProfileCUScope(ctu, totalCTUTime, totalCTUs); >>>>> >>>>> @@ -233,6 +236,8 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& >>>>> frame, const CUGeom& cuGeom, con >>>>> memcpy(ctu.m_partSize, >>>>> &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * >>>>> numPartition); >>>>> memcpy(ctu.m_chromaIntraDir, >>>>> &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * >>>>> numPartition); >>>>> } >>>>> + if (m_param->rc.cuTree && reuseLevel > 1 && reuseLevel < 10) >>>>> + m_reuseQP = &intraDataCTU->cuQPOff[ctu.m_cuAddr * >>>>> MAX_NUM_CU_GEOMS]; >>>>> compressIntraCU(ctu, cuGeom, qp); >>>>> } >>>>> else >>>>> @@ -520,6 +525,9 @@ uint64_t Analysis::compressIntraCU(const CUData& >>>>> parentCTU, const CUGeom& cuGeom >>>>> bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); >>>>> bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); >>>>> >>>>> + if (m_param->rc.cuTree && m_param->analysisSaveReuseLevel > 1 && >>>>> m_param->analysisSaveReuseLevel < 10) >>>>> + m_reuseQP[cuGeom.geomRecurId] = (int8_t)qp; >>>>> >>>> + >>>>> bool bAlreadyDecided = m_param->intraRefine != 4 && >>>>> parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX && >>>>> !(m_param->bAnalysisType == HEVC_INFO); >>>>> bool bDecidedDepth = m_param->intraRefine != 4 && >>>>> parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth; >>>>> int split = 0; >>>>> @@ -870,6 +878,9 @@ uint32_t Analysis::compressInterCU_dist(const >>>>> CUData& parentCTU, const CUGeom& c >>>>> uint32_t minDepth = m_param->rdLevel <= 4 ? >>>>> topSkipMinDepth(parentCTU, cuGeom) : 0; >>>>> uint32_t splitRefs[4] = { 0, 0, 0, 0 }; >>>>> >>>>> + if (m_param->rc.cuTree && m_param->analysisSaveReuseLevel > 1 && >>>>> m_param->analysisSaveReuseLevel < 10) >>>>> + m_reuseQP[cuGeom.geomRecurId] = (int8_t)qp; >>>>> + >>>>> X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not >>>>> support RD 0 or 1\n"); >>>>> >>>>> PMODE pmode(*this, cuGeom); >>>>> @@ -1152,6 +1163,8 @@ SplitData Analysis::compressInterCU_rd0_4(const >>>>> CUData& parentCTU, const CUGeom& >>>>> uint32_t cuAddr = parentCTU.m_cuAddr; >>>>> ModeDepth& md = m_modeDepth[depth]; >>>>> >>>>> + if (m_param->rc.cuTree && m_param->analysisSaveReuseLevel > 1 && >>>>> m_param->analysisSaveReuseLevel < 10) >>>>> + m_reuseQP[cuGeom.geomRecurId] = (int8_t)qp; >>>>> >>>>> if (m_param->searchMethod == X265_SEA) >>>>> { >>>>> @@ -1856,6 +1869,9 @@ SplitData Analysis::compressInterCU_rd5_6(const >>>>> CUData& parentCTU, const CUGeom& >>>>> ModeDepth& md = m_modeDepth[depth]; >>>>> md.bestMode = NULL; >>>>> >>>>> + if (m_param->rc.cuTree && m_param->analysisSaveReuseLevel > 1 && >>>>> m_param->analysisSaveReuseLevel < 10) >>>>> + m_reuseQP[cuGeom.geomRecurId] = (int8_t)qp; >>>>> + >>>>> if (m_param->searchMethod == X265_SEA) >>>>> { >>>>> int numPredDir = m_slice->isInterP() ? 1 : 2; >>>>> @@ -3647,11 +3663,20 @@ int Analysis::calculateQpforCuSize(const >>>>> CUData& ctu, const CUGeom& cuGeom, int3 >>>>> >>>>> if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree) >>>>> { >>>>> - int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + >>>>> cuGeom.absPartIdx; >>>>> + int cuIdx; >>>>> + int8_t cuQPOffSet = 0; >>>>> + >>>>> + if (m_param->scaleFactor == 2 || >>>>> m_param->analysisLoadReuseLevel == 10) >>>>> + cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + >>>>> cuGeom.absPartIdx; >>>>> + else >>>>> + cuIdx = (ctu.m_cuAddr * MAX_NUM_CU_GEOMS) + >>>>> cuGeom.geomRecurId; >>>>> + >>>>> if (ctu.m_slice->m_sliceType == I_SLICE) >>>>> - return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, >>>>> (int32_t)(qp + 0.5 + >>>>> ((x265_analysis_intra_data*)m_frame->m_analysisData.intraData)->cuQPOff[cuIdx])); >>>>> + cuQPOffSet = >>>>> ((x265_analysis_intra_data*)m_frame->m_analysisData.intraData)->cuQPOff[cuIdx]; >>>>> else >>>>> - return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, >>>>> (int32_t)(qp + 0.5 + >>>>> ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx])); >>>>> + cuQPOffSet = >>>>> ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx]; >>>>> + >>>>> + return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, >>>>> (int32_t)(qp + 0.5 + cuQPOffSet)); >>>>> } >>>>> if (m_param->rc.hevcAq) >>>>> { >>>>> diff --git a/source/encoder/analysis.h b/source/encoder/analysis.h >>>>> index 3bcb56bc3..8d76d5c5e 100644 >>>>> --- a/source/encoder/analysis.h >>>>> +++ b/source/encoder/analysis.h >>>>> @@ -126,6 +126,7 @@ protected: >>>>> int32_t* m_reuseRef; >>>>> uint8_t* m_reuseDepth; >>>>> uint8_t* m_reuseModes; >>>>> + int8_t * m_reuseQP; // array of QP values for >>>>> analysis reuse at reuse levels > 1 and < 10 when cutree is enabled >>>>> uint8_t* m_reusePartSize; >>>>> uint8_t* m_reuseMergeFlag; >>>>> x265_analysis_MV* m_reuseMv[2]; >>>>> diff --git a/source/encoder/api.cpp b/source/encoder/api.cpp >>>>> index a986355e0..2c90fe8f2 100644 >>>>> --- a/source/encoder/api.cpp >>>>> +++ b/source/encoder/api.cpp >>>>> @@ -825,7 +825,16 @@ void x265_alloc_analysis_data(x265_param *param, >>>>> x265_analysis_data* analysis) >>>>> CHECKED_MALLOC_ZERO(intraData->partSizes, char, >>>>> analysis->numPartitions * analysis->numCUsInFrame); >>>>> CHECKED_MALLOC_ZERO(intraData->chromaModes, uint8_t, >>>>> analysis->numPartitions * analysis->numCUsInFrame); >>>>> if (param->rc.cuTree) >>>>> - CHECKED_MALLOC_ZERO(intraData->cuQPOff, int8_t, >>>>> analysis->numPartitions * analysis->numCUsInFrame); >>>>> + { >>>>> + if (maxReuseLevel == 10) >>>>> + { >>>>> + CHECKED_MALLOC_ZERO(intraData->cuQPOff, int8_t, >>>>> analysis->numPartitions * analysis->numCUsInFrame); >>>>> + } >>>>> + else >>>>> + { >>>>> + CHECKED_MALLOC_ZERO(intraData->cuQPOff, int8_t, >>>>> MAX_NUM_CU_GEOMS * analysis->numCUsInFrame); >>>>> + } >>>>> + } >>>>> } >>>>> analysis->intraData = intraData; >>>>> >>>>> @@ -837,7 +846,16 @@ void x265_alloc_analysis_data(x265_param *param, >>>>> x265_analysis_data* analysis) >>>>> CHECKED_MALLOC_ZERO(interData->modes, uint8_t, >>>>> analysis->numPartitions * analysis->numCUsInFrame); >>>>> >>>>> if (param->rc.cuTree && !isMultiPassOpt) >>>>> - CHECKED_MALLOC_ZERO(interData->cuQPOff, int8_t, >>>>> analysis->numPartitions * analysis->numCUsInFrame); >>>>> + { >>>>> + if (maxReuseLevel == 10) >>>>> + { >>>>> + CHECKED_MALLOC_ZERO(interData->cuQPOff, int8_t, >>>>> analysis->numPartitions * analysis->numCUsInFrame); >>>>> + } >>>>> + else >>>>> + { >>>>> + CHECKED_MALLOC_ZERO(interData->cuQPOff, int8_t, >>>>> MAX_NUM_CU_GEOMS * analysis->numCUsInFrame); >>>>> >>>> [AM] Can't we share lowres cutree stats generated at qg size >>>> granularity? Why MAX_NUM_CU_GEOMS combinations? >>>> >>>>> + } >>>>> + } >>>>> CHECKED_MALLOC_ZERO(interData->mvpIdx[0], uint8_t, >>>>> analysis->numPartitions * analysis->numCUsInFrame); >>>>> CHECKED_MALLOC_ZERO(interData->mvpIdx[1], uint8_t, >>>>> analysis->numPartitions * analysis->numCUsInFrame); >>>>> CHECKED_MALLOC_ZERO(interData->mv[0], x265_analysis_MV, >>>>> analysis->numPartitions * analysis->numCUsInFrame); >>>>> @@ -919,7 +937,9 @@ void x265_free_analysis_data(x265_param *param, >>>>> x265_analysis_data* analysis) >>>>> X265_FREE((analysis->intraData)->partSizes); >>>>> X265_FREE((analysis->intraData)->chromaModes); >>>>> if (param->rc.cuTree) >>>>> - X265_FREE((analysis->intraData)->cuQPOff); >>>>> + { >>>>> + X265_FREE_ZERO((analysis->intraData)->cuQPOff); >>>>> + } >>>>> } >>>>> X265_FREE(analysis->intraData); >>>>> analysis->intraData = NULL; >>>>> @@ -931,7 +951,9 @@ void x265_free_analysis_data(x265_param *param, >>>>> x265_analysis_data* analysis) >>>>> X265_FREE((analysis->interData)->depth); >>>>> X265_FREE((analysis->interData)->modes); >>>>> if (!isMultiPassOpt && param->rc.cuTree) >>>>> + { >>>>> X265_FREE((analysis->interData)->cuQPOff); >>>>> + } >>>>> X265_FREE((analysis->interData)->mvpIdx[0]); >>>>> X265_FREE((analysis->interData)->mvpIdx[1]); >>>>> X265_FREE((analysis->interData)->mv[0]); >>>>> diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp >>>>> index 1f710e1ce..5eb123d31 100644 >>>>> --- a/source/encoder/encoder.cpp >>>>> +++ b/source/encoder/encoder.cpp >>>>> @@ -4444,6 +4444,26 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> } >>>>> } >>>>> } >>>>> + >>>>> + int8_t *cuQPBuf = NULL, *cuQPOffSets = NULL; >>>>> + uint32_t reuseBufSize = 0; >>>>> + >>>>> + if (m_param->rc.cuTree) >>>>> + { >>>>> + if (m_param->analysisLoadReuseLevel == 10) >>>>> + reuseBufSize = depthBytes; >>>>> + else if (m_param->analysisLoadReuseLevel > 1) >>>>> + reuseBufSize = MAX_NUM_CU_GEOMS * analysis->numCUsInFrame; >>>>> + cuQPBuf = X265_MALLOC(int8_t, reuseBufSize); >>>>> + if (!m_param->bUseAnalysisFile) >>>>> + { >>>>> + if (analysis->sliceType == X265_TYPE_IDR || >>>>> analysis->sliceType == X265_TYPE_I) >>>>> + cuQPOffSets = intraPic->cuQPOff; >>>>> + else >>>>> + cuQPOffSets = interPic->cuQPOff; >>>>> + } >>>>> + } >>>>> + >>>>> if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType >>>>> == X265_TYPE_I) >>>>> { >>>>> if (m_param->bAnalysisType == HEVC_INFO) >>>>> @@ -4452,19 +4472,21 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> return; >>>>> >>>>> uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, >>>>> *partSizes = NULL; >>>>> - int8_t *cuQPBuf = NULL; >>>>> >>>>> tempBuf = X265_MALLOC(uint8_t, depthBytes * 3); >>>>> depthBuf = tempBuf; >>>>> modeBuf = tempBuf + depthBytes; >>>>> partSizes = tempBuf + 2 * depthBytes; >>>>> - if (m_param->rc.cuTree) >>>>> - cuQPBuf = X265_MALLOC(int8_t, depthBytes); >>>>> >>>>> X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, >>>>> m_analysisFileIn, intraPic->depth); >>>>> X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, >>>>> m_analysisFileIn, intraPic->chromaModes); >>>>> X265_FREAD(partSizes, sizeof(uint8_t), depthBytes, >>>>> m_analysisFileIn, intraPic->partSizes); >>>>> - if (m_param->rc.cuTree) { X265_FREAD(cuQPBuf, sizeof(int8_t), >>>>> depthBytes, m_analysisFileIn, intraPic->cuQPOff); } >>>>> + if (m_param->rc.cuTree) >>>>> + { >>>>> + X265_FREAD(cuQPBuf, sizeof(int8_t), reuseBufSize, >>>>> m_analysisFileIn, cuQPOffSets); >>>>> + if (m_param->analysisLoadReuseLevel > 1 && >>>>> m_param->analysisLoadReuseLevel < 10) >>>>> + memcpy(analysis->intraData->cuQPOff, cuQPBuf, >>>>> sizeof(int8_t) * reuseBufSize); >>>>> + } >>>>> >>>>> size_t count = 0; >>>>> for (uint32_t d = 0; d < depthBytes; d++) >>>>> @@ -4480,7 +4502,7 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> memset(&(analysis->intraData)->depth[count], depthBuf[d], >>>>> bytes); >>>>> memset(&(analysis->intraData)->chromaModes[count], >>>>> modeBuf[d], bytes); >>>>> memset(&(analysis->intraData)->partSizes[count], >>>>> partSizes[d], bytes); >>>>> - if (m_param->rc.cuTree) >>>>> + if (m_param->rc.cuTree && m_param->analysisLoadReuseLevel >>>>> == 10) >>>>> memset(&(analysis->intraData)->cuQPOff[count], >>>>> cuQPBuf[d], bytes); >>>>> count += bytes; >>>>> } >>>>> @@ -4515,7 +4537,6 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> uint8_t *interDir = NULL, *chromaDir = NULL, *mvpIdx[2]; >>>>> MV* mv[2]; >>>>> int8_t* refIdx[2]; >>>>> - int8_t* cuQPBuf = NULL; >>>>> >>>>> int numBuf = m_param->analysisLoadReuseLevel > 4 ? 4 : 2; >>>>> bool bIntraInInter = false; >>>>> @@ -4535,12 +4556,15 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> tempBuf = X265_MALLOC(uint8_t, depthBytes * numBuf); >>>>> depthBuf = tempBuf; >>>>> modeBuf = tempBuf + depthBytes; >>>>> - if (m_param->rc.cuTree) >>>>> - cuQPBuf = X265_MALLOC(int8_t, depthBytes); >>>>> >>>>> X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, >>>>> m_analysisFileIn, interPic->depth); >>>>> X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, >>>>> m_analysisFileIn, interPic->modes); >>>>> - if (m_param->rc.cuTree) { X265_FREAD(cuQPBuf, >>>>> sizeof(int8_t), depthBytes, m_analysisFileIn, interPic->cuQPOff); } >>>>> + if (m_param->rc.cuTree) >>>>> + { >>>>> + X265_FREAD(cuQPBuf, sizeof(int8_t), reuseBufSize, >>>>> m_analysisFileIn, cuQPOffSets); >>>>> + if (m_param->analysisLoadReuseLevel > 1 && >>>>> m_param->analysisLoadReuseLevel < 10) >>>>> + memcpy(analysis->interData->cuQPOff, cuQPBuf, >>>>> sizeof(int8_t) * reuseBufSize); >>>>> + } >>>>> >>>>> if (m_param->analysisLoadReuseLevel > 4) >>>>> { >>>>> @@ -4578,7 +4602,7 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> depthBuf[d] = 1; >>>>> memset(&(analysis->interData)->depth[count], >>>>> depthBuf[d], bytes); >>>>> memset(&(analysis->interData)->modes[count], >>>>> modeBuf[d], bytes); >>>>> - if (m_param->rc.cuTree) >>>>> + if (m_param->rc.cuTree && >>>>> m_param->analysisLoadReuseLevel == 10) >>>>> memset(&(analysis->interData)->cuQPOff[count], >>>>> cuQPBuf[d], bytes); >>>>> if (m_param->analysisLoadReuseLevel > 4) >>>>> { >>>>> @@ -4736,7 +4760,7 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> int numPartitions = analysis->numPartitions; >>>>> int numCUsInFrame = analysis->numCUsInFrame; >>>>> int numCuInHeight = analysis->numCuInHeight; >>>>> - /* Allocate memory for scaled resoultion's numPartitions and >>>>> numCUsInFrame*/ >>>>> + /* Allocate memory for scaled resolution's numPartitions and >>>>> numCUsInFrame */ >>>>> analysis->numPartitions = m_param->num4x4Partitions; >>>>> analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU; >>>>> analysis->numCuInHeight = cuLoc.heightInCU; >>>>> @@ -4808,25 +4832,40 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> X265_FREE(vbvCostBuf); >>>>> } >>>>> >>>>> + uint32_t reuseBufSize = 0; >>>>> + int8_t *cuQPOffSets = NULL, *cuQPBuf = NULL; >>>>> + if (m_param->rc.cuTree) >>>>> + { >>>>> + if (m_param->analysisLoadReuseLevel == 10) >>>>> + reuseBufSize = depthBytes; >>>>> + else if (m_param->analysisLoadReuseLevel > 1) >>>>> + reuseBufSize = (MAX_NUM_CU_GEOMS / factor) * >>>>> (analysis->numCUsInFrame); >>>>> + cuQPBuf = X265_MALLOC(int8_t, reuseBufSize); >>>>> + if (!m_param->bUseAnalysisFile) >>>>> + { >>>>> + if (analysis->sliceType == X265_TYPE_IDR || >>>>> analysis->sliceType == X265_TYPE_I) >>>>> + cuQPOffSets = intraPic->cuQPOff; >>>>> + else >>>>> + cuQPOffSets = interPic->cuQPOff; >>>>> + } >>>>> + } >>>>> + >>>>> if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType >>>>> == X265_TYPE_I) >>>>> { >>>>> if (m_param->analysisLoadReuseLevel < 2) >>>>> return; >>>>> >>>>> uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, >>>>> *partSizes = NULL; >>>>> - int8_t *cuQPBuf = NULL; >>>>> >>>>> tempBuf = X265_MALLOC(uint8_t, depthBytes * 3); >>>>> depthBuf = tempBuf; >>>>> modeBuf = tempBuf + depthBytes; >>>>> partSizes = tempBuf + 2 * depthBytes; >>>>> - if (m_param->rc.cuTree) >>>>> - cuQPBuf = X265_MALLOC(int8_t, depthBytes); >>>>> >>>>> X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, >>>>> m_analysisFileIn, intraPic->depth); >>>>> X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, >>>>> m_analysisFileIn, intraPic->chromaModes); >>>>> X265_FREAD(partSizes, sizeof(uint8_t), depthBytes, >>>>> m_analysisFileIn, intraPic->partSizes); >>>>> - if (m_param->rc.cuTree) { X265_FREAD(cuQPBuf, sizeof(int8_t), >>>>> depthBytes, m_analysisFileIn, intraPic->cuQPOff); } >>>>> + if (m_param->rc.cuTree) { X265_FREAD(cuQPBuf, sizeof(int8_t), >>>>> reuseBufSize, m_analysisFileIn, cuQPOffSets); } >>>>> >>>>> uint32_t count = 0; >>>>> for (uint32_t d = 0; d < depthBytes; d++) >>>>> @@ -4848,7 +4887,7 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> memset(&(analysis->intraData)->depth[count], >>>>> depthBuf[d], bytes); >>>>> memset(&(analysis->intraData)->chromaModes[count], >>>>> modeBuf[d], bytes); >>>>> memset(&(analysis->intraData)->partSizes[count], >>>>> partSizes[d], bytes); >>>>> - if (m_param->rc.cuTree) >>>>> + if (m_param->rc.cuTree && >>>>> m_param->analysisLoadReuseLevel == 10) >>>>> memset(&(analysis->intraData)->cuQPOff[count], >>>>> cuQPBuf[d], bytes); >>>>> count += bytes; >>>>> d += getCUIndex(&cuLoc, &count, bytes, 1); >>>>> @@ -4886,7 +4925,6 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> uint8_t *interDir = NULL, *chromaDir = NULL, *mvpIdx[2]; >>>>> MV* mv[2]; >>>>> int8_t* refIdx[2]; >>>>> - int8_t* cuQPBuf = NULL; >>>>> >>>>> int numBuf = m_param->analysisLoadReuseLevel > 4 ? 4 : 2; >>>>> bool bIntraInInter = false; >>>>> @@ -4900,12 +4938,16 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> tempBuf = X265_MALLOC(uint8_t, depthBytes * numBuf); >>>>> depthBuf = tempBuf; >>>>> modeBuf = tempBuf + depthBytes; >>>>> - if (m_param->rc.cuTree) >>>>> - cuQPBuf = X265_MALLOC(int8_t, depthBytes); >>>>> >>>>> X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, >>>>> m_analysisFileIn, interPic->depth); >>>>> X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, >>>>> m_analysisFileIn, interPic->modes); >>>>> - if (m_param->rc.cuTree) { X265_FREAD(cuQPBuf, sizeof(int8_t), >>>>> depthBytes, m_analysisFileIn, interPic->cuQPOff); } >>>>> + if (m_param->rc.cuTree) >>>>> + { >>>>> + X265_FREAD(cuQPBuf, sizeof(int8_t), reuseBufSize, >>>>> m_analysisFileIn, cuQPOffSets); >>>>> + if (m_param->analysisLoadReuseLevel > 1 && >>>>> m_param->analysisLoadReuseLevel < 10) >>>>> + memcpy(&(analysis->interData)->cuQPOff, cuQPBuf, >>>>> sizeof(int8_t) * reuseBufSize); >>>>> + } >>>>> + >>>>> if (m_param->analysisLoadReuseLevel > 4) >>>>> { >>>>> partSize = modeBuf + depthBytes; >>>>> @@ -4954,7 +4996,7 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> { >>>>> memset(&(analysis->interData)->depth[count], >>>>> writeDepth, bytes); >>>>> memset(&(analysis->interData)->modes[count], >>>>> modeBuf[d], bytes); >>>>> - if (m_param->rc.cuTree) >>>>> + if (m_param->rc.cuTree && >>>>> m_param->analysisLoadReuseLevel == 10) >>>>> memset(&(analysis->interData)->cuQPOff[count], >>>>> cuQPBuf[d], bytes); >>>>> if (m_param->analysisLoadReuseLevel == 10 && >>>>> bIntraInInter) >>>>> >>>>> memset(&(analysis->intraData)->chromaModes[count], chromaDir[d], bytes); >>>>> @@ -5046,7 +5088,9 @@ void >>>>> Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const >>>>> x >>>>> } >>>>> } >>>>> else >>>>> + { >>>>> X265_FREAD((analysis->interData)->ref, sizeof(int32_t), >>>>> analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, >>>>> m_analysisFileIn, interPic->ref); >>>>> + } >>>>> >>>>> consumedBytes += frameRecordSize; >>>>> if (numDir == 1) >>>>> @@ -5510,9 +5554,10 @@ void >>>>> Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData >>>>> &curEncD >>>>> analysis->frameRecordSize += analysis->numCUsInFrame * >>>>> sizeof(sse_t); >>>>> } >>>>> >>>>> + uint32_t reuseQPBufsize = 0; >>>>> if (m_param->analysisSaveReuseLevel > 1) >>>>> { >>>>> - >>>>> + reuseQPBufsize = MAX_NUM_CU_GEOMS * analysis->numCUsInFrame; >>>>> if (analysis->sliceType == X265_TYPE_IDR || >>>>> analysis->sliceType == X265_TYPE_I) >>>>> { >>>>> for (uint32_t cuAddr = 0; cuAddr < >>>>> analysis->numCUsInFrame; cuAddr++) >>>>> @@ -5536,12 +5581,21 @@ void >>>>> Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData >>>>> &curEncD >>>>> partSize = ctu->m_partSize[absPartIdx]; >>>>> intraDataCTU->partSizes[depthBytes] = partSize; >>>>> >>>>> - if (m_param->rc.cuTree) >>>>> + if (m_param->rc.cuTree && >>>>> m_param->analysisSaveReuseLevel == 10) >>>>> intraDataCTU->cuQPOff[depthBytes] = >>>>> (int8_t)(ctu->m_qpAnalysis[absPartIdx] - baseQP); >>>>> absPartIdx += ctu->m_numPartitions >> (depth * 2); >>>>> } >>>>> + >>>>> + if (m_param->rc.cuTree && >>>>> m_param->analysisSaveReuseLevel < 10) >>>>> + { >>>>> + uint32_t nextCuIdx = (cuAddr + 1) * >>>>> MAX_NUM_CU_GEOMS; >>>>> + for (uint32_t i = cuAddr * MAX_NUM_CU_GEOMS; i < >>>>> nextCuIdx; i++) >>>>> + intraDataCTU->cuQPOff[i] = >>>>> (int8_t)(intraDataCTU->cuQPOff[i] - baseQP); >>>>> + } >>>>> memcpy(&intraDataCTU->modes[ctu->m_cuAddr * >>>>> ctu->m_numPartitions], ctu->m_lumaIntraDir, sizeof(uint8_t)* >>>>> ctu->m_numPartitions); >>>>> } >>>>> + if (m_param->rc.cuTree && m_param->analysisSaveReuseLevel >>>>> == 10) >>>>> + reuseQPBufsize = depthBytes; >>>>> } >>>>> else >>>>> { >>>>> @@ -5567,7 +5621,7 @@ void >>>>> Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData >>>>> &curEncD >>>>> predMode = 4; // used as indicator if the >>>>> block is coded as bidir >>>>> >>>>> interDataCTU->modes[depthBytes] = predMode; >>>>> - if (m_param->rc.cuTree) >>>>> + if (m_param->rc.cuTree && >>>>> m_param->analysisSaveReuseLevel == 10) >>>>> interDataCTU->cuQPOff[depthBytes] = >>>>> (int8_t)(ctu->m_qpAnalysis[absPartIdx] - baseQP); >>>>> >>>>> if (m_param->analysisSaveReuseLevel > 4) >>>>> @@ -5599,13 +5653,23 @@ void >>>>> Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData >>>>> &curEncD >>>>> } >>>>> absPartIdx += ctu->m_numPartitions >> (depth * 2); >>>>> } >>>>> + >>>>> + if (m_param->rc.cuTree && >>>>> m_param->analysisSaveReuseLevel < 10) >>>>> + { >>>>> + uint32_t nextCuIdx = (cuAddr + 1) * >>>>> MAX_NUM_CU_GEOMS; >>>>> + for (uint32_t i = cuAddr * MAX_NUM_CU_GEOMS; i < >>>>> nextCuIdx ; i++) >>>>> + interDataCTU->cuQPOff[i] = >>>>> (int8_t)(interDataCTU->cuQPOff[i] - baseQP); >>>>> + } >>>>> + >>>>> if (m_param->analysisSaveReuseLevel == 10 && >>>>> bIntraInInter) >>>>> memcpy(&intraDataCTU->modes[ctu->m_cuAddr * >>>>> ctu->m_numPartitions], ctu->m_lumaIntraDir, sizeof(uint8_t)* >>>>> ctu->m_numPartitions); >>>>> } >>>>> + if (m_param->rc.cuTree && m_param->analysisSaveReuseLevel >>>>> == 10) >>>>> + reuseQPBufsize = depthBytes; >>>>> } >>>>> >>>>> if ((analysis->sliceType == X265_TYPE_IDR || >>>>> analysis->sliceType == X265_TYPE_I) && m_param->rc.cuTree) >>>>> - analysis->frameRecordSize += sizeof(uint8_t)* >>>>> analysis->numCUsInFrame * analysis->numPartitions + depthBytes * 3 + >>>>> (sizeof(int8_t) * depthBytes); >>>>> + analysis->frameRecordSize += sizeof(uint8_t)* >>>>> analysis->numCUsInFrame * analysis->numPartitions + depthBytes * 3 + >>>>> (sizeof(int8_t) * reuseQPBufsize); >>>>> else if (analysis->sliceType == X265_TYPE_IDR || >>>>> analysis->sliceType == X265_TYPE_I) >>>>> analysis->frameRecordSize += sizeof(uint8_t)* >>>>> analysis->numCUsInFrame * analysis->numPartitions + depthBytes * 3; >>>>> else >>>>> @@ -5613,7 +5677,8 @@ void >>>>> Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData >>>>> &curEncD >>>>> /* Add sizeof depth, modes, partSize, cuQPOffset, >>>>> mergeFlag */ >>>>> analysis->frameRecordSize += depthBytes * 2; >>>>> if (m_param->rc.cuTree) >>>>> - analysis->frameRecordSize += (sizeof(int8_t) * >>>>> depthBytes); >>>>> + analysis->frameRecordSize += (sizeof(int8_t) * >>>>> reuseQPBufsize); >>>>> + >>>>> if (m_param->analysisSaveReuseLevel > 4) >>>>> analysis->frameRecordSize += (depthBytes * 2); >>>>> >>>>> @@ -5669,7 +5734,7 @@ void >>>>> Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData >>>>> &curEncD >>>>> X265_FWRITE((analysis->intraData)->chromaModes, >>>>> sizeof(uint8_t), depthBytes, m_analysisFileOut); >>>>> X265_FWRITE((analysis->intraData)->partSizes, sizeof(char), >>>>> depthBytes, m_analysisFileOut); >>>>> if (m_param->rc.cuTree) >>>>> - X265_FWRITE((analysis->intraData)->cuQPOff, >>>>> sizeof(int8_t), depthBytes, m_analysisFileOut); >>>>> + X265_FWRITE((analysis->intraData)->cuQPOff, >>>>> sizeof(int8_t), reuseQPBufsize, m_analysisFileOut); >>>>> X265_FWRITE((analysis->intraData)->modes, sizeof(uint8_t), >>>>> analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileOut); >>>>> } >>>>> else >>>>> @@ -5677,7 +5742,7 @@ void >>>>> Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData >>>>> &curEncD >>>>> X265_FWRITE((analysis->interData)->depth, sizeof(uint8_t), >>>>> depthBytes, m_analysisFileOut); >>>>> X265_FWRITE((analysis->interData)->modes, sizeof(uint8_t), >>>>> depthBytes, m_analysisFileOut); >>>>> if (m_param->rc.cuTree) >>>>> - X265_FWRITE((analysis->interData)->cuQPOff, >>>>> sizeof(int8_t), depthBytes, m_analysisFileOut); >>>>> + X265_FWRITE((analysis->interData)->cuQPOff, >>>>> sizeof(int8_t), reuseQPBufsize, m_analysisFileOut); >>>>> if (m_param->analysisSaveReuseLevel > 4) >>>>> { >>>>> X265_FWRITE((analysis->interData)->partSize, >>>>> sizeof(uint8_t), depthBytes, m_analysisFileOut); >>>>> @@ -5762,7 +5827,7 @@ void >>>>> Encoder::writeAnalysisFileRefine(x265_analysis_data* analysis, FrameData >>>>> &c >>>>> interData->mv[1][depthBytes].word = >>>>> ctu->m_mv[1][absPartIdx].word; >>>>> interData->mvpIdx[1][depthBytes] = >>>>> ctu->m_mvpIdx[1][absPartIdx]; >>>>> ref[1][depthBytes] = ctu->m_refIdx[1][absPartIdx]; >>>>> - predMode = 4; // used as indiacator if the block >>>>> is coded as bidir >>>>> + predMode = 4; // used as indicator if the block >>>>> is coded as bidir >>>>> } >>>>> interData->modes[depthBytes] = predMode; >>>>> >>>>> diff --git a/source/encoder/slicetype.cpp >>>>> b/source/encoder/slicetype.cpp >>>>> index 0adb0d0db..3bc01268b 100644 >>>>> --- a/source/encoder/slicetype.cpp >>>>> +++ b/source/encoder/slicetype.cpp >>>>> @@ -1894,7 +1894,7 @@ void Lookahead::slicetypeAnalyse(Lowres >>>>> **frames, bool bKeyframe) >>>>> >>>>> if (!framecnt) >>>>> { >>>>> - if (m_param->rc.cuTree) >>>>> + if (m_param->rc.cuTree && !m_param->analysisLoad) >>>>> >>>> [AM] Won't this implicitly turn OFF cutree at reuse-level 1? >>>> >>>>> cuTree(frames, 0, bKeyframe); >>>>> return; >>>>> } >>>>> diff --git a/source/x265.h b/source/x265.h >>>>> index f44040ba7..8d7a75826 100644 >>>>> --- a/source/x265.h >>>>> +++ b/source/x265.h >>>>> @@ -144,7 +144,7 @@ typedef struct x265_analysis_intra_data >>>>> uint8_t* modes; >>>>> char* partSizes; >>>>> uint8_t* chromaModes; >>>>> - int8_t* cuQPOff; >>>>> + int8_t* cuQPOff; >>>>> }x265_analysis_intra_data; >>>>> >>>>> typedef struct x265_analysis_MV >>>>> @@ -167,7 +167,7 @@ typedef struct x265_analysis_inter_data >>>>> uint8_t* interDir; >>>>> uint8_t* mvpIdx[2]; >>>>> int8_t* refIdx[2]; >>>>> - x265_analysis_MV* mv[2]; >>>>> + x265_analysis_MV* mv[2]; >>>>> int64_t* sadCost; >>>>> int8_t* cuQPOff; >>>>> }x265_analysis_inter_data; >>>>> -- >>>>> 2.20.1.windows.1 >>>>> >>>>> >>>>> -- >>>>> *With Regards,* >>>>> *Srikanth Kurapati.* >>>>> _______________________________________________ >>>>> x265-devel mailing list >>>>> x265-devel@videolan.org >>>>> https://mailman.videolan.org/listinfo/x265-devel >>>>> >>>> >>>> >>>> -- >>>> Regards, >>>> *Aruna Matheswaran,* >>>> Video Codec Engineer, >>>> Media & AI analytics BU, >>>> >>>> >>>> >>>> _______________________________________________ >>>> x265-devel mailing list >>>> x265-devel@videolan.org >>>> https://mailman.videolan.org/listinfo/x265-devel >>>> >>> >>> >>> -- >>> *With Regards,* >>> *Srikanth Kurapati.* >>> >> >> >> -- >> *With Regards,* >> *Srikanth Kurapati.* >> _______________________________________________ >> x265-devel mailing list >> x265-devel@videolan.org >> https://mailman.videolan.org/listinfo/x265-devel >> > > > -- > Regards, > *Aruna Matheswaran,* > Video Codec Engineer, > Media & AI analytics BU, > > > > _______________________________________________ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > -- *With Regards,* *Srikanth Kurapati.*
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel