# HG changeset patch # User Bhavna Hariharan <bha...@multicorewareinc.com> # Date 1519796358 -19800 # Wed Feb 28 11:09:18 2018 +0530 # Node ID 55eb3992299530de882829de0d3c0fea6d58b70d # Parent 7219376de42a1cc378ec957c886b511139d3c201 remove maxCTU size restriction in scaled save/load encodes
The scaled save/load feature requires that the save encode has a maximum CTU size of 32. The 32x32 blocks are mapped to a 64x64 block in load encode. Due to this restriction we will be able to heirarchialy encode only 3 resolutions. WxH - ctu 16 2Wx2H - ctu 32 4Wx4H - ctu 64 diff -r 7219376de42a -r 55eb39922995 source/encoder/encoder.cpp --- a/source/encoder/encoder.cpp Thu Feb 15 02:21:26 2018 -0800 +++ b/source/encoder/encoder.cpp Wed Feb 28 11:09:18 2018 +0530 @@ -3272,10 +3272,10 @@ #define X265_FREAD(val, size, readSize, fileOffset, src)\ if (!m_param->bUseAnalysisFile)\ - {\ + {\ memcpy(val, src, (size * readSize));\ - }\ - else if (fread(val, size, readSize, fileOffset) != readSize)\ + }\ + else if (fread(val, size, readSize, fileOffset) != readSize)\ {\ x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\ freeAnalysis(analysis);\ @@ -3334,10 +3334,37 @@ int scaledNumPartition = analysis->numPartitions; int factor = 1 << m_param->scaleFactor; + int numPartitions = analysis->numPartitions; + int numCUsInFrame = analysis->numCUsInFrame; + int extendedWidth, extendedHeight; + cuLocation cuLoc; + cuLoc.heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + cuLoc.widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + cuLoc.skipHeight = false; + cuLoc.skipWidth = false; + if (m_param->scaleFactor) - analysis->numPartitions *= factor; + { + /* Allocate memory for scaled resoultion's numPartitions and numCUsInFrame*/ + analysis->numPartitions = m_param->num4x4Partitions; + analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU; + + /* Set skipWidth/skipHeight flags when the out of bound pixels in lowRes is greater than half of maxCUSize */ + extendedWidth = ((m_param->sourceWidth / 2 + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize) * m_param->maxCUSize; + extendedHeight = ((m_param->sourceHeight / 2 + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize) * m_param->maxCUSize; + uint32_t outOfBoundaryLowres = extendedWidth - m_param->sourceWidth / 2; + if (outOfBoundaryLowres * 2 >= m_param->maxCUSize) + cuLoc.skipWidth = true; + uint32_t outOfBoundaryLowresH = extendedHeight - m_param->sourceHeight / 2; + if (outOfBoundaryLowresH * 2 >= m_param->maxCUSize) + cuLoc.skipHeight = true; + } + /* Memory is allocated for inter and intra analysis data based on the slicetype */ allocAnalysis(analysis); + + analysis->numPartitions = numPartitions * factor; + analysis->numCUsInFrame = numCUsInFrame; if (m_param->bDisableLookahead && m_rateControl->m_isVbv) { X265_FREAD(analysis->lookahead.intraVbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.intraVbvCost); @@ -3345,6 +3372,11 @@ X265_FREAD(analysis->lookahead.satdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.satdForVbv); X265_FREAD(analysis->lookahead.intraSatdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.intraSatdForVbv); } + + cuLoc.evenRowIndex = 0; + cuLoc.oddRowIndex = m_param->num4x4Partitions * cuLoc.widthInCU; + cuLoc.switchCondition = 0; // To switch between odd and even rows + if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) { if (m_param->analysisReuseLevel < 2) @@ -3365,17 +3397,30 @@ for (uint32_t d = 0; d < depthBytes; d++) { int bytes = analysis->numPartitions >> (depthBuf[d] * 2); + int numCTUCopied = 1; + if (m_param->scaleFactor) { - if (depthBuf[d] == 0) - depthBuf[d] = 1; + if (!depthBuf[d]) //copy data of one 64x64 to four scaled 64x64 CTUs. + { + bytes /= 4; + numCTUCopied = 4; + } + if (partSizes[d] == SIZE_NxN) partSizes[d] = SIZE_2Nx2N; + if ((depthBuf[d] > 1 && m_param->maxCUSize == 64) || (depthBuf[d] && m_param->maxCUSize != 64)) + depthBuf[d]--; } - memset(&((analysis_intra_data *)analysis->intraData)->depth[count], depthBuf[d], bytes); - memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], modeBuf[d], bytes); - memset(&((analysis_intra_data *)analysis->intraData)->partSizes[count], partSizes[d], bytes); - count += bytes; + for (int numCTU = 0; numCTU < numCTUCopied; numCTU++) + { + memset(&((analysis_intra_data *)analysis->intraData)->depth[count], depthBuf[d], bytes); + memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], modeBuf[d], bytes); + memset(&((analysis_intra_data *)analysis->intraData)->partSizes[count], partSizes[d], bytes); + count += bytes; + if (m_param->scaleFactor) + d += getCUIndex(&cuLoc, &count, bytes, 1); + } } if (!m_param->scaleFactor) @@ -3384,10 +3429,18 @@ } else { + cuLoc.evenRowIndex = 0; + cuLoc.oddRowIndex = m_param->num4x4Partitions * cuLoc.widthInCU; + cuLoc.switchCondition = 0; uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition); X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes); - for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor) + size_t cnt = 0; + for (uint32_t ctu32Idx = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++) + { memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor); + cnt += factor; + ctu32Idx += getCUIndex(&cuLoc, &cnt, factor, 0); + } X265_FREE(tempLumaBuf); } X265_FREE(tempBuf); @@ -3452,43 +3505,95 @@ } size_t count = 0; + cuLoc.switchCondition = 0; for (uint32_t d = 0; d < depthBytes; d++) { int bytes = analysis->numPartitions >> (depthBuf[d] * 2); - if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && depthBuf[d] == 0) - depthBuf[d] = 1; - memset(&((analysis_inter_data *)analysis->interData)->depth[count], depthBuf[d], bytes); - memset(&((analysis_inter_data *)analysis->interData)->modes[count], modeBuf[d], bytes); - if (m_param->analysisReuseLevel > 4) + bool isScaledMaxCUSize = false; + int numCTUCopied = 1; + int writeDepth = depthBuf[d]; + if (m_param->scaleFactor) { - if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && partSize[d] == SIZE_NxN) - partSize[d] = SIZE_2Nx2N; - memset(&((analysis_inter_data *)analysis->interData)->partSize[count], partSize[d], bytes); - int numPU = (modeBuf[d] == MODE_INTRA) ? 1 : nbPartsTable[(int)partSize[d]]; - for (int pu = 0; pu < numPU; pu++) + if (!depthBuf[d]) //copy data of one 64x64 to four scaled 64x64 CTUs. { - if (pu) d++; - ((analysis_inter_data *)analysis->interData)->mergeFlag[count + pu] = mergeFlag[d]; - if (m_param->analysisReuseLevel == 10) + isScaledMaxCUSize = true; + bytes /= 4; + numCTUCopied = 4; + } + if ((modeBuf[d] != MODE_INTRA && depthBuf[d] != 0) || (modeBuf[d] == MODE_INTRA && depthBuf[d] > 1)) + writeDepth--; + } + + for (int numCTU = 0; numCTU < numCTUCopied; numCTU++) + { + memset(&((analysis_inter_data *)analysis->interData)->depth[count], writeDepth, bytes); + memset(&((analysis_inter_data *)analysis->interData)->modes[count], modeBuf[d], bytes); + if (m_param->analysisReuseLevel == 10 && bIntraInInter) + memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], chromaDir[d], bytes); + + if (m_param->analysisReuseLevel > 4) + { + puOrientation puOrient; + puOrient.isRect = false; + puOrient.isAmp = false; + puOrient.isVert = false; + if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && partSize[d] == SIZE_NxN) + partSize[d] = SIZE_2Nx2N; + int partitionSize = partSize[d]; + if (isScaledMaxCUSize && partSize[d] != SIZE_2Nx2N) + partitionSize = getPuShape(&puOrient, partSize[d], numCTU); + memset(&((analysis_inter_data *)analysis->interData)->partSize[count], partitionSize, bytes); + int numPU = (modeBuf[d] == MODE_INTRA) ? 1 : nbPartsTable[(int)partSize[d]]; + for (int pu = 0; pu < numPU; pu++) { - ((analysis_inter_data *)analysis->interData)->interDir[count + pu] = interDir[d]; - for (uint32_t i = 0; i < numDir; i++) + if (!isScaledMaxCUSize && pu) + d++; + int restoreD = d; + /* Adjust d value when the current CTU takes data from 2nd PU */ + if (puOrient.isRect || (puOrient.isAmp && partitionSize == SIZE_2Nx2N)) { - ((analysis_inter_data *)analysis->interData)->mvpIdx[i][count + pu] = mvpIdx[i][d]; - ((analysis_inter_data *)analysis->interData)->refIdx[i][count + pu] = refIdx[i][d]; - if (m_param->scaleFactor) + if ((numCTU > 1 && !puOrient.isVert) || ((numCTU % 2 == 1) && puOrient.isVert)) + d++; + } + if (puOrient.isAmp && pu) + d++; + + ((analysis_inter_data *)analysis->interData)->mergeFlag[count + pu] = mergeFlag[d]; + if (m_param->analysisReuseLevel == 10) + { + ((analysis_inter_data *)analysis->interData)->interDir[count + pu] = interDir[d]; + for (uint32_t i = 0; i < numDir; i++) { - mv[i][d].x *= (int16_t)m_param->scaleFactor; - mv[i][d].y *= (int16_t)m_param->scaleFactor; + ((analysis_inter_data *)analysis->interData)->mvpIdx[i][count + pu] = mvpIdx[i][d]; + ((analysis_inter_data *)analysis->interData)->refIdx[i][count + pu] = refIdx[i][d]; + MV mvCopy[2]; + mvCopy[i].x = mv[i][d].x; + mvCopy[i].y = mv[i][d].y; + if (m_param->scaleFactor) + { + mvCopy[i].x = mv[i][d].x * (int16_t)m_param->scaleFactor; + mvCopy[i].y = mv[i][d].y * (int16_t)m_param->scaleFactor; + } + memcpy(&((analysis_inter_data *)analysis->interData)->mv[i][count + pu], &mvCopy[i], sizeof(MV)); } - memcpy(&((analysis_inter_data *)analysis->interData)->mv[i][count + pu], &mv[i][d], sizeof(MV)); + } + d = restoreD; // Restore d value after copying each of the 4 64x64 CTUs + + if (isScaledMaxCUSize && (puOrient.isRect || puOrient.isAmp)) + { + /* Skip PU index when current CTU is a 2Nx2N */ + if (partitionSize == SIZE_2Nx2N) + pu++; + /* Adjust d after completion of all 4 CTU copies */ + if (numCTU == 3 && (pu == (numPU - 1))) + d++; } } } - if (m_param->analysisReuseLevel == 10 && bIntraInInter) - memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], chromaDir[d], bytes); + count += bytes; + if (m_param->scaleFactor) + d += getCUIndex(&cuLoc, &count, bytes, 1); } - count += bytes; } X265_FREE(tempBuf); @@ -3509,10 +3614,18 @@ } else { + cuLoc.evenRowIndex = 0; + cuLoc.oddRowIndex = m_param->num4x4Partitions * cuLoc.widthInCU; + cuLoc.switchCondition = 0; uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition); X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes); - for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor) + size_t cnt = 0; + for (uint32_t ctu32Idx = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++) + { memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor); + cnt += factor; + ctu32Idx += getCUIndex(&cuLoc, &cnt, factor, 0); + } X265_FREE(tempLumaBuf); } } @@ -3524,9 +3637,123 @@ if (numDir == 1) totalConsumedBytes = consumedBytes; } + + /* Restore to the current encode's numPartitions and numCUsInFrame */ + if (m_param->scaleFactor) + { + analysis->numPartitions = m_param->num4x4Partitions; + analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU; + } #undef X265_FREAD } +/* Toggle between two consecutive CTU rows. The save's CTU is copied +twice consecutively in the first and second CTU row of load*/ + +int Encoder::getCUIndex(cuLocation* cuLoc, size_t* count, int bytes, int flag) +{ + int index = 0; + cuLoc->switchCondition += bytes; + int isBoundaryW = (*count % (m_param->num4x4Partitions * cuLoc->widthInCU) == 0); + + /* Width boundary case : + Skip to appropriate index when out of boundary cases occur + Out of boundary may occur when the out of bound pixels along + the width in low resoultion is greater than half of the maxCUSize */ + if (cuLoc->skipWidth && isBoundaryW) + { + if (flag) + index++; + else + { + /* Number of 4x4 blocks in out of bound region */ + int outOfBound = m_param->maxCUSize / 2; + uint32_t sum = (uint32_t)pow((outOfBound >> 2), 2); + index += sum; + } + cuLoc->switchCondition += m_param->num4x4Partitions; + } + + /* Completed writing 2 CTUs - move to the last remembered index of the next CTU row*/ + if (cuLoc->switchCondition == 2 * m_param->num4x4Partitions) + { + if (isBoundaryW) + cuLoc->evenRowIndex = *count + (m_param->num4x4Partitions * cuLoc->widthInCU); // end of row - skip to the next even row + else + cuLoc->evenRowIndex = *count; + *count = cuLoc->oddRowIndex; + + /* Height boundary case : + Skip to appropriate index when out of boundary cases occur + Out of boundary may occur when the out of bound pixels along + the height in low resoultion is greater than half of the maxCUSize */ + int isBoundaryH = (*count >= (m_param->num4x4Partitions * cuLoc->heightInCU * cuLoc->widthInCU)); + if (cuLoc->skipHeight && isBoundaryH) + { + if (flag) + index += 2; + else + { + int outOfBound = m_param->maxCUSize / 2; + uint32_t sum = (uint32_t)(2 * pow((abs(outOfBound) >> 2), 2)); + index += sum; + } + *count = cuLoc->evenRowIndex; + cuLoc->switchCondition = 0; + } + } + /* Completed writing 4 CTUs - move to the last remembered index of + the previous CTU row to copy the next save CTU's data*/ + else if (cuLoc->switchCondition == 4 * m_param->num4x4Partitions) + { + if (isBoundaryW) + cuLoc->oddRowIndex = *count + (m_param->num4x4Partitions * cuLoc->widthInCU); // end of row - skip to the next odd row + else + cuLoc->oddRowIndex = *count; + *count = cuLoc->evenRowIndex; + cuLoc->switchCondition = 0; + } + return index; +} + +/* save load + CTU0 CTU1 CTU2 CTU3 + 2NxN 2Nx2N 2Nx2N 2Nx2N 2Nx2N + NX2N 2Nx2N 2Nx2N 2Nx2N 2Nx2N + 2NxnU 2NxN 2NxN 2Nx2N 2Nx2N + 2NxnD 2Nx2N 2Nx2N 2NxN 2NxN + nLx2N Nx2N 2Nx2N Nx2N 2Nx2N + nRx2N 2Nx2N Nx2N 2Nx2N Nx2N +*/ +int Encoder::getPuShape(puOrientation* puOrient, int partSize, int numCTU) +{ + puOrient->isRect = true; + if (partSize == SIZE_Nx2N) + puOrient->isVert = true; + if (partSize >= SIZE_2NxnU) // All AMP modes + { + puOrient->isAmp = true; + puOrient->isRect = false; + if (partSize == SIZE_2NxnD && numCTU > 1) + return SIZE_2NxN; + else if (partSize == SIZE_2NxnU && numCTU < 2) + return SIZE_2NxN; + else if (partSize == SIZE_nLx2N) + { + puOrient->isVert = true; + if (!(numCTU % 2)) + return SIZE_Nx2N; + } + else if (partSize == SIZE_nRx2N) + { + puOrient->isVert = true; + if (numCTU % 2) + return SIZE_Nx2N; + } + } + return SIZE_2Nx2N; +} + void Encoder::readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int curPoc, int sliceType) { diff -r 7219376de42a -r 55eb39922995 source/encoder/encoder.h --- a/source/encoder/encoder.h Thu Feb 15 02:21:26 2018 -0800 +++ b/source/encoder/encoder.h Wed Feb 28 11:09:18 2018 +0530 @@ -90,6 +90,25 @@ RPSListNode* prior; }; +struct cuLocation +{ + bool skipWidth; + bool skipHeight; + uint32_t heightInCU; + uint32_t widthInCU; + size_t oddRowIndex; + size_t evenRowIndex; + uint32_t switchCondition; +}; + +struct puOrientation +{ + int isVert; + int isRect; + int isAmp; +}; + + class FrameEncoder; class DPB; class Lookahead; @@ -237,6 +256,10 @@ void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn); + int getCUIndex(cuLocation* cuLoc, size_t* count, int bytes, int flag); + + int getPuShape(puOrientation* puOrient, int partSize, int numCTU); + void writeAnalysisFile(x265_analysis_data* pic, FrameData &curEncData); void readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int poc, int sliceType); void writeAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, FrameData &curEncData, int slicetype);
# HG changeset patch # User Bhavna Hariharan <bha...@multicorewareinc.com> # Date 1519796358 -19800 # Wed Feb 28 11:09:18 2018 +0530 # Node ID 55eb3992299530de882829de0d3c0fea6d58b70d # Parent 7219376de42a1cc378ec957c886b511139d3c201 remove maxCTU size restriction in scaled save/load encodes The scaled save/load feature requires that the save encode has a maximum CTU size of 32. The 32x32 blocks are mapped to a 64x64 block in load encode. Due to this restriction we will be able to heirarchialy encode only 3 resolutions. WxH - ctu 16 2Wx2H - ctu 32 4Wx4H - ctu 64 diff -r 7219376de42a -r 55eb39922995 source/encoder/encoder.cpp --- a/source/encoder/encoder.cpp Thu Feb 15 02:21:26 2018 -0800 +++ b/source/encoder/encoder.cpp Wed Feb 28 11:09:18 2018 +0530 @@ -3272,10 +3272,10 @@ #define X265_FREAD(val, size, readSize, fileOffset, src)\ if (!m_param->bUseAnalysisFile)\ - {\ + {\ memcpy(val, src, (size * readSize));\ - }\ - else if (fread(val, size, readSize, fileOffset) != readSize)\ + }\ + else if (fread(val, size, readSize, fileOffset) != readSize)\ {\ x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\ freeAnalysis(analysis);\ @@ -3334,10 +3334,37 @@ int scaledNumPartition = analysis->numPartitions; int factor = 1 << m_param->scaleFactor; + int numPartitions = analysis->numPartitions; + int numCUsInFrame = analysis->numCUsInFrame; + int extendedWidth, extendedHeight; + cuLocation cuLoc; + cuLoc.heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + cuLoc.widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + cuLoc.skipHeight = false; + cuLoc.skipWidth = false; + if (m_param->scaleFactor) - analysis->numPartitions *= factor; + { + /* Allocate memory for scaled resoultion's numPartitions and numCUsInFrame*/ + analysis->numPartitions = m_param->num4x4Partitions; + analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU; + + /* Set skipWidth/skipHeight flags when the out of bound pixels in lowRes is greater than half of maxCUSize */ + extendedWidth = ((m_param->sourceWidth / 2 + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize) * m_param->maxCUSize; + extendedHeight = ((m_param->sourceHeight / 2 + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize) * m_param->maxCUSize; + uint32_t outOfBoundaryLowres = extendedWidth - m_param->sourceWidth / 2; + if (outOfBoundaryLowres * 2 >= m_param->maxCUSize) + cuLoc.skipWidth = true; + uint32_t outOfBoundaryLowresH = extendedHeight - m_param->sourceHeight / 2; + if (outOfBoundaryLowresH * 2 >= m_param->maxCUSize) + cuLoc.skipHeight = true; + } + /* Memory is allocated for inter and intra analysis data based on the slicetype */ allocAnalysis(analysis); + + analysis->numPartitions = numPartitions * factor; + analysis->numCUsInFrame = numCUsInFrame; if (m_param->bDisableLookahead && m_rateControl->m_isVbv) { X265_FREAD(analysis->lookahead.intraVbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.intraVbvCost); @@ -3345,6 +3372,11 @@ X265_FREAD(analysis->lookahead.satdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.satdForVbv); X265_FREAD(analysis->lookahead.intraSatdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.intraSatdForVbv); } + + cuLoc.evenRowIndex = 0; + cuLoc.oddRowIndex = m_param->num4x4Partitions * cuLoc.widthInCU; + cuLoc.switchCondition = 0; // To switch between odd and even rows + if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) { if (m_param->analysisReuseLevel < 2) @@ -3365,17 +3397,30 @@ for (uint32_t d = 0; d < depthBytes; d++) { int bytes = analysis->numPartitions >> (depthBuf[d] * 2); + int numCTUCopied = 1; + if (m_param->scaleFactor) { - if (depthBuf[d] == 0) - depthBuf[d] = 1; + if (!depthBuf[d]) //copy data of one 64x64 to four scaled 64x64 CTUs. + { + bytes /= 4; + numCTUCopied = 4; + } + if (partSizes[d] == SIZE_NxN) partSizes[d] = SIZE_2Nx2N; + if ((depthBuf[d] > 1 && m_param->maxCUSize == 64) || (depthBuf[d] && m_param->maxCUSize != 64)) + depthBuf[d]--; } - memset(&((analysis_intra_data *)analysis->intraData)->depth[count], depthBuf[d], bytes); - memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], modeBuf[d], bytes); - memset(&((analysis_intra_data *)analysis->intraData)->partSizes[count], partSizes[d], bytes); - count += bytes; + for (int numCTU = 0; numCTU < numCTUCopied; numCTU++) + { + memset(&((analysis_intra_data *)analysis->intraData)->depth[count], depthBuf[d], bytes); + memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], modeBuf[d], bytes); + memset(&((analysis_intra_data *)analysis->intraData)->partSizes[count], partSizes[d], bytes); + count += bytes; + if (m_param->scaleFactor) + d += getCUIndex(&cuLoc, &count, bytes, 1); + } } if (!m_param->scaleFactor) @@ -3384,10 +3429,18 @@ } else { + cuLoc.evenRowIndex = 0; + cuLoc.oddRowIndex = m_param->num4x4Partitions * cuLoc.widthInCU; + cuLoc.switchCondition = 0; uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition); X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes); - for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor) + size_t cnt = 0; + for (uint32_t ctu32Idx = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++) + { memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor); + cnt += factor; + ctu32Idx += getCUIndex(&cuLoc, &cnt, factor, 0); + } X265_FREE(tempLumaBuf); } X265_FREE(tempBuf); @@ -3452,43 +3505,95 @@ } size_t count = 0; + cuLoc.switchCondition = 0; for (uint32_t d = 0; d < depthBytes; d++) { int bytes = analysis->numPartitions >> (depthBuf[d] * 2); - if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && depthBuf[d] == 0) - depthBuf[d] = 1; - memset(&((analysis_inter_data *)analysis->interData)->depth[count], depthBuf[d], bytes); - memset(&((analysis_inter_data *)analysis->interData)->modes[count], modeBuf[d], bytes); - if (m_param->analysisReuseLevel > 4) + bool isScaledMaxCUSize = false; + int numCTUCopied = 1; + int writeDepth = depthBuf[d]; + if (m_param->scaleFactor) { - if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && partSize[d] == SIZE_NxN) - partSize[d] = SIZE_2Nx2N; - memset(&((analysis_inter_data *)analysis->interData)->partSize[count], partSize[d], bytes); - int numPU = (modeBuf[d] == MODE_INTRA) ? 1 : nbPartsTable[(int)partSize[d]]; - for (int pu = 0; pu < numPU; pu++) + if (!depthBuf[d]) //copy data of one 64x64 to four scaled 64x64 CTUs. { - if (pu) d++; - ((analysis_inter_data *)analysis->interData)->mergeFlag[count + pu] = mergeFlag[d]; - if (m_param->analysisReuseLevel == 10) + isScaledMaxCUSize = true; + bytes /= 4; + numCTUCopied = 4; + } + if ((modeBuf[d] != MODE_INTRA && depthBuf[d] != 0) || (modeBuf[d] == MODE_INTRA && depthBuf[d] > 1)) + writeDepth--; + } + + for (int numCTU = 0; numCTU < numCTUCopied; numCTU++) + { + memset(&((analysis_inter_data *)analysis->interData)->depth[count], writeDepth, bytes); + memset(&((analysis_inter_data *)analysis->interData)->modes[count], modeBuf[d], bytes); + if (m_param->analysisReuseLevel == 10 && bIntraInInter) + memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], chromaDir[d], bytes); + + if (m_param->analysisReuseLevel > 4) + { + puOrientation puOrient; + puOrient.isRect = false; + puOrient.isAmp = false; + puOrient.isVert = false; + if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && partSize[d] == SIZE_NxN) + partSize[d] = SIZE_2Nx2N; + int partitionSize = partSize[d]; + if (isScaledMaxCUSize && partSize[d] != SIZE_2Nx2N) + partitionSize = getPuShape(&puOrient, partSize[d], numCTU); + memset(&((analysis_inter_data *)analysis->interData)->partSize[count], partitionSize, bytes); + int numPU = (modeBuf[d] == MODE_INTRA) ? 1 : nbPartsTable[(int)partSize[d]]; + for (int pu = 0; pu < numPU; pu++) { - ((analysis_inter_data *)analysis->interData)->interDir[count + pu] = interDir[d]; - for (uint32_t i = 0; i < numDir; i++) + if (!isScaledMaxCUSize && pu) + d++; + int restoreD = d; + /* Adjust d value when the current CTU takes data from 2nd PU */ + if (puOrient.isRect || (puOrient.isAmp && partitionSize == SIZE_2Nx2N)) { - ((analysis_inter_data *)analysis->interData)->mvpIdx[i][count + pu] = mvpIdx[i][d]; - ((analysis_inter_data *)analysis->interData)->refIdx[i][count + pu] = refIdx[i][d]; - if (m_param->scaleFactor) + if ((numCTU > 1 && !puOrient.isVert) || ((numCTU % 2 == 1) && puOrient.isVert)) + d++; + } + if (puOrient.isAmp && pu) + d++; + + ((analysis_inter_data *)analysis->interData)->mergeFlag[count + pu] = mergeFlag[d]; + if (m_param->analysisReuseLevel == 10) + { + ((analysis_inter_data *)analysis->interData)->interDir[count + pu] = interDir[d]; + for (uint32_t i = 0; i < numDir; i++) { - mv[i][d].x *= (int16_t)m_param->scaleFactor; - mv[i][d].y *= (int16_t)m_param->scaleFactor; + ((analysis_inter_data *)analysis->interData)->mvpIdx[i][count + pu] = mvpIdx[i][d]; + ((analysis_inter_data *)analysis->interData)->refIdx[i][count + pu] = refIdx[i][d]; + MV mvCopy[2]; + mvCopy[i].x = mv[i][d].x; + mvCopy[i].y = mv[i][d].y; + if (m_param->scaleFactor) + { + mvCopy[i].x = mv[i][d].x * (int16_t)m_param->scaleFactor; + mvCopy[i].y = mv[i][d].y * (int16_t)m_param->scaleFactor; + } + memcpy(&((analysis_inter_data *)analysis->interData)->mv[i][count + pu], &mvCopy[i], sizeof(MV)); } - memcpy(&((analysis_inter_data *)analysis->interData)->mv[i][count + pu], &mv[i][d], sizeof(MV)); + } + d = restoreD; // Restore d value after copying each of the 4 64x64 CTUs + + if (isScaledMaxCUSize && (puOrient.isRect || puOrient.isAmp)) + { + /* Skip PU index when current CTU is a 2Nx2N */ + if (partitionSize == SIZE_2Nx2N) + pu++; + /* Adjust d after completion of all 4 CTU copies */ + if (numCTU == 3 && (pu == (numPU - 1))) + d++; } } } - if (m_param->analysisReuseLevel == 10 && bIntraInInter) - memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], chromaDir[d], bytes); + count += bytes; + if (m_param->scaleFactor) + d += getCUIndex(&cuLoc, &count, bytes, 1); } - count += bytes; } X265_FREE(tempBuf); @@ -3509,10 +3614,18 @@ } else { + cuLoc.evenRowIndex = 0; + cuLoc.oddRowIndex = m_param->num4x4Partitions * cuLoc.widthInCU; + cuLoc.switchCondition = 0; uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition); X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes); - for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor) + size_t cnt = 0; + for (uint32_t ctu32Idx = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++) + { memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor); + cnt += factor; + ctu32Idx += getCUIndex(&cuLoc, &cnt, factor, 0); + } X265_FREE(tempLumaBuf); } } @@ -3524,9 +3637,123 @@ if (numDir == 1) totalConsumedBytes = consumedBytes; } + + /* Restore to the current encode's numPartitions and numCUsInFrame */ + if (m_param->scaleFactor) + { + analysis->numPartitions = m_param->num4x4Partitions; + analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU; + } #undef X265_FREAD } +/* Toggle between two consecutive CTU rows. The save's CTU is copied +twice consecutively in the first and second CTU row of load*/ + +int Encoder::getCUIndex(cuLocation* cuLoc, size_t* count, int bytes, int flag) +{ + int index = 0; + cuLoc->switchCondition += bytes; + int isBoundaryW = (*count % (m_param->num4x4Partitions * cuLoc->widthInCU) == 0); + + /* Width boundary case : + Skip to appropriate index when out of boundary cases occur + Out of boundary may occur when the out of bound pixels along + the width in low resoultion is greater than half of the maxCUSize */ + if (cuLoc->skipWidth && isBoundaryW) + { + if (flag) + index++; + else + { + /* Number of 4x4 blocks in out of bound region */ + int outOfBound = m_param->maxCUSize / 2; + uint32_t sum = (uint32_t)pow((outOfBound >> 2), 2); + index += sum; + } + cuLoc->switchCondition += m_param->num4x4Partitions; + } + + /* Completed writing 2 CTUs - move to the last remembered index of the next CTU row*/ + if (cuLoc->switchCondition == 2 * m_param->num4x4Partitions) + { + if (isBoundaryW) + cuLoc->evenRowIndex = *count + (m_param->num4x4Partitions * cuLoc->widthInCU); // end of row - skip to the next even row + else + cuLoc->evenRowIndex = *count; + *count = cuLoc->oddRowIndex; + + /* Height boundary case : + Skip to appropriate index when out of boundary cases occur + Out of boundary may occur when the out of bound pixels along + the height in low resoultion is greater than half of the maxCUSize */ + int isBoundaryH = (*count >= (m_param->num4x4Partitions * cuLoc->heightInCU * cuLoc->widthInCU)); + if (cuLoc->skipHeight && isBoundaryH) + { + if (flag) + index += 2; + else + { + int outOfBound = m_param->maxCUSize / 2; + uint32_t sum = (uint32_t)(2 * pow((abs(outOfBound) >> 2), 2)); + index += sum; + } + *count = cuLoc->evenRowIndex; + cuLoc->switchCondition = 0; + } + } + /* Completed writing 4 CTUs - move to the last remembered index of + the previous CTU row to copy the next save CTU's data*/ + else if (cuLoc->switchCondition == 4 * m_param->num4x4Partitions) + { + if (isBoundaryW) + cuLoc->oddRowIndex = *count + (m_param->num4x4Partitions * cuLoc->widthInCU); // end of row - skip to the next odd row + else + cuLoc->oddRowIndex = *count; + *count = cuLoc->evenRowIndex; + cuLoc->switchCondition = 0; + } + return index; +} + +/* save load + CTU0 CTU1 CTU2 CTU3 + 2NxN 2Nx2N 2Nx2N 2Nx2N 2Nx2N + NX2N 2Nx2N 2Nx2N 2Nx2N 2Nx2N + 2NxnU 2NxN 2NxN 2Nx2N 2Nx2N + 2NxnD 2Nx2N 2Nx2N 2NxN 2NxN + nLx2N Nx2N 2Nx2N Nx2N 2Nx2N + nRx2N 2Nx2N Nx2N 2Nx2N Nx2N +*/ +int Encoder::getPuShape(puOrientation* puOrient, int partSize, int numCTU) +{ + puOrient->isRect = true; + if (partSize == SIZE_Nx2N) + puOrient->isVert = true; + if (partSize >= SIZE_2NxnU) // All AMP modes + { + puOrient->isAmp = true; + puOrient->isRect = false; + if (partSize == SIZE_2NxnD && numCTU > 1) + return SIZE_2NxN; + else if (partSize == SIZE_2NxnU && numCTU < 2) + return SIZE_2NxN; + else if (partSize == SIZE_nLx2N) + { + puOrient->isVert = true; + if (!(numCTU % 2)) + return SIZE_Nx2N; + } + else if (partSize == SIZE_nRx2N) + { + puOrient->isVert = true; + if (numCTU % 2) + return SIZE_Nx2N; + } + } + return SIZE_2Nx2N; +} + void Encoder::readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int curPoc, int sliceType) { diff -r 7219376de42a -r 55eb39922995 source/encoder/encoder.h --- a/source/encoder/encoder.h Thu Feb 15 02:21:26 2018 -0800 +++ b/source/encoder/encoder.h Wed Feb 28 11:09:18 2018 +0530 @@ -90,6 +90,25 @@ RPSListNode* prior; }; +struct cuLocation +{ + bool skipWidth; + bool skipHeight; + uint32_t heightInCU; + uint32_t widthInCU; + size_t oddRowIndex; + size_t evenRowIndex; + uint32_t switchCondition; +}; + +struct puOrientation +{ + int isVert; + int isRect; + int isAmp; +}; + + class FrameEncoder; class DPB; class Lookahead; @@ -237,6 +256,10 @@ void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn); + int getCUIndex(cuLocation* cuLoc, size_t* count, int bytes, int flag); + + int getPuShape(puOrientation* puOrient, int partSize, int numCTU); + void writeAnalysisFile(x265_analysis_data* pic, FrameData &curEncData); void readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int poc, int sliceType); void writeAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, FrameData &curEncData, int slicetype);
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel