Pushed to default. On Wed, Jul 10, 2019 at 9:47 AM <[email protected]> wrote:
> # HG changeset patch > # User Pooja Venkatesan <[email protected]> > # Date 1562562567 -19800 > # Mon Jul 08 10:39:27 2019 +0530 > # Node ID 2dcff9aea06f0f1c396fd2a62104e4fd5029bf40 > # Parent 14a235657a2011aa28d45544f33b7186c33b9218 > motion: Perform ME on each HME level > > This patch does the following: > 1) Perform level-0 ME > 2) Use the MVs as predictor for next level ME > 3) Restrict full-search within a range when HME is enabled > > diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.cpp > --- a/source/common/lowres.cpp Fri Jul 05 11:17:26 2019 +0530 > +++ b/source/common/lowres.cpp Mon Jul 08 10:39:27 2019 +0530 > @@ -65,6 +65,7 @@ > maxBlocksInColFullRes = maxBlocksInCol * 2; > int cuCount = maxBlocksInRow * maxBlocksInCol; > int cuCountFullRes = (qgSize > 8) ? cuCount : cuCount << 2; > + isHMELowres = param->bEnableHME ? 1 : 0; > > /* rounding the width to multiple of lowres CU size */ > width = maxBlocksInRow * X265_LOWRES_CU_SIZE; > @@ -176,6 +177,16 @@ > CHECKED_MALLOC(lowresMvs[1][i], MV, cuCount); > CHECKED_MALLOC(lowresMvCosts[0][i], int32_t, cuCount); > CHECKED_MALLOC(lowresMvCosts[1][i], int32_t, cuCount); > + if (bEnableHME) > + { > + int maxBlocksInRowLowerRes = ((width/2) + X265_LOWRES_CU_SIZE > - 1) >> X265_LOWRES_CU_BITS; > + int maxBlocksInColLowerRes = ((lines/2) + X265_LOWRES_CU_SIZE > - 1) >> X265_LOWRES_CU_BITS; > + int cuCountLowerRes = maxBlocksInRowLowerRes * > maxBlocksInColLowerRes; > + CHECKED_MALLOC(lowerResMvs[0][i], MV, cuCountLowerRes); > + CHECKED_MALLOC(lowerResMvs[1][i], MV, cuCountLowerRes); > + CHECKED_MALLOC(lowerResMvCosts[0][i], int32_t, > cuCountLowerRes); > + CHECKED_MALLOC(lowerResMvCosts[1][i], int32_t, > cuCountLowerRes); > + } > } > > return true; > @@ -207,6 +218,13 @@ > X265_FREE(lowresMvs[1][i]); > X265_FREE(lowresMvCosts[0][i]); > X265_FREE(lowresMvCosts[1][i]); > + if (bEnableHME) > + { > + X265_FREE(lowerResMvs[0][i]); > + X265_FREE(lowerResMvs[1][i]); > + X265_FREE(lowerResMvCosts[0][i]); > + X265_FREE(lowerResMvCosts[1][i]); > + } > } > X265_FREE(qpAqOffset); > X265_FREE(invQscaleFactor); > diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.h > --- a/source/common/lowres.h Fri Jul 05 11:17:26 2019 +0530 > +++ b/source/common/lowres.h Mon Jul 08 10:39:27 2019 +0530 > @@ -46,6 +46,7 @@ > > bool isWeighted; > bool isLowres; > + bool isHMELowres; > > intptr_t lumaStride; > intptr_t chromaStride; > @@ -63,46 +64,58 @@ > > /* lowres motion compensation, you must provide a buffer and stride > for QPEL averaged pixels > * in case QPEL is required. Else it returns a pointer to the HPEL > pixels */ > - inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel > *buf, intptr_t& outstride) > + inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel > *buf, intptr_t& outstride, bool hme) > { > + intptr_t YStride = hme ? lumaStride / 2 : lumaStride; > + pixel *plane[4]; > + for (int i = 0; i < 4; i++) > + { > + plane[i] = hme ? lowerResPlane[i] : lowresPlane[i]; > + } > if ((qmv.x | qmv.y) & 1) > { > int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1); > - pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> > 2) + (qmv.y >> 2) * lumaStride; > + pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) + > (qmv.y >> 2) * YStride; > int qmvx = qmv.x + (qmv.x & 1); > int qmvy = qmv.y + (qmv.y & 1); > int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1); > - pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) > + (qmvy >> 2) * lumaStride; > - primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && > (lumaStride % 64 == 0)](buf, outstride, frefA, lumaStride, frefB, > lumaStride, 32); > + pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) + > (qmvy >> 2) * YStride; > + primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && > (YStride % 64 == 0)](buf, outstride, frefA, YStride, frefB, YStride, 32); > return buf; > } > else > { > - outstride = lumaStride; > + outstride = YStride; > int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1); > - return lowresPlane[hpel] + blockOffset + (qmv.x >> 2) + > (qmv.y >> 2) * lumaStride; > + return plane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> > 2) * YStride; > } > } > > - inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const > MV& qmv, pixelcmp_t comp) > + inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const > MV& qmv, pixelcmp_t comp, bool hme) > { > + intptr_t YStride = hme ? lumaStride / 2 : lumaStride; > + pixel *plane[4]; > + for (int i = 0; i < 4; i++) > + { > + plane[i] = hme ? lowerResPlane[i] : lowresPlane[i]; > + } > if ((qmv.x | qmv.y) & 1) > { > ALIGN_VAR_16(pixel, subpelbuf[8 * 8]); > int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1); > - pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> > 2) + (qmv.y >> 2) * lumaStride; > + pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) + > (qmv.y >> 2) * YStride; > int qmvx = qmv.x + (qmv.x & 1); > int qmvy = qmv.y + (qmv.y & 1); > int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1); > - pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) > + (qmvy >> 2) * lumaStride; > - primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, > frefA, lumaStride, frefB, lumaStride, 32); > + pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) + > (qmvy >> 2) * YStride; > + primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, > frefA, YStride, frefB, YStride, 32); > return comp(fenc, FENC_STRIDE, subpelbuf, 8); > } > else > { > int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1); > - pixel *fref = lowresPlane[hpel] + blockOffset + (qmv.x >> 2) > + (qmv.y >> 2) * lumaStride; > - return comp(fenc, FENC_STRIDE, fref, lumaStride); > + pixel *fref = plane[hpel] + blockOffset + (qmv.x >> 2) + > (qmv.y >> 2) * YStride; > + return comp(fenc, FENC_STRIDE, fref, YStride); > } > } > }; > @@ -188,6 +201,8 @@ > > /* Hierarchical Motion Estimation */ > bool bEnableHME; > + int32_t* lowerResMvCosts[2][X265_BFRAME_MAX + 2]; > + MV* lowerResMvs[2][X265_BFRAME_MAX + 2]; > > /* used for vbvLookahead */ > int plannedType[X265_LOOKAHEAD_MAX + 1]; > diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/encoder.cpp > --- a/source/encoder/encoder.cpp Fri Jul 05 11:17:26 2019 +0530 > +++ b/source/encoder/encoder.cpp Mon Jul 08 10:39:27 2019 +0530 > @@ -3387,6 +3387,10 @@ > x265_log(p, X265_LOG_WARNING, "Source height < 540p is too > low for HME. Disabling HME.\n"); > p->bEnableHME = 0; > } > + if (m_param->bEnableHME && m_param->searchMethod != > m_param->hmeSearchMethod[2]) > + { > + m_param->searchMethod = m_param->hmeSearchMethod[2]; > + } > } > } > > diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.cpp > --- a/source/encoder/motion.cpp Fri Jul 05 11:17:26 2019 +0530 > +++ b/source/encoder/motion.cpp Mon Jul 08 10:39:27 2019 +0530 > @@ -104,6 +104,8 @@ > ctuAddr = -1; > absPartIdx = -1; > searchMethod = X265_HEX_SEARCH; > + searchMethodL0 = X265_HEX_SEARCH; > + searchMethodL1 = X265_HEX_SEARCH; > subpelRefine = 2; > blockwidth = blockheight = 0; > blockOffset = 0; > @@ -162,7 +164,7 @@ > } > > /* Called by lookahead, luma only, no use of PicYuv */ > -void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t > offset, int pwidth, int pheight, const int method, const int refine) > +void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t > offset, int pwidth, int pheight, const int method, const int searchL0, > const int searchL1, const int refine) > { > partEnum = partitionFromSizes(pwidth, pheight); > X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n"); > @@ -179,6 +181,8 @@ > > /* Search params */ > searchMethod = method; > + searchMethodL0 = searchL0; > + searchMethodL1 = searchL1; > subpelRefine = refine; > > /* copy PU block into cache */ > @@ -743,9 +747,10 @@ > pixel * srcReferencePlane) > { > ALIGN_VAR_16(int, costs[16]); > + bool hme = srcReferencePlane && srcReferencePlane == > ref->fpelLowerResPlane[0]; > if (ctuAddr >= 0) > blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - > ref->reconPic->getLumaAddr(0); > - intptr_t stride = ref->lumaStride; > + intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride; > pixel* fenc = fencPUYuv.m_buf[0]; > pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + > blockOffset : srcReferencePlane + blockOffset; > > @@ -767,7 +772,7 @@ > int bprecost; > > if (ref->isLowres) > - bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad); > + bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme); > else > bprecost = subpelCompare(ref, pmv, sad); > > @@ -808,7 +813,8 @@ > pmv = pmv.roundToFPel(); > MV omv = bmv; // current search origin or starting point > > - switch (searchMethod) > + int search = ref->isHMELowres ? (hme ? searchMethodL0 : > searchMethodL1) : searchMethod; > + switch (search) > { > case X265_DIA_SEARCH: > { > @@ -1391,11 +1397,20 @@ > { > // dead slow exhaustive search, but at least it uses sad_x4() > MV tmv; > - for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++) > + int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y, > mvmax_x = mvmax.x; > + if (ref->isHMELowres) > { > - for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++) > + merange = (merange < 0 ? -merange : merange); > + mvmin_y = X265_MAX(mvmin.y, -merange); > + mvmin_x = X265_MAX(mvmin.x, -merange); > + mvmax_y = X265_MIN(mvmax.y, merange); > + mvmax_x = X265_MIN(mvmax.x, merange); > + } > + for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++) > + { > + for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++) > { > - if (tmv.x + 3 <= mvmax.x) > + if (tmv.x + 3 <= mvmax_x) > { > pixel *pix_base = fref + tmv.y * stride + tmv.x; > sad_x4(fenc, > @@ -1463,12 +1478,12 @@ > if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y)) > continue; > > - int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + > mvcost(qmv); > + int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad, > hme) + mvcost(qmv); > COPY2_IF_LT(bcost, cost, bdir, i); > } > > bmv += square1[bdir] * 2; > - bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + > mvcost(bmv); > + bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) + > mvcost(bmv); > > bdir = 0; > for (int i = 1; i <= wl.qpel_dirs; i++) > @@ -1479,7 +1494,7 @@ > if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y)) > continue; > > - int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) > + mvcost(qmv); > + int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd, > hme) + mvcost(qmv); > COPY2_IF_LT(bcost, cost, bdir, i); > } > > diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.h > --- a/source/encoder/motion.h Fri Jul 05 11:17:26 2019 +0530 > +++ b/source/encoder/motion.h Mon Jul 08 10:39:27 2019 +0530 > @@ -44,6 +44,8 @@ > int absPartIdx; // part index of PU, including CU offset within CTU > > int searchMethod; > + int searchMethodL0; > + int searchMethodL1; > int subpelRefine; > > int blockwidth; > @@ -76,7 +78,7 @@ > > /* Methods called at slice setup */ > > - void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int > pwidth, int pheight, const int searchMethod, const int subpelRefine); > + void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int > pwidth, int pheight, const int searchMethod, const int searchL0, const int > searchL1, const int subpelRefine); > void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, > int puPartIdx, int pwidth, int pheight, const int searchMethod, const int > subpelRefine, bool bChroma); > > /* buf*() and motionEstimate() methods all use cached fenc pixels and > thus > diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.cpp > --- a/source/encoder/search.cpp Fri Jul 05 11:17:26 2019 +0530 > +++ b/source/encoder/search.cpp Mon Jul 08 10:39:27 2019 +0530 > @@ -2096,13 +2096,16 @@ > > const MV* amvp = interMode.amvpCand[list][ref]; > int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref); > - MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx]; > + bool bLowresMVP = false; > + MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres; > > if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents > load/save outputs from diverging if lowresMV is not available */ > { > MV lmv = getLowresMV(interMode.cu, pu, list, ref); > if (lmv.notZero()) > mvc[numMvc++] = lmv; > + if (m_param->bEnableHME) > + mvp_lowres = lmv; > } > > setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax); > @@ -2110,11 +2113,28 @@ > int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], > mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, > m_param->maxSlices, > m_param->bSourceReferenceEstimation ? > m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); > > + if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp) > + { > + MV outmv_lowres; > + setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, > mvmin, mvmax); > + int lowresMvCost = > m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, > numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, > + m_param->bSourceReferenceEstimation ? > m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); > + if (lowresMvCost < satdCost) > + { > + outmv = outmv_lowres; > + satdCost = lowresMvCost; > + bLowresMVP = true; > + } > + } > /* Get total cost of partition, but only include MV bit cost once */ > bits += m_me.bitcost(outmv); > uint32_t mvCost = m_me.mvcost(outmv); > uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits); > > + /* Update LowresMVP to best AMVP cand*/ > + if (bLowresMVP) > + updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres); > + > /* Refine MVP selection, updates: mvpIdx, bits, cost */ > mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost); > > @@ -2346,13 +2366,16 @@ > > const MV* amvp = interMode.amvpCand[list][ref]; > int mvpIdx = selectMVP(cu, pu, amvp, list, ref); > - MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx]; > + MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], > mvp_lowres; > + bool bLowresMVP = false; > > if (!m_param->analysisSave && !m_param->analysisLoad) > /* Prevents load/save outputs from diverging when lowresMV is not available > */ > { > MV lmv = getLowresMV(cu, pu, list, ref); > if (lmv.notZero()) > mvc[numMvc++] = lmv; > + if (m_param->bEnableHME) > + mvp_lowres = lmv; > } > if (m_param->searchMethod == X265_SEA) > { > @@ -2365,10 +2388,27 @@ > int satdCost = > m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, > mvc, m_param->searchRange, outmv, m_param->maxSlices, > m_param->bSourceReferenceEstimation ? > m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); > > + if (m_param->bEnableHME && mvp_lowres.notZero() && > mvp_lowres != mvp) > + { > + MV outmv_lowres; > + setSearchRange(cu, mvp_lowres, > m_param->searchRange, mvmin, mvmax); > + int lowresMvCost = > m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, > numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, > + m_param->bSourceReferenceEstimation ? > m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); > + if (lowresMvCost < satdCost) > + { > + outmv = outmv_lowres; > + satdCost = lowresMvCost; > + bLowresMVP = true; > + } > + } > + > /* Get total cost of partition, but only include MV > bit cost once */ > bits += m_me.bitcost(outmv); > uint32_t mvCost = m_me.mvcost(outmv); > uint32_t cost = (satdCost - mvCost) + > m_rdCost.getCost(bits); > + /* Update LowresMVP to best AMVP cand*/ > + if (bLowresMVP) > + updateMVP(amvp[mvpIdx], outmv, bits, cost, > mvp_lowres); > > /* Refine MVP selection, updates: mvpIdx, bits, cost > */ > mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost); > @@ -2631,6 +2671,15 @@ > return amvpCand[mvpIdx]; > } > > +/* Update to default MVP when using an alternative mvp */ > +void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, > uint32_t& outCost, const MV& alterMVP) > +{ > + int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP); > + uint32_t origOutBits = outBits; > + outBits = origOutBits + diffBits; > + outCost = (outCost - m_rdCost.getCost(origOutBits)) + > m_rdCost.getCost(outBits); > +} > + > void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, > MV& mvmin, MV& mvmax) const > { > MV dist((int32_t)merange << 2, (int32_t)merange << 2); > diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.h > --- a/source/encoder/search.h Fri Jul 05 11:17:26 2019 +0530 > +++ b/source/encoder/search.h Mon Jul 08 10:39:27 2019 +0530 > @@ -425,6 +425,7 @@ > void setSearchRange(const CUData& cu, const MV& mvp, int merange, > MV& mvmin, MV& mvmax) const; > uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const > PredictionUnit& pu, int puIdx, MergeData& m); > static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, > uint32_t lastMode, uint32_t blockBit[3]); > + void updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, > uint32_t& outCost, const MV& alterMVP); > > /* intra helper functions */ > enum { MAX_RD_INTRA_MODES = 16 }; > diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.cpp > --- a/source/encoder/slicetype.cpp Fri Jul 05 11:17:26 2019 +0530 > +++ b/source/encoder/slicetype.cpp Mon Jul 08 10:39:27 2019 +0530 > @@ -664,6 +664,7 @@ > weightedRef.lumaStride = fenc.lumaStride; > weightedRef.isLowres = true; > weightedRef.isWeighted = false; > + weightedRef.isHMELowres = ref.bEnableHME; > > /* epsilon is chosen to require at least a numerator of 127 (with > denominator = 128) */ > float guessScale, fencMean, refMean; > @@ -759,6 +760,8 @@ > m_extendGopBoundary = false; > m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) > >> X265_LOWRES_CU_BITS; > m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) > >> X265_LOWRES_CU_BITS; > + m_4x4Height = ((m_param->sourceHeight / 4) + X265_LOWRES_CU_SIZE - 1) > >> X265_LOWRES_CU_BITS; > + m_4x4Width = ((m_param->sourceWidth / 4) + X265_LOWRES_CU_SIZE - 1) > >> X265_LOWRES_CU_BITS; > m_cuCount = m_8x8Width * m_8x8Height; > m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2 > * (m_8x8Width + m_8x8Height)) : m_cuCount; > m_isFadeIn = false; > @@ -2782,16 +2785,32 @@ > > X265_CHECK(i < MAX_COOP_SLICES, "impossible number of coop > slices\n"); > > - int firstY = m_lookahead.m_numRowsPerSlice * i; > - int lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - > 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1; > - > - bool lastRow = true; > + int firstY, lastY; > + bool lastRow; > + if (m_lookahead.m_param->bEnableHME) > + { > + int numRowsPerSlice = m_lookahead.m_4x4Height / > m_lookahead.m_param->lookaheadSlices; > + numRowsPerSlice = X265_MIN(X265_MAX(numRowsPerSlice, 5), > m_lookahead.m_4x4Height); > + firstY = numRowsPerSlice * i; > + lastY = (i == m_jobTotal - 1) ? m_lookahead.m_4x4Height - > 1 : numRowsPerSlice * (i + 1) - 1; > + lastRow = true; > + for (int cuY = lastY; cuY >= firstY; cuY--) > + { > + for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; > cuX--) > + estimateCUCost(tld, cuX, cuY, m_coop.p0, > m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i, 1); > + lastRow = false; > + } > + } > + > + firstY = m_lookahead.m_numRowsPerSlice * i; > + lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 : > m_lookahead.m_numRowsPerSlice * (i + 1) - 1; > + lastRow = true; > for (int cuY = lastY; cuY >= firstY; cuY--) > { > m_frames[m_coop.b]->rowSatds[m_coop.b - > m_coop.p0][m_coop.p1 - m_coop.b][cuY] = 0; > > for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; > cuX--) > - estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, > m_coop.b, m_coop.bDoSearch, lastRow, i); > + estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, > m_coop.b, m_coop.bDoSearch, lastRow, i, 0); > > lastRow = false; > } > @@ -2864,13 +2883,25 @@ > } > else > { > - bool lastRow = true; > + /* Calculate MVs for 1/16th resolution*/ > + bool lastRow; > + if (param->bEnableHME) > + { > + lastRow = true; > + for (int cuY = m_lookahead.m_4x4Height - 1; cuY >= 0; > cuY--) > + { > + for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; > cuX--) > + estimateCUCost(tld, cuX, cuY, p0, p1, b, > bDoSearch, lastRow, -1, 1); > + lastRow = false; > + } > + } > + lastRow = true; > for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--) > { > fenc->rowSatds[b - p0][p1 - b][cuY] = 0; > > for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; > cuX--) > - estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, > lastRow, -1); > + estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, > lastRow, -1, 0); > > lastRow = false; > } > @@ -2891,23 +2922,27 @@ > return score; > } > > -void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int > cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice) > +void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int > cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, > bool hme) > { > Lowres *fref0 = m_frames[p0]; > Lowres *fref1 = m_frames[p1]; > Lowres *fenc = m_frames[b]; > > - ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted ? > &fenc->weightedRef[b - p0] : fref0; > - > - const int widthInCU = m_lookahead.m_8x8Width; > - const int heightInCU = m_lookahead.m_8x8Height; > + ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted && > !hme ? &fenc->weightedRef[b - p0] : fref0; > + > + const int widthInCU = hme ? m_lookahead.m_4x4Width : > m_lookahead.m_8x8Width; > + const int heightInCU = hme ? m_lookahead.m_4x4Height : > m_lookahead.m_8x8Height; > const int bBidir = (b < p1); > const int cuXY = cuX + cuY * widthInCU; > + const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2; > const int cuSize = X265_LOWRES_CU_SIZE; > - const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * > fenc->lumaStride; > - > - if (bBidir || bDoSearch[0] || bDoSearch[1]) > - tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, > pelOffset, cuSize, cuSize, X265_HEX_SEARCH, 1); > + const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ? > fenc->lumaStride/2 : fenc->lumaStride); > + > + if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme) > + tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2, > pelOffset, cuSize, cuSize, X265_HEX_SEARCH, > m_lookahead.m_param->hmeSearchMethod[0], > m_lookahead.m_param->hmeSearchMethod[1], 1); > + else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme) > + tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, > pelOffset, cuSize, cuSize, X265_HEX_SEARCH, > m_lookahead.m_param->hmeSearchMethod[0], > m_lookahead.m_param->hmeSearchMethod[1], 1); > + > > /* A small, arbitrary bias to avoid VBV problems caused by > zero-residual lookahead blocks. */ > int lowresPenalty = 4; > @@ -2926,7 +2961,7 @@ > > for (int i = 0; i < 1 + bBidir; i++) > { > - int& fencCost = fenc->lowresMvCosts[i][listDist[i]][cuXY]; > + int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY] > : fenc->lowresMvCosts[i][listDist[i]][cuXY]; > int skipCost = INT_MAX; > > if (!bDoSearch[i]) > @@ -2936,8 +2971,8 @@ > } > > int numc = 0; > - MV mvc[4], mvp; > - MV* fencMV = &fenc->lowresMvs[i][listDist[i]][cuXY]; > + MV mvc[5], mvp; > + MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] : > &fenc->lowresMvs[i][listDist[i]][cuXY]; > ReferencePlanes* fref = i ? fref1 : wfref0; > > /* Reverse-order MV prediction */ > @@ -2952,6 +2987,10 @@ > if (cuX < widthInCU - 1) > MVC(fencMV[widthInCU + 1]); > } > + if (fenc->lowerResMvs[0][0] && !hme && > fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0) > + { > + MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2); > + } > #undef MVC > > if (!numc) > @@ -2967,7 +3006,7 @@ > for (int idx = 0; idx < numc; idx++) > { > intptr_t stride = X265_LOWRES_CU_SIZE; > - pixel *src = fref->lowresMC(pelOffset, mvc[idx], > subpelbuf, stride); > + pixel *src = fref->lowresMC(pelOffset, mvc[idx], > subpelbuf, stride, hme); > int cost = tld.me.bufSATD(src, stride); > COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]); > /* Except for mv0 case, everyting else is likely to have > enough residual to not trigger the skip. */ > @@ -2978,7 +3017,10 @@ > > /* ME will never return a cost larger than the cost @MVP, so we > do not > * have to check that ME cost is more than the estimated merge > cost */ > - fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, > NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices); > + if(!hme) > + fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, > NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices); > + else > + fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, > NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices, > fref->lowerResPlane[0]); > if (skipCost < 64 && skipCost < fencCost && bBidir) > { > fencCost = skipCost; > @@ -2986,6 +3028,8 @@ > } > COPY2_IF_LT(bcost, fencCost, listused, i + 1); > } > + if (hme) > + return; > > if (bBidir) /* B, also consider bidir */ > { > @@ -2995,8 +3039,8 @@ > ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * > X265_LOWRES_CU_SIZE]); > ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * > X265_LOWRES_CU_SIZE]); > intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = > X265_LOWRES_CU_SIZE; > - pixel *src0 = fref0->lowresMC(pelOffset, > fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0); > - pixel *src1 = fref1->lowresMC(pelOffset, > fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1); > + pixel *src0 = fref0->lowresMC(pelOffset, > fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0); > + pixel *src1 = fref1->lowresMC(pelOffset, > fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0); > ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * > X265_LOWRES_CU_SIZE]); > primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, > X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32); > int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE); > diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.h > --- a/source/encoder/slicetype.h Fri Jul 05 11:17:26 2019 +0530 > +++ b/source/encoder/slicetype.h Mon Jul 08 10:39:27 2019 +0530 > @@ -124,6 +124,10 @@ > int m_inputCount; > double m_cuTreeStrength; > > + /* HME */ > + int m_4x4Width; > + int m_4x4Height; > + > bool m_isActive; > bool m_sliceTypeBusy; > bool m_bAdaptiveQuant; > @@ -246,7 +250,7 @@ > void processTasks(int workerThreadID); > > int64_t estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, > bool intraPenalty); > - void estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, > int p1, int b, bool bDoSearch[2], bool lastRow, int slice); > + void estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, > int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme); > > CostEstimateGroup& operator=(const CostEstimateGroup&); > }; > diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/weightPrediction.cpp > --- a/source/encoder/weightPrediction.cpp Fri Jul 05 11:17:26 2019 > +0530 > +++ b/source/encoder/weightPrediction.cpp Mon Jul 08 10:39:27 2019 > +0530 > @@ -82,7 +82,7 @@ > /* clip MV to available pixels */ > MV mv = mvs[cu]; > mv = mv.clipped(mvmin, mvmax); > - pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride); > + pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride, 0); > primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride, > tmp, bstride); > } > } > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Regards, Aruna
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
