Re: [x265] [PATCH 2 of 2] motion: Perform ME on each HME level

Aruna Matheswaran Thu, 11 Jul 2019 06:09:44 -0700

Pushed to default.

On Wed, Jul 10, 2019 at 9:47 AM <[email protected]> wrote:


> # HG changeset patch
> # User Pooja Venkatesan <[email protected]>
> # Date 1562562567 -19800
> #      Mon Jul 08 10:39:27 2019 +0530
> # Node ID 2dcff9aea06f0f1c396fd2a62104e4fd5029bf40
> # Parent  14a235657a2011aa28d45544f33b7186c33b9218
> motion: Perform ME on each HME level
>
> This patch does the following:
> 1) Perform level-0 ME
> 2) Use the MVs as predictor for next level ME
> 3) Restrict full-search within a range when HME is enabled
>
> diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.cpp
> --- a/source/common/lowres.cpp  Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/common/lowres.cpp  Mon Jul 08 10:39:27 2019 +0530
> @@ -65,6 +65,7 @@
>      maxBlocksInColFullRes = maxBlocksInCol * 2;
>      int cuCount = maxBlocksInRow * maxBlocksInCol;
>      int cuCountFullRes = (qgSize > 8) ? cuCount : cuCount << 2;
> +    isHMELowres = param->bEnableHME ? 1 : 0;
>
>      /* rounding the width to multiple of lowres CU size */
>      width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
> @@ -176,6 +177,16 @@
>          CHECKED_MALLOC(lowresMvs[1][i], MV, cuCount);
>          CHECKED_MALLOC(lowresMvCosts[0][i], int32_t, cuCount);
>          CHECKED_MALLOC(lowresMvCosts[1][i], int32_t, cuCount);
> +        if (bEnableHME)
> +        {
> +            int maxBlocksInRowLowerRes = ((width/2) + X265_LOWRES_CU_SIZE
> - 1) >> X265_LOWRES_CU_BITS;
> +            int maxBlocksInColLowerRes = ((lines/2) + X265_LOWRES_CU_SIZE
> - 1) >> X265_LOWRES_CU_BITS;
> +            int cuCountLowerRes = maxBlocksInRowLowerRes *
> maxBlocksInColLowerRes;
> +            CHECKED_MALLOC(lowerResMvs[0][i], MV, cuCountLowerRes);
> +            CHECKED_MALLOC(lowerResMvs[1][i], MV, cuCountLowerRes);
> +            CHECKED_MALLOC(lowerResMvCosts[0][i], int32_t,
> cuCountLowerRes);
> +            CHECKED_MALLOC(lowerResMvCosts[1][i], int32_t,
> cuCountLowerRes);
> +        }
>      }
>
>      return true;
> @@ -207,6 +218,13 @@
>          X265_FREE(lowresMvs[1][i]);
>          X265_FREE(lowresMvCosts[0][i]);
>          X265_FREE(lowresMvCosts[1][i]);
> +        if (bEnableHME)
> +        {
> +            X265_FREE(lowerResMvs[0][i]);
> +            X265_FREE(lowerResMvs[1][i]);
> +            X265_FREE(lowerResMvCosts[0][i]);
> +            X265_FREE(lowerResMvCosts[1][i]);
> +        }
>      }
>      X265_FREE(qpAqOffset);
>      X265_FREE(invQscaleFactor);
> diff -r 14a235657a20 -r 2dcff9aea06f source/common/lowres.h
> --- a/source/common/lowres.h    Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/common/lowres.h    Mon Jul 08 10:39:27 2019 +0530
> @@ -46,6 +46,7 @@
>
>      bool     isWeighted;
>      bool     isLowres;
> +    bool     isHMELowres;
>
>      intptr_t lumaStride;
>      intptr_t chromaStride;
> @@ -63,46 +64,58 @@
>
>      /* lowres motion compensation, you must provide a buffer and stride
> for QPEL averaged pixels
>       * in case QPEL is required.  Else it returns a pointer to the HPEL
> pixels */
> -    inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel
> *buf, intptr_t& outstride)
> +    inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel
> *buf, intptr_t& outstride, bool hme)
>      {
> +        intptr_t YStride = hme ? lumaStride / 2 : lumaStride;
> +        pixel *plane[4];
> +        for (int i = 0; i < 4; i++)
> +        {
> +            plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];
> +        }
>          if ((qmv.x | qmv.y) & 1)
>          {
>              int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
> -            pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >>
> 2) + (qmv.y >> 2) * lumaStride;
> +            pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) +
> (qmv.y >> 2) * YStride;
>              int qmvx = qmv.x + (qmv.x & 1);
>              int qmvy = qmv.y + (qmv.y & 1);
>              int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
> -            pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2)
> + (qmvy >> 2) * lumaStride;
> -            primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) &&
> (lumaStride % 64 == 0)](buf, outstride, frefA, lumaStride, frefB,
> lumaStride, 32);
> +            pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) +
> (qmvy >> 2) * YStride;
> +            primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) &&
> (YStride % 64 == 0)](buf, outstride, frefA, YStride, frefB, YStride, 32);
>              return buf;
>          }
>          else
>          {
> -            outstride = lumaStride;
> +            outstride = YStride;
>              int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);
> -            return lowresPlane[hpel] + blockOffset + (qmv.x >> 2) +
> (qmv.y >> 2) * lumaStride;
> +            return plane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >>
> 2) * YStride;
>          }
>      }
>
> -    inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const
> MV& qmv, pixelcmp_t comp)
> +    inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const
> MV& qmv, pixelcmp_t comp, bool hme)
>      {
> +        intptr_t YStride = hme ? lumaStride / 2 : lumaStride;
> +        pixel *plane[4];
> +        for (int i = 0; i < 4; i++)
> +        {
> +            plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];
> +        }
>          if ((qmv.x | qmv.y) & 1)
>          {
>              ALIGN_VAR_16(pixel, subpelbuf[8 * 8]);
>              int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
> -            pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >>
> 2) + (qmv.y >> 2) * lumaStride;
> +            pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) +
> (qmv.y >> 2) * YStride;
>              int qmvx = qmv.x + (qmv.x & 1);
>              int qmvy = qmv.y + (qmv.y & 1);
>              int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
> -            pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2)
> + (qmvy >> 2) * lumaStride;
> -            primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8,
> frefA, lumaStride, frefB, lumaStride, 32);
> +            pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) +
> (qmvy >> 2) * YStride;
> +            primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8,
> frefA, YStride, frefB, YStride, 32);
>              return comp(fenc, FENC_STRIDE, subpelbuf, 8);
>          }
>          else
>          {
>              int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);
> -            pixel *fref = lowresPlane[hpel] + blockOffset + (qmv.x >> 2)
> + (qmv.y >> 2) * lumaStride;
> -            return comp(fenc, FENC_STRIDE, fref, lumaStride);
> +            pixel *fref = plane[hpel] + blockOffset + (qmv.x >> 2) +
> (qmv.y >> 2) * YStride;
> +            return comp(fenc, FENC_STRIDE, fref, YStride);
>          }
>      }
>  };
> @@ -188,6 +201,8 @@
>
>      /* Hierarchical Motion Estimation */
>      bool      bEnableHME;
> +    int32_t*  lowerResMvCosts[2][X265_BFRAME_MAX + 2];
> +    MV*       lowerResMvs[2][X265_BFRAME_MAX + 2];
>
>      /* used for vbvLookahead */
>      int       plannedType[X265_LOOKAHEAD_MAX + 1];
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp        Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/encoder.cpp        Mon Jul 08 10:39:27 2019 +0530
> @@ -3387,6 +3387,10 @@
>              x265_log(p, X265_LOG_WARNING, "Source height < 540p is too
> low for HME. Disabling HME.\n");
>              p->bEnableHME = 0;
>          }
> +        if (m_param->bEnableHME && m_param->searchMethod !=
> m_param->hmeSearchMethod[2])
> +        {
> +            m_param->searchMethod = m_param->hmeSearchMethod[2];
> +        }
>      }
>  }
>
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.cpp
> --- a/source/encoder/motion.cpp Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/motion.cpp Mon Jul 08 10:39:27 2019 +0530
> @@ -104,6 +104,8 @@
>      ctuAddr = -1;
>      absPartIdx = -1;
>      searchMethod = X265_HEX_SEARCH;
> +    searchMethodL0 = X265_HEX_SEARCH;
> +    searchMethodL1 = X265_HEX_SEARCH;
>      subpelRefine = 2;
>      blockwidth = blockheight = 0;
>      blockOffset = 0;
> @@ -162,7 +164,7 @@
>  }
>
>  /* Called by lookahead, luma only, no use of PicYuv */
> -void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t
> offset, int pwidth, int pheight, const int method, const int refine)
> +void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t
> offset, int pwidth, int pheight, const int method, const int searchL0,
> const int searchL1, const int refine)
>  {
>      partEnum = partitionFromSizes(pwidth, pheight);
>      X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
> @@ -179,6 +181,8 @@
>
>      /* Search params */
>      searchMethod = method;
> +    searchMethodL0 = searchL0;
> +    searchMethodL1 = searchL1;
>      subpelRefine = refine;
>
>      /* copy PU block into cache */
> @@ -743,9 +747,10 @@
>                                     pixel *          srcReferencePlane)
>  {
>      ALIGN_VAR_16(int, costs[16]);
> +    bool hme = srcReferencePlane && srcReferencePlane ==
> ref->fpelLowerResPlane[0];
>      if (ctuAddr >= 0)
>          blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) -
> ref->reconPic->getLumaAddr(0);
> -    intptr_t stride = ref->lumaStride;
> +    intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;
>      pixel* fenc = fencPUYuv.m_buf[0];
>      pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] +
> blockOffset : srcReferencePlane + blockOffset;
>
> @@ -767,7 +772,7 @@
>      int bprecost;
>
>      if (ref->isLowres)
> -        bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad);
> +        bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);
>      else
>          bprecost = subpelCompare(ref, pmv, sad);
>
> @@ -808,7 +813,8 @@
>      pmv = pmv.roundToFPel();
>      MV omv = bmv;  // current search origin or starting point
>
> -    switch (searchMethod)
> +    int search = ref->isHMELowres ? (hme ? searchMethodL0 :
> searchMethodL1) : searchMethod;
> +    switch (search)
>      {
>      case X265_DIA_SEARCH:
>      {
> @@ -1391,11 +1397,20 @@
>      {
>          // dead slow exhaustive search, but at least it uses sad_x4()
>          MV tmv;
> -        for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++)
> +        int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y,
> mvmax_x = mvmax.x;
> +        if (ref->isHMELowres)
>          {
> -            for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++)
> +            merange = (merange < 0 ? -merange : merange);
> +            mvmin_y = X265_MAX(mvmin.y, -merange);
> +            mvmin_x = X265_MAX(mvmin.x, -merange);
> +            mvmax_y = X265_MIN(mvmax.y, merange);
> +            mvmax_x = X265_MIN(mvmax.x, merange);
> +        }
> +        for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++)
> +        {
> +            for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++)
>              {
> -                if (tmv.x + 3 <= mvmax.x)
> +                if (tmv.x + 3 <= mvmax_x)
>                  {
>                      pixel *pix_base = fref + tmv.y * stride + tmv.x;
>                      sad_x4(fenc,
> @@ -1463,12 +1478,12 @@
>              if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
>                  continue;
>
> -            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) +
> mvcost(qmv);
> +            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad,
> hme) + mvcost(qmv);
>              COPY2_IF_LT(bcost, cost, bdir, i);
>          }
>
>          bmv += square1[bdir] * 2;
> -        bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) +
> mvcost(bmv);
> +        bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) +
> mvcost(bmv);
>
>          bdir = 0;
>          for (int i = 1; i <= wl.qpel_dirs; i++)
> @@ -1479,7 +1494,7 @@
>              if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
>                  continue;
>
> -            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd)
> + mvcost(qmv);
> +            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd,
> hme) + mvcost(qmv);
>              COPY2_IF_LT(bcost, cost, bdir, i);
>          }
>
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/motion.h
> --- a/source/encoder/motion.h   Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/motion.h   Mon Jul 08 10:39:27 2019 +0530
> @@ -44,6 +44,8 @@
>      int absPartIdx;  // part index of PU, including CU offset within CTU
>
>      int searchMethod;
> +    int searchMethodL0;
> +    int searchMethodL1;
>      int subpelRefine;
>
>      int blockwidth;
> @@ -76,7 +78,7 @@
>
>      /* Methods called at slice setup */
>
> -    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int
> pwidth, int pheight, const int searchMethod, const int subpelRefine);
> +    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int
> pwidth, int pheight, const int searchMethod, const int searchL0, const int
> searchL1, const int subpelRefine);
>      void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx,
> int puPartIdx, int pwidth, int pheight, const int searchMethod, const int
> subpelRefine, bool bChroma);
>
>      /* buf*() and motionEstimate() methods all use cached fenc pixels and
> thus
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.cpp
> --- a/source/encoder/search.cpp Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/search.cpp Mon Jul 08 10:39:27 2019 +0530
> @@ -2096,13 +2096,16 @@
>
>      const MV* amvp = interMode.amvpCand[list][ref];
>      int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
> -    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
> +    bool bLowresMVP = false;
> +    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
>
>      if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents
> load/save outputs from diverging if lowresMV is not available */
>      {
>          MV lmv = getLowresMV(interMode.cu, pu, list, ref);
>          if (lmv.notZero())
>              mvc[numMvc++] = lmv;
> +        if (m_param->bEnableHME)
> +            mvp_lowres = lmv;
>      }
>
>      setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
> @@ -2110,11 +2113,28 @@
>      int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref],
> mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv,
> m_param->maxSlices,
>        m_param->bSourceReferenceEstimation ?
> m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
>
> +    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
> +    {
> +        MV outmv_lowres;
> +        setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange,
> mvmin, mvmax);
> +        int lowresMvCost =
> m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres,
> numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
> +            m_param->bSourceReferenceEstimation ?
> m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
> +        if (lowresMvCost < satdCost)
> +        {
> +            outmv = outmv_lowres;
> +            satdCost = lowresMvCost;
> +            bLowresMVP = true;
> +        }
> +    }
>      /* Get total cost of partition, but only include MV bit cost once */
>      bits += m_me.bitcost(outmv);
>      uint32_t mvCost = m_me.mvcost(outmv);
>      uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
>
> +    /* Update LowresMVP to best AMVP cand*/
> +    if (bLowresMVP)
> +        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
> +
>      /* Refine MVP selection, updates: mvpIdx, bits, cost */
>      mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
>
> @@ -2346,13 +2366,16 @@
>
>                      const MV* amvp = interMode.amvpCand[list][ref];
>                      int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
> -                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
> +                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx],
> mvp_lowres;
> +                    bool bLowresMVP = false;
>
>                      if (!m_param->analysisSave && !m_param->analysisLoad)
> /* Prevents load/save outputs from diverging when lowresMV is not available
> */
>                      {
>                          MV lmv = getLowresMV(cu, pu, list, ref);
>                          if (lmv.notZero())
>                              mvc[numMvc++] = lmv;
> +                        if (m_param->bEnableHME)
> +                            mvp_lowres = lmv;
>                      }
>                      if (m_param->searchMethod == X265_SEA)
>                      {
> @@ -2365,10 +2388,27 @@
>                      int satdCost =
> m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc,
> mvc, m_param->searchRange, outmv, m_param->maxSlices,
>                        m_param->bSourceReferenceEstimation ?
> m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
>
> +                    if (m_param->bEnableHME && mvp_lowres.notZero() &&
> mvp_lowres != mvp)
> +                    {
> +                        MV outmv_lowres;
> +                        setSearchRange(cu, mvp_lowres,
> m_param->searchRange, mvmin, mvmax);
> +                        int lowresMvCost =
> m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres,
> numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
> +                            m_param->bSourceReferenceEstimation ?
> m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
> +                        if (lowresMvCost < satdCost)
> +                        {
> +                            outmv = outmv_lowres;
> +                            satdCost = lowresMvCost;
> +                            bLowresMVP = true;
> +                        }
> +                    }
> +
>                      /* Get total cost of partition, but only include MV
> bit cost once */
>                      bits += m_me.bitcost(outmv);
>                      uint32_t mvCost = m_me.mvcost(outmv);
>                      uint32_t cost = (satdCost - mvCost) +
> m_rdCost.getCost(bits);
> +                    /* Update LowresMVP to best AMVP cand*/
> +                    if (bLowresMVP)
> +                        updateMVP(amvp[mvpIdx], outmv, bits, cost,
> mvp_lowres);
>
>                      /* Refine MVP selection, updates: mvpIdx, bits, cost
> */
>                      mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
> @@ -2631,6 +2671,15 @@
>      return amvpCand[mvpIdx];
>  }
>
> +/* Update to default MVP when using an alternative mvp */
> +void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits,
> uint32_t& outCost, const MV& alterMVP)
> +{
> +    int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP);
> +    uint32_t origOutBits = outBits;
> +    outBits = origOutBits + diffBits;
> +    outCost = (outCost - m_rdCost.getCost(origOutBits)) +
> m_rdCost.getCost(outBits);
> +}
> +
>  void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange,
> MV& mvmin, MV& mvmax) const
>  {
>      MV dist((int32_t)merange << 2, (int32_t)merange << 2);
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/search.h
> --- a/source/encoder/search.h   Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/search.h   Mon Jul 08 10:39:27 2019 +0530
> @@ -425,6 +425,7 @@
>      void     setSearchRange(const CUData& cu, const MV& mvp, int merange,
> MV& mvmin, MV& mvmax) const;
>      uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const
> PredictionUnit& pu, int puIdx, MergeData& m);
>      static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx,
> uint32_t lastMode, uint32_t blockBit[3]);
> +    void      updateMVP(const MV amvp, const MV& mv, uint32_t& outBits,
> uint32_t& outCost, const MV& alterMVP);
>
>      /* intra helper functions */
>      enum { MAX_RD_INTRA_MODES = 16 };
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp      Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/slicetype.cpp      Mon Jul 08 10:39:27 2019 +0530
> @@ -664,6 +664,7 @@
>      weightedRef.lumaStride = fenc.lumaStride;
>      weightedRef.isLowres = true;
>      weightedRef.isWeighted = false;
> +    weightedRef.isHMELowres = ref.bEnableHME;
>
>      /* epsilon is chosen to require at least a numerator of 127 (with
> denominator = 128) */
>      float guessScale, fencMean, refMean;
> @@ -759,6 +760,8 @@
>      m_extendGopBoundary = false;
>      m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1)
> >> X265_LOWRES_CU_BITS;
>      m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1)
> >> X265_LOWRES_CU_BITS;
> +    m_4x4Height = ((m_param->sourceHeight / 4) + X265_LOWRES_CU_SIZE - 1)
> >> X265_LOWRES_CU_BITS;
> +    m_4x4Width = ((m_param->sourceWidth / 4) + X265_LOWRES_CU_SIZE - 1)
> >> X265_LOWRES_CU_BITS;
>      m_cuCount = m_8x8Width * m_8x8Height;
>      m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2
> * (m_8x8Width + m_8x8Height)) : m_cuCount;
>      m_isFadeIn = false;
> @@ -2782,16 +2785,32 @@
>
>              X265_CHECK(i < MAX_COOP_SLICES, "impossible number of coop
> slices\n");
>
> -            int firstY = m_lookahead.m_numRowsPerSlice * i;
> -            int lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height -
> 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
> -
> -            bool lastRow = true;
> +            int firstY, lastY;
> +            bool lastRow;
> +            if (m_lookahead.m_param->bEnableHME)
> +            {
> +                int numRowsPerSlice = m_lookahead.m_4x4Height /
> m_lookahead.m_param->lookaheadSlices;
> +                numRowsPerSlice = X265_MIN(X265_MAX(numRowsPerSlice, 5),
> m_lookahead.m_4x4Height);
> +                firstY = numRowsPerSlice * i;
> +                lastY = (i == m_jobTotal - 1) ? m_lookahead.m_4x4Height -
> 1 : numRowsPerSlice * (i + 1) - 1;
> +                lastRow = true;
> +                for (int cuY = lastY; cuY >= firstY; cuY--)
> +                {
> +                    for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0;
> cuX--)
> +                        estimateCUCost(tld, cuX, cuY, m_coop.p0,
> m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i, 1);
> +                    lastRow = false;
> +                }
> +            }
> +
> +            firstY = m_lookahead.m_numRowsPerSlice * i;
> +            lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 :
> m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
> +            lastRow = true;
>              for (int cuY = lastY; cuY >= firstY; cuY--)
>              {
>                  m_frames[m_coop.b]->rowSatds[m_coop.b -
> m_coop.p0][m_coop.p1 - m_coop.b][cuY] = 0;
>
>                  for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0;
> cuX--)
> -                    estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1,
> m_coop.b, m_coop.bDoSearch, lastRow, i);
> +                    estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1,
> m_coop.b, m_coop.bDoSearch, lastRow, i, 0);
>
>                  lastRow = false;
>              }
> @@ -2864,13 +2883,25 @@
>          }
>          else
>          {
> -            bool lastRow = true;
> +            /* Calculate MVs for 1/16th resolution*/
> +            bool lastRow;
> +            if (param->bEnableHME)
> +            {
> +                lastRow = true;
> +                for (int cuY = m_lookahead.m_4x4Height - 1; cuY >= 0;
> cuY--)
> +                {
> +                    for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0;
> cuX--)
> +                        estimateCUCost(tld, cuX, cuY, p0, p1, b,
> bDoSearch, lastRow, -1, 1);
> +                    lastRow = false;
> +                }
> +            }
> +            lastRow = true;
>              for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--)
>              {
>                  fenc->rowSatds[b - p0][p1 - b][cuY] = 0;
>
>                  for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0;
> cuX--)
> -                    estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch,
> lastRow, -1);
> +                    estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch,
> lastRow, -1, 0);
>
>                  lastRow = false;
>              }
> @@ -2891,23 +2922,27 @@
>      return score;
>  }
>
> -void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int
> cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice)
> +void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int
> cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice,
> bool hme)
>  {
>      Lowres *fref0 = m_frames[p0];
>      Lowres *fref1 = m_frames[p1];
>      Lowres *fenc  = m_frames[b];
>
> -    ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted ?
> &fenc->weightedRef[b - p0] : fref0;
> -
> -    const int widthInCU = m_lookahead.m_8x8Width;
> -    const int heightInCU = m_lookahead.m_8x8Height;
> +    ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted &&
> !hme ? &fenc->weightedRef[b - p0] : fref0;
> +
> +    const int widthInCU = hme ? m_lookahead.m_4x4Width :
> m_lookahead.m_8x8Width;
> +    const int heightInCU = hme ? m_lookahead.m_4x4Height :
> m_lookahead.m_8x8Height;
>      const int bBidir = (b < p1);
>      const int cuXY = cuX + cuY * widthInCU;
> +    const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2;
>      const int cuSize = X265_LOWRES_CU_SIZE;
> -    const intptr_t pelOffset = cuSize * cuX + cuSize * cuY *
> fenc->lumaStride;
> -
> -    if (bBidir || bDoSearch[0] || bDoSearch[1])
> -        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride,
> pelOffset, cuSize, cuSize, X265_HEX_SEARCH, 1);
> +    const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ?
> fenc->lumaStride/2 : fenc->lumaStride);
> +
> +    if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme)
> +        tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2,
> pelOffset, cuSize, cuSize, X265_HEX_SEARCH,
> m_lookahead.m_param->hmeSearchMethod[0],
> m_lookahead.m_param->hmeSearchMethod[1], 1);
> +    else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme)
> +        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride,
> pelOffset, cuSize, cuSize, X265_HEX_SEARCH,
> m_lookahead.m_param->hmeSearchMethod[0],
> m_lookahead.m_param->hmeSearchMethod[1], 1);
> +
>
>      /* A small, arbitrary bias to avoid VBV problems caused by
> zero-residual lookahead blocks. */
>      int lowresPenalty = 4;
> @@ -2926,7 +2961,7 @@
>
>      for (int i = 0; i < 1 + bBidir; i++)
>      {
> -        int& fencCost = fenc->lowresMvCosts[i][listDist[i]][cuXY];
> +        int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY]
> : fenc->lowresMvCosts[i][listDist[i]][cuXY];
>          int skipCost = INT_MAX;
>
>          if (!bDoSearch[i])
> @@ -2936,8 +2971,8 @@
>          }
>
>          int numc = 0;
> -        MV mvc[4], mvp;
> -        MV* fencMV = &fenc->lowresMvs[i][listDist[i]][cuXY];
> +        MV mvc[5], mvp;
> +        MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] :
> &fenc->lowresMvs[i][listDist[i]][cuXY];
>          ReferencePlanes* fref = i ? fref1 : wfref0;
>
>          /* Reverse-order MV prediction */
> @@ -2952,6 +2987,10 @@
>              if (cuX < widthInCU - 1)
>                  MVC(fencMV[widthInCU + 1]);
>          }
> +        if (fenc->lowerResMvs[0][0] && !hme &&
> fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0)
> +        {
> +            MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2);
> +        }
>  #undef MVC
>
>          if (!numc)
> @@ -2967,7 +3006,7 @@
>              for (int idx = 0; idx < numc; idx++)
>              {
>                  intptr_t stride = X265_LOWRES_CU_SIZE;
> -                pixel *src = fref->lowresMC(pelOffset, mvc[idx],
> subpelbuf, stride);
> +                pixel *src = fref->lowresMC(pelOffset, mvc[idx],
> subpelbuf, stride, hme);
>                  int cost = tld.me.bufSATD(src, stride);
>                  COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
>                  /* Except for mv0 case, everyting else is likely to have
> enough residual to not trigger the skip. */
> @@ -2978,7 +3017,10 @@
>
>          /* ME will never return a cost larger than the cost @MVP, so we
> do not
>           * have to check that ME cost is more than the estimated merge
> cost */
> -        fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0,
> NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
> +        if(!hme)
> +            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0,
> NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
> +        else
> +            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0,
> NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices,
> fref->lowerResPlane[0]);
>          if (skipCost < 64 && skipCost < fencCost && bBidir)
>          {
>              fencCost = skipCost;
> @@ -2986,6 +3028,8 @@
>          }
>          COPY2_IF_LT(bcost, fencCost, listused, i + 1);
>      }
> +    if (hme)
> +        return;
>
>      if (bBidir) /* B, also consider bidir */
>      {
> @@ -2995,8 +3039,8 @@
>          ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE *
> X265_LOWRES_CU_SIZE]);
>          ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE *
> X265_LOWRES_CU_SIZE]);
>          intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 =
> X265_LOWRES_CU_SIZE;
> -        pixel *src0 = fref0->lowresMC(pelOffset,
> fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0);
> -        pixel *src1 = fref1->lowresMC(pelOffset,
> fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1);
> +        pixel *src0 = fref0->lowresMC(pelOffset,
> fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0);
> +        pixel *src1 = fref1->lowresMC(pelOffset,
> fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0);
>          ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE *
> X265_LOWRES_CU_SIZE]);
>          primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref,
> X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
>          int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/slicetype.h
> --- a/source/encoder/slicetype.h        Fri Jul 05 11:17:26 2019 +0530
> +++ b/source/encoder/slicetype.h        Mon Jul 08 10:39:27 2019 +0530
> @@ -124,6 +124,10 @@
>      int           m_inputCount;
>      double        m_cuTreeStrength;
>
> +    /* HME */
> +    int           m_4x4Width;
> +    int           m_4x4Height;
> +
>      bool          m_isActive;
>      bool          m_sliceTypeBusy;
>      bool          m_bAdaptiveQuant;
> @@ -246,7 +250,7 @@
>      void    processTasks(int workerThreadID);
>
>      int64_t estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b,
> bool intraPenalty);
> -    void    estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0,
> int p1, int b, bool bDoSearch[2], bool lastRow, int slice);
> +    void    estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0,
> int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme);
>
>      CostEstimateGroup& operator=(const CostEstimateGroup&);
>  };
> diff -r 14a235657a20 -r 2dcff9aea06f source/encoder/weightPrediction.cpp
> --- a/source/encoder/weightPrediction.cpp       Fri Jul 05 11:17:26 2019
> +0530
> +++ b/source/encoder/weightPrediction.cpp       Mon Jul 08 10:39:27 2019
> +0530
> @@ -82,7 +82,7 @@
>              /* clip MV to available pixels */
>              MV mv = mvs[cu];
>              mv = mv.clipped(mvmin, mvmax);
> -            pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
> +            pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride, 0);
>              primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride,
> tmp, bstride);
>          }
>      }
> _______________________________________________
> x265-devel mailing list
> [email protected]
> https://mailman.videolan.org/listinfo/x265-devel
>


-- 
Regards,
Aruna

_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH 2 of 2] motion: Perform ME on each HME level

Reply via email to