Re: [FFmpeg-devel] [PATCH] avcodec/hevc: reduce memory used by the SAO

Christophe Gisquet Mon, 02 Feb 2015 05:23:03 -0800

Hi,

2015-02-02 13:32 GMT+01:00 Michael Niedermayer <[email protected]>:
> On Mon, Feb 02, 2015 at 07:41:54AM +0100, Christophe Gisquet wrote:
> hmm, is there a reason not to take the original commit unchanged ?
> I was hoping to reduce the difference to openhevc so that we also
> are able to merge future changes from openhevc with few confilcts
> but maybe iam missing something


Because there are alignment requirements in our dsp, and technically,
adding another buffer (which isn't aligned) while there are already
perfectly good ones is not the best solution (memory-wise and
bookkeeping-wise).

I'd go as far as suggest openhevc to align to this version. But maybe
they have diverged too much.

Also, I don't really like merge for the sake of merging in that case,
because those are matters difficult to follow, and I prefer something
that is more logical and makes sense.

> also it seems this does not apply cleanly

OK, I took a shortcut thinking my branch wouldn't cause such an issue.
Please try the attached patch.

-- 
Christophe

From c691861e730f941412749b764a15d66c26d9a216 Mon Sep 17 00:00:00 2001
From: Fabrice Bellard <[email protected]>
Date: Mon, 12 Jan 2015 23:35:25 +0100
Subject: [PATCH] avcodec/hevc: reduce memory used by the SAO

SAO edge filter uses pre-SAO pixel data on the left and top of the ctb, so
this data must be kept available. This was done previously by having 2
copies of the frame, one before and one after SAO.

This patch reduces the storage to just that, instead of the previous whole
frame. A slight adaptation from Fabrice's version is to match our alignment
requirements, and abuse the edge emu buffers instead of adding a new
buffer.

Decicycles: 26772->26220 (BO32),  83803->80942 (BO64)

Signed-off-by: Christophe Gisquet <[email protected]>
---
 libavcodec/hevc.c        |  43 +++++------
 libavcodec/hevc.h        |   5 +-
 libavcodec/hevc_filter.c | 192 ++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 177 insertions(+), 63 deletions(-)

diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index f24cd8f..7699297 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -280,24 +280,6 @@ static int decode_lt_rps(HEVCContext *s, LongTermRPS *rps, GetBitContext *gb)
     return 0;
 }
 
-static int get_buffer_sao(HEVCContext *s, AVFrame *frame, const HEVCSPS *sps)
-{
-    int ret, i;
-
-    frame->width  = FFALIGN(s->avctx->coded_width + 2, FF_INPUT_BUFFER_PADDING_SIZE);
-    frame->height = s->avctx->coded_height + 3;
-    if ((ret = ff_get_buffer(s->avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
-        return ret;
-    for (i = 0; frame->data[i]; i++) {
-        int offset = frame->linesize[i] + FF_INPUT_BUFFER_PADDING_SIZE;
-        frame->data[i] += offset;
-    }
-    frame->width  = s->avctx->coded_width;
-    frame->height = s->avctx->coded_height;
-
-    return 0;
-}
-
 static int set_sps(HEVCContext *s, const HEVCSPS *sps)
 {
     #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL)
@@ -353,9 +335,19 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps)
     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
 
     if (sps->sao_enabled && !s->avctx->hwaccel) {
-        av_frame_unref(s->tmp_frame);
-        ret = get_buffer_sao(s, s->tmp_frame, sps);
-        s->sao_frame = s->tmp_frame;
+        int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+        int c_idx;
+
+        for(c_idx = 0; c_idx < c_count; c_idx++) {
+            int w = sps->width >> sps->hshift[c_idx];
+            int h = sps->height >> sps->vshift[c_idx];
+            s->sao_pixel_buffer_h[c_idx] =
+                av_malloc((w * 2 * sps->ctb_height) <<
+                          sps->pixel_shift);
+            s->sao_pixel_buffer_v[c_idx] =
+                av_malloc((h * 2 * sps->ctb_width) <<
+                          sps->pixel_shift);
+        }
     }
 
     s->sps = sps;
@@ -3175,7 +3167,10 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
 
     av_freep(&s->cabac_state);
 
-    av_frame_free(&s->tmp_frame);
+    for (i = 0; i < 3; i++) {
+        av_freep(&s->sao_pixel_buffer_h[i]);
+        av_freep(&s->sao_pixel_buffer_v[i]);
+    }
     av_frame_free(&s->output_frame);
 
     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
@@ -3235,10 +3230,6 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
     if (!s->cabac_state)
         goto fail;
 
-    s->tmp_frame = av_frame_alloc();
-    if (!s->tmp_frame)
-        goto fail;
-
     s->output_frame = av_frame_alloc();
     if (!s->output_frame)
         goto fail;
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index 1727b60..ae9a32a 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -769,6 +769,7 @@ typedef struct HEVCLocalContext {
     int     end_of_tiles_y;
     /* +7 is for subpixel interpolation, *2 for high bit depths */
     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
+    /* The extended size between the new edge emu buffer is abused by SAO */
     DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
     DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
 
@@ -807,9 +808,9 @@ typedef struct HEVCContext {
     uint8_t slice_initialized;
 
     AVFrame *frame;
-    AVFrame *sao_frame;
-    AVFrame *tmp_frame;
     AVFrame *output_frame;
+    uint8_t *sao_pixel_buffer_h[3];
+    uint8_t *sao_pixel_buffer_v[3];
 
     const HEVCVPS *vps;
     const HEVCSPS *sps;
diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
index 7a0ec6d..92b431b 100644
--- a/libavcodec/hevc_filter.c
+++ b/libavcodec/hevc_filter.c
@@ -161,13 +161,62 @@ int i, j;
     }
 }
 
-static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int height, int c_idx)
+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
+{
+    if (pixel_shift)
+        *(uint16_t *)dst = *(uint16_t *)src;
+    else
+        *dst = *src;
+}
+
+static void copy_vert(uint8_t *dst, const uint8_t *src,
+                      int pixel_shift, int height,
+                      int stride_dst, int stride_src)
+{
+    int i;
+    if (pixel_shift == 0) {
+        for (i = 0; i < height; i++) {
+            *dst = *src;
+            dst += stride_dst;
+            src += stride_src;
+        }
+    } else {
+        for (i = 0; i < height; i++) {
+            *(uint16_t *)dst = *(uint16_t *)src;
+            dst += stride_dst;
+            src += stride_src;
+        }
+    }
+}
+
+static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+                           int stride_src, int x, int y, int width, int height,
+                           int c_idx, int x_ctb, int y_ctb)
+{
+    int sh = s->sps->pixel_shift;
+    int w = s->sps->width >> s->sps->hshift[c_idx];
+    int h = s->sps->height >> s->sps->vshift[c_idx];
+
+    /* copy horizontal edges */
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
+        src, width << sh);
+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
+        src + stride_src * (height - 1), width << sh);
+
+    /* copy vertical edges */
+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
+
+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
+}
+
+static void restore_tqb_pixels(HEVCContext *s,
+                               uint8_t *src1, const uint8_t *dst1,
+                               ptrdiff_t stride_src, ptrdiff_t stride_dst,
+                               int x0, int y0, int width, int height, int c_idx)
 {
     if ( s->pps->transquant_bypass_enable_flag ||
             (s->sps->pcm.loop_filter_disable_flag && s->sps->pcm_enabled_flag)) {
         int x, y;
-        ptrdiff_t stride_dst = s->sao_frame->linesize[c_idx];
-        ptrdiff_t stride_src = s->frame->linesize[c_idx];
         int min_pu_size  = 1 << s->sps->log2_min_pu_size;
         int hshift       = s->sps->hshift[c_idx];
         int vshift       = s->sps->vshift[c_idx];
@@ -175,13 +224,13 @@ static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int he
         int y_min        = ((y0         ) >> s->sps->log2_min_pu_size);
         int x_max        = ((x0 + width ) >> s->sps->log2_min_pu_size);
         int y_max        = ((y0 + height) >> s->sps->log2_min_pu_size);
-        int len          = min_pu_size >> hshift;
+        int len          = (min_pu_size >> hshift) << s->sps->pixel_shift;
         for (y = y_min; y < y_max; y++) {
             for (x = x_min; x < x_max; x++) {
                 if (s->is_pcm[y * s->sps->min_pu_width + x]) {
                     int n;
-                    uint8_t *src = &s->frame->data[c_idx][    ((y << s->sps->log2_min_pu_size) >> vshift) * stride_src + (((x << s->sps->log2_min_pu_size) >> hshift) << s->sps->pixel_shift)];
-                    uint8_t *dst = &s->sao_frame->data[c_idx][((y << s->sps->log2_min_pu_size) >> vshift) * stride_dst + (((x << s->sps->log2_min_pu_size) >> hshift) << s->sps->pixel_shift)];
+                    uint8_t *src = src1 + (((y << s->sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->sps->log2_min_pu_size) - x0) >> hshift) << s->sps->pixel_shift);
+                    const uint8_t *dst = dst1 + (((y << s->sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->sps->log2_min_pu_size) - x0) >> hshift) << s->sps->pixel_shift);
                     for (n = 0; n < (min_pu_size >> vshift); n++) {
                         memcpy(src, dst, len);
                         src += stride_src;
@@ -198,6 +247,7 @@ static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int he
 static void sao_filter_CTB(HEVCContext *s, int x, int y)
 {
     static const uint8_t band_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+    HEVCLocalContext *lc = s->HEVClc;
     int c_idx;
     int edges[4];  // 0 left 1 top 2 right 3 bottom
     int x_ctb                = x >> s->sps->log2_ctb_size;
@@ -258,54 +308,125 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
         int x0       = x >> s->sps->hshift[c_idx];
         int y0       = y >> s->sps->vshift[c_idx];
         int stride_src = s->frame->linesize[c_idx];
-        int stride_dst = s->sao_frame->linesize[c_idx];
         int ctb_size_h = (1 << (s->sps->log2_ctb_size)) >> s->sps->hshift[c_idx];
         int ctb_size_v = (1 << (s->sps->log2_ctb_size)) >> s->sps->vshift[c_idx];
         int width    = FFMIN(ctb_size_h, (s->sps->width  >> s->sps->hshift[c_idx]) - x0);
         int height   = FFMIN(ctb_size_v, (s->sps->height >> s->sps->vshift[c_idx]) - y0);
         int tab      = band_tab[(FFALIGN(width, 8) >> 3) - 1];
         uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->sps->pixel_shift)];
-        uint8_t *dst = &s->sao_frame->data[c_idx][y0 * stride_dst + (x0 << s->sps->pixel_shift)];
+        int stride_dst;
+        uint8_t *dst;
 
         switch (sao->type_idx[c_idx]) {
         case SAO_BAND:
+            dst = lc->edge_emu_buffer;
+            stride_dst = 2*MAX_PB_SIZE;
             copy_CTB(dst, src, width << s->sps->pixel_shift, height, stride_dst, stride_src);
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
             s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
                                             sao->offset_val[c_idx], sao->band_position[c_idx],
                                             width, height);
-            restore_tqb_pixels(s, x, y, width, height, c_idx);
+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                               x, y, width, height, c_idx);
             sao->type_idx[c_idx] = SAO_APPLIED;
             break;
         case SAO_EDGE:
         {
-            uint8_t left_pixels = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] != SAO_APPLIED);
-            if (!edges[1]) {
-                uint8_t top_left  = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED);
-                uint8_t top_right = !edges[2] && (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED);
-                if (CTB(s->sao, x_ctb  , y_ctb-1).type_idx[c_idx] == 0)
-                    memcpy( dst - stride_dst - (top_left << s->sps->pixel_shift),
-                            src - stride_src - (top_left << s->sps->pixel_shift),
-                            (top_left + width + top_right) << s->sps->pixel_shift);
-                else {
-                    if (top_left)
-                        memcpy( dst - stride_dst - (1 << s->sps->pixel_shift),
-                                src - stride_src - (1 << s->sps->pixel_shift),
-                                1 << s->sps->pixel_shift);
-                    if(top_right)
-                        memcpy( dst - stride_dst + (width << s->sps->pixel_shift),
-                                src - stride_src + (width << s->sps->pixel_shift),
-                                1 << s->sps->pixel_shift);
+            int w = s->sps->width >> s->sps->hshift[c_idx];
+            int h = s->sps->height >> s->sps->vshift[c_idx];
+            int left_edge = edges[0];
+            int top_edge = edges[1];
+            int right_edge = edges[2];
+            int bottom_edge = edges[3];
+            int sh = s->sps->pixel_shift;
+            int left_pixels, right_pixels;
+
+            stride_dst = 2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE;
+            dst = lc->edge_emu_buffer + stride_dst + FF_INPUT_BUFFER_PADDING_SIZE;
+
+            if (!top_edge) {
+                int left = 1 - left_edge;
+                int right = 1 - right_edge;
+                const uint8_t *src1[2];
+                uint8_t *dst1;
+                int src_idx, pos;
+
+                dst1 = dst - stride_dst - (left << sh);
+                src1[0] = src - stride_src - (left << sh);
+                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1, src1[src_idx], sh);
+                    pos += (1 << sh);
+                }
+                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
                 }
             }
-            if (!edges[3]) {                                                                // bottom and bottom right
-                uint8_t bottom_left = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] != SAO_APPLIED);
-                memcpy( dst + height * stride_dst - (bottom_left << s->sps->pixel_shift),
-                        src + height * stride_src - (bottom_left << s->sps->pixel_shift),
-                        (width + 1 + bottom_left) << s->sps->pixel_shift);
+            if (!bottom_edge) {
+                int left = 1 - left_edge;
+                int right = 1 - right_edge;
+                const uint8_t *src1[2];
+                uint8_t *dst1;
+                int src_idx, pos;
+
+                dst1 = dst + height * stride_dst - (left << sh);
+                src1[0] = src + height * stride_src - (left << sh);
+                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
+                pos = 0;
+                if (left) {
+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1, src1[src_idx], sh);
+                    pos += (1 << sh);
+                }
+                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
+                           SAO_APPLIED);
+                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+                if (right) {
+                    pos += width << sh;
+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
+                               SAO_APPLIED);
+                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+                }
+            }
+            left_pixels = 0;
+            if (!left_edge) {
+                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    copy_vert(dst - (1 << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    left_pixels = 1;
+                }
             }
-            copy_CTB(dst - (left_pixels << s->sps->pixel_shift),
-                     src - (left_pixels << s->sps->pixel_shift),
-                     (width + 1 + left_pixels) << s->sps->pixel_shift, height, stride_dst, stride_src);
+            right_pixels = 0;
+            if (!right_edge) {
+                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                    copy_vert(dst + (width << sh),
+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+                              sh, height, stride_dst, 1 << sh);
+                } else {
+                    right_pixels = 1;
+                }
+            }
+
+            copy_CTB(dst - (left_pixels << sh),
+                     src - (left_pixels << sh),
+                     (width + left_pixels + right_pixels) << sh,
+                     height, stride_dst, stride_src);
+
+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                           x_ctb, y_ctb);
             s->hevcdsp.sao_edge_filter[restore](src, dst,
                                                 stride_src, stride_dst,
                                                 sao,
@@ -314,7 +435,8 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
                                                 vert_edge,
                                                 horiz_edge,
                                                 diag_edge);
-            restore_tqb_pixels(s, x, y, width, height, c_idx);
+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                               x, y, width, height, c_idx);
             sao->type_idx[c_idx] = SAO_APPLIED;
             break;
         }
-- 
1.9.2.msysgit.0

_______________________________________________
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] avcodec/hevc: reduce memory used by the SAO

Reply via email to