Hi, the attached patch is somewhat of a hack job, as the commit I used may already have been edited from its original version, and I have added some stuff on top of it (eg the commit message).
Thus I have signed it off. I haven't tried testing it in the edge filter case because it was a messier test, and the band filter test showed benefits besides the memory reduction. -- Christophe
From 81f305e68ad3cc589909b00d9b2644d3e0750ade Mon Sep 17 00:00:00 2001 From: Fabrice Bellard <fabr...@bellard.org> Date: Mon, 12 Jan 2015 23:35:25 +0100 Subject: [PATCH] avcodec/hevc: reduce memory used by the SAO SAO edge filter uses pre-SAO pixel data on the left and top of the ctb, so this data must be kept available. This was done previously by having 2 copies of the frame, one before and one after SAO. This patch reduces the storage to just that, instead of the previous whole frame. A slight adaptation from Fabrice's version is to match our alignment requirements, and abuse the edge emu buffers instead of adding a new buffer. Decicycles: 26772->26220 (BO32), 83803->80942 (BO64) Signed-off-by: Christophe Gisquet <christophe.gisq...@gmail.com> --- libavcodec/hevc.c | 44 +++++------ libavcodec/hevc.h | 5 +- libavcodec/hevc_filter.c | 192 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 177 insertions(+), 64 deletions(-) diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c index dab0be8..7699297 100644 --- a/libavcodec/hevc.c +++ b/libavcodec/hevc.c @@ -280,24 +280,6 @@ static int decode_lt_rps(HEVCContext *s, LongTermRPS *rps, GetBitContext *gb) return 0; } -static int get_buffer_sao(HEVCContext *s, AVFrame *frame, const HEVCSPS *sps) -{ - int ret, i; - - frame->width = FFALIGN(s->avctx->coded_width + 2, FF_INPUT_BUFFER_PADDING_SIZE); - frame->height = s->avctx->coded_height + 3; - if ((ret = ff_get_buffer(s->avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0) - return ret; - for (i = 0; frame->data[i]; i++) { - int offset = frame->linesize[i] + FF_INPUT_BUFFER_PADDING_SIZE; - frame->data[i] += offset; - } - frame->width = s->avctx->coded_width; - frame->height = s->avctx->coded_height; - - return 0; -} - static int set_sps(HEVCContext *s, const HEVCSPS *sps) { #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL) @@ -353,10 +335,19 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps) ff_videodsp_init (&s->vdsp, sps->bit_depth); if (sps->sao_enabled && !s->avctx->hwaccel) { - av_frame_unref(s->tmp_frame); - if ((ret = get_buffer_sao(s, s->tmp_frame, sps)) < 0) - goto fail; - s->sao_frame = s->tmp_frame; + int c_count = (sps->chroma_format_idc != 0) ? 3 : 1; + int c_idx; + + for(c_idx = 0; c_idx < c_count; c_idx++) { + int w = sps->width >> sps->hshift[c_idx]; + int h = sps->height >> sps->vshift[c_idx]; + s->sao_pixel_buffer_h[c_idx] = + av_malloc((w * 2 * sps->ctb_height) << + sps->pixel_shift); + s->sao_pixel_buffer_v[c_idx] = + av_malloc((h * 2 * sps->ctb_width) << + sps->pixel_shift); + } } s->sps = sps; @@ -3176,7 +3167,10 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) av_freep(&s->cabac_state); - av_frame_free(&s->tmp_frame); + for (i = 0; i < 3; i++) { + av_freep(&s->sao_pixel_buffer_h[i]); + av_freep(&s->sao_pixel_buffer_v[i]); + } av_frame_free(&s->output_frame); for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { @@ -3236,10 +3230,6 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) if (!s->cabac_state) goto fail; - s->tmp_frame = av_frame_alloc(); - if (!s->tmp_frame) - goto fail; - s->output_frame = av_frame_alloc(); if (!s->output_frame) goto fail; diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h index 1727b60..ae9a32a 100644 --- a/libavcodec/hevc.h +++ b/libavcodec/hevc.h @@ -769,6 +769,7 @@ typedef struct HEVCLocalContext { int end_of_tiles_y; /* +7 is for subpixel interpolation, *2 for high bit depths */ DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; + /* The extended size between the new edge emu buffer is abused by SAO */ DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]); @@ -807,9 +808,9 @@ typedef struct HEVCContext { uint8_t slice_initialized; AVFrame *frame; - AVFrame *sao_frame; - AVFrame *tmp_frame; AVFrame *output_frame; + uint8_t *sao_pixel_buffer_h[3]; + uint8_t *sao_pixel_buffer_v[3]; const HEVCVPS *vps; const HEVCSPS *sps; diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c index 8b4c1be..f3a0963 100644 --- a/libavcodec/hevc_filter.c +++ b/libavcodec/hevc_filter.c @@ -161,13 +161,62 @@ int i, j; } } -static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int height, int c_idx) +static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) +{ + if (pixel_shift) + *(uint16_t *)dst = *(uint16_t *)src; + else + *dst = *src; +} + +static void copy_vert(uint8_t *dst, const uint8_t *src, + int pixel_shift, int height, + int stride_dst, int stride_src) +{ + int i; + if (pixel_shift == 0) { + for (i = 0; i < height; i++) { + *dst = *src; + dst += stride_dst; + src += stride_src; + } + } else { + for (i = 0; i < height; i++) { + *(uint16_t *)dst = *(uint16_t *)src; + dst += stride_dst; + src += stride_src; + } + } +} + +static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src, + int stride_src, int x, int y, int width, int height, + int c_idx, int x_ctb, int y_ctb) +{ + int sh = s->sps->pixel_shift; + int w = s->sps->width >> s->sps->hshift[c_idx]; + int h = s->sps->height >> s->sps->vshift[c_idx]; + + /* copy horizontal edges */ + memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh), + src, width << sh); + memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh), + src + stride_src * (height - 1), width << sh); + + /* copy vertical edges */ + copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src); + + copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src); +} + +static void restore_tqb_pixels(HEVCContext *s, + uint8_t *src1, const uint8_t *dst1, + ptrdiff_t stride_src, ptrdiff_t stride_dst, + int x0, int y0, int width, int height, int c_idx) { if ( s->pps->transquant_bypass_enable_flag || (s->sps->pcm.loop_filter_disable_flag && s->sps->pcm_enabled_flag)) { int x, y; - ptrdiff_t stride_dst = s->sao_frame->linesize[c_idx]; - ptrdiff_t stride_src = s->frame->linesize[c_idx]; int min_pu_size = 1 << s->sps->log2_min_pu_size; int hshift = s->sps->hshift[c_idx]; int vshift = s->sps->vshift[c_idx]; @@ -175,13 +224,13 @@ static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int he int y_min = ((y0 ) >> s->sps->log2_min_pu_size); int x_max = ((x0 + width ) >> s->sps->log2_min_pu_size); int y_max = ((y0 + height) >> s->sps->log2_min_pu_size); - int len = min_pu_size >> hshift; + int len = (min_pu_size >> hshift) << s->sps->pixel_shift; for (y = y_min; y < y_max; y++) { for (x = x_min; x < x_max; x++) { if (s->is_pcm[y * s->sps->min_pu_width + x]) { int n; - uint8_t *src = &s->frame->data[c_idx][ ((y << s->sps->log2_min_pu_size) >> vshift) * stride_src + (((x << s->sps->log2_min_pu_size) >> hshift) << s->sps->pixel_shift)]; - uint8_t *dst = &s->sao_frame->data[c_idx][((y << s->sps->log2_min_pu_size) >> vshift) * stride_dst + (((x << s->sps->log2_min_pu_size) >> hshift) << s->sps->pixel_shift)]; + uint8_t *src = src1 + (((y << s->sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->sps->log2_min_pu_size) - x0) >> hshift) << s->sps->pixel_shift); + const uint8_t *dst = dst1 + (((y << s->sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->sps->log2_min_pu_size) - x0) >> hshift) << s->sps->pixel_shift); for (n = 0; n < (min_pu_size >> vshift); n++) { memcpy(src, dst, len); src += stride_src; @@ -198,6 +247,7 @@ static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int he static void sao_filter_CTB(HEVCContext *s, int x, int y) { static const uint8_t band_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 }; + HEVCLocalContext *lc = s->HEVClc; int c_idx; int edges[4]; // 0 left 1 top 2 right 3 bottom int x_ctb = x >> s->sps->log2_ctb_size; @@ -258,54 +308,125 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) int x0 = x >> s->sps->hshift[c_idx]; int y0 = y >> s->sps->vshift[c_idx]; int stride_src = s->frame->linesize[c_idx]; - int stride_dst = s->sao_frame->linesize[c_idx]; int ctb_size_h = (1 << (s->sps->log2_ctb_size)) >> s->sps->hshift[c_idx]; int ctb_size_v = (1 << (s->sps->log2_ctb_size)) >> s->sps->vshift[c_idx]; int width = FFMIN(ctb_size_h, (s->sps->width >> s->sps->hshift[c_idx]) - x0); int height = FFMIN(ctb_size_v, (s->sps->height >> s->sps->vshift[c_idx]) - y0); int tab = band_tab[(FFALIGN(width, 8) >> 3) - 1]; uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->sps->pixel_shift)]; - uint8_t *dst = &s->sao_frame->data[c_idx][y0 * stride_dst + (x0 << s->sps->pixel_shift)]; + int stride_dst; + uint8_t *dst; switch (sao->type_idx[c_idx]) { case SAO_BAND: + dst = lc->edge_emu_buffer; + stride_dst = 2*MAX_PB_SIZE; copy_CTB(dst, src, width << s->sps->pixel_shift, height, stride_dst, stride_src); + copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, + x_ctb, y_ctb); s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, sao->offset_val[c_idx], sao->band_position[c_idx], width, height); - restore_tqb_pixels(s, x, y, width, height, c_idx); + restore_tqb_pixels(s, src, dst, stride_src, stride_dst, + x, y, width, height, c_idx); sao->type_idx[c_idx] = SAO_APPLIED; break; case SAO_EDGE: { - uint8_t left_pixels = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] != SAO_APPLIED); - if (!edges[1]) { - uint8_t top_left = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED); - uint8_t top_right = !edges[2] && (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED); - if (CTB(s->sao, x_ctb , y_ctb-1).type_idx[c_idx] == 0) - memcpy( dst - stride_dst - (top_left << s->sps->pixel_shift), - src - stride_src - (top_left << s->sps->pixel_shift), - (top_left + width + top_right) << s->sps->pixel_shift); - else { - if (top_left) - memcpy( dst - stride_dst - (1 << s->sps->pixel_shift), - src - stride_src - (1 << s->sps->pixel_shift), - 1 << s->sps->pixel_shift); - if(top_right) - memcpy( dst - stride_dst + (width << s->sps->pixel_shift), - src - stride_src + (width << s->sps->pixel_shift), - 1 << s->sps->pixel_shift); + int w = s->sps->width >> s->sps->hshift[c_idx]; + int h = s->sps->height >> s->sps->vshift[c_idx]; + int left_edge = edges[0]; + int top_edge = edges[1]; + int right_edge = edges[2]; + int bottom_edge = edges[3]; + int sh = s->sps->pixel_shift; + int left_pixels, right_pixels; + + stride_dst = 2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE; + dst = lc->edge_emu_buffer + stride_dst + FF_INPUT_BUFFER_PADDING_SIZE; + + if (!top_edge) { + int left = 1 - left_edge; + int right = 1 - right_edge; + const uint8_t *src1[2]; + uint8_t *dst1; + int src_idx, pos; + + dst1 = dst - stride_dst - (left << sh); + src1[0] = src - stride_src - (left << sh); + src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh); + pos = 0; + if (left) { + src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); + copy_pixel(dst1, src1[src_idx], sh); + pos += (1 << sh); + } + src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); + memcpy(dst1 + pos, src1[src_idx] + pos, width << sh); + if (right) { + pos += width << sh; + src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); + copy_pixel(dst1 + pos, src1[src_idx] + pos, sh); } } - if (!edges[3]) { // bottom and bottom right - uint8_t bottom_left = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] != SAO_APPLIED); - memcpy( dst + height * stride_dst - (bottom_left << s->sps->pixel_shift), - src + height * stride_src - (bottom_left << s->sps->pixel_shift), - (width + 1 + bottom_left) << s->sps->pixel_shift); + if (!bottom_edge) { + int left = 1 - left_edge; + int right = 1 - right_edge; + const uint8_t *src1[2]; + uint8_t *dst1; + int src_idx, pos; + + dst1 = dst + height * stride_dst - (left << sh); + src1[0] = src + height * stride_src - (left << sh); + src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh); + pos = 0; + if (left) { + src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); + copy_pixel(dst1, src1[src_idx], sh); + pos += (1 << sh); + } + src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); + memcpy(dst1 + pos, src1[src_idx] + pos, width << sh); + if (right) { + pos += width << sh; + src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); + copy_pixel(dst1 + pos, src1[src_idx] + pos, sh); + } + } + left_pixels = 0; + if (!left_edge) { + if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { + copy_vert(dst - (1 << sh), + s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh), + sh, height, stride_dst, 1 << sh); + } else { + left_pixels = 1; + } } - copy_CTB(dst - (left_pixels << s->sps->pixel_shift), - src - (left_pixels << s->sps->pixel_shift), - (width + 1 + left_pixels) << s->sps->pixel_shift, height, stride_dst, stride_src); + right_pixels = 0; + if (!right_edge) { + if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { + copy_vert(dst + (width << sh), + s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh), + sh, height, stride_dst, 1 << sh); + } else { + right_pixels = 1; + } + } + + copy_CTB(dst - (left_pixels << sh), + src - (left_pixels << sh), + (width + left_pixels + right_pixels) << sh, + height, stride_dst, stride_src); + + copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, + x_ctb, y_ctb); s->hevcdsp.sao_edge_filter[restore](src, dst, stride_src, stride_dst, sao, @@ -314,7 +435,8 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) vert_edge, horiz_edge, diag_edge); - restore_tqb_pixels(s, x, y, width, height, c_idx); + restore_tqb_pixels(s, src, dst, stride_src, stride_dst, + x, y, width, height, c_idx); sao->type_idx[c_idx] = SAO_APPLIED; break; } -- 1.9.2.msysgit.0
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel