Hi, now for review of the vp9-specific bits.
On Tue, Aug 22, 2017 at 7:59 PM, Ilia Valiakhmetov <zakne...@gmail.com> wrote: > + lflvl_len*sizeof(*s->lflvl) + 16 * > sizeof(*s->above_mv_ctx))); > lflvl_len * sizeof (space). > - // these will be re-allocated a little later > - av_freep(&s->b_base); > - av_freep(&s->block_base); > I don't think you can remove that. I understand that td may or may not be allocated here, so you probably need an if or so, but its critically important that we re-allocate these upon a size change, so this behaviour probably needs to stay in. > } else { > - s->b_base = av_malloc(sizeof(VP9Block)); > - s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * > bytesperpixel * sizeof(int16_t) + > - 16 * 16 + 2 * chroma_eobs); > - if (!s->b_base || !s->block_base) > - return AVERROR(ENOMEM); > - s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel; > - s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * > bytesperpixel; > - s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * > bytesperpixel); > - s->uveob_base[0] = s->eob_base + 16 * 16; > - s->uveob_base[1] = s->uveob_base[0] + chroma_eobs; > + for (i = 1; i < s->s.h.tiling.tile_cols; i++) { > + if (s->td[i].b_base && s->td[i].block_base) { > + av_free(s->td[i].b_base); > + av_free(s->td[i].block_base); > + } > + } > + for (i = 0; i < s->s.h.tiling.tile_cols; i++) { > + s->td[i].b_base = av_malloc(sizeof(VP9Block)); > + s->td[i].block_base = av_mallocz((64 * 64 + 2 * > chroma_blocks) * bytesperpixel * sizeof(int16_t) + > + 16 * 16 + 2 * chroma_eobs); > + if (!s->td[i].b_base || !s->td[i].block_base) > + return AVERROR(ENOMEM); > + s->td[i].uvblock_base[0] = s->td[i].block_base + 64 * 64 * > bytesperpixel; > + s->td[i].uvblock_base[1] = s->td[i].uvblock_base[0] + > chroma_blocks * bytesperpixel; > + s->td[i].eob_base = (uint8_t *) (s->td[i].uvblock_base[1] + > chroma_blocks * bytesperpixel); > + s->td[i].uveob_base[0] = s->td[i].eob_base + 16 * 16; > + s->td[i].uveob_base[1] = s->td[i].uveob_base[0] + chroma_eobs; > + } > This makes the non-threaded decoder use a lot more memory, I'm not sure that's a good idea, it's certainly not required, since I believe the codepath of non-threaded is shared with frame-mt, so it only uses s->td[0]. Can we allocate less memory here if there's no frame-threading at all? > +static int decode_tiles(AVCodecContext *avctx) > +{ > + VP9Context *s = avctx->priv_data; > + VP9TileData *td = &s->td[0]; > + int row, col, tile_row, tile_col; > + int bytesperpixel; > + int tile_row_start, tile_row_end, tile_col_start, tile_col_end; > + AVFrame *f; > + ptrdiff_t yoff, uvoff, ls_y, ls_uv; > + > + f = s->s.frames[CUR_FRAME].tf.f; > + ls_y = f->linesize[0]; > + ls_uv =f->linesize[1]; > + bytesperpixel = s->bytesperpixel; > + > + yoff = uvoff = 0; > + for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) { > + set_tile_offset(&tile_row_start, &tile_row_end, > + tile_row, s->s.h.tiling.log2_tile_rows, > s->sb_rows); > + > + for (row = tile_row_start; row < tile_row_end; > + row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) > { > + VP9Filter *lflvl_ptr = s->lflvl; > + ptrdiff_t yoff2 = yoff, uvoff2 = uvoff; > + > + for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; > tile_col++) { > + set_tile_offset(&tile_col_start, &tile_col_end, > + tile_col, s->s.h.tiling.log2_tile_cols, > s->sb_cols); > + td->tile_col_start = tile_col_start; > + if (s->pass != 2) { > + memset(td->left_partition_ctx, 0, 8); > + memset(td->left_skip_ctx, 0, 8); > + if (s->s.h.keyframe || s->s.h.intraonly) { > + memset(td->left_mode_ctx, DC_PRED, 16); > + } else { > + memset(td->left_mode_ctx, NEARESTMV, 8); > + } > + memset(td->left_y_nnz_ctx, 0, 16); > + memset(td->left_uv_nnz_ctx, 0, 32); > + memset(td->left_segpred_ctx, 0, 8); > + > + memcpy(&td->c, &s->td[tile_col].c_b[tile_row], > sizeof(td->c)); > + } > + > + for (col = tile_col_start; > + col < tile_col_end; > + col += 8, yoff2 += 64 * bytesperpixel, > + uvoff2 += 64 * bytesperpixel >> s->ss_h, > lflvl_ptr++) { > + // FIXME integrate with lf code (i.e. zero after each > + // use, similar to invtxfm coefficients, or similar) > + if (s->pass != 1) { > + memset(lflvl_ptr->mask, 0, > sizeof(lflvl_ptr->mask)); > + } > + > + if (s->pass == 2) { > + decode_sb_mem(td, row, col, lflvl_ptr, > + yoff2, uvoff2, BL_64X64); > + } else { > + decode_sb(td, row, col, lflvl_ptr, > + yoff2, uvoff2, BL_64X64); > + } > + } > + if (s->pass != 2) > + memcpy(&s->td[tile_col].c_b[tile_row], &td->c, > sizeof(td->c)); > + } > + > + if (s->pass == 1) > + continue; > + > + // backup pre-loopfilter reconstruction data for intra > + // prediction of next row of sb64s > + if (row + 8 < s->rows) { > + memcpy(s->intra_pred_data[0], > + f->data[0] + yoff + 63 * ls_y, > + 8 * s->cols * bytesperpixel); > + memcpy(s->intra_pred_data[1], > + f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv, > + 8 * s->cols * bytesperpixel >> s->ss_h); > + memcpy(s->intra_pred_data[2], > + f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv, > + 8 * s->cols * bytesperpixel >> s->ss_h); > + } > + > + // loopfilter one row > + if (s->s.h.filter.level) { > + yoff2 = yoff; > + uvoff2 = uvoff; > + lflvl_ptr = s->lflvl; > + for (col = 0; col < s->cols; > + col += 8, yoff2 += 64 * bytesperpixel, > + uvoff2 += 64 * bytesperpixel >> s->ss_h, > lflvl_ptr++) { > + ff_vp9_loopfilter_sb(avctx, lflvl_ptr, row, col, > + yoff2, uvoff2); > + } > + } > + > + // FIXME maybe we can make this more finegrained by running > the > + // loopfilter per-block instead of after each sbrow > + // In fact that would also make intra pred left preparation > easier? > + ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> > 3, 0); > + } > + } > + return 0; > +} > + > + > +static av_always_inline > +int decode_tiles_mt(AVCodecContext *avctx, void *tdata, int jobnr, > + int threadnr) > +{ > + VP9Context *s = avctx->priv_data; > + VP9TileData *td = &s->td[jobnr]; > + ptrdiff_t uvoff, yoff, ls_y, ls_uv; > + int bytesperpixel = s->bytesperpixel, row, col, tile_row; > + unsigned tile_cols_len; > + int tile_row_start, tile_row_end, tile_col_start, tile_col_end; > + VP9Filter *lflvl_ptr_base; > + AVFrame *f; > + > + f = s->s.frames[CUR_FRAME].tf.f; > + ls_y = f->linesize[0]; > + ls_uv =f->linesize[1]; > + > + set_tile_offset(&tile_col_start, &tile_col_end, > + jobnr, s->s.h.tiling.log2_tile_cols, s->sb_cols); > + td->tile_col_start = tile_col_start; > + uvoff = (64 * bytesperpixel >> s->ss_h)*(tile_col_start >> 3); > + yoff = (64 * bytesperpixel)*(tile_col_start >> 3); > + lflvl_ptr_base = s->lflvl+(tile_col_start >> 3); > + > + for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) { > + set_tile_offset(&tile_row_start, &tile_row_end, > + tile_row, s->s.h.tiling.log2_tile_rows, > s->sb_rows); > + > + memcpy(&td->c, &td->c_b[tile_row], sizeof(td->c)); > + for (row = tile_row_start; row < tile_row_end; > + row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) > { > + ptrdiff_t yoff2 = yoff, uvoff2 = uvoff; > + VP9Filter *lflvl_ptr = lflvl_ptr_base+s->sb_cols*(row >> 3); > + > + memset(td->left_partition_ctx, 0, 8); > + memset(td->left_skip_ctx, 0, 8); > + if (s->s.h.keyframe || s->s.h.intraonly) { > + memset(td->left_mode_ctx, DC_PRED, 16); > + } else { > + memset(td->left_mode_ctx, NEARESTMV, 8); > + } > + memset(td->left_y_nnz_ctx, 0, 16); > + memset(td->left_uv_nnz_ctx, 0, 32); > + memset(td->left_segpred_ctx, 0, 8); > + > + for (col = tile_col_start; > + col < tile_col_end; > + col += 8, yoff2 += 64 * bytesperpixel, > + uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) { > + // FIXME integrate with lf code (i.e. zero after each > + // use, similar to invtxfm coefficients, or similar) > + memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask)); > + decode_sb(td, row, col, lflvl_ptr, > + yoff2, uvoff2, BL_64X64); > + } > + > + // backup pre-loopfilter reconstruction data for intra > + // prediction of next row of sb64s > + tile_cols_len = tile_col_end - tile_col_start; > + if (row + 8 < s->rows) { > + memcpy(s->intra_pred_data[0] + (tile_col_start * 8 * > bytesperpixel), > + f->data[0] + yoff + 63 * ls_y, > + 8 * tile_cols_len * bytesperpixel); > + memcpy(s->intra_pred_data[1] + (tile_col_start * 8 * > bytesperpixel >> s->ss_h), > + f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv, > + 8 * tile_cols_len * bytesperpixel >> s->ss_h); > + memcpy(s->intra_pred_data[2] + (tile_col_start * 8 * > bytesperpixel >> s->ss_h), > + f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv, > + 8 * tile_cols_len * bytesperpixel >> s->ss_h); > + } > + > + ff_thread_report_progress2(avctx, row >> 3, 0, 1); > + } > + } > + return 0; > +} > + > +static av_always_inline > +int loopfilter_proc(AVCodecContext *avctx) > +{ > + VP9Context *s = avctx->priv_data; > + ptrdiff_t uvoff, yoff, ls_y, ls_uv; > + VP9Filter *lflvl_ptr; > + int bytesperpixel = s->bytesperpixel, col, i; > + AVFrame *f; > + > + f = s->s.frames[CUR_FRAME].tf.f; > + ls_y = f->linesize[0]; > + ls_uv =f->linesize[1]; > + > + for (i = 0; i < s->sb_rows; i++) { > + ff_thread_await_progress3(avctx, i, 0, s->s.h.tiling.tile_cols); > + > + if (s->s.h.filter.level) { > + yoff = (ls_y * 64)*i; > + uvoff = (ls_uv * 64 >> s->ss_v)*i; > + lflvl_ptr = s->lflvl+s->sb_cols*i; > + for (col = 0; col < s->cols; > + col += 8, yoff += 64 * bytesperpixel, > + uvoff += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) { > + ff_vp9_loopfilter_sb(avctx, lflvl_ptr, i << 3, col, > + yoff, uvoff); > + } > + } > + } > return 0; > } > Hm... There is some duplicate code here... I guess it's OK. + if (avctx->active_thread_type == FF_THREAD_SLICE) > + for (i = 1; i < s->s.h.tiling.tile_cols; i++) > + for (j = 0; j < sizeof(s->td[i].counts) / > sizeof(unsigned); j++) > + ((unsigned *)&s->td[0].counts)[j] += ((unsigned > *)&s->td[i].counts)[j]; Add a comment so people understand what this does, this is kind of cryptic. > +typedef struct VP9TileData { > + VP9Context *s; > Can this be marked const? > + VP56RangeCoder c_b[4]; > + VP56RangeCoder c; > Why do you need 'c', and can that be a pointer into one of the c_b entries? Ronald _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel