[FFmpeg-devel] [PATCH v3 3/3] avcodec/vvc/dsp: prefix TxType and TxSize with VVC
From: Wu Jianhua See https://patchwork.ffmpeg.org/project/ffmpeg/patch/tyspr06mb64337c4a9adf5312e6648543aa...@tyspr06mb6433.apcprd06.prod.outlook.com/#81892 Signed-off-by: Wu Jianhua --- libavcodec/vvc/dsp.h | 28 ++-- libavcodec/vvc/dsp_template.c | 2 +- libavcodec/vvc/intra.c| 26 +- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h index 0b49b97021..38ff492a23 100644 --- a/libavcodec/vvc/dsp.h +++ b/libavcodec/vvc/dsp.h @@ -27,21 +27,21 @@ #include #include -enum TxType { -DCT2, -DST7, -DCT8, -N_TX_TYPE, +enum VVCTxType { +VVC_DCT2, +VVC_DST7, +VVC_DCT8, +VVC_N_TX_TYPE, }; -enum TxSize { -TX_SIZE_2, -TX_SIZE_4, -TX_SIZE_8, -TX_SIZE_16, -TX_SIZE_32, -TX_SIZE_64, -N_TX_SIZE, +enum VVCTxSize { +VVC_TX_SIZE_2, +VVC_TX_SIZE_4, +VVC_TX_SIZE_8, +VVC_TX_SIZE_16, +VVC_TX_SIZE_32, +VVC_TX_SIZE_64, +VVC_N_TX_SIZE, }; typedef struct VVCInterDSPContext { @@ -127,7 +127,7 @@ typedef struct VVCItxDSPContext { void (*add_residual_joint)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride, int c_sign, int shift); void (*pred_residual_joint)(int *buf, int width, int height, int c_sign, int shift); -void (*itx[N_TX_TYPE][N_TX_SIZE])(int *coeffs, ptrdiff_t step, size_t nz); +void (*itx[VVC_N_TX_TYPE][VVC_N_TX_SIZE])(int *coeffs, ptrdiff_t step, size_t nz); void (*transform_bdpcm)(int *coeffs, int width, int height, int vertical, int log2_transform_range); } VVCItxDSPContext; diff --git a/libavcodec/vvc/dsp_template.c b/libavcodec/vvc/dsp_template.c index 8130abbccf..1aa1e027bd 100644 --- a/libavcodec/vvc/dsp_template.c +++ b/libavcodec/vvc/dsp_template.c @@ -97,7 +97,7 @@ static void FUNC(transform_bdpcm)(int *coeffs, const int width, const int height static void FUNC(ff_vvc_itx_dsp_init)(VVCItxDSPContext *const itx) { #define VVC_ITX(TYPE, type, s) \ -itx->itx[TYPE][TX_SIZE_##s] = ff_vvc_inv_##type##_##s; \ +itx->itx[VVC_##TYPE][VVC_##TX_SIZE_##s] = ff_vvc_inv_##type##_##s; \ #define VVC_ITX_COMMON(TYPE, type) \ VVC_ITX(TYPE, type, 4); \ diff --git a/libavcodec/vvc/intra.c b/libavcodec/vvc/intra.c index f77a012f09..73dca6dc85 100644 --- a/libavcodec/vvc/intra.c +++ b/libavcodec/vvc/intra.c @@ -128,15 +128,15 @@ static void ilfnst_transform(const VVCLocalContext *lc, TransformBlock *tb) } //part of 8.7.4 Transformation process for scaled transform coefficients -static void derive_transform_type(const VVCFrameContext *fc, const VVCLocalContext *lc, const TransformBlock *tb, enum TxType *trh, enum TxType *trv) +static void derive_transform_type(const VVCFrameContext *fc, const VVCLocalContext *lc, const TransformBlock *tb, enum VVCTxType *trh, enum VVCTxType *trv) { const CodingUnit *cu = lc->cu; -static const enum TxType mts_to_trh[] = {DCT2, DST7, DCT8, DST7, DCT8}; -static const enum TxType mts_to_trv[] = {DCT2, DST7, DST7, DCT8, DCT8}; +static const enum VVCTxType mts_to_trh[] = { VVC_DCT2, VVC_DST7, VVC_DCT8, VVC_DST7, VVC_DCT8 }; +static const enum VVCTxType mts_to_trv[] = { VVC_DCT2, VVC_DST7, VVC_DST7, VVC_DCT8, VVC_DCT8 }; const VVCSPS *sps = fc->ps.sps; int implicit_mts_enabled = 0; if (tb->c_idx || (cu->isp_split_type != ISP_NO_SPLIT && cu->lfnst_idx)) { -*trh = *trv = DCT2; +*trh = *trv = VVC_DCT2; return; } @@ -152,11 +152,11 @@ static void derive_transform_type(const VVCFrameContext *fc, const VVCLocalConte const int w = tb->tb_width; const int h = tb->tb_height; if (cu->sbt_flag) { -*trh = (cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? DST7 : DCT8; -*trv = (!cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? DST7 : DCT8; +*trh = (cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? VVC_DST7 : VVC_DCT8; +*trv = (!cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? VVC_DST7 : VVC_DCT8; } else { -*trh = (w >= 4 && w <= 16) ? DST7 : DCT2; -*trv = (h >= 4 && h <= 16) ? DST7 : DCT2; +*trh = (w >= 4 && w <= 16) ? VVC_DST7 : VVC_DCT2; +*trv = (h >= 4 && h <= 16) ? VVC_DST7 : VVC_DCT2; } return; } @@ -447,7 +447,7 @@ static void dequant(const VVCLocalContext *lc, const TransformUnit *tu, Transfor //transmatrix[0][0] #define DCT_A 64 -static void itx_2d(const VVCFrameContext *fc, TransformBlock *tb, const enum TxType trh, const enum TxType trv) +static void itx_2d(const VVCFrameContext *fc, TransformBlock *tb, const enum VVCTxType trh, const enum VVCTxType trv) { const VVCSPS *sps = fc->ps
[FFmpeg-devel] [PATCH v3 2/3] avcodec/vvc/cabac: remove vvc_refill2
From: Wu Jianhua The vvc_refill2 is the same as the refill2 in cabac_functions. Remove it to reduce duplicated codes. Signed-off-by: Wu Jianhua --- libavcodec/cabac_functions.h | 2 +- libavcodec/vvc/cabac.c | 28 +--- 2 files changed, 2 insertions(+), 28 deletions(-) diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h index c3f08d3410..9bee401f2c 100644 --- a/libavcodec/cabac_functions.h +++ b/libavcodec/cabac_functions.h @@ -85,7 +85,7 @@ static inline void renorm_cabac_decoder_once(CABACContext *c){ } #endif -#ifndef get_cabac_inline +#if !defined(get_cabac_inline) || !defined(refill2) static void refill2(CABACContext *c){ int i; unsigned x; diff --git a/libavcodec/vvc/cabac.c b/libavcodec/vvc/cabac.c index 0d45eec751..c9b6f9bf3e 100644 --- a/libavcodec/vvc/cabac.c +++ b/libavcodec/vvc/cabac.c @@ -856,32 +856,6 @@ int ff_vvc_cabac_init(VVCLocalContext *lc, return ret; } -//fixme -static void vvc_refill2(CABACContext* c) { -int i; -unsigned x; -#if !HAVE_FAST_CLZ -x = c->low ^ (c->low - 1); -i = 7 - ff_h264_norm_shift[x >> (CABAC_BITS - 1)]; -#else -i = ff_ctz(c->low) - CABAC_BITS; -#endif - -x = -CABAC_MASK; - -#if CABAC_BITS == 16 -x += (c->bytestream[0] << 9) + (c->bytestream[1] << 1); -#else -x += c->bytestream[0] << 1; -#endif - -c->low += x << i; -#if !UNCHECKED_BITSTREAM_READER -if (c->bytestream < c->bytestream_end) -#endif -c->bytestream += CABAC_BITS / 8; -} - static int inline vvc_get_cabac(CABACContext *c, VVCCabacState* base, const int ctx) { VVCCabacState *s = base + ctx; @@ -904,7 +878,7 @@ static int inline vvc_get_cabac(CABACContext *c, VVCCabacState* base, const int c->low <<= lps_mask; if (!(c->low & CABAC_MASK)) -vvc_refill2(c); +refill2(c); s->state[0] = s->state[0] - (s->state[0] >> s->shift[0]) + (1023 * bit >> s->shift[0]); s->state[1] = s->state[1] - (s->state[1] >> s->shift[1]) + (16383 * bit >> s->shift[1]); return bit; -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v3 1/3] avcodec/vvc_parser: move avctx->has_b_frames initialization to dec
From: Wu Jianhua >From Jun Zhao : > Should we relocate this to the decoder? Other codecs typically set this > parameter in the decoder. Signed-off-by: Wu Jianhua --- libavcodec/vvc/dec.c| 1 + libavcodec/vvc_parser.c | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/libavcodec/vvc/dec.c b/libavcodec/vvc/dec.c index d04f68e4cf..6e225d278a 100644 --- a/libavcodec/vvc/dec.c +++ b/libavcodec/vvc/dec.c @@ -748,6 +748,7 @@ static void export_frame_params(VVCContext *s, const VVCFrameContext *fc) c->coded_height = pps->height; c->width= pps->width - ((pps->r->pps_conf_win_left_offset + pps->r->pps_conf_win_right_offset) << sps->hshift[CHROMA]); c->height = pps->height - ((pps->r->pps_conf_win_top_offset + pps->r->pps_conf_win_bottom_offset) << sps->vshift[CHROMA]); +c->has_b_frames = sps->r->sps_dpb_params.dpb_max_num_reorder_pics[sps->r->sps_max_sublayers_minus1]; } static int frame_setup(VVCFrameContext *fc, VVCContext *s) diff --git a/libavcodec/vvc_parser.c b/libavcodec/vvc_parser.c index 5373875aae..8d32d66573 100644 --- a/libavcodec/vvc_parser.c +++ b/libavcodec/vvc_parser.c @@ -185,9 +185,6 @@ static void set_parser_ctx(AVCodecParserContext *s, AVCodecContext *avctx, avctx->color_range = sps->vui.vui_full_range_flag ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG; -avctx->has_b_frames = - sps->sps_dpb_params.dpb_max_num_reorder_pics[sps->sps_max_sublayers_minus1]; - if (sps->sps_ptl_dpb_hrd_params_present_flag && sps->sps_timing_hrd_params_present_flag) { uint32_t num = sps->sps_general_timing_hrd_parameters.num_units_in_tick; -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 3/3] avcodec/vvc/dsp: prefix TxType and TxSize with VVC
From: Wu Jianhua See https://github.com/ffvvc/FFmpeg/issues/180 Signed-off-by: Wu Jianhua --- libavcodec/vvc/dsp.h | 28 ++-- libavcodec/vvc/dsp_template.c | 2 +- libavcodec/vvc/intra.c| 26 +- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h index 0b49b97021..38ff492a23 100644 --- a/libavcodec/vvc/dsp.h +++ b/libavcodec/vvc/dsp.h @@ -27,21 +27,21 @@ #include #include -enum TxType { -DCT2, -DST7, -DCT8, -N_TX_TYPE, +enum VVCTxType { +VVC_DCT2, +VVC_DST7, +VVC_DCT8, +VVC_N_TX_TYPE, }; -enum TxSize { -TX_SIZE_2, -TX_SIZE_4, -TX_SIZE_8, -TX_SIZE_16, -TX_SIZE_32, -TX_SIZE_64, -N_TX_SIZE, +enum VVCTxSize { +VVC_TX_SIZE_2, +VVC_TX_SIZE_4, +VVC_TX_SIZE_8, +VVC_TX_SIZE_16, +VVC_TX_SIZE_32, +VVC_TX_SIZE_64, +VVC_N_TX_SIZE, }; typedef struct VVCInterDSPContext { @@ -127,7 +127,7 @@ typedef struct VVCItxDSPContext { void (*add_residual_joint)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride, int c_sign, int shift); void (*pred_residual_joint)(int *buf, int width, int height, int c_sign, int shift); -void (*itx[N_TX_TYPE][N_TX_SIZE])(int *coeffs, ptrdiff_t step, size_t nz); +void (*itx[VVC_N_TX_TYPE][VVC_N_TX_SIZE])(int *coeffs, ptrdiff_t step, size_t nz); void (*transform_bdpcm)(int *coeffs, int width, int height, int vertical, int log2_transform_range); } VVCItxDSPContext; diff --git a/libavcodec/vvc/dsp_template.c b/libavcodec/vvc/dsp_template.c index 8130abbccf..1aa1e027bd 100644 --- a/libavcodec/vvc/dsp_template.c +++ b/libavcodec/vvc/dsp_template.c @@ -97,7 +97,7 @@ static void FUNC(transform_bdpcm)(int *coeffs, const int width, const int height static void FUNC(ff_vvc_itx_dsp_init)(VVCItxDSPContext *const itx) { #define VVC_ITX(TYPE, type, s) \ -itx->itx[TYPE][TX_SIZE_##s] = ff_vvc_inv_##type##_##s; \ +itx->itx[VVC_##TYPE][VVC_##TX_SIZE_##s] = ff_vvc_inv_##type##_##s; \ #define VVC_ITX_COMMON(TYPE, type) \ VVC_ITX(TYPE, type, 4); \ diff --git a/libavcodec/vvc/intra.c b/libavcodec/vvc/intra.c index f77a012f09..73dca6dc85 100644 --- a/libavcodec/vvc/intra.c +++ b/libavcodec/vvc/intra.c @@ -128,15 +128,15 @@ static void ilfnst_transform(const VVCLocalContext *lc, TransformBlock *tb) } //part of 8.7.4 Transformation process for scaled transform coefficients -static void derive_transform_type(const VVCFrameContext *fc, const VVCLocalContext *lc, const TransformBlock *tb, enum TxType *trh, enum TxType *trv) +static void derive_transform_type(const VVCFrameContext *fc, const VVCLocalContext *lc, const TransformBlock *tb, enum VVCTxType *trh, enum VVCTxType *trv) { const CodingUnit *cu = lc->cu; -static const enum TxType mts_to_trh[] = {DCT2, DST7, DCT8, DST7, DCT8}; -static const enum TxType mts_to_trv[] = {DCT2, DST7, DST7, DCT8, DCT8}; +static const enum VVCTxType mts_to_trh[] = { VVC_DCT2, VVC_DST7, VVC_DCT8, VVC_DST7, VVC_DCT8 }; +static const enum VVCTxType mts_to_trv[] = { VVC_DCT2, VVC_DST7, VVC_DST7, VVC_DCT8, VVC_DCT8 }; const VVCSPS *sps = fc->ps.sps; int implicit_mts_enabled = 0; if (tb->c_idx || (cu->isp_split_type != ISP_NO_SPLIT && cu->lfnst_idx)) { -*trh = *trv = DCT2; +*trh = *trv = VVC_DCT2; return; } @@ -152,11 +152,11 @@ static void derive_transform_type(const VVCFrameContext *fc, const VVCLocalConte const int w = tb->tb_width; const int h = tb->tb_height; if (cu->sbt_flag) { -*trh = (cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? DST7 : DCT8; -*trv = (!cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? DST7 : DCT8; +*trh = (cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? VVC_DST7 : VVC_DCT8; +*trv = (!cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? VVC_DST7 : VVC_DCT8; } else { -*trh = (w >= 4 && w <= 16) ? DST7 : DCT2; -*trv = (h >= 4 && h <= 16) ? DST7 : DCT2; +*trh = (w >= 4 && w <= 16) ? VVC_DST7 : VVC_DCT2; +*trv = (h >= 4 && h <= 16) ? VVC_DST7 : VVC_DCT2; } return; } @@ -447,7 +447,7 @@ static void dequant(const VVCLocalContext *lc, const TransformUnit *tu, Transfor //transmatrix[0][0] #define DCT_A 64 -static void itx_2d(const VVCFrameContext *fc, TransformBlock *tb, const enum TxType trh, const enum TxType trv) +static void itx_2d(const VVCFrameContext *fc, TransformBlock *tb, const enum VVCTxType trh, const enum VVCTxType trv) { const VVCSPS *sps = fc->ps.sps; const int w = tb->tb_width; @@ -456,7 +456,7 @@ static void itx_2d(const VV
[FFmpeg-devel] [PATCH v2 2/3] avcodec/vvc/cabac: remove vvc_refill2
From: Wu Jianhua See https://github.com/ffvvc/FFmpeg/issues/178 Signed-off-by: Wu Jianhua --- libavcodec/cabac_functions.h | 2 +- libavcodec/vvc/cabac.c | 28 +--- 2 files changed, 2 insertions(+), 28 deletions(-) diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h index c3f08d3410..9bee401f2c 100644 --- a/libavcodec/cabac_functions.h +++ b/libavcodec/cabac_functions.h @@ -85,7 +85,7 @@ static inline void renorm_cabac_decoder_once(CABACContext *c){ } #endif -#ifndef get_cabac_inline +#if !defined(get_cabac_inline) || !defined(refill2) static void refill2(CABACContext *c){ int i; unsigned x; diff --git a/libavcodec/vvc/cabac.c b/libavcodec/vvc/cabac.c index 0d45eec751..c9b6f9bf3e 100644 --- a/libavcodec/vvc/cabac.c +++ b/libavcodec/vvc/cabac.c @@ -856,32 +856,6 @@ int ff_vvc_cabac_init(VVCLocalContext *lc, return ret; } -//fixme -static void vvc_refill2(CABACContext* c) { -int i; -unsigned x; -#if !HAVE_FAST_CLZ -x = c->low ^ (c->low - 1); -i = 7 - ff_h264_norm_shift[x >> (CABAC_BITS - 1)]; -#else -i = ff_ctz(c->low) - CABAC_BITS; -#endif - -x = -CABAC_MASK; - -#if CABAC_BITS == 16 -x += (c->bytestream[0] << 9) + (c->bytestream[1] << 1); -#else -x += c->bytestream[0] << 1; -#endif - -c->low += x << i; -#if !UNCHECKED_BITSTREAM_READER -if (c->bytestream < c->bytestream_end) -#endif -c->bytestream += CABAC_BITS / 8; -} - static int inline vvc_get_cabac(CABACContext *c, VVCCabacState* base, const int ctx) { VVCCabacState *s = base + ctx; @@ -904,7 +878,7 @@ static int inline vvc_get_cabac(CABACContext *c, VVCCabacState* base, const int c->low <<= lps_mask; if (!(c->low & CABAC_MASK)) -vvc_refill2(c); +refill2(c); s->state[0] = s->state[0] - (s->state[0] >> s->shift[0]) + (1023 * bit >> s->shift[0]); s->state[1] = s->state[1] - (s->state[1] >> s->shift[1]) + (16383 * bit >> s->shift[1]); return bit; -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 1/3] avcodec/vvc_parser: move avctx->has_b_frames initialization to dec
From: Wu Jianhua >From Jun Zhao : > Should we relocate this to the decoder? Other codecs typically set this > parameter in the decoder. Signed-off-by: Wu Jianhua --- libavcodec/vvc/dec.c| 1 + libavcodec/vvc_parser.c | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/libavcodec/vvc/dec.c b/libavcodec/vvc/dec.c index d04f68e4cf..6e225d278a 100644 --- a/libavcodec/vvc/dec.c +++ b/libavcodec/vvc/dec.c @@ -748,6 +748,7 @@ static void export_frame_params(VVCContext *s, const VVCFrameContext *fc) c->coded_height = pps->height; c->width= pps->width - ((pps->r->pps_conf_win_left_offset + pps->r->pps_conf_win_right_offset) << sps->hshift[CHROMA]); c->height = pps->height - ((pps->r->pps_conf_win_top_offset + pps->r->pps_conf_win_bottom_offset) << sps->vshift[CHROMA]); +c->has_b_frames = sps->r->sps_dpb_params.dpb_max_num_reorder_pics[sps->r->sps_max_sublayers_minus1]; } static int frame_setup(VVCFrameContext *fc, VVCContext *s) diff --git a/libavcodec/vvc_parser.c b/libavcodec/vvc_parser.c index 5373875aae..8d32d66573 100644 --- a/libavcodec/vvc_parser.c +++ b/libavcodec/vvc_parser.c @@ -185,9 +185,6 @@ static void set_parser_ctx(AVCodecParserContext *s, AVCodecContext *avctx, avctx->color_range = sps->vui.vui_full_range_flag ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG; -avctx->has_b_frames = - sps->sps_dpb_params.dpb_max_num_reorder_pics[sps->sps_max_sublayers_minus1]; - if (sps->sps_ptl_dpb_hrd_params_present_flag && sps->sps_timing_hrd_params_present_flag) { uint32_t num = sps->sps_general_timing_hrd_parameters.num_units_in_tick; -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/3] avcodec/vvc/dsp: prefix TxType and TxSize with VVC
From: Wu Jianhua See https://github.com/ffvvc/FFmpeg/issues/180 Signed-off-by: Wu Jianhua --- libavcodec/vvc/dsp.h | 28 ++-- libavcodec/vvc/dsp_template.c | 2 +- libavcodec/vvc/intra.c| 26 +- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h index 1f14096c41..662092fafc 100644 --- a/libavcodec/vvc/dsp.h +++ b/libavcodec/vvc/dsp.h @@ -27,21 +27,21 @@ #include #include -enum TxType { -DCT2, -DST7, -DCT8, -N_TX_TYPE, +enum VVCTxType { +VVC_DCT2, +VVC_DST7, +VVC_DCT8, +VVC_N_TX_TYPE, }; -enum TxSize { -TX_SIZE_2, -TX_SIZE_4, -TX_SIZE_8, -TX_SIZE_16, -TX_SIZE_32, -TX_SIZE_64, -N_TX_SIZE, +enum VVCTxSize { +VVC_TX_SIZE_2, +VVC_TX_SIZE_4, +VVC_TX_SIZE_8, +VVC_TX_SIZE_16, +VVC_TX_SIZE_32, +VVC_TX_SIZE_64, +VVC_N_TX_SIZE, }; typedef struct VVCInterDSPContext { @@ -127,7 +127,7 @@ typedef struct VVCItxDSPContext { void (*add_residual_joint)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride, int c_sign, int shift); void (*pred_residual_joint)(int *buf, int width, int height, int c_sign, int shift); -void (*itx[N_TX_TYPE][N_TX_SIZE])(int *coeffs, ptrdiff_t step, size_t nz); +void (*itx[VVC_N_TX_TYPE][VVC_N_TX_SIZE])(int *coeffs, ptrdiff_t step, size_t nz); void (*transform_bdpcm)(int *coeffs, int width, int height, int vertical, int log2_transform_range); } VVCItxDSPContext; diff --git a/libavcodec/vvc/dsp_template.c b/libavcodec/vvc/dsp_template.c index 8130abbccf..1aa1e027bd 100644 --- a/libavcodec/vvc/dsp_template.c +++ b/libavcodec/vvc/dsp_template.c @@ -97,7 +97,7 @@ static void FUNC(transform_bdpcm)(int *coeffs, const int width, const int height static void FUNC(ff_vvc_itx_dsp_init)(VVCItxDSPContext *const itx) { #define VVC_ITX(TYPE, type, s) \ -itx->itx[TYPE][TX_SIZE_##s] = ff_vvc_inv_##type##_##s; \ +itx->itx[VVC_##TYPE][VVC_##TX_SIZE_##s] = ff_vvc_inv_##type##_##s; \ #define VVC_ITX_COMMON(TYPE, type) \ VVC_ITX(TYPE, type, 4); \ diff --git a/libavcodec/vvc/intra.c b/libavcodec/vvc/intra.c index f77a012f09..73dca6dc85 100644 --- a/libavcodec/vvc/intra.c +++ b/libavcodec/vvc/intra.c @@ -128,15 +128,15 @@ static void ilfnst_transform(const VVCLocalContext *lc, TransformBlock *tb) } //part of 8.7.4 Transformation process for scaled transform coefficients -static void derive_transform_type(const VVCFrameContext *fc, const VVCLocalContext *lc, const TransformBlock *tb, enum TxType *trh, enum TxType *trv) +static void derive_transform_type(const VVCFrameContext *fc, const VVCLocalContext *lc, const TransformBlock *tb, enum VVCTxType *trh, enum VVCTxType *trv) { const CodingUnit *cu = lc->cu; -static const enum TxType mts_to_trh[] = {DCT2, DST7, DCT8, DST7, DCT8}; -static const enum TxType mts_to_trv[] = {DCT2, DST7, DST7, DCT8, DCT8}; +static const enum VVCTxType mts_to_trh[] = { VVC_DCT2, VVC_DST7, VVC_DCT8, VVC_DST7, VVC_DCT8 }; +static const enum VVCTxType mts_to_trv[] = { VVC_DCT2, VVC_DST7, VVC_DST7, VVC_DCT8, VVC_DCT8 }; const VVCSPS *sps = fc->ps.sps; int implicit_mts_enabled = 0; if (tb->c_idx || (cu->isp_split_type != ISP_NO_SPLIT && cu->lfnst_idx)) { -*trh = *trv = DCT2; +*trh = *trv = VVC_DCT2; return; } @@ -152,11 +152,11 @@ static void derive_transform_type(const VVCFrameContext *fc, const VVCLocalConte const int w = tb->tb_width; const int h = tb->tb_height; if (cu->sbt_flag) { -*trh = (cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? DST7 : DCT8; -*trv = (!cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? DST7 : DCT8; +*trh = (cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? VVC_DST7 : VVC_DCT8; +*trv = (!cu->sbt_horizontal_flag || cu->sbt_pos_flag) ? VVC_DST7 : VVC_DCT8; } else { -*trh = (w >= 4 && w <= 16) ? DST7 : DCT2; -*trv = (h >= 4 && h <= 16) ? DST7 : DCT2; +*trh = (w >= 4 && w <= 16) ? VVC_DST7 : VVC_DCT2; +*trv = (h >= 4 && h <= 16) ? VVC_DST7 : VVC_DCT2; } return; } @@ -447,7 +447,7 @@ static void dequant(const VVCLocalContext *lc, const TransformUnit *tu, Transfor //transmatrix[0][0] #define DCT_A 64 -static void itx_2d(const VVCFrameContext *fc, TransformBlock *tb, const enum TxType trh, const enum TxType trv) +static void itx_2d(const VVCFrameContext *fc, TransformBlock *tb, const enum VVCTxType trh, const enum VVCTxType trv) { const VVCSPS *sps = fc->ps.sps; const int w = tb->tb_width; @@ -456,7 +456,7 @@ static void itx_2d(const VV
[FFmpeg-devel] [PATCH 2/3] avcodec/vvc/cabac: remove vvc_refill2
From: Wu Jianhua See https://github.com/ffvvc/FFmpeg/issues/178 Signed-off-by: Wu Jianhua --- libavcodec/vvc/cabac.c | 28 +--- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/libavcodec/vvc/cabac.c b/libavcodec/vvc/cabac.c index 0d45eec751..c9b6f9bf3e 100644 --- a/libavcodec/vvc/cabac.c +++ b/libavcodec/vvc/cabac.c @@ -856,32 +856,6 @@ int ff_vvc_cabac_init(VVCLocalContext *lc, return ret; } -//fixme -static void vvc_refill2(CABACContext* c) { -int i; -unsigned x; -#if !HAVE_FAST_CLZ -x = c->low ^ (c->low - 1); -i = 7 - ff_h264_norm_shift[x >> (CABAC_BITS - 1)]; -#else -i = ff_ctz(c->low) - CABAC_BITS; -#endif - -x = -CABAC_MASK; - -#if CABAC_BITS == 16 -x += (c->bytestream[0] << 9) + (c->bytestream[1] << 1); -#else -x += c->bytestream[0] << 1; -#endif - -c->low += x << i; -#if !UNCHECKED_BITSTREAM_READER -if (c->bytestream < c->bytestream_end) -#endif -c->bytestream += CABAC_BITS / 8; -} - static int inline vvc_get_cabac(CABACContext *c, VVCCabacState* base, const int ctx) { VVCCabacState *s = base + ctx; @@ -904,7 +878,7 @@ static int inline vvc_get_cabac(CABACContext *c, VVCCabacState* base, const int c->low <<= lps_mask; if (!(c->low & CABAC_MASK)) -vvc_refill2(c); +refill2(c); s->state[0] = s->state[0] - (s->state[0] >> s->shift[0]) + (1023 * bit >> s->shift[0]); s->state[1] = s->state[1] - (s->state[1] >> s->shift[1]) + (16383 * bit >> s->shift[1]); return bit; -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/3] avcodec/vvc_parser: move avctx->has_b_frames initialization to dec
From: Wu Jianhua >From Jun Zhao : > Should we relocate this to the decoder? Other codecs typically set this > parameter in the decoder. Signed-off-by: Wu Jianhua --- libavcodec/vvc/dec.c| 1 + libavcodec/vvc_parser.c | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/libavcodec/vvc/dec.c b/libavcodec/vvc/dec.c index d04f68e4cf..6e225d278a 100644 --- a/libavcodec/vvc/dec.c +++ b/libavcodec/vvc/dec.c @@ -748,6 +748,7 @@ static void export_frame_params(VVCContext *s, const VVCFrameContext *fc) c->coded_height = pps->height; c->width= pps->width - ((pps->r->pps_conf_win_left_offset + pps->r->pps_conf_win_right_offset) << sps->hshift[CHROMA]); c->height = pps->height - ((pps->r->pps_conf_win_top_offset + pps->r->pps_conf_win_bottom_offset) << sps->vshift[CHROMA]); +c->has_b_frames = sps->r->sps_dpb_params.dpb_max_num_reorder_pics[sps->r->sps_max_sublayers_minus1]; } static int frame_setup(VVCFrameContext *fc, VVCContext *s) diff --git a/libavcodec/vvc_parser.c b/libavcodec/vvc_parser.c index 5373875aae..8d32d66573 100644 --- a/libavcodec/vvc_parser.c +++ b/libavcodec/vvc_parser.c @@ -185,9 +185,6 @@ static void set_parser_ctx(AVCodecParserContext *s, AVCodecContext *avctx, avctx->color_range = sps->vui.vui_full_range_flag ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG; -avctx->has_b_frames = - sps->sps_dpb_params.dpb_max_num_reorder_pics[sps->sps_max_sublayers_minus1]; - if (sps->sps_ptl_dpb_hrd_params_present_flag && sps->sps_timing_hrd_params_present_flag) { uint32_t num = sps->sps_general_timing_hrd_parameters.num_units_in_tick; -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 3/3] tests/checkasm/vvc_alf: change alf step size to 8
From: Wu Jianhua >From Benjamin Bross: > for ALF where functions are in increments of 4 while 8 should be sufficient > according to the spec. Signed-off-by: Wu Jianhua --- tests/checkasm/vvc_alf.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c index f35fd2cd3e..84b0f9da15 100644 --- a/tests/checkasm/vvc_alf.c +++ b/tests/checkasm/vvc_alf.c @@ -90,8 +90,8 @@ static void check_alf_filter(VVCDSPContext *c, const int bit_depth) randomize_buffers2(filter, LUMA_PARAMS_SIZE, 1); randomize_buffers2(clip, LUMA_PARAMS_SIZE, 0); -for (int h = 4; h <= MAX_CTU_SIZE; h += 4) { -for (int w = 4; w <= MAX_CTU_SIZE; w += 4) { +for (int h = 4; h <= MAX_CTU_SIZE; h += 8) { +for (int w = 4; w <= MAX_CTU_SIZE; w += 8) { const int ctu_size = MAX_CTU_SIZE; if (check_func(c->alf.filter[LUMA], "vvc_alf_filter_luma_%dx%d_%d", w, h, bit_depth)) { const int vb_pos = ctu_size - ALF_VB_POS_ABOVE_LUMA; @@ -142,8 +142,8 @@ static void check_alf_classify(VVCDSPContext *c, const int bit_depth) randomize_buffers(src0, src1, SRC_BUF_SIZE); -for (int h = 4; h <= MAX_CTU_SIZE; h += 4) { -for (int w = 4; w <= MAX_CTU_SIZE; w += 4) { +for (int h = 4; h <= MAX_CTU_SIZE; h += 8) { +for (int w = 4; w <= MAX_CTU_SIZE; w += 8) { const int id_size = w * h / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * sizeof(int); const int vb_pos = MAX_CTU_SIZE - ALF_BLOCK_SIZE; if (check_func(c->alf.classify, "vvc_alf_classify_%dx%d_%d", w, h, bit_depth)) { -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 2/3] avcodec/x86/vvc/vvc_alf: use xq to match ptrdiff_t
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvc_alf.asm | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/vvc/vvc_alf.asm b/libavcodec/x86/vvc/vvc_alf.asm index f7b3e2a6cc..b35dd9b0e9 100644 --- a/libavcodec/x86/vvc/vvc_alf.asm +++ b/libavcodec/x86/vvc/vvc_alf.asm @@ -409,7 +409,7 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, dst_stride, src, src_s .loop: pushsrcq pushdstq -xor xd, xd +xor xq, xq .loop_w: LOAD_PARAMS @@ -417,8 +417,8 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, dst_stride, src, src_s add srcq, 16 * ps add dstq, 16 * ps -add xd, 16 -cmp xd, widthd +add xq, 16 +cmp xq, widthq jl .loop_w pop dstq @@ -427,7 +427,7 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, dst_stride, src, src_s lea dstq, [dstq + 4 * dst_strideq] lea filterq, [filterq + 2 * strideq] -leaclipq, [clipq + 2 * strideq] +leaclipq, [clipq + 2 * strideq] sub vb_posq, 4 sub heightq, 4 -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 1/3] avcodec/x86/vvc/vvc_alf: fix integer overflow
From: Wu Jianhua Some tests fails with certain seeds tests/checkasm/checkasm 2325607578 --test=vvc_alf checkasm: using random seed 2325607578 AVX2: vvc_alf_filter_luma_120x20_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x24_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x28_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x32_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x36_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x40_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x44_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x48_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x52_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x56_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x60_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x64_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x68_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x72_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x76_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x80_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x84_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x88_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x92_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x96_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x100_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x104_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x108_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x112_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x116_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x120_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x124_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x128_12_avx2 (vvc_alf.c:104) - vvc_alf.alf_filter [FAILED] - vvc_alf.alf_classify [OK] checkasm: 28 of 9216 tests have failed Reported-by: James Almer Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvc_alf.asm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/vvc/vvc_alf.asm b/libavcodec/x86/vvc/vvc_alf.asm index 71e821c27b..f7b3e2a6cc 100644 --- a/libavcodec/x86/vvc/vvc_alf.asm +++ b/libavcodec/x86/vvc/vvc_alf.asm @@ -356,7 +356,8 @@ SECTION .text FILTER_VB xq -paddw m0, m2 +; sum += curr +paddsw m0, m2 ; clip to pixel CLIPW m0, m14, m15 -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/3] tests/checkasm/vvc_alf: change alf step size to 8
From: Wu Jianhua >From Benjamin Bross: > for ALF where functions are in increments of 4 while 8 should be sufficient > according to the spec. Signed-off-by: Wu Jianhua --- tests/checkasm/vvc_alf.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c index f35fd2cd3e..84b0f9da15 100644 --- a/tests/checkasm/vvc_alf.c +++ b/tests/checkasm/vvc_alf.c @@ -90,8 +90,8 @@ static void check_alf_filter(VVCDSPContext *c, const int bit_depth) randomize_buffers2(filter, LUMA_PARAMS_SIZE, 1); randomize_buffers2(clip, LUMA_PARAMS_SIZE, 0); -for (int h = 4; h <= MAX_CTU_SIZE; h += 4) { -for (int w = 4; w <= MAX_CTU_SIZE; w += 4) { +for (int h = 4; h <= MAX_CTU_SIZE; h += 8) { +for (int w = 4; w <= MAX_CTU_SIZE; w += 8) { const int ctu_size = MAX_CTU_SIZE; if (check_func(c->alf.filter[LUMA], "vvc_alf_filter_luma_%dx%d_%d", w, h, bit_depth)) { const int vb_pos = ctu_size - ALF_VB_POS_ABOVE_LUMA; @@ -142,8 +142,8 @@ static void check_alf_classify(VVCDSPContext *c, const int bit_depth) randomize_buffers(src0, src1, SRC_BUF_SIZE); -for (int h = 4; h <= MAX_CTU_SIZE; h += 4) { -for (int w = 4; w <= MAX_CTU_SIZE; w += 4) { +for (int h = 4; h <= MAX_CTU_SIZE; h += 8) { +for (int w = 4; w <= MAX_CTU_SIZE; w += 8) { const int id_size = w * h / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * sizeof(int); const int vb_pos = MAX_CTU_SIZE - ALF_BLOCK_SIZE; if (check_func(c->alf.classify, "vvc_alf_classify_%dx%d_%d", w, h, bit_depth)) { -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/3] avcodec/x86/vvc/vvc_alf: use xq to match ptrdiff_t
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvc_alf.asm | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/vvc/vvc_alf.asm b/libavcodec/x86/vvc/vvc_alf.asm index 91f158bac9..8bb698955c 100644 --- a/libavcodec/x86/vvc/vvc_alf.asm +++ b/libavcodec/x86/vvc/vvc_alf.asm @@ -421,7 +421,7 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, dst_stride, src, src_s .loop: pushsrcq pushdstq -xor xd, xd +xor xq, xq .loop_w: LOAD_PARAMS @@ -429,8 +429,8 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, dst_stride, src, src_s add srcq, 16 * ps add dstq, 16 * ps -add xd, 16 -cmp xd, widthd +add xq, 16 +cmp xq, widthq jl .loop_w pop dstq @@ -439,7 +439,7 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, dst_stride, src, src_s lea dstq, [dstq + 4 * dst_strideq] lea filterq, [filterq + 2 * strideq] -leaclipq, [clipq + 2 * strideq] +leaclipq, [clipq + 2 * strideq] sub vb_posq, 4 sub heightq, 4 -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/3] avcodec/x86/vvc/vvc_alf: fix integer overflow
From: Wu Jianhua Some tests fails with certain seeds tests/checkasm/checkasm 2325607578 --test=vvc_alf checkasm: using random seed 2325607578 AVX2: vvc_alf_filter_luma_120x20_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x24_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x28_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x32_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x36_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x40_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x44_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x48_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x52_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x56_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x60_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x64_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x68_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x72_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x76_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x80_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x84_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x88_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x92_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x96_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x100_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x104_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x108_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x112_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x116_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x120_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x124_12_avx2 (vvc_alf.c:104) vvc_alf_filter_luma_120x128_12_avx2 (vvc_alf.c:104) - vvc_alf.alf_filter [FAILED] - vvc_alf.alf_classify [OK] checkasm: 28 of 9216 tests have failed Reported-by: James Almer Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvc_alf.asm | 13 + 1 file changed, 13 insertions(+) diff --git a/libavcodec/x86/vvc/vvc_alf.asm b/libavcodec/x86/vvc/vvc_alf.asm index 71e821c27b..91f158bac9 100644 --- a/libavcodec/x86/vvc/vvc_alf.asm +++ b/libavcodec/x86/vvc/vvc_alf.asm @@ -278,7 +278,9 @@ SECTION .text psrad m0, SHIFT + 3 psrad m1, SHIFT + 3 %%shift_end: +%if ps == 1 packssdw m0, m0, m1 +%endif %endmacro ; FILTER_VB(line) @@ -356,7 +358,18 @@ SECTION .text FILTER_VB xq +; sum += curr +%if ps == 1 paddw m0, m2 +%else +vpunpcklqdq m11, m2, m2 +vpunpckhqdq m12, m2, m2 +vpunpcklwd m11, m11, m14 +vpunpcklwd m12, m12, m14 +paddd m0, m11 +paddd m1, m12 +packssdw m0, m0, m1 +%endif ; clip to pixel CLIPW m0, m14, m15 -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v3 1/4] avcodec/x86/vvc: add alf filter luma and chroma avx2 optimizations
From: Wu Jianhua ff_vvc_alf_filter_luma_4x4_10_c: 135 ff_vvc_alf_filter_luma_4x4_10_avx2: 54 ff_vvc_alf_filter_luma_4x8_10_c: 268 ff_vvc_alf_filter_luma_4x8_10_avx2: 106 ff_vvc_alf_filter_luma_4x12_10_c: 400 ff_vvc_alf_filter_luma_4x12_10_avx2: 160 ff_vvc_alf_filter_luma_4x16_10_c: 535 ff_vvc_alf_filter_luma_4x16_10_avx2: 213 ff_vvc_alf_filter_luma_4x20_10_c: 646 ff_vvc_alf_filter_luma_4x20_10_avx2: 262 ff_vvc_alf_filter_luma_4x24_10_c: 783 ff_vvc_alf_filter_luma_4x24_10_avx2: 309 ff_vvc_alf_filter_luma_4x28_10_c: 908 ff_vvc_alf_filter_luma_4x28_10_avx2: 361 ff_vvc_alf_filter_luma_4x32_10_c: 1039 ff_vvc_alf_filter_luma_4x32_10_avx2: 412 ff_vvc_alf_filter_luma_8x4_10_c: 260 ff_vvc_alf_filter_luma_8x4_10_avx2: 53 ff_vvc_alf_filter_luma_8x8_10_c: 516 ff_vvc_alf_filter_luma_8x8_10_avx2: 105 ff_vvc_alf_filter_luma_8x12_10_c: 779 ff_vvc_alf_filter_luma_8x12_10_avx2: 157 ff_vvc_alf_filter_luma_8x16_10_c: 1038 ff_vvc_alf_filter_luma_8x16_10_avx2: 210 ff_vvc_alf_filter_luma_8x20_10_c: 1293 ff_vvc_alf_filter_luma_8x20_10_avx2: 259 ff_vvc_alf_filter_luma_8x24_10_c: 1553 ff_vvc_alf_filter_luma_8x24_10_avx2: 309 ff_vvc_alf_filter_luma_8x28_10_c: 1815 ff_vvc_alf_filter_luma_8x28_10_avx2: 361 ff_vvc_alf_filter_luma_8x32_10_c: 2067 ff_vvc_alf_filter_luma_8x32_10_avx2: 419 ff_vvc_alf_filter_luma_12x4_10_c: 390 ff_vvc_alf_filter_luma_12x4_10_avx2: 54 ff_vvc_alf_filter_luma_12x8_10_c: 773 ff_vvc_alf_filter_luma_12x8_10_avx2: 107 ff_vvc_alf_filter_luma_12x12_10_c: 1159 ff_vvc_alf_filter_luma_12x12_10_avx2: 155 ff_vvc_alf_filter_luma_12x16_10_c: 1550 ff_vvc_alf_filter_luma_12x16_10_avx2: 207 ff_vvc_alf_filter_luma_12x20_10_c: 1970 ff_vvc_alf_filter_luma_12x20_10_avx2: 260 ff_vvc_alf_filter_luma_12x24_10_c: 2379 ff_vvc_alf_filter_luma_12x24_10_avx2: 309 ff_vvc_alf_filter_luma_12x28_10_c: 2763 ff_vvc_alf_filter_luma_12x28_10_avx2: 362 ff_vvc_alf_filter_luma_12x32_10_c: 3158 ff_vvc_alf_filter_luma_12x32_10_avx2: 419 ff_vvc_alf_filter_luma_16x4_10_c: 523 ff_vvc_alf_filter_luma_16x4_10_avx2: 53 ff_vvc_alf_filter_luma_16x8_10_c: 1049 ff_vvc_alf_filter_luma_16x8_10_avx2: 103 ff_vvc_alf_filter_luma_16x12_10_c: 1566 ff_vvc_alf_filter_luma_16x12_10_avx2: 159 ff_vvc_alf_filter_luma_16x16_10_c: 2078 ff_vvc_alf_filter_luma_16x16_10_avx2: 211 ff_vvc_alf_filter_luma_16x20_10_c: 2631 ff_vvc_alf_filter_luma_16x20_10_avx2: 259 ff_vvc_alf_filter_luma_16x24_10_c: 3149 ff_vvc_alf_filter_luma_16x24_10_avx2: 316 ff_vvc_alf_filter_luma_16x28_10_c: 3631 ff_vvc_alf_filter_luma_16x28_10_avx2: 359 ff_vvc_alf_filter_luma_16x32_10_c: 4233 ff_vvc_alf_filter_luma_16x32_10_avx2: 428 ff_vvc_alf_filter_luma_20x4_10_c: 649 ff_vvc_alf_filter_luma_20x4_10_avx2: 106 ff_vvc_alf_filter_luma_20x8_10_c: 1294 ff_vvc_alf_filter_luma_20x8_10_avx2: 206 ff_vvc_alf_filter_luma_20x12_10_c: 1936 ff_vvc_alf_filter_luma_20x12_10_avx2: 310 ff_vvc_alf_filter_luma_20x16_10_c: 2594 ff_vvc_alf_filter_luma_20x16_10_avx2: 411 ff_vvc_alf_filter_luma_20x20_10_c: 3234 ff_vvc_alf_filter_luma_20x20_10_avx2: 517 ff_vvc_alf_filter_luma_20x24_10_c: 3894 ff_vvc_alf_filter_luma_20x24_10_avx2: 621 ff_vvc_alf_filter_luma_20x28_10_c: 4542 ff_vvc_alf_filter_luma_20x28_10_avx2: 722 ff_vvc_alf_filter_luma_20x32_10_c: 5205 ff_vvc_alf_filter_luma_20x32_10_avx2: 832 ff_vvc_alf_filter_luma_24x4_10_c: 774 ff_vvc_alf_filter_luma_24x4_10_avx2: 104 ff_vvc_alf_filter_luma_24x8_10_c: 1546 ff_vvc_alf_filter_luma_24x8_10_avx2: 206 ff_vvc_alf_filter_luma_24x12_10_c: 2318 ff_vvc_alf_filter_luma_24x12_10_avx2: 312 ff_vvc_alf_filter_luma_24x16_10_c: 3104 ff_vvc_alf_filter_luma_24x16_10_avx2: 411 ff_vvc_alf_filter_luma_24x20_10_c: 3893 ff_vvc_alf_filter_luma_24x20_10_avx2: 513 ff_vvc_alf_filter_luma_24x24_10_c: 4681 ff_vvc_alf_filter_luma_24x24_10_avx2: 616 ff_vvc_alf_filter_luma_24x28_10_c: 5474 ff_vvc_alf_filter_luma_24x28_10_avx2: 721 ff_vvc_alf_filter_luma_24x32_10_c: 6271 ff_vvc_alf_filter_luma_24x32_10_avx2: 832 ff_vvc_alf_filter_luma_28x4_10_c: 907 ff_vvc_alf_filter_luma_28x4_10_avx2: 103 ff_vvc_alf_filter_luma_28x8_10_c: 1797 ff_vvc_alf_filter_luma_28x8_10_avx2: 206 ff_vvc_alf_filter_luma_28x12_10_c: 2708 ff_vvc_alf_filter_luma_28x12_10_avx2: 309 ff_vvc_alf_filter_luma_28x16_10_c: 3632 ff_vvc_alf_filter_luma_28x16_10_avx2: 413 ff_vvc_alf_filter_luma_28x20_10_c: 4537 ff_vvc_alf_filter_luma_28x20_10_avx2: 519 ff_vvc_alf_filter_luma_28x24_10_c: 5463 ff_vvc_alf_filter_luma_28x24_10_avx2: 616 ff_vvc_alf_filter_luma_28x28_10_c: 6372 ff_vvc_alf_filter_luma_28x28_10_avx2: 719 ff_vvc_alf_filter_luma_28x32_10_c: 7274 ff_vvc_alf_filter_luma_28x32_10_avx2: 823 ff_vvc_alf_filter_luma_32x4_10_c: 1029 ff_vvc_alf_filter_luma_32x4_10_avx2: 104 ff_vvc_alf_filter_luma_32x8_10_c: 2060 ff_vvc_alf_filter_luma_32x8_10_avx2: 206 ff_vvc_alf_filter_luma_32x12_10_c: 3112 ff_vvc_alf_filter_luma_32x12_10_avx2: 307 ff_vvc_alf_filter_luma_32x16_10_c: 4161 ff_vvc_alf_filter_luma_32x16_10_avx2: 413 ff_vvc_alf_filter_luma_32x20_10_c: 5211 ff_vvc_alf_filter_luma_32x20_10_avx2: 514 ff_vvc_alf_filter_luma_32x24_10_c: 6238 ff_vvc_alf_filter_luma
[FFmpeg-devel] [PATCH v3 4/4] tests/checkasm/vvc_alf: add check_alf_classify
From: Wu Jianhua Perforamnce Test (fps): clip before after delta Tango2_3840x2160_60_10_420_27_LD.266 56 115 105.36% RitualDance_1920x1080_60_10_420_32_LD.266 272 481 76.83% RitualDance_1920x1080_60_10_420_37_RA.266 303 426 40.59% Signed-off-by: Wu Jianhua --- tests/checkasm/vvc_alf.c | 47 1 file changed, 47 insertions(+) diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c index 10469e1528..9526260598 100644 --- a/tests/checkasm/vvc_alf.c +++ b/tests/checkasm/vvc_alf.c @@ -121,6 +121,47 @@ static void check_alf_filter(VVCDSPContext *c, const int bit_depth) } } +static void check_alf_classify(VVCDSPContext *c, const int bit_depth) +{ +LOCAL_ALIGNED_32(int, class_idx0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int, transpose_idx0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int, class_idx1, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int, transpose_idx1, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int32_t, alf_gradient_tmp, [ALF_GRADIENT_SIZE * ALF_GRADIENT_SIZE * ALF_NUM_DIR]); + +ptrdiff_t stride = SRC_PIXEL_STRIDE * SIZEOF_PIXEL; +int offset = (3 * SRC_PIXEL_STRIDE + 3) * SIZEOF_PIXEL; + +declare_func_emms(AV_CPU_FLAG_AVX2, void, int *class_idx, int *transpose_idx, +const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp); + +randomize_buffers(src0, src1, SRC_BUF_SIZE); + +for (int h = 4; h <= MAX_CTU_SIZE; h += 4) { +for (int w = 4; w <= MAX_CTU_SIZE; w += 4) { +const int id_size = w * h / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * sizeof(int); +const int vb_pos = MAX_CTU_SIZE - ALF_BLOCK_SIZE; +if (check_func(c->alf.classify, "vvc_alf_classify_%dx%d_%d", w, h, bit_depth)) { +memset(class_idx0, 0, id_size); +memset(class_idx1, 0, id_size); +memset(transpose_idx0, 0, id_size); +memset(transpose_idx1, 0, id_size); +call_ref(class_idx0, transpose_idx0, src0 + offset, stride, w, h, vb_pos, alf_gradient_tmp); + +call_new(class_idx1, transpose_idx1, src1 + offset, stride, w, h, vb_pos, alf_gradient_tmp); + +if (memcmp(class_idx0, class_idx1, id_size)) +fail(); +if (memcmp(transpose_idx0, transpose_idx1, id_size)) +fail(); +bench_new(class_idx1, transpose_idx1, src1 + offset, stride, w, h, vb_pos, alf_gradient_tmp); +} +} +} +} + void checkasm_check_vvc_alf(void) { int bit_depth; @@ -130,4 +171,10 @@ void checkasm_check_vvc_alf(void) check_alf_filter(&h, bit_depth); } report("alf_filter"); + +for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { +ff_vvc_dsp_init(&h, bit_depth); +check_alf_classify(&h, bit_depth); +} +report("alf_classify"); } -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v3 2/4] tests/checkasm: add checkasm_check_vvc_alf and check_alf_filter
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/Makefile | 2 +- tests/checkasm/checkasm.c | 3 +- tests/checkasm/checkasm.h | 1 + tests/checkasm/vvc_alf.c | 133 ++ 4 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 tests/checkasm/vvc_alf.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index b5bb885201..92624aab0a 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -43,7 +43,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o -AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_mc.o +AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_alf.o vvc_mc.o CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 04f94f9d09..ffc89882b1 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -204,7 +204,8 @@ static const struct { { "vorbisdsp", checkasm_check_vorbisdsp }, #endif #if CONFIG_VVC_DECODER -{ "vvc_mc", checkasm_check_vvc_mc }, +{ "vvc_alf", checkasm_check_vvc_alf }, +{ "vvc_mc", checkasm_check_vvc_mc }, #endif #endif #if CONFIG_AVFILTER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 8807a37a43..07fcc751ff 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -134,6 +134,7 @@ void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); void checkasm_check_videodsp(void); void checkasm_check_vorbisdsp(void); +void checkasm_check_vvc_alf(void); void checkasm_check_vvc_mc(void); struct CheckasmPerf; diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c new file mode 100644 index 00..10469e1528 --- /dev/null +++ b/tests/checkasm/vvc_alf.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2023-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +#include "checkasm.h" +#include "libavcodec/vvc/ctu.h" +#include "libavcodec/vvc/data.h" +#include "libavcodec/vvc/dsp.h" + +#include "libavutil/common.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff }; + +#define SIZEOF_PIXEL ((bit_depth + 7) / 8) +#define SRC_PIXEL_STRIDE (MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE) +#define DST_PIXEL_STRIDE (SRC_PIXEL_STRIDE + 4) +#define SRC_BUF_SIZE (SRC_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2) //+3 * 2 for top and bottom row, *2 for high bit depth +#define DST_BUF_SIZE (DST_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2) +#define LUMA_PARAMS_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * ALF_NUM_COEFF_LUMA) + +#define randomize_buffers(buf0, buf1, size) \ +do {\ +uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \ +int k; \ +for (k = 0; k < size; k += 4) { \ +uint32_t r = rnd() & mask; \ +AV_WN32A(buf0 + k, r); \ +AV_WN32A(buf1 + k, r); \ +} \ +} while (0) + +#define randomize_buffers2(buf, size, filter) \ +do {\ +int k; \ +if (filter) { \ +for (k = 0; k < size; k++) {\ +int8_t r = rnd(); \ +buf[k] = r; \ +} \ +} else {\ +for (k = 0; k < size; k++) {\ +int r = rnd() % FF_ARRAY_ELEMS(clip_set); \ +buf[k] = clip_set[r]; \ +}
[FFmpeg-devel] [PATCH v2 4/4] tests/checkasm/vvc_alf: add check_alf_classify
From: Wu Jianhua Perforamnce Test (fps): clip before after delta Tango2_3840x2160_60_10_420_27_LD.266 56 115 105.36% RitualDance_1920x1080_60_10_420_32_LD.266 272 481 76.83% RitualDance_1920x1080_60_10_420_37_RA.266 303 426 40.59% Signed-off-by: Wu Jianhua --- tests/checkasm/vvc_alf.c | 47 1 file changed, 47 insertions(+) diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c index 10469e1528..9526260598 100644 --- a/tests/checkasm/vvc_alf.c +++ b/tests/checkasm/vvc_alf.c @@ -121,6 +121,47 @@ static void check_alf_filter(VVCDSPContext *c, const int bit_depth) } } +static void check_alf_classify(VVCDSPContext *c, const int bit_depth) +{ +LOCAL_ALIGNED_32(int, class_idx0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int, transpose_idx0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int, class_idx1, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int, transpose_idx1, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int32_t, alf_gradient_tmp, [ALF_GRADIENT_SIZE * ALF_GRADIENT_SIZE * ALF_NUM_DIR]); + +ptrdiff_t stride = SRC_PIXEL_STRIDE * SIZEOF_PIXEL; +int offset = (3 * SRC_PIXEL_STRIDE + 3) * SIZEOF_PIXEL; + +declare_func_emms(AV_CPU_FLAG_AVX2, void, int *class_idx, int *transpose_idx, +const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp); + +randomize_buffers(src0, src1, SRC_BUF_SIZE); + +for (int h = 4; h <= MAX_CTU_SIZE; h += 4) { +for (int w = 4; w <= MAX_CTU_SIZE; w += 4) { +const int id_size = w * h / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * sizeof(int); +const int vb_pos = MAX_CTU_SIZE - ALF_BLOCK_SIZE; +if (check_func(c->alf.classify, "vvc_alf_classify_%dx%d_%d", w, h, bit_depth)) { +memset(class_idx0, 0, id_size); +memset(class_idx1, 0, id_size); +memset(transpose_idx0, 0, id_size); +memset(transpose_idx1, 0, id_size); +call_ref(class_idx0, transpose_idx0, src0 + offset, stride, w, h, vb_pos, alf_gradient_tmp); + +call_new(class_idx1, transpose_idx1, src1 + offset, stride, w, h, vb_pos, alf_gradient_tmp); + +if (memcmp(class_idx0, class_idx1, id_size)) +fail(); +if (memcmp(transpose_idx0, transpose_idx1, id_size)) +fail(); +bench_new(class_idx1, transpose_idx1, src1 + offset, stride, w, h, vb_pos, alf_gradient_tmp); +} +} +} +} + void checkasm_check_vvc_alf(void) { int bit_depth; @@ -130,4 +171,10 @@ void checkasm_check_vvc_alf(void) check_alf_filter(&h, bit_depth); } report("alf_filter"); + +for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { +ff_vvc_dsp_init(&h, bit_depth); +check_alf_classify(&h, bit_depth); +} +report("alf_classify"); } -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 2/4] tests/checkasm: add checkasm_check_vvc_alf and check_alf_filter
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/Makefile | 2 +- tests/checkasm/checkasm.c | 3 +- tests/checkasm/checkasm.h | 1 + tests/checkasm/vvc_alf.c | 133 ++ 4 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 tests/checkasm/vvc_alf.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 2673e1d098..5a3e3985c4 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -41,7 +41,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o -AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_mc.o +AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_alf.o vvc_mc.o CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 8be6cb0f55..8b2bf2827b 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -198,7 +198,8 @@ static const struct { { "vorbisdsp", checkasm_check_vorbisdsp }, #endif #if CONFIG_VVC_DECODER -{ "vvc_mc", checkasm_check_vvc_mc }, +{ "vvc_alf", checkasm_check_vvc_alf }, +{ "vvc_mc", checkasm_check_vvc_mc }, #endif #endif #if CONFIG_AVFILTER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index f90920dee7..c6a5cf42dd 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -132,6 +132,7 @@ void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); void checkasm_check_videodsp(void); void checkasm_check_vorbisdsp(void); +void checkasm_check_vvc_alf(void); void checkasm_check_vvc_mc(void); struct CheckasmPerf; diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c new file mode 100644 index 00..10469e1528 --- /dev/null +++ b/tests/checkasm/vvc_alf.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2023-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +#include "checkasm.h" +#include "libavcodec/vvc/ctu.h" +#include "libavcodec/vvc/data.h" +#include "libavcodec/vvc/dsp.h" + +#include "libavutil/common.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff }; + +#define SIZEOF_PIXEL ((bit_depth + 7) / 8) +#define SRC_PIXEL_STRIDE (MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE) +#define DST_PIXEL_STRIDE (SRC_PIXEL_STRIDE + 4) +#define SRC_BUF_SIZE (SRC_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2) //+3 * 2 for top and bottom row, *2 for high bit depth +#define DST_BUF_SIZE (DST_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2) +#define LUMA_PARAMS_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * ALF_NUM_COEFF_LUMA) + +#define randomize_buffers(buf0, buf1, size) \ +do {\ +uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \ +int k; \ +for (k = 0; k < size; k += 4) { \ +uint32_t r = rnd() & mask; \ +AV_WN32A(buf0 + k, r); \ +AV_WN32A(buf1 + k, r); \ +} \ +} while (0) + +#define randomize_buffers2(buf, size, filter) \ +do {\ +int k; \ +if (filter) { \ +for (k = 0; k < size; k++) {\ +int8_t r = rnd(); \ +buf[k] = r; \ +} \ +} else {\ +for (k = 0; k < size; k++) {\ +int r = rnd() % FF_ARRAY_ELEMS(clip_set); \ +buf[k] = clip_set[r]; \ +}
[FFmpeg-devel] [PATCH v2 1/4] avcodec/x86/vvc: add alf filter luma and chroma avx2 optimizations
From: Wu Jianhua ff_vvc_alf_filter_luma_4x4_10_c: 135 ff_vvc_alf_filter_luma_4x4_10_avx2: 54 ff_vvc_alf_filter_luma_4x8_10_c: 268 ff_vvc_alf_filter_luma_4x8_10_avx2: 106 ff_vvc_alf_filter_luma_4x12_10_c: 400 ff_vvc_alf_filter_luma_4x12_10_avx2: 160 ff_vvc_alf_filter_luma_4x16_10_c: 535 ff_vvc_alf_filter_luma_4x16_10_avx2: 213 ff_vvc_alf_filter_luma_4x20_10_c: 646 ff_vvc_alf_filter_luma_4x20_10_avx2: 262 ff_vvc_alf_filter_luma_4x24_10_c: 783 ff_vvc_alf_filter_luma_4x24_10_avx2: 309 ff_vvc_alf_filter_luma_4x28_10_c: 908 ff_vvc_alf_filter_luma_4x28_10_avx2: 361 ff_vvc_alf_filter_luma_4x32_10_c: 1039 ff_vvc_alf_filter_luma_4x32_10_avx2: 412 ff_vvc_alf_filter_luma_8x4_10_c: 260 ff_vvc_alf_filter_luma_8x4_10_avx2: 53 ff_vvc_alf_filter_luma_8x8_10_c: 516 ff_vvc_alf_filter_luma_8x8_10_avx2: 105 ff_vvc_alf_filter_luma_8x12_10_c: 779 ff_vvc_alf_filter_luma_8x12_10_avx2: 157 ff_vvc_alf_filter_luma_8x16_10_c: 1038 ff_vvc_alf_filter_luma_8x16_10_avx2: 210 ff_vvc_alf_filter_luma_8x20_10_c: 1293 ff_vvc_alf_filter_luma_8x20_10_avx2: 259 ff_vvc_alf_filter_luma_8x24_10_c: 1553 ff_vvc_alf_filter_luma_8x24_10_avx2: 309 ff_vvc_alf_filter_luma_8x28_10_c: 1815 ff_vvc_alf_filter_luma_8x28_10_avx2: 361 ff_vvc_alf_filter_luma_8x32_10_c: 2067 ff_vvc_alf_filter_luma_8x32_10_avx2: 419 ff_vvc_alf_filter_luma_12x4_10_c: 390 ff_vvc_alf_filter_luma_12x4_10_avx2: 54 ff_vvc_alf_filter_luma_12x8_10_c: 773 ff_vvc_alf_filter_luma_12x8_10_avx2: 107 ff_vvc_alf_filter_luma_12x12_10_c: 1159 ff_vvc_alf_filter_luma_12x12_10_avx2: 155 ff_vvc_alf_filter_luma_12x16_10_c: 1550 ff_vvc_alf_filter_luma_12x16_10_avx2: 207 ff_vvc_alf_filter_luma_12x20_10_c: 1970 ff_vvc_alf_filter_luma_12x20_10_avx2: 260 ff_vvc_alf_filter_luma_12x24_10_c: 2379 ff_vvc_alf_filter_luma_12x24_10_avx2: 309 ff_vvc_alf_filter_luma_12x28_10_c: 2763 ff_vvc_alf_filter_luma_12x28_10_avx2: 362 ff_vvc_alf_filter_luma_12x32_10_c: 3158 ff_vvc_alf_filter_luma_12x32_10_avx2: 419 ff_vvc_alf_filter_luma_16x4_10_c: 523 ff_vvc_alf_filter_luma_16x4_10_avx2: 53 ff_vvc_alf_filter_luma_16x8_10_c: 1049 ff_vvc_alf_filter_luma_16x8_10_avx2: 103 ff_vvc_alf_filter_luma_16x12_10_c: 1566 ff_vvc_alf_filter_luma_16x12_10_avx2: 159 ff_vvc_alf_filter_luma_16x16_10_c: 2078 ff_vvc_alf_filter_luma_16x16_10_avx2: 211 ff_vvc_alf_filter_luma_16x20_10_c: 2631 ff_vvc_alf_filter_luma_16x20_10_avx2: 259 ff_vvc_alf_filter_luma_16x24_10_c: 3149 ff_vvc_alf_filter_luma_16x24_10_avx2: 316 ff_vvc_alf_filter_luma_16x28_10_c: 3631 ff_vvc_alf_filter_luma_16x28_10_avx2: 359 ff_vvc_alf_filter_luma_16x32_10_c: 4233 ff_vvc_alf_filter_luma_16x32_10_avx2: 428 ff_vvc_alf_filter_luma_20x4_10_c: 649 ff_vvc_alf_filter_luma_20x4_10_avx2: 106 ff_vvc_alf_filter_luma_20x8_10_c: 1294 ff_vvc_alf_filter_luma_20x8_10_avx2: 206 ff_vvc_alf_filter_luma_20x12_10_c: 1936 ff_vvc_alf_filter_luma_20x12_10_avx2: 310 ff_vvc_alf_filter_luma_20x16_10_c: 2594 ff_vvc_alf_filter_luma_20x16_10_avx2: 411 ff_vvc_alf_filter_luma_20x20_10_c: 3234 ff_vvc_alf_filter_luma_20x20_10_avx2: 517 ff_vvc_alf_filter_luma_20x24_10_c: 3894 ff_vvc_alf_filter_luma_20x24_10_avx2: 621 ff_vvc_alf_filter_luma_20x28_10_c: 4542 ff_vvc_alf_filter_luma_20x28_10_avx2: 722 ff_vvc_alf_filter_luma_20x32_10_c: 5205 ff_vvc_alf_filter_luma_20x32_10_avx2: 832 ff_vvc_alf_filter_luma_24x4_10_c: 774 ff_vvc_alf_filter_luma_24x4_10_avx2: 104 ff_vvc_alf_filter_luma_24x8_10_c: 1546 ff_vvc_alf_filter_luma_24x8_10_avx2: 206 ff_vvc_alf_filter_luma_24x12_10_c: 2318 ff_vvc_alf_filter_luma_24x12_10_avx2: 312 ff_vvc_alf_filter_luma_24x16_10_c: 3104 ff_vvc_alf_filter_luma_24x16_10_avx2: 411 ff_vvc_alf_filter_luma_24x20_10_c: 3893 ff_vvc_alf_filter_luma_24x20_10_avx2: 513 ff_vvc_alf_filter_luma_24x24_10_c: 4681 ff_vvc_alf_filter_luma_24x24_10_avx2: 616 ff_vvc_alf_filter_luma_24x28_10_c: 5474 ff_vvc_alf_filter_luma_24x28_10_avx2: 721 ff_vvc_alf_filter_luma_24x32_10_c: 6271 ff_vvc_alf_filter_luma_24x32_10_avx2: 832 ff_vvc_alf_filter_luma_28x4_10_c: 907 ff_vvc_alf_filter_luma_28x4_10_avx2: 103 ff_vvc_alf_filter_luma_28x8_10_c: 1797 ff_vvc_alf_filter_luma_28x8_10_avx2: 206 ff_vvc_alf_filter_luma_28x12_10_c: 2708 ff_vvc_alf_filter_luma_28x12_10_avx2: 309 ff_vvc_alf_filter_luma_28x16_10_c: 3632 ff_vvc_alf_filter_luma_28x16_10_avx2: 413 ff_vvc_alf_filter_luma_28x20_10_c: 4537 ff_vvc_alf_filter_luma_28x20_10_avx2: 519 ff_vvc_alf_filter_luma_28x24_10_c: 5463 ff_vvc_alf_filter_luma_28x24_10_avx2: 616 ff_vvc_alf_filter_luma_28x28_10_c: 6372 ff_vvc_alf_filter_luma_28x28_10_avx2: 719 ff_vvc_alf_filter_luma_28x32_10_c: 7274 ff_vvc_alf_filter_luma_28x32_10_avx2: 823 ff_vvc_alf_filter_luma_32x4_10_c: 1029 ff_vvc_alf_filter_luma_32x4_10_avx2: 104 ff_vvc_alf_filter_luma_32x8_10_c: 2060 ff_vvc_alf_filter_luma_32x8_10_avx2: 206 ff_vvc_alf_filter_luma_32x12_10_c: 3112 ff_vvc_alf_filter_luma_32x12_10_avx2: 307 ff_vvc_alf_filter_luma_32x16_10_c: 4161 ff_vvc_alf_filter_luma_32x16_10_avx2: 413 ff_vvc_alf_filter_luma_32x20_10_c: 5211 ff_vvc_alf_filter_luma_32x20_10_avx2: 514 ff_vvc_alf_filter_luma_32x24_10_c: 6238 ff_vvc_alf_filter_luma
[FFmpeg-devel] [PATCH 4/4] tests/checkasm/vvc_alf: add check_alf_classify
From: Wu Jianhua Perforamnce Test: clip before (fps)after (fps) delta Tango2_3840x2160_60_10_420_27_LD.266 56 115 105.36% RitualDance_1920x1080_60_10_420_32_LD.266272 481 76.83% RitualDance_1920x1080_60_10_420_37_RA.266303 426 40.59% Signed-off-by: Wu Jianhua --- tests/checkasm/vvc_alf.c | 47 1 file changed, 47 insertions(+) diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c index 10469e1528..9526260598 100644 --- a/tests/checkasm/vvc_alf.c +++ b/tests/checkasm/vvc_alf.c @@ -121,6 +121,47 @@ static void check_alf_filter(VVCDSPContext *c, const int bit_depth) } } +static void check_alf_classify(VVCDSPContext *c, const int bit_depth) +{ +LOCAL_ALIGNED_32(int, class_idx0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int, transpose_idx0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int, class_idx1, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int, transpose_idx1, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int32_t, alf_gradient_tmp, [ALF_GRADIENT_SIZE * ALF_GRADIENT_SIZE * ALF_NUM_DIR]); + +ptrdiff_t stride = SRC_PIXEL_STRIDE * SIZEOF_PIXEL; +int offset = (3 * SRC_PIXEL_STRIDE + 3) * SIZEOF_PIXEL; + +declare_func_emms(AV_CPU_FLAG_AVX2, void, int *class_idx, int *transpose_idx, +const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp); + +randomize_buffers(src0, src1, SRC_BUF_SIZE); + +for (int h = 4; h <= MAX_CTU_SIZE; h += 4) { +for (int w = 4; w <= MAX_CTU_SIZE; w += 4) { +const int id_size = w * h / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * sizeof(int); +const int vb_pos = MAX_CTU_SIZE - ALF_BLOCK_SIZE; +if (check_func(c->alf.classify, "vvc_alf_classify_%dx%d_%d", w, h, bit_depth)) { +memset(class_idx0, 0, id_size); +memset(class_idx1, 0, id_size); +memset(transpose_idx0, 0, id_size); +memset(transpose_idx1, 0, id_size); +call_ref(class_idx0, transpose_idx0, src0 + offset, stride, w, h, vb_pos, alf_gradient_tmp); + +call_new(class_idx1, transpose_idx1, src1 + offset, stride, w, h, vb_pos, alf_gradient_tmp); + +if (memcmp(class_idx0, class_idx1, id_size)) +fail(); +if (memcmp(transpose_idx0, transpose_idx1, id_size)) +fail(); +bench_new(class_idx1, transpose_idx1, src1 + offset, stride, w, h, vb_pos, alf_gradient_tmp); +} +} +} +} + void checkasm_check_vvc_alf(void) { int bit_depth; @@ -130,4 +171,10 @@ void checkasm_check_vvc_alf(void) check_alf_filter(&h, bit_depth); } report("alf_filter"); + +for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { +ff_vvc_dsp_init(&h, bit_depth); +check_alf_classify(&h, bit_depth); +} +report("alf_classify"); } -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/4] tests/checkasm: add checkasm_check_vvc_alf and check_alf_filter
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/Makefile | 2 +- tests/checkasm/checkasm.c | 3 +- tests/checkasm/checkasm.h | 1 + tests/checkasm/vvc_alf.c | 133 ++ 4 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 tests/checkasm/vvc_alf.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 2673e1d098..5a3e3985c4 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -41,7 +41,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o -AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_mc.o +AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_alf.o vvc_mc.o CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 8be6cb0f55..8b2bf2827b 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -198,7 +198,8 @@ static const struct { { "vorbisdsp", checkasm_check_vorbisdsp }, #endif #if CONFIG_VVC_DECODER -{ "vvc_mc", checkasm_check_vvc_mc }, +{ "vvc_alf", checkasm_check_vvc_alf }, +{ "vvc_mc", checkasm_check_vvc_mc }, #endif #endif #if CONFIG_AVFILTER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index f90920dee7..c6a5cf42dd 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -132,6 +132,7 @@ void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); void checkasm_check_videodsp(void); void checkasm_check_vorbisdsp(void); +void checkasm_check_vvc_alf(void); void checkasm_check_vvc_mc(void); struct CheckasmPerf; diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c new file mode 100644 index 00..10469e1528 --- /dev/null +++ b/tests/checkasm/vvc_alf.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2023-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +#include "checkasm.h" +#include "libavcodec/vvc/ctu.h" +#include "libavcodec/vvc/data.h" +#include "libavcodec/vvc/dsp.h" + +#include "libavutil/common.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff }; + +#define SIZEOF_PIXEL ((bit_depth + 7) / 8) +#define SRC_PIXEL_STRIDE (MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE) +#define DST_PIXEL_STRIDE (SRC_PIXEL_STRIDE + 4) +#define SRC_BUF_SIZE (SRC_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2) //+3 * 2 for top and bottom row, *2 for high bit depth +#define DST_BUF_SIZE (DST_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2) +#define LUMA_PARAMS_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * ALF_NUM_COEFF_LUMA) + +#define randomize_buffers(buf0, buf1, size) \ +do {\ +uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \ +int k; \ +for (k = 0; k < size; k += 4) { \ +uint32_t r = rnd() & mask; \ +AV_WN32A(buf0 + k, r); \ +AV_WN32A(buf1 + k, r); \ +} \ +} while (0) + +#define randomize_buffers2(buf, size, filter) \ +do {\ +int k; \ +if (filter) { \ +for (k = 0; k < size; k++) {\ +int8_t r = rnd(); \ +buf[k] = r; \ +} \ +} else {\ +for (k = 0; k < size; k++) {\ +int r = rnd() % FF_ARRAY_ELEMS(clip_set); \ +buf[k] = clip_set[r]; \ +}
[FFmpeg-devel] [PATCH 1/4] avcodec/x86/vvc: add alf filter luma and chroma avx2 optimizations
From: Wu Jianhua vvc_alf_filter_chroma_4x4_10_c: 657.0 vvc_alf_filter_chroma_4x4_10_avx2: 138.0 vvc_alf_filter_chroma_4x8_10_c: 1264.7 vvc_alf_filter_chroma_4x8_10_avx2: 253.5 vvc_alf_filter_chroma_4x12_10_c: 1841.7 vvc_alf_filter_chroma_4x12_10_avx2: 375.5 vvc_alf_filter_chroma_4x16_10_c: 2442.7 vvc_alf_filter_chroma_4x16_10_avx2: 491.7 vvc_alf_filter_chroma_4x20_10_c: 3057.0 vvc_alf_filter_chroma_4x20_10_avx2: 607.2 vvc_alf_filter_chroma_4x24_10_c: 3667.0 vvc_alf_filter_chroma_4x24_10_avx2: 747.5 vvc_alf_filter_chroma_4x28_10_c: 4286.7 vvc_alf_filter_chroma_4x28_10_avx2: 849.0 vvc_alf_filter_chroma_4x32_10_c: 4886.0 vvc_alf_filter_chroma_4x32_10_avx2: 967.5 vvc_alf_filter_chroma_8x4_10_c: 1250.5 vvc_alf_filter_chroma_8x4_10_avx2: 261.0 vvc_alf_filter_chroma_8x8_10_c: 2430.7 vvc_alf_filter_chroma_8x8_10_avx2: 494.7 vvc_alf_filter_chroma_8x12_10_c: 3631.2 vvc_alf_filter_chroma_8x12_10_avx2: 734.5 vvc_alf_filter_chroma_8x16_10_c: 13675.7 vvc_alf_filter_chroma_8x16_10_avx2: 972.0 vvc_alf_filter_chroma_8x20_10_c: 6212.0 vvc_alf_filter_chroma_8x20_10_avx2: 1211.0 vvc_alf_filter_chroma_8x24_10_c: 7440.7 vvc_alf_filter_chroma_8x24_10_avx2: 1447.0 vvc_alf_filter_chroma_8x28_10_c: 8460.5 vvc_alf_filter_chroma_8x28_10_avx2: 1682.5 vvc_alf_filter_chroma_8x32_10_c: 9665.2 vvc_alf_filter_chroma_8x32_10_avx2: 1917.7 vvc_alf_filter_chroma_12x4_10_c: 1865.2 vvc_alf_filter_chroma_12x4_10_avx2: 391.7 vvc_alf_filter_chroma_12x8_10_c: 3625.2 vvc_alf_filter_chroma_12x8_10_avx2: 739.0 vvc_alf_filter_chroma_12x12_10_c: 5427.5 vvc_alf_filter_chroma_12x12_10_avx2: 1094.2 vvc_alf_filter_chroma_12x16_10_c: 7237.7 vvc_alf_filter_chroma_12x16_10_avx2: 1447.2 vvc_alf_filter_chroma_12x20_10_c: 9035.2 vvc_alf_filter_chroma_12x20_10_avx2: 1805.2 vvc_alf_filter_chroma_12x24_10_c: 11135.7 vvc_alf_filter_chroma_12x24_10_avx2: 2158.2 vvc_alf_filter_chroma_12x28_10_c: 12644.0 vvc_alf_filter_chroma_12x28_10_avx2: 2511.2 vvc_alf_filter_chroma_12x32_10_c: 14441.7 vvc_alf_filter_chroma_12x32_10_avx2: 2888.0 vvc_alf_filter_chroma_16x4_10_c: 2410.0 vvc_alf_filter_chroma_16x4_10_avx2: 251.7 vvc_alf_filter_chroma_16x8_10_c: 4943.0 vvc_alf_filter_chroma_16x8_10_avx2: 479.0 vvc_alf_filter_chroma_16x12_10_c: 7235.5 vvc_alf_filter_chroma_16x12_10_avx2: 9751.0 vvc_alf_filter_chroma_16x16_10_c: 10142.7 vvc_alf_filter_chroma_16x16_10_avx2: 935.5 vvc_alf_filter_chroma_16x20_10_c: 12029.0 vvc_alf_filter_chroma_16x20_10_avx2: 1174.5 vvc_alf_filter_chroma_16x24_10_c: 14414.2 vvc_alf_filter_chroma_16x24_10_avx2: 1410.5 vvc_alf_filter_chroma_16x28_10_c: 16813.0 vvc_alf_filter_chroma_16x28_10_avx2: 1713.0 vvc_alf_filter_chroma_16x32_10_c: 19228.5 vvc_alf_filter_chroma_16x32_10_avx2: 2256.0 vvc_alf_filter_chroma_20x4_10_c: 3015.2 vvc_alf_filter_chroma_20x4_10_avx2: 371.7 vvc_alf_filter_chroma_20x8_10_c: 6170.2 vvc_alf_filter_chroma_20x8_10_avx2: 721.0 vvc_alf_filter_chroma_20x12_10_c: 9019.7 vvc_alf_filter_chroma_20x12_10_avx2: 1102.7 vvc_alf_filter_chroma_20x16_10_c: 12040.2 vvc_alf_filter_chroma_20x16_10_avx2: 1422.5 vvc_alf_filter_chroma_20x20_10_c: 15010.7 vvc_alf_filter_chroma_20x20_10_avx2: 1765.7 vvc_alf_filter_chroma_20x24_10_c: 18017.7 vvc_alf_filter_chroma_20x24_10_avx2: 2124.7 vvc_alf_filter_chroma_20x28_10_c: 21025.5 vvc_alf_filter_chroma_20x28_10_avx2: 2488.2 vvc_alf_filter_chroma_20x32_10_c: 31128.5 vvc_alf_filter_chroma_20x32_10_avx2: 3205.2 vvc_alf_filter_chroma_24x4_10_c: 3701.2 vvc_alf_filter_chroma_24x4_10_avx2: 494.7 vvc_alf_filter_chroma_24x8_10_c: 7613.0 vvc_alf_filter_chroma_24x8_10_avx2: 957.2 vvc_alf_filter_chroma_24x12_10_c: 10816.7 vvc_alf_filter_chroma_24x12_10_avx2: 1427.7 vvc_alf_filter_chroma_24x16_10_c: 14390.5 vvc_alf_filter_chroma_24x16_10_avx2: 1948.2 vvc_alf_filter_chroma_24x20_10_c: 17989.5 vvc_alf_filter_chroma_24x20_10_avx2: 2363.7 vvc_alf_filter_chroma_24x24_10_c: 21581.7 vvc_alf_filter_chroma_24x24_10_avx2: 2839.7 vvc_alf_filter_chroma_24x28_10_c: 25179.2 vvc_alf_filter_chroma_24x28_10_avx2: 3313.2 vvc_alf_filter_chroma_24x32_10_c: 28776.2 vvc_alf_filter_chroma_24x32_10_avx2: 4154.7 vvc_alf_filter_chroma_28x4_10_c: 4331.2 vvc_alf_filter_chroma_28x4_10_avx2: 624.2 vvc_alf_filter_chroma_28x8_10_c: 8445.0 vvc_alf_filter_chroma_28x8_10_avx2: 1197.7 vvc_alf_filter_chroma_28x12_10_c: 12684.5 vvc_alf_filter_chroma_28x12_10_avx2: 1786.7 vvc_alf_filter_chroma_28x16_10_c: 16924.5 vvc_alf_filter_chroma_28x16_10_avx2: 2378.7 vvc_alf_filter_chroma_28x20_10_c: 38361.0 vvc_alf_filter_chroma_28x20_10_avx2: 2967.0 vvc_alf_filter_chroma_28x24_10_c: 25329.0 vvc_alf_filter_chroma_28x24_10_avx2: 3564.2 vvc_alf_filter_chroma_28x28_10_c: 29514.0 vvc_alf_filter_chroma_28x28_10_avx2: 4151.7 vvc_alf_filter_chroma_28x32_10_c: 33673.2 vvc_alf_filter_chroma_28x32_10_avx2: 5125.0 vvc_alf_filter_chroma_32x4_10_c: 4945.2 vvc_alf_filter_chroma_32x4_10_avx2: 485.7 vvc_alf_filter_chroma_32x8_10_c: 9658.7 vvc_alf_filter_chroma_32x8_10_avx2: 943.7 vvc_alf_filter_chroma_32x12_10_c: 16177.7 vvc_alf_filter_chroma_32x12_10_avx2: 1443.7 vvc_alf_filter_chroma_32x16
[FFmpeg-devel] [PATCH v3 3/3] avcodec/x86/vvc/vvcdsp_init: fix linking error when configuring with --disable-ssse3 --disable-optimizations options
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvcdsp_init.c | 46 +--- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index aef6699c35..985d750472 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -88,6 +88,7 @@ AVG_PROTOTYPES(10, avx2) AVG_PROTOTYPES(12, avx2) #if ARCH_X86_64 +#if HAVE_SSE4_EXTERNAL #define FW_PUT(name, depth, opt) \ void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride,\ int height, const int8_t *hf, const int8_t *vf, int width)\ @@ -125,7 +126,9 @@ void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *sr FW_PUT_SSE4( 8) FW_PUT_SSE4(10) FW_PUT_SSE4(12) +#endif +#if HAVE_AVX2_EXTERNAL #define FW_PUT_TAP_AVX2(n, bitd)\ FW_PUT(n ## tap_h32, bitd, avx2) \ FW_PUT(n ## tap_h64, bitd, avx2) \ @@ -161,6 +164,25 @@ FW_PUT_AVX2(12) FW_PUT_16BPC_AVX2(10) FW_PUT_16BPC_AVX2(12) +#define AVG_FUNCS(bpc, bd, opt) \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height) \ +{ \ +BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \ +} \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height, \ +int denom, int w0, int w1, int o0, int o1) \ +{ \ +BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \ +denom, w0, w1, o0, o1, (1 << bd) - 1); \ +} + +AVG_FUNCS(8, 8, avx2) +AVG_FUNCS(16, 10, avx2) +AVG_FUNCS(16, 12, avx2) +#endif + #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt) \ dst[C][W][idx1][idx2] = ff_vvc_put_## name ## _ ## D ## _##opt; \ dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \ @@ -226,27 +248,9 @@ FW_PUT_16BPC_AVX2(12) MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \ MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd); -#define AVG_FUNCS(bpc, bd, opt) \ -void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ -const int16_t *src0, const int16_t *src1, int width, int height) \ -{ \ -BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \ -} \ -void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ -const int16_t *src0, const int16_t *src1, int width, int height, \ -int denom, int w0, int w1, int o0, int o1) \ -{ \ -BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \ -denom, w0, w1, o0, o1, (1 << bd) - 1); \ -} - -AVG_FUNCS(8, 8, avx2) -AVG_FUNCS(16, 10, avx2) -AVG_FUNCS(16, 12, avx2) - -#define AVG_INIT(bd, opt) do { \ -c->inter.avg= bf(ff_vvc_avg, bd, opt); \ -c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt);\ +#define AVG_INIT(bd, opt) do { \ +c->inter.avg= bf(ff_vvc_avg, bd, opt); \ +c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \ } while (0) #endif -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v3 2/3] avcodec/x86/vvc/vvcdsp_init: add avg prototypes
From: Wu Jianhua When we used the --disable-ssse3 --disable-optimizations options, the compiler would not skip the MC_LINKS like the compilation that enabled the optimization, so it would fail to find the function prototypes. Hence, this commit uses the same way to add prototypes for the functions as HEVC DSP. And, when prototypes are added for the functions, we cannot add the static qualifier. Therefore, the ff_vvc prefix is needed to avoid the naming conflict. Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvcdsp_init.c | 45 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index d9203f4d5f..aef6699c35 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -63,6 +63,30 @@ PUT_TAP_PROTOTYPES(8, sse4) PUT_TAP_PROTOTYPES(4, avx2) PUT_TAP_PROTOTYPES(8, avx2) +#define bf(fn, bd, opt) fn##_##bd##_##opt +#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt + +#define AVG_BPC_PROTOTYPES(bpc, opt) \ +void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \ +void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \ +intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max); + +#define AVG_PROTOTYPES(bd, opt) \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height); \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height, \ +int denom, int w0, int w1, int o0, int o1); + +AVG_BPC_PROTOTYPES( 8, avx2) +AVG_BPC_PROTOTYPES(16, avx2) + +AVG_PROTOTYPES( 8, avx2) +AVG_PROTOTYPES(10, avx2) +AVG_PROTOTYPES(12, avx2) + #if ARCH_X86_64 #define FW_PUT(name, depth, opt) \ void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride,\ @@ -202,23 +226,13 @@ FW_PUT_16BPC_AVX2(12) MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \ MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd); -#define bf(fn, bd, opt) fn##_##bd##_##opt -#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt - -#define AVG_BPC_FUNC(bpc, opt) \ -void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ -const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \ -void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ -const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \ -intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max); - #define AVG_FUNCS(bpc, bd, opt) \ -static void bf(avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *src0, const int16_t *src1, int width, int height) \ { \ BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \ } \ -static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *src0, const int16_t *src1, int width, int height, \ int denom, int w0, int w1, int o0, int o1) \ { \ @@ -226,16 +240,13 @@ static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, denom, w0, w1, o0, o1, (1 << bd) - 1); \ } -AVG_BPC_FUNC(8, avx2) -AVG_BPC_FUNC(16, avx2) - AVG_FUNCS(8, 8, avx2) AVG_FUNCS(16, 10, avx2) AVG_FUNCS(16, 12, avx2) #define AVG_INI
[FFmpeg-devel] [PATCH v3 1/3] avcodec/x86/vvc/vvcdsp_init: add put prototypes
From: Wu Jianhua When we used the --disable-ssse3 --disable-optimizations options, the compiler would not skip the MC_LINKS like the compilation that enabled the optimization, so it would fail to find the function prototypes. Hence, this commit uses the same way to add prototypes for the functions as HEVC DSP. Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvcdsp_init.c | 35 +++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index 23a3172c45..d9203f4d5f 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -30,9 +30,42 @@ #include "libavcodec/vvc/dsp.h" #include "libavcodec/x86/h26x/h2656dsp.h" +#define PUT_PROTOTYPE(name, depth, opt) \ +void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int width); + +#define PUT_PROTOTYPES(name, bitd, opt) \ +PUT_PROTOTYPE(name##2, bitd, opt) \ +PUT_PROTOTYPE(name##4, bitd, opt) \ +PUT_PROTOTYPE(name##8, bitd, opt) \ +PUT_PROTOTYPE(name##12, bitd, opt) \ +PUT_PROTOTYPE(name##16, bitd, opt) \ +PUT_PROTOTYPE(name##24, bitd, opt) \ +PUT_PROTOTYPE(name##32, bitd, opt) \ +PUT_PROTOTYPE(name##48, bitd, opt) \ +PUT_PROTOTYPE(name##64, bitd, opt) \ +PUT_PROTOTYPE(name##128, bitd, opt) + +#define PUT_BPC_PROTOTYPES(name, opt) \ +PUT_PROTOTYPES(name, 8, opt) \ +PUT_PROTOTYPES(name, 10, opt) \ +PUT_PROTOTYPES(name, 12, opt) + +#define PUT_TAP_PROTOTYPES(n, opt) \ +PUT_BPC_PROTOTYPES(n##tap_h, opt) \ +PUT_BPC_PROTOTYPES(n##tap_v, opt) \ +PUT_BPC_PROTOTYPES(n##tap_hv, opt) + +PUT_BPC_PROTOTYPES(pixels, sse4) +PUT_BPC_PROTOTYPES(pixels, avx2) + +PUT_TAP_PROTOTYPES(4, sse4) +PUT_TAP_PROTOTYPES(8, sse4) +PUT_TAP_PROTOTYPES(4, avx2) +PUT_TAP_PROTOTYPES(8, avx2) + #if ARCH_X86_64 #define FW_PUT(name, depth, opt) \ -static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \ +void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride,\ int height, const int8_t *hf, const int8_t *vf, int width)\ { \ ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \ -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 3/3] avcodec/x86/vvc/vvcdsp_init: fix linking error when configuring with --disable-ssse3 --disable-optimizations options
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvcdsp_init.c | 46 +--- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index aef6699c35..985d750472 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -88,6 +88,7 @@ AVG_PROTOTYPES(10, avx2) AVG_PROTOTYPES(12, avx2) #if ARCH_X86_64 +#if HAVE_SSE4_EXTERNAL #define FW_PUT(name, depth, opt) \ void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride,\ int height, const int8_t *hf, const int8_t *vf, int width)\ @@ -125,7 +126,9 @@ void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *sr FW_PUT_SSE4( 8) FW_PUT_SSE4(10) FW_PUT_SSE4(12) +#endif +#if HAVE_AVX2_EXTERNAL #define FW_PUT_TAP_AVX2(n, bitd)\ FW_PUT(n ## tap_h32, bitd, avx2) \ FW_PUT(n ## tap_h64, bitd, avx2) \ @@ -161,6 +164,25 @@ FW_PUT_AVX2(12) FW_PUT_16BPC_AVX2(10) FW_PUT_16BPC_AVX2(12) +#define AVG_FUNCS(bpc, bd, opt) \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height) \ +{ \ +BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \ +} \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height, \ +int denom, int w0, int w1, int o0, int o1) \ +{ \ +BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \ +denom, w0, w1, o0, o1, (1 << bd) - 1); \ +} + +AVG_FUNCS(8, 8, avx2) +AVG_FUNCS(16, 10, avx2) +AVG_FUNCS(16, 12, avx2) +#endif + #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt) \ dst[C][W][idx1][idx2] = ff_vvc_put_## name ## _ ## D ## _##opt; \ dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \ @@ -226,27 +248,9 @@ FW_PUT_16BPC_AVX2(12) MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \ MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd); -#define AVG_FUNCS(bpc, bd, opt) \ -void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ -const int16_t *src0, const int16_t *src1, int width, int height) \ -{ \ -BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \ -} \ -void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ -const int16_t *src0, const int16_t *src1, int width, int height, \ -int denom, int w0, int w1, int o0, int o1) \ -{ \ -BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \ -denom, w0, w1, o0, o1, (1 << bd) - 1); \ -} - -AVG_FUNCS(8, 8, avx2) -AVG_FUNCS(16, 10, avx2) -AVG_FUNCS(16, 12, avx2) - -#define AVG_INIT(bd, opt) do { \ -c->inter.avg= bf(ff_vvc_avg, bd, opt); \ -c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt);\ +#define AVG_INIT(bd, opt) do { \ +c->inter.avg= bf(ff_vvc_avg, bd, opt); \ +c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \ } while (0) #endif -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 2/3] avcodec/x86/vvc/vvcdsp_init: add avg prototypes
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvcdsp_init.c | 45 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index d9203f4d5f..aef6699c35 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -63,6 +63,30 @@ PUT_TAP_PROTOTYPES(8, sse4) PUT_TAP_PROTOTYPES(4, avx2) PUT_TAP_PROTOTYPES(8, avx2) +#define bf(fn, bd, opt) fn##_##bd##_##opt +#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt + +#define AVG_BPC_PROTOTYPES(bpc, opt) \ +void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \ +void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \ +intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max); + +#define AVG_PROTOTYPES(bd, opt) \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height); \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height, \ +int denom, int w0, int w1, int o0, int o1); + +AVG_BPC_PROTOTYPES( 8, avx2) +AVG_BPC_PROTOTYPES(16, avx2) + +AVG_PROTOTYPES( 8, avx2) +AVG_PROTOTYPES(10, avx2) +AVG_PROTOTYPES(12, avx2) + #if ARCH_X86_64 #define FW_PUT(name, depth, opt) \ void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride,\ @@ -202,23 +226,13 @@ FW_PUT_16BPC_AVX2(12) MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \ MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd); -#define bf(fn, bd, opt) fn##_##bd##_##opt -#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt - -#define AVG_BPC_FUNC(bpc, opt) \ -void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ -const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \ -void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ -const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \ -intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max); - #define AVG_FUNCS(bpc, bd, opt) \ -static void bf(avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *src0, const int16_t *src1, int width, int height) \ { \ BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \ } \ -static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *src0, const int16_t *src1, int width, int height, \ int denom, int w0, int w1, int o0, int o1) \ { \ @@ -226,16 +240,13 @@ static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, denom, w0, w1, o0, o1, (1 << bd) - 1); \ } -AVG_BPC_FUNC(8, avx2) -AVG_BPC_FUNC(16, avx2) - AVG_FUNCS(8, 8, avx2) AVG_FUNCS(16, 10, avx2) AVG_FUNCS(16, 12, avx2) #define AVG_INIT(bd, opt) do { \ -c->inter.avg= bf(avg, bd, opt); \ -c->inter.w_avg = bf(w_avg, bd, opt); \ +c->inter.avg= bf(ff_vvc_avg, bd, opt); \ +c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt);\ } while (0) #endif -- 2.44.0.windows.1 ___ f
[FFmpeg-devel] [PATCH v2 1/3] avcodec/x86/vvc/vvcdsp_init: add put prototypes
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvcdsp_init.c | 35 +++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index 23a3172c45..d9203f4d5f 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -30,9 +30,42 @@ #include "libavcodec/vvc/dsp.h" #include "libavcodec/x86/h26x/h2656dsp.h" +#define PUT_PROTOTYPE(name, depth, opt) \ +void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int width); + +#define PUT_PROTOTYPES(name, bitd, opt) \ +PUT_PROTOTYPE(name##2, bitd, opt) \ +PUT_PROTOTYPE(name##4, bitd, opt) \ +PUT_PROTOTYPE(name##8, bitd, opt) \ +PUT_PROTOTYPE(name##12, bitd, opt) \ +PUT_PROTOTYPE(name##16, bitd, opt) \ +PUT_PROTOTYPE(name##24, bitd, opt) \ +PUT_PROTOTYPE(name##32, bitd, opt) \ +PUT_PROTOTYPE(name##48, bitd, opt) \ +PUT_PROTOTYPE(name##64, bitd, opt) \ +PUT_PROTOTYPE(name##128, bitd, opt) + +#define PUT_BPC_PROTOTYPES(name, opt) \ +PUT_PROTOTYPES(name, 8, opt) \ +PUT_PROTOTYPES(name, 10, opt) \ +PUT_PROTOTYPES(name, 12, opt) + +#define PUT_TAP_PROTOTYPES(n, opt) \ +PUT_BPC_PROTOTYPES(n##tap_h, opt) \ +PUT_BPC_PROTOTYPES(n##tap_v, opt) \ +PUT_BPC_PROTOTYPES(n##tap_hv, opt) + +PUT_BPC_PROTOTYPES(pixels, sse4) +PUT_BPC_PROTOTYPES(pixels, avx2) + +PUT_TAP_PROTOTYPES(4, sse4) +PUT_TAP_PROTOTYPES(8, sse4) +PUT_TAP_PROTOTYPES(4, avx2) +PUT_TAP_PROTOTYPES(8, avx2) + #if ARCH_X86_64 #define FW_PUT(name, depth, opt) \ -static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \ +void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride,\ int height, const int8_t *hf, const int8_t *vf, int width)\ { \ ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \ -- 2.44.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] avcodec/x86/vvc/vvcdsp_init: fix linking error when configuring with --disable-ssse3 --disable-optimizations options
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvcdsp_init.c | 115 ++- 1 file changed, 82 insertions(+), 33 deletions(-) diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index 0d2c683f0f..9ae84bda48 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -31,9 +31,67 @@ #include "libavcodec/vvc/vvcdsp.h" #include "libavcodec/x86/h26x/h2656dsp.h" +#define PUT_PROTOTYPE(name, depth, opt) \ +void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int width); + +#define PUT_PROTOTYPES(name, bitd, opt) \ +PUT_PROTOTYPE(name##2, bitd, opt) \ +PUT_PROTOTYPE(name##4, bitd, opt) \ +PUT_PROTOTYPE(name##8, bitd, opt) \ +PUT_PROTOTYPE(name##12, bitd, opt) \ +PUT_PROTOTYPE(name##16, bitd, opt) \ +PUT_PROTOTYPE(name##24, bitd, opt) \ +PUT_PROTOTYPE(name##32, bitd, opt) \ +PUT_PROTOTYPE(name##48, bitd, opt) \ +PUT_PROTOTYPE(name##64, bitd, opt) \ +PUT_PROTOTYPE(name##128, bitd, opt) + +#define PUT_BPC_PROTOTYPES(name, opt) \ +PUT_PROTOTYPES(name, 8, opt) \ +PUT_PROTOTYPES(name, 10, opt) \ +PUT_PROTOTYPES(name, 12, opt) + +#define PUT_TAP_PROTOTYPES(n, opt) \ +PUT_BPC_PROTOTYPES(n##tap_h, opt) \ +PUT_BPC_PROTOTYPES(n##tap_v, opt) \ +PUT_BPC_PROTOTYPES(n##tap_hv, opt) + +PUT_BPC_PROTOTYPES(pixels, sse4) +PUT_BPC_PROTOTYPES(pixels, avx2) + +PUT_TAP_PROTOTYPES(4, sse4) +PUT_TAP_PROTOTYPES(8, sse4) +PUT_TAP_PROTOTYPES(4, avx2) +PUT_TAP_PROTOTYPES(8, avx2) + +#define bf(fn, bd, opt) fn##_##bd##_##opt +#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt + +#define AVG_BPC_PROTOTYPES(bpc, opt) \ +void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \ +void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \ +intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max); + +#define AVG_PROTOTYPES(bd, opt) \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height); \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height, \ +int denom, int w0, int w1, int o0, int o1); + +AVG_BPC_PROTOTYPES( 8, avx2) +AVG_BPC_PROTOTYPES(16, avx2) + +AVG_PROTOTYPES( 8, avx2) +AVG_PROTOTYPES(10, avx2) +AVG_PROTOTYPES(12, avx2) + #if ARCH_X86_64 +#if HAVE_SSE4_EXTERNAL #define FW_PUT(name, depth, opt) \ -static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \ +void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride,\ int height, const int8_t *hf, const int8_t *vf, int width)\ { \ ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \ @@ -69,7 +127,9 @@ static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint FW_PUT_SSE4( 8) FW_PUT_SSE4(10) FW_PUT_SSE4(12) +#endif +#if HAVE_AVX2_EXTERNAL #define FW_PUT_TAP_AVX2(n, bitd)\ FW_PUT(n ## tap_h32, bitd, avx2) \ FW_PUT(n ## tap_h64, bitd, avx2) \ @@ -105,6 +165,25 @@ FW_PUT_AVX2(12) FW_PUT_16BPC_AVX2(10) FW_PUT_16BPC_AVX2(12) +#define AVG_FUNCS(bpc, bd, opt) \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height) \ +{ \ +BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \ +} \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ +const int16_t *src0, const int16_t *src1, int width, int height,
[FFmpeg-devel] [PATCH] avcodec/x86/vvc/vvcdsp_init: fix unresolved external symbol on ARCH_X86_32
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/vvc/vvcdsp_init.c | 78 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index 909ef9f56b..8ee4074350 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -31,6 +31,7 @@ #include "libavcodec/vvc/vvcdsp.h" #include "libavcodec/x86/h26x/h2656dsp.h" +#if ARCH_X86_64 #define FW_PUT(name, depth, opt) \ static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \ int height, const int8_t *hf, const int8_t *vf, int width)\ @@ -204,51 +205,52 @@ AVG_FUNCS(16, 12, avx2) c->inter.avg= bf(avg, bd, opt); \ c->inter.w_avg = bf(w_avg, bd, opt); \ } while (0) +#endif void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) { +#if ARCH_X86_64 const int cpu_flags = av_get_cpu_flags(); -if (ARCH_X86_64) { -if (bd == 8) { -if (EXTERNAL_SSE4(cpu_flags)) { -MC_LINK_SSE4(8); -} -if (EXTERNAL_AVX2_FAST(cpu_flags)) { -MC_LINKS_AVX2(8); -} -} else if (bd == 10) { -if (EXTERNAL_SSE4(cpu_flags)) { -MC_LINK_SSE4(10); -} -if (EXTERNAL_AVX2_FAST(cpu_flags)) { -MC_LINKS_AVX2(10); -MC_LINKS_16BPC_AVX2(10); -} -} else if (bd == 12) { -if (EXTERNAL_SSE4(cpu_flags)) { -MC_LINK_SSE4(12); -} -if (EXTERNAL_AVX2_FAST(cpu_flags)) { -MC_LINKS_AVX2(12); -MC_LINKS_16BPC_AVX2(12); -} +if (bd == 8) { +if (EXTERNAL_SSE4(cpu_flags)) { +MC_LINK_SSE4(8); } +if (EXTERNAL_AVX2_FAST(cpu_flags)) { +MC_LINKS_AVX2(8); +} +} else if (bd == 10) { +if (EXTERNAL_SSE4(cpu_flags)) { +MC_LINK_SSE4(10); +} +if (EXTERNAL_AVX2_FAST(cpu_flags)) { +MC_LINKS_AVX2(10); +MC_LINKS_16BPC_AVX2(10); +} +} else if (bd == 12) { +if (EXTERNAL_SSE4(cpu_flags)) { +MC_LINK_SSE4(12); +} +if (EXTERNAL_AVX2_FAST(cpu_flags)) { +MC_LINKS_AVX2(12); +MC_LINKS_16BPC_AVX2(12); +} +} -if (EXTERNAL_AVX2(cpu_flags)) { -switch (bd) { -case 8: -AVG_INIT(8, avx2); -break; -case 10: -AVG_INIT(10, avx2); -break; -case 12: -AVG_INIT(12, avx2); -break; -default: -break; -} +if (EXTERNAL_AVX2(cpu_flags)) { +switch (bd) { +case 8: +AVG_INIT(8, avx2); +break; +case 10: +AVG_INIT(10, avx2); +break; +case 12: +AVG_INIT(12, avx2); +break; +default: +break; } } +#endif } -- 2.43.0.windows.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v4 6/8] tests/checkasm: add checkasm_check_vvc_mc
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 + tests/checkasm/checkasm.h | 1 + tests/checkasm/vvc_mc.c | 270 ++ 4 files changed, 275 insertions(+) create mode 100644 tests/checkasm/vvc_mc.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 3b5b54352b..3562acb2b2 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -40,6 +40,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o +AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_mc.o CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 87f24c77ca..36a97957e5 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -194,6 +194,9 @@ static const struct { #if CONFIG_VORBIS_DECODER { "vorbisdsp", checkasm_check_vorbisdsp }, #endif +#if CONFIG_VVC_DECODER +{ "vvc_mc", checkasm_check_vvc_mc }, +#endif #endif #if CONFIG_AVFILTER #if CONFIG_AFIR_FILTER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 4db8c495ea..53cb3ccfbf 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -131,6 +131,7 @@ void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); void checkasm_check_videodsp(void); void checkasm_check_vorbisdsp(void); +void checkasm_check_vvc_mc(void); struct CheckasmPerf; diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c new file mode 100644 index 00..711280deec --- /dev/null +++ b/tests/checkasm/vvc_mc.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2023-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +#include "checkasm.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/vvc/vvc_ctu.h" +#include "libavcodec/vvc/vvc_data.h" + +#include "libavutil/common.h" +#include "libavutil/internal.h" +#include "libavutil/internal.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0x }; +static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 }; + +#define PIXEL_STRIDE (MAX_CTU_SIZE * 2) +#define EXTRA_BEFORE 3 +#define EXTRA_AFTER 4 +#define SRC_EXTRA(EXTRA_BEFORE + EXTRA_AFTER) * 2 +#define SRC_BUF_SIZE (PIXEL_STRIDE + SRC_EXTRA) * (PIXEL_STRIDE + SRC_EXTRA) +#define DST_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE * 2) +#define SRC_OFFSET ((PIXEL_STRIDE + EXTRA_BEFORE * 2) * EXTRA_BEFORE) + +#define randomize_buffers(buf0, buf1, size, mask) \ +do {\ +int k; \ +for (k = 0; k < size; k += 4) { \ +uint32_t r = rnd() & mask; \ +AV_WN32A(buf0 + k, r); \ +AV_WN32A(buf1 + k, r); \ +} \ +} while (0) + +#define randomize_pixels(buf0, buf1, size) \ +do {\ +uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \ +randomize_buffers(buf0, buf1, size, mask); \ +} while (0) + +#define randomize_avg_src(buf0, buf1, size) \ +do {\ +uint32_t mask = 0x3fff3fff; \ +randomize_buffers(buf0, buf1, size, mask); \ +} while (0) + +static void check_put_vvc_luma(void) +{ +LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]); +LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]); +LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]); +VVCDSPContext c; + +declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t *dst, const uint8_t *src, const ptrdiff_t src_stride, +
[FFmpeg-devel] [PATCH v4 8/8] tests/checkasm/vvc_mc: add check_avg
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/vvc_mc.c | 64 + 1 file changed, 64 insertions(+) diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c index 711280deec..8adb00573f 100644 --- a/tests/checkasm/vvc_mc.c +++ b/tests/checkasm/vvc_mc.c @@ -35,6 +35,7 @@ static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0x }; static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 }; +#define SIZEOF_PIXEL ((bit_depth + 7) / 8) #define PIXEL_STRIDE (MAX_CTU_SIZE * 2) #define EXTRA_BEFORE 3 #define EXTRA_AFTER 4 @@ -261,10 +262,73 @@ static void check_put_vvc_chroma_uni(void) report("put_uni_chroma"); } +#define AVG_SRC_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE) +#define AVG_DST_BUF_SIZE (MAX_PB_SIZE * MAX_PB_SIZE * 2) + +static void check_avg(void) +{ +LOCAL_ALIGNED_32(int16_t, src00, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src01, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src10, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src11, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, dst0, [AVG_DST_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, dst1, [AVG_DST_BUF_SIZE]); +VVCDSPContext c; + +for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) { +randomize_avg_src((uint8_t*)src00, (uint8_t*)src10, AVG_SRC_BUF_SIZE * sizeof(int16_t)); +randomize_avg_src((uint8_t*)src01, (uint8_t*)src11, AVG_SRC_BUF_SIZE * sizeof(int16_t)); +ff_vvc_dsp_init(&c, bit_depth); +for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) { +for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) { +{ + declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride, +const int16_t *src0, const int16_t *src1, int width, int height); +if (check_func(c.inter.avg, "avg_%d_%dx%d", bit_depth, w, h)) { +memset(dst0, 0, AVG_DST_BUF_SIZE); +memset(dst1, 0, AVG_DST_BUF_SIZE); +call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h); +call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h); +if (memcmp(dst0, dst1, DST_BUF_SIZE)) +fail(); +if (w == h) +bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h); +} +} +{ +declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride, +const int16_t *src0, const int16_t *src1, int width, int height, +int denom, int w0, int w1, int o0, int o1); +{ +const int denom = rnd() % 8; +const int w0= rnd() % 256 - 128; +const int w1= rnd() % 256 - 128; +const int o0= rnd() % 256 - 128; +const int o1= rnd() % 256 - 128; +if (check_func(c.inter.w_avg, "w_avg_%d_%dx%d", bit_depth, w, h)) { +memset(dst0, 0, AVG_DST_BUF_SIZE); +memset(dst1, 0, AVG_DST_BUF_SIZE); + +call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1); +call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h, denom, w0, w1, o0, o1); +if (memcmp(dst0, dst1, DST_BUF_SIZE)) +fail(); +if (w == h) +bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1); +} +} +} +} +} +} +report("avg"); +} + void checkasm_check_vvc_mc(void) { check_put_vvc_luma(); check_put_vvc_luma_uni(); check_put_vvc_chroma(); check_put_vvc_chroma_uni(); +check_avg(); } -- 2.34.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v4 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations
From: Wu Jianhua The avg/avg_w is based on dav1d. See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm vvc_avg_8_2x2_c: 71.6 vvc_avg_8_2x2_avx2: 26.8 vvc_avg_8_2x4_c: 140.8 vvc_avg_8_2x4_avx2: 34.6 vvc_avg_8_2x8_c: 410.3 vvc_avg_8_2x8_avx2: 41.3 vvc_avg_8_2x16_c: 769.3 vvc_avg_8_2x16_avx2: 60.3 vvc_avg_8_2x32_c: 1669.6 vvc_avg_8_2x32_avx2: 105.1 vvc_avg_8_2x64_c: 1978.3 vvc_avg_8_2x64_avx2: 425.8 vvc_avg_8_2x128_c: 6536.8 vvc_avg_8_2x128_avx2: 1315.1 vvc_avg_8_4x2_c: 155.6 vvc_avg_8_4x2_avx2: 26.1 vvc_avg_8_4x4_c: 250.3 vvc_avg_8_4x4_avx2: 31.3 vvc_avg_8_4x8_c: 831.8 vvc_avg_8_4x8_avx2: 41.3 vvc_avg_8_4x16_c: 1461.1 vvc_avg_8_4x16_avx2: 57.1 vvc_avg_8_4x32_c: 2821.6 vvc_avg_8_4x32_avx2: 105.1 vvc_avg_8_4x64_c: 3615.8 vvc_avg_8_4x64_avx2: 412.6 vvc_avg_8_4x128_c: 11962.6 vvc_avg_8_4x128_avx2: 1274.3 vvc_avg_8_8x2_c: 215.8 vvc_avg_8_8x2_avx2: 29.1 vvc_avg_8_8x4_c: 430.6 vvc_avg_8_8x4_avx2: 37.6 vvc_avg_8_8x8_c: 1463.3 vvc_avg_8_8x8_avx2: 51.8 vvc_avg_8_8x16_c: 2630.1 vvc_avg_8_8x16_avx2: 97.6 vvc_avg_8_8x32_c: 5813.8 vvc_avg_8_8x32_avx2: 196.6 vvc_avg_8_8x64_c: 6687.3 vvc_avg_8_8x64_avx2: 487.8 vvc_avg_8_8x128_c: 13178.6 vvc_avg_8_8x128_avx2: 1290.6 vvc_avg_8_16x2_c: 443.8 vvc_avg_8_16x2_avx2: 28.3 vvc_avg_8_16x4_c: 1253.3 vvc_avg_8_16x4_avx2: 32.1 vvc_avg_8_16x8_c: 2236.3 vvc_avg_8_16x8_avx2: 44.3 vvc_avg_8_16x16_c: 5127.8 vvc_avg_8_16x16_avx2: 63.3 vvc_avg_8_16x32_c: 6573.3 vvc_avg_8_16x32_avx2: 223.6 vvc_avg_8_16x64_c: 30311.8 vvc_avg_8_16x64_avx2: 437.8 vvc_avg_8_16x128_c: 25693.3 vvc_avg_8_16x128_avx2: 1266.8 vvc_avg_8_32x2_c: 954.6 vvc_avg_8_32x2_avx2: 32.1 vvc_avg_8_32x4_c: 2359.6 vvc_avg_8_32x4_avx2: 39.6 vvc_avg_8_32x8_c: 5703.6 vvc_avg_8_32x8_avx2: 57.1 vvc_avg_8_32x16_c: 9967.6 vvc_avg_8_32x16_avx2: 107.1 vvc_avg_8_32x32_c: 21327.6 vvc_avg_8_32x32_avx2: 272.6 vvc_avg_8_32x64_c: 39240.8 vvc_avg_8_32x64_avx2: 529.6 vvc_avg_8_32x128_c: 52580.8 vvc_avg_8_32x128_avx2: 1338.8 vvc_avg_8_64x2_c: 1647.3 vvc_avg_8_64x2_avx2: 38.8 vvc_avg_8_64x4_c: 5130.1 vvc_avg_8_64x4_avx2: 58.8 vvc_avg_8_64x8_c: 6529.3 vvc_avg_8_64x8_avx2: 88.3 vvc_avg_8_64x16_c: 19913.6 vvc_avg_8_64x16_avx2: 162.3 vvc_avg_8_64x32_c: 39360.8 vvc_avg_8_64x32_avx2: 295.8 vvc_avg_8_64x64_c: 49658.3 vvc_avg_8_64x64_avx2: 784.1 vvc_avg_8_64x128_c: 108513.1 vvc_avg_8_64x128_avx2: 1977.1 vvc_avg_8_128x2_c: 3226.1 vvc_avg_8_128x2_avx2: 61.1 vvc_avg_8_128x4_c: 10280.3 vvc_avg_8_128x4_avx2: 94.6 vvc_avg_8_128x8_c: 18079.3 vvc_avg_8_128x8_avx2: 155.3 vvc_avg_8_128x16_c: 45121.8 vvc_avg_8_128x16_avx2: 285.3 vvc_avg_8_128x32_c: 48651.8 vvc_avg_8_128x32_avx2: 581.6 vvc_avg_8_128x64_c: 165078.6 vvc_avg_8_128x64_avx2: 1942.8 vvc_avg_8_128x128_c: 339103.1 vvc_avg_8_128x128_avx2: 4332.6 vvc_avg_10_2x2_c: 144.3 vvc_avg_10_2x2_avx2: 26.8 vvc_avg_10_2x4_c: 142.6 vvc_avg_10_2x4_avx2: 45.3 vvc_avg_10_2x8_c: 478.1 vvc_avg_10_2x8_avx2: 38.1 vvc_avg_10_2x16_c: 518.3 vvc_avg_10_2x16_avx2: 58.1 vvc_avg_10_2x32_c: 2059.8 vvc_avg_10_2x32_avx2: 93.1 vvc_avg_10_2x64_c: 2383.8 vvc_avg_10_2x64_avx2: 714.8 vvc_avg_10_2x128_c: 4498.3 vvc_avg_10_2x128_avx2: 1466.3 vvc_avg_10_4x2_c: 228.6 vvc_avg_10_4x2_avx2: 26.8 vvc_avg_10_4x4_c: 378.3 vvc_avg_10_4x4_avx2: 30.6 vvc_avg_10_4x8_c: 866.8 vvc_avg_10_4x8_avx2: 44.6 vvc_avg_10_4x16_c: 1018.1 vvc_avg_10_4x16_avx2: 58.1 vvc_avg_10_4x32_c: 3590.8 vvc_avg_10_4x32_avx2: 128.8 vvc_avg_10_4x64_c: 4200.8 vvc_avg_10_4x64_avx2: 663.6 vvc_avg_10_4x128_c: 8450.8 vvc_avg_10_4x128_avx2: 1531.8 vvc_avg_10_8x2_c: 369.3 vvc_avg_10_8x2_avx2: 28.3 vvc_avg_10_8x4_c: 513.8 vvc_avg_10_8x4_avx2: 32.1 vvc_avg_10_8x8_c: 1720.3 vvc_avg_10_8x8_avx2: 49.1 vvc_avg_10_8x16_c: 1894.8 vvc_avg_10_8x16_avx2: 71.6 vvc_avg_10_8x32_c: 3931.3 vvc_avg_10_8x32_avx2: 148.1 vvc_avg_10_8x64_c: 7964.3 vvc_avg_10_8x64_avx2: 613.1 vvc_avg_10_8x128_c: 15540.1 vvc_avg_10_8x128_avx2: 1585.1 vvc_avg_10_16x2_c: 877.3 vvc_avg_10_16x2_avx2: 27.6 vvc_avg_10_16x4_c: 955.8 vvc_avg_10_16x4_avx2: 29.8 vvc_avg_10_16x8_c: 3419.6 vvc_avg_10_16x8_avx2: 62.6 vvc_avg_10_16x16_c: 3826.8 vvc_avg_10_16x16_avx2: 54.3 vvc_avg_10_16x32_c: 7655.3 vvc_avg_10_16x32_avx2: 86.3 vvc_avg_10_16x64_c: 30011.1 vvc_avg_10_16x64_avx2: 692.6 vvc_avg_10_16x128_c: 47894.8 vvc_avg_10_16x128_avx2: 1580.3 vvc_avg_10_32x2_c: 944.3 vvc_avg_10_32x2_avx2: 29.8 vvc_avg_10_32x4_c: 2022.6 vvc_avg_10_32x4_avx2: 35.1 vvc_avg_10_32x8_c: 6148.8 vvc_avg_10_32x8_avx2: 51.3 vvc_avg_10_32x16_c: 12601.6 vvc_avg_10_32x16_avx2: 70.8 vvc_avg_10_32x32_c: 15958.6 vvc_avg_10_32x32_avx2: 124.3 vvc_avg_10_32x64_c: 31784.6 vvc_avg_10_32x64_avx2: 757.3 vvc_avg_10_32x128_c: 63892.8 vvc_avg_10_32x128_avx2: 1711.3 vvc_avg_10_64x2_c: 1890.8 vvc_avg_10_64x2_avx2: 34.3 vvc_avg_10_64x4_c: 6267.3 vvc_avg_10_64x4_avx2: 42.6 vvc_avg_10_64x8_c: 12778.1 vvc_avg_10_64x8_avx2: 67.8 vvc_avg_10_64x16_c: 22304.3 vvc_avg_10_64x16_avx2: 116.8 vvc_avg_10_64x32_c: 30777.1 vvc_avg_10_64x32_avx2: 201.1 vvc_avg_10_64x64_c: 60169.1 vvc_avg_10_64x64_avx2: 1454.3 vvc_avg_10_64x128_c: 124392.8 vvc_avg_10_64x128_avx2: 3648.6 vvc_avg_10_128x2
[FFmpeg-devel] [PATCH v4 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/Makefile | 1 + libavcodec/vvc/vvcdsp.c | 4 + libavcodec/vvc/vvcdsp.h | 2 + libavcodec/x86/vvc/Makefile | 6 + libavcodec/x86/vvc/vvcdsp_init.c | 202 +++ 5 files changed, 215 insertions(+) create mode 100644 libavcodec/x86/vvc/Makefile create mode 100644 libavcodec/x86/vvc/vvcdsp_init.c diff --git a/libavcodec/Makefile b/libavcodec/Makefile index bb42095165..ce33631b60 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -65,6 +65,7 @@ OBJS = ac3_parser.o \ # subsystems include $(SRC_PATH)/libavcodec/vvc/Makefile +include $(SRC_PATH)/libavcodec/x86/vvc/Makefile OBJS-$(CONFIG_AANDCTTABLES)+= aandcttab.o OBJS-$(CONFIG_AC3DSP) += ac3dsp.o ac3.o ac3tab.o OBJS-$(CONFIG_ADTS_HEADER) += adts_header.o mpeg4audio_sample_rates.o diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c index c82ea7be30..c542be5258 100644 --- a/libavcodec/vvc/vvcdsp.c +++ b/libavcodec/vvc/vvcdsp.c @@ -138,4 +138,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth) VVC_DSP(8); break; } + +#if ARCH_X86 +ff_vvc_dsp_init_x86(vvcdsp, bit_depth); +#endif } diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h index b5a63c5833..6f59e73654 100644 --- a/libavcodec/vvc/vvcdsp.h +++ b/libavcodec/vvc/vvcdsp.h @@ -167,4 +167,6 @@ typedef struct VVCDSPContext { void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); +void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); + #endif /* AVCODEC_VVC_VVCDSP_H */ diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile new file mode 100644 index 00..b4acc22501 --- /dev/null +++ b/libavcodec/x86/vvc/Makefile @@ -0,0 +1,6 @@ +clean:: + $(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%) + +OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o +X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/h26x/h2656dsp.o \ + x86/h26x/h2656_inter.o diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c new file mode 100644 index 00..c197cdb4cc --- /dev/null +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -0,0 +1,202 @@ +/* + * VVC DSP init for x86 + * + * Copyright (C) 2022-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vvc/vvcdec.h" +#include "libavcodec/vvc/vvc_ctu.h" +#include "libavcodec/vvc/vvcdsp.h" +#include "libavcodec/x86/h26x/h2656dsp.h" + +#define FW_PUT(name, depth, opt) \ +static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \ + int height, const int8_t *hf, const int8_t *vf, int width)\ +{ \ +ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_TAP(fname, bitd, opt ) \ +FW_PUT(fname##4, bitd, opt ); \ +FW_PUT(fname##8, bitd, opt ); \ +FW_PUT(fname##16, bitd, opt ); \ +FW_PUT(fname##32, bitd, opt ); \ +FW_PUT(fname##64, bitd, opt ); \ +FW_PUT(fname##128, bitd, opt ); \ + +#define FW_PUT_4TAP(fname, bitd, opt) \ +FW_PUT(fname ## 2, bitd, opt) \ +FW_PUT_TAP(fname, bitd, opt) + +#define FW_PUT_4TAP_SSE4(bitd) \ +FW_PUT_4TAP(pixels, bitd, sse4) \ +FW_PUT_4TAP(4tap_h, bitd, sse4) \ +FW_PUT_4TAP(4tap_v, bitd, sse4) \ +FW_PUT_4TAP(4tap_hv, bitd, sse4) + +#define FW_PUT_8TAP_SSE4(bitd) \ +FW_PUT_TAP(8tap_h, bitd, sse4) \ +FW_PUT_TAP(8tap_v, bitd, sse4) \ +FW_PUT_TAP(8tap_hv, bitd, sse4) + +#define FW_PUT_SSE4(bitd) \ +FW_PUT_4TAP_SSE4(bitd) \ +FW_PUT_8TAP_SSE4(bitd) + +FW_PUT_SSE4( 8); +FW_PUT_SSE4(10); +FW_PUT_SSE4(12); + +#define FW_PUT_T
[FFmpeg-devel] [PATCH v4 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm
From: Wu Jianhua This enable that the asm optimization can be reused by VVC Signed-off-by: Wu Jianhua --- libavcodec/x86/Makefile |1 + libavcodec/x86/h26x/h2656_inter.asm | 1145 +++ libavcodec/x86/h26x/h2656dsp.c | 98 +++ libavcodec/x86/h26x/h2656dsp.h | 103 +++ libavcodec/x86/hevc_mc.asm | 462 +-- libavcodec/x86/hevcdsp_init.c | 108 ++- 6 files changed, 1471 insertions(+), 446 deletions(-) create mode 100644 libavcodec/x86/h26x/h2656_inter.asm create mode 100644 libavcodec/x86/h26x/h2656dsp.c create mode 100644 libavcodec/x86/h26x/h2656dsp.h diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index d5fb30645a..8098cd840c 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o\ x86/hevc_deblock.o\ x86/hevc_idct.o \ x86/hevc_mc.o \ + x86/h26x/h2656_inter.o\ x86/hevc_sao.o\ x86/hevc_sao_10bit.o X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm new file mode 100644 index 00..aa296d549c --- /dev/null +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -0,0 +1,1145 @@ +; /* +; * Provide SSE luma and chroma mc functions for HEVC/VVC decoding +; * Copyright (c) 2013 Pierre-Edouard LEPERE +; * Copyright (c) 2023-2024 Nuo Mi +; * Copyright (c) 2023-2024 Wu Jianhua +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +%define MAX_PB_SIZE 64 + +SECTION_RODATA 32 +cextern pw_255 +cextern pw_512 +cextern pw_2048 +cextern pw_1023 +cextern pw_1024 +cextern pw_4096 +cextern pw_8192 +%define scale_8 pw_512 +%define scale_10 pw_2048 +%define scale_12 pw_8192 +%define max_pixels_8 pw_255 +%define max_pixels_10 pw_1023 +max_pixels_12: times 16 dw ((1 << 12)-1) +cextern pb_0 + +SECTION .text +%macro SIMPLE_LOAD 4;width, bitd, tab, r1 +%if %1 == 2 || (%2 == 8 && %1 <= 4) +movd %4, [%3] ; load data from source +%elif %1 == 4 || (%2 == 8 && %1 <= 8) +movq %4, [%3] ; load data from source +%elif notcpuflag(avx) +movu %4, [%3] ; load data from source +%elif %1 <= 8 || (%2 == 8 && %1 <= 16) +movdqu %4, [%3] +%else +movu %4, [%3] +%endif +%endmacro + +%macro VPBROADCASTW 2 +%if notcpuflag(avx2) +movd %1, %2 +pshuflw%1, %1, 0 +punpcklwd %1, %1 +%else +vpbroadcastw %1, %2 +%endif +%endmacro + +%macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b, +VPBROADCASTW %3, [%2q + 0 * 2] ; coeff 0, 1 +VPBROADCASTW %4, [%2q + 1 * 2] ; coeff 2, 3 +%if %1 != 8 +pmovsxbw %3, xmm%3 +pmovsxbw %4, xmm%4 +%endif +%endmacro + +%macro MC_4TAP_HV_FILTER 1 +VPBROADCASTW m12, [vfq + 0 * 2] ; vf 0, 1 +VPBROADCASTW m13, [vfq + 1 * 2] ; vf 2, 3 +VPBROADCASTW m14, [hfq + 0 * 2] ; hf 0, 1 +VPBROADCASTW m15, [hfq + 1 * 2] ; hf 2, 3 + +pmovsxbw m12, xm12 +pmovsxbw m13, xm13 +%if %1 != 8 +pmovsxbw m14, xm14 +pmovsxbw m15, xm15 +%endif +lea r3srcq, [srcstrideq*3] +%endmacro + +%macro MC_8TAP_SAVE_FILTER 5;offset, mm registers +mova [rsp + %1 + 0*mmsize], %2 +mova [rsp + %1 + 1*mmsize], %3 +mova [rsp + %1 + 2*mmsize], %4 +mova [rsp + %1 + 3*mmsize], %5 +%endmacro + +%macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset +VPBROADCASTW m12, [%2q + 0 * 2] ; coeff 0, 1 +VPBROADCASTW m13, [%2q + 1 * 2] ; coeff 2, 3 +VPBROADCASTW m14, [%2q + 2 * 2] ; coeff 4, 5 +VPBROADCASTW m15, [%2q + 3 * 2]
[FFmpeg-devel] [PATCH v4 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/hevcdsp_template.c | 594 +++--- 1 file changed, 46 insertions(+), 548 deletions(-) diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 0de14e9dcf..9b48bdf08e 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -26,6 +26,7 @@ #include "bit_depth_template.c" #include "hevcdsp.h" #include "h26x/h2656_sao_template.c" +#include "h26x/h2656_inter_template.c" static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height, GetBitContext *gb, int pcm_bit_depth) @@ -299,37 +300,51 @@ IDCT_DC(32) // -static void FUNC(put_hevc_pel_pixels)(int16_t *dst, - const uint8_t *_src, ptrdiff_t _srcstride, - int height, intptr_t mx, intptr_t my, int width) -{ -int x, y; -const pixel *src= (const pixel *)_src; -ptrdiff_t srcstride = _srcstride / sizeof(pixel); - -for (y = 0; y < height; y++) { -for (x = 0; x < width; x++) -dst[x] = src[x] << (14 - BIT_DEPTH); -src += srcstride; -dst += MAX_PB_SIZE; -} -} - -static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, - int height, intptr_t mx, intptr_t my, int width) -{ -int y; -const pixel *src= (const pixel *)_src; -ptrdiff_t srcstride = _srcstride / sizeof(pixel); -pixel *dst = (pixel *)_dst; -ptrdiff_t dststride = _dststride / sizeof(pixel); - -for (y = 0; y < height; y++) { -memcpy(dst, src, width * sizeof(pixel)); -src += srcstride; -dst += dststride; -} -} +#define ff_hevc_pel_filters ff_hevc_qpel_filters +#define DECL_HV_FILTER(f) \ +const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \ +const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1]; + +#define FW_PUT(p, f, t) \ +static void FUNC(put_hevc_## f)(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height,\ + intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_UNI(p, f, t) \ +static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride, int height, intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, dststride, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_UNI_W(p, f, t) \ +static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride,int height, int denom, int wx, int ox, \ + intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, dststride, src, srcstride, height, denom, wx, ox, hf, vf, width);\ +} + +#define FW_PUT_FUNCS(f, t, dir) \ +FW_PUT(f, f ## _ ## dir, t ## _ ## dir) \ +FW_PUT_UNI(f, f ## _uni_ ## dir, uni_ ## t ## _ ## dir)\ +FW_PUT_UNI_W(f, f ## _uni_w_ ## dir, uni_## t ## _w_ ## dir) + +FW_PUT(pel, pel_pixels, pixels) +FW_PUT_UNI(pel, pel_uni_pixels, uni_pixels) +FW_PUT_UNI_W(pel, pel_uni_w_pixels, uni_w_pixels) + +FW_PUT_FUNCS(qpel, luma, h ) +FW_PUT_FUNCS(qpel, luma, v ) +FW_PUT_FUNCS(qpel, luma, hv) +FW_PUT_FUNCS(epel, chroma, h ) +FW_PUT_FUNCS(epel, chroma, v ) +FW_PUT_FUNCS(epel, chroma, hv) static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_
[FFmpeg-devel] [PATCH v4 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/h26x/h2656_inter.asm | 32 ++--- libavcodec/x86/h26x/h2656dsp.c | 4 ++-- libavcodec/x86/h26x/h2656dsp.h | 2 +- libavcodec/x86/hevcdsp_init.c | 2 +- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm index aa296d549c..cbba0c1ea5 100644 --- a/libavcodec/x86/h26x/h2656_inter.asm +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -22,8 +22,6 @@ ; */ %include "libavutil/x86/x86util.asm" -%define MAX_PB_SIZE 64 - SECTION_RODATA 32 cextern pw_255 cextern pw_512 @@ -342,7 +340,7 @@ SECTION .text %endmacro %macro LOOP_END 3 -add %1q, 2*MAX_PB_SIZE ; dst += dststride +add %1q, dststrideq ; dst += dststride add %2q, %3q; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -539,7 +537,7 @@ SECTION .text ; ** -; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void %1_put_pixels(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** @@ -549,7 +547,7 @@ SECTION .text %endmacro %macro MC_PIXELS 3 -cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height +cglobal %1_put_pixels%2_%3, 5, 5, 3, dst, dststride, src, srcstride, height pxor m2, m2 .loop: SIMPLE_LOAD %2, %3, srcq, m0 @@ -579,10 +577,10 @@ cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, src, srcstride, height %endif ; ** -; void %1_put_4tap_hX(int16_t *dst, +; void %1_put_4tap_hX(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width); ; ** -cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf +cglobal %1_put_4tap_h%2_%3, 6, 6, XMM_REGS, dst, dststride, src, srcstride, height, hf %assign %%stride ((%3 + 7)/8) MC_4TAP_FILTER %3, hf, m4, m5 .loop: @@ -612,10 +610,10 @@ cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, dststride, src, srcstride, RET ; ** -; void %1_put_4tap_v(int16_t *dst, +; void %1_put_4tap_v(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width) ; ** -cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, r3src, vf +cglobal %1_put_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, vf sub srcq, srcstrideq MC_4TAP_FILTER%3, vf, m4, m5 lea r3srcq, [srcstrideq*3] @@ -649,10 +647,10 @@ cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, %macro PUT_4TAP_HV 3 ; ** -; void put_4tap_hv(int16_t *dst, +; void put_4tap_hv(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width) ; ** -cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, r3src +cglobal %1_put_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, height, hf, vf, r3src %assign %%stride ((%3 + 7)/8) sub srcq, srcstrideq MC_4TAP_HV_FILTER%3 @@ -784,12 +782,12 @@ cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, heig %endmacro ; ** -; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void put_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** %macro PUT_8TAP 3 -cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf +cglobal %1_put_8tap_h%2_%3, 6, 6, 16, dst, dststride, src, srcstride, height, hf MC_8TAP_FILTER %3, hf .loop: MC_8TAP_H_LOAD %3, srcq, %2, 10 @@ -824,10 +822,10 @@ cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, dststride, src, srcstride, heigh ; ** -; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void put_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** -cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf +cglobal %1_put_8tap_v%2_%3, 7, 8, 16, dst, dststride, src, srcstride, height,
[FFmpeg-devel] [PATCH v4 1/8] avcodec/vvc/vvc_inter_template: move put/put_luma/put_chroma template to h2656_inter_template.c
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/h26x/h2656_inter_template.c | 577 + libavcodec/vvc/vvc_inter_template.c| 559 +--- 2 files changed, 578 insertions(+), 558 deletions(-) create mode 100644 libavcodec/h26x/h2656_inter_template.c diff --git a/libavcodec/h26x/h2656_inter_template.c b/libavcodec/h26x/h2656_inter_template.c new file mode 100644 index 00..864f6c7e7d --- /dev/null +++ b/libavcodec/h26x/h2656_inter_template.c @@ -0,0 +1,577 @@ +/* + * inter prediction template for HEVC/VVC + * + * Copyright (C) 2022 Nuo Mi + * Copyright (C) 2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define CHROMA_EXTRA_BEFORE 1 +#define CHROMA_EXTRA3 +#define LUMA_EXTRA_BEFORE 3 +#define LUMA_EXTRA 7 + +static void FUNC(put_pixels)(int16_t *dst, +const uint8_t *_src, const ptrdiff_t _src_stride, +const int height, const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src= (const pixel *)_src; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) +dst[x] = src[x] << (14 - BIT_DEPTH); +src += src_stride; +dst += MAX_PB_SIZE; +} +} + +static void FUNC(put_uni_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, +const uint8_t *_src, const ptrdiff_t _src_stride, const int height, + const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src= (const pixel *)_src; +pixel *dst = (pixel *)_dst; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + +for (int y = 0; y < height; y++) { +memcpy(dst, src, width * sizeof(pixel)); +src += src_stride; +dst += dst_stride; +} +} + +static void FUNC(put_uni_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, +const uint8_t *_src, const ptrdiff_t _src_stride, const int height, +const int denom, const int wx, const int _ox, const int8_t *hf, const int8_t *vf, +const int width) +{ +const pixel *src= (const pixel *)_src; +pixel *dst = (pixel *)_dst; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); +const int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 +const int offset= 1 << (shift - 1); +#else +const int offset= 0; +#endif +const int ox= _ox * (1 << (BIT_DEPTH - 8)); + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) { +const int v = (src[x] << (14 - BIT_DEPTH)); +dst[x] = av_clip_pixel(((v * wx + offset) >> shift) + ox); +} +src += src_stride; +dst += dst_stride; +} +} + +#define LUMA_FILTER(src, stride) \ +(filter[0] * src[x - 3 * stride] + \ + filter[1] * src[x - 2 * stride] + \ + filter[2] * src[x - stride] + \ + filter[3] * src[x ] + \ + filter[4] * src[x + stride] + \ + filter[5] * src[x + 2 * stride] + \ + filter[6] * src[x + 3 * stride] + \ + filter[7] * src[x + 4 * stride]) + +static void FUNC(put_luma_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, +const int height, const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src = (const pixel*)_src; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const int8_t *filter = hf; + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) +dst[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); +src += src_stride; +dst += MAX_PB_SIZE; +} +} + +static void FUNC(put_luma_v)(int
[FFmpeg-devel] [PATCH v3 8/8] tests/checkasm/vvc_mc: add check_avg
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/vvc_mc.c | 64 + 1 file changed, 64 insertions(+) diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c index 711280deec..8adb00573f 100644 --- a/tests/checkasm/vvc_mc.c +++ b/tests/checkasm/vvc_mc.c @@ -35,6 +35,7 @@ static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0x }; static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 }; +#define SIZEOF_PIXEL ((bit_depth + 7) / 8) #define PIXEL_STRIDE (MAX_CTU_SIZE * 2) #define EXTRA_BEFORE 3 #define EXTRA_AFTER 4 @@ -261,10 +262,73 @@ static void check_put_vvc_chroma_uni(void) report("put_uni_chroma"); } +#define AVG_SRC_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE) +#define AVG_DST_BUF_SIZE (MAX_PB_SIZE * MAX_PB_SIZE * 2) + +static void check_avg(void) +{ +LOCAL_ALIGNED_32(int16_t, src00, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src01, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src10, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src11, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, dst0, [AVG_DST_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, dst1, [AVG_DST_BUF_SIZE]); +VVCDSPContext c; + +for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) { +randomize_avg_src((uint8_t*)src00, (uint8_t*)src10, AVG_SRC_BUF_SIZE * sizeof(int16_t)); +randomize_avg_src((uint8_t*)src01, (uint8_t*)src11, AVG_SRC_BUF_SIZE * sizeof(int16_t)); +ff_vvc_dsp_init(&c, bit_depth); +for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) { +for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) { +{ + declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride, +const int16_t *src0, const int16_t *src1, int width, int height); +if (check_func(c.inter.avg, "avg_%d_%dx%d", bit_depth, w, h)) { +memset(dst0, 0, AVG_DST_BUF_SIZE); +memset(dst1, 0, AVG_DST_BUF_SIZE); +call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h); +call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h); +if (memcmp(dst0, dst1, DST_BUF_SIZE)) +fail(); +if (w == h) +bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h); +} +} +{ +declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride, +const int16_t *src0, const int16_t *src1, int width, int height, +int denom, int w0, int w1, int o0, int o1); +{ +const int denom = rnd() % 8; +const int w0= rnd() % 256 - 128; +const int w1= rnd() % 256 - 128; +const int o0= rnd() % 256 - 128; +const int o1= rnd() % 256 - 128; +if (check_func(c.inter.w_avg, "w_avg_%d_%dx%d", bit_depth, w, h)) { +memset(dst0, 0, AVG_DST_BUF_SIZE); +memset(dst1, 0, AVG_DST_BUF_SIZE); + +call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1); +call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h, denom, w0, w1, o0, o1); +if (memcmp(dst0, dst1, DST_BUF_SIZE)) +fail(); +if (w == h) +bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1); +} +} +} +} +} +} +report("avg"); +} + void checkasm_check_vvc_mc(void) { check_put_vvc_luma(); check_put_vvc_luma_uni(); check_put_vvc_chroma(); check_put_vvc_chroma_uni(); +check_avg(); } -- 2.34.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v3 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/Makefile | 1 + libavcodec/vvc/vvcdsp.c | 4 + libavcodec/vvc/vvcdsp.h | 2 + libavcodec/x86/vvc/Makefile | 6 + libavcodec/x86/vvc/vvcdsp_init.c | 202 +++ 5 files changed, 215 insertions(+) create mode 100644 libavcodec/x86/vvc/Makefile create mode 100644 libavcodec/x86/vvc/vvcdsp_init.c diff --git a/libavcodec/Makefile b/libavcodec/Makefile index bb42095165..ce33631b60 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -65,6 +65,7 @@ OBJS = ac3_parser.o \ # subsystems include $(SRC_PATH)/libavcodec/vvc/Makefile +include $(SRC_PATH)/libavcodec/x86/vvc/Makefile OBJS-$(CONFIG_AANDCTTABLES)+= aandcttab.o OBJS-$(CONFIG_AC3DSP) += ac3dsp.o ac3.o ac3tab.o OBJS-$(CONFIG_ADTS_HEADER) += adts_header.o mpeg4audio_sample_rates.o diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c index c82ea7be30..c542be5258 100644 --- a/libavcodec/vvc/vvcdsp.c +++ b/libavcodec/vvc/vvcdsp.c @@ -138,4 +138,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth) VVC_DSP(8); break; } + +#if ARCH_X86 +ff_vvc_dsp_init_x86(vvcdsp, bit_depth); +#endif } diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h index b5a63c5833..6f59e73654 100644 --- a/libavcodec/vvc/vvcdsp.h +++ b/libavcodec/vvc/vvcdsp.h @@ -167,4 +167,6 @@ typedef struct VVCDSPContext { void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); +void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); + #endif /* AVCODEC_VVC_VVCDSP_H */ diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile new file mode 100644 index 00..b4acc22501 --- /dev/null +++ b/libavcodec/x86/vvc/Makefile @@ -0,0 +1,6 @@ +clean:: + $(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%) + +OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o +X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/h26x/h2656dsp.o \ + x86/h26x/h2656_inter.o diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c new file mode 100644 index 00..c197cdb4cc --- /dev/null +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -0,0 +1,202 @@ +/* + * VVC DSP init for x86 + * + * Copyright (C) 2022-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vvc/vvcdec.h" +#include "libavcodec/vvc/vvc_ctu.h" +#include "libavcodec/vvc/vvcdsp.h" +#include "libavcodec/x86/h26x/h2656dsp.h" + +#define FW_PUT(name, depth, opt) \ +static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \ + int height, const int8_t *hf, const int8_t *vf, int width)\ +{ \ +ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_TAP(fname, bitd, opt ) \ +FW_PUT(fname##4, bitd, opt ); \ +FW_PUT(fname##8, bitd, opt ); \ +FW_PUT(fname##16, bitd, opt ); \ +FW_PUT(fname##32, bitd, opt ); \ +FW_PUT(fname##64, bitd, opt ); \ +FW_PUT(fname##128, bitd, opt ); \ + +#define FW_PUT_4TAP(fname, bitd, opt) \ +FW_PUT(fname ## 2, bitd, opt) \ +FW_PUT_TAP(fname, bitd, opt) + +#define FW_PUT_4TAP_SSE4(bitd) \ +FW_PUT_4TAP(pixels, bitd, sse4) \ +FW_PUT_4TAP(4tap_h, bitd, sse4) \ +FW_PUT_4TAP(4tap_v, bitd, sse4) \ +FW_PUT_4TAP(4tap_hv, bitd, sse4) + +#define FW_PUT_8TAP_SSE4(bitd) \ +FW_PUT_TAP(8tap_h, bitd, sse4) \ +FW_PUT_TAP(8tap_v, bitd, sse4) \ +FW_PUT_TAP(8tap_hv, bitd, sse4) + +#define FW_PUT_SSE4(bitd) \ +FW_PUT_4TAP_SSE4(bitd) \ +FW_PUT_8TAP_SSE4(bitd) + +FW_PUT_SSE4( 8); +FW_PUT_SSE4(10); +FW_PUT_SSE4(12); + +#define FW_PUT_T
[FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations
From: Wu Jianhua The avg/avg_w is based on dav1d. See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm vvc_avg_8_2x2_c: 71.6 vvc_avg_8_2x2_avx2: 26.8 vvc_avg_8_2x4_c: 140.8 vvc_avg_8_2x4_avx2: 34.6 vvc_avg_8_2x8_c: 410.3 vvc_avg_8_2x8_avx2: 41.3 vvc_avg_8_2x16_c: 769.3 vvc_avg_8_2x16_avx2: 60.3 vvc_avg_8_2x32_c: 1669.6 vvc_avg_8_2x32_avx2: 105.1 vvc_avg_8_2x64_c: 1978.3 vvc_avg_8_2x64_avx2: 425.8 vvc_avg_8_2x128_c: 6536.8 vvc_avg_8_2x128_avx2: 1315.1 vvc_avg_8_4x2_c: 155.6 vvc_avg_8_4x2_avx2: 26.1 vvc_avg_8_4x4_c: 250.3 vvc_avg_8_4x4_avx2: 31.3 vvc_avg_8_4x8_c: 831.8 vvc_avg_8_4x8_avx2: 41.3 vvc_avg_8_4x16_c: 1461.1 vvc_avg_8_4x16_avx2: 57.1 vvc_avg_8_4x32_c: 2821.6 vvc_avg_8_4x32_avx2: 105.1 vvc_avg_8_4x64_c: 3615.8 vvc_avg_8_4x64_avx2: 412.6 vvc_avg_8_4x128_c: 11962.6 vvc_avg_8_4x128_avx2: 1274.3 vvc_avg_8_8x2_c: 215.8 vvc_avg_8_8x2_avx2: 29.1 vvc_avg_8_8x4_c: 430.6 vvc_avg_8_8x4_avx2: 37.6 vvc_avg_8_8x8_c: 1463.3 vvc_avg_8_8x8_avx2: 51.8 vvc_avg_8_8x16_c: 2630.1 vvc_avg_8_8x16_avx2: 97.6 vvc_avg_8_8x32_c: 5813.8 vvc_avg_8_8x32_avx2: 196.6 vvc_avg_8_8x64_c: 6687.3 vvc_avg_8_8x64_avx2: 487.8 vvc_avg_8_8x128_c: 13178.6 vvc_avg_8_8x128_avx2: 1290.6 vvc_avg_8_16x2_c: 443.8 vvc_avg_8_16x2_avx2: 28.3 vvc_avg_8_16x4_c: 1253.3 vvc_avg_8_16x4_avx2: 32.1 vvc_avg_8_16x8_c: 2236.3 vvc_avg_8_16x8_avx2: 44.3 vvc_avg_8_16x16_c: 5127.8 vvc_avg_8_16x16_avx2: 63.3 vvc_avg_8_16x32_c: 6573.3 vvc_avg_8_16x32_avx2: 223.6 vvc_avg_8_16x64_c: 30311.8 vvc_avg_8_16x64_avx2: 437.8 vvc_avg_8_16x128_c: 25693.3 vvc_avg_8_16x128_avx2: 1266.8 vvc_avg_8_32x2_c: 954.6 vvc_avg_8_32x2_avx2: 32.1 vvc_avg_8_32x4_c: 2359.6 vvc_avg_8_32x4_avx2: 39.6 vvc_avg_8_32x8_c: 5703.6 vvc_avg_8_32x8_avx2: 57.1 vvc_avg_8_32x16_c: 9967.6 vvc_avg_8_32x16_avx2: 107.1 vvc_avg_8_32x32_c: 21327.6 vvc_avg_8_32x32_avx2: 272.6 vvc_avg_8_32x64_c: 39240.8 vvc_avg_8_32x64_avx2: 529.6 vvc_avg_8_32x128_c: 52580.8 vvc_avg_8_32x128_avx2: 1338.8 vvc_avg_8_64x2_c: 1647.3 vvc_avg_8_64x2_avx2: 38.8 vvc_avg_8_64x4_c: 5130.1 vvc_avg_8_64x4_avx2: 58.8 vvc_avg_8_64x8_c: 6529.3 vvc_avg_8_64x8_avx2: 88.3 vvc_avg_8_64x16_c: 19913.6 vvc_avg_8_64x16_avx2: 162.3 vvc_avg_8_64x32_c: 39360.8 vvc_avg_8_64x32_avx2: 295.8 vvc_avg_8_64x64_c: 49658.3 vvc_avg_8_64x64_avx2: 784.1 vvc_avg_8_64x128_c: 108513.1 vvc_avg_8_64x128_avx2: 1977.1 vvc_avg_8_128x2_c: 3226.1 vvc_avg_8_128x2_avx2: 61.1 vvc_avg_8_128x4_c: 10280.3 vvc_avg_8_128x4_avx2: 94.6 vvc_avg_8_128x8_c: 18079.3 vvc_avg_8_128x8_avx2: 155.3 vvc_avg_8_128x16_c: 45121.8 vvc_avg_8_128x16_avx2: 285.3 vvc_avg_8_128x32_c: 48651.8 vvc_avg_8_128x32_avx2: 581.6 vvc_avg_8_128x64_c: 165078.6 vvc_avg_8_128x64_avx2: 1942.8 vvc_avg_8_128x128_c: 339103.1 vvc_avg_8_128x128_avx2: 4332.6 vvc_avg_10_2x2_c: 144.3 vvc_avg_10_2x2_avx2: 26.8 vvc_avg_10_2x4_c: 142.6 vvc_avg_10_2x4_avx2: 45.3 vvc_avg_10_2x8_c: 478.1 vvc_avg_10_2x8_avx2: 38.1 vvc_avg_10_2x16_c: 518.3 vvc_avg_10_2x16_avx2: 58.1 vvc_avg_10_2x32_c: 2059.8 vvc_avg_10_2x32_avx2: 93.1 vvc_avg_10_2x64_c: 2383.8 vvc_avg_10_2x64_avx2: 714.8 vvc_avg_10_2x128_c: 4498.3 vvc_avg_10_2x128_avx2: 1466.3 vvc_avg_10_4x2_c: 228.6 vvc_avg_10_4x2_avx2: 26.8 vvc_avg_10_4x4_c: 378.3 vvc_avg_10_4x4_avx2: 30.6 vvc_avg_10_4x8_c: 866.8 vvc_avg_10_4x8_avx2: 44.6 vvc_avg_10_4x16_c: 1018.1 vvc_avg_10_4x16_avx2: 58.1 vvc_avg_10_4x32_c: 3590.8 vvc_avg_10_4x32_avx2: 128.8 vvc_avg_10_4x64_c: 4200.8 vvc_avg_10_4x64_avx2: 663.6 vvc_avg_10_4x128_c: 8450.8 vvc_avg_10_4x128_avx2: 1531.8 vvc_avg_10_8x2_c: 369.3 vvc_avg_10_8x2_avx2: 28.3 vvc_avg_10_8x4_c: 513.8 vvc_avg_10_8x4_avx2: 32.1 vvc_avg_10_8x8_c: 1720.3 vvc_avg_10_8x8_avx2: 49.1 vvc_avg_10_8x16_c: 1894.8 vvc_avg_10_8x16_avx2: 71.6 vvc_avg_10_8x32_c: 3931.3 vvc_avg_10_8x32_avx2: 148.1 vvc_avg_10_8x64_c: 7964.3 vvc_avg_10_8x64_avx2: 613.1 vvc_avg_10_8x128_c: 15540.1 vvc_avg_10_8x128_avx2: 1585.1 vvc_avg_10_16x2_c: 877.3 vvc_avg_10_16x2_avx2: 27.6 vvc_avg_10_16x4_c: 955.8 vvc_avg_10_16x4_avx2: 29.8 vvc_avg_10_16x8_c: 3419.6 vvc_avg_10_16x8_avx2: 62.6 vvc_avg_10_16x16_c: 3826.8 vvc_avg_10_16x16_avx2: 54.3 vvc_avg_10_16x32_c: 7655.3 vvc_avg_10_16x32_avx2: 86.3 vvc_avg_10_16x64_c: 30011.1 vvc_avg_10_16x64_avx2: 692.6 vvc_avg_10_16x128_c: 47894.8 vvc_avg_10_16x128_avx2: 1580.3 vvc_avg_10_32x2_c: 944.3 vvc_avg_10_32x2_avx2: 29.8 vvc_avg_10_32x4_c: 2022.6 vvc_avg_10_32x4_avx2: 35.1 vvc_avg_10_32x8_c: 6148.8 vvc_avg_10_32x8_avx2: 51.3 vvc_avg_10_32x16_c: 12601.6 vvc_avg_10_32x16_avx2: 70.8 vvc_avg_10_32x32_c: 15958.6 vvc_avg_10_32x32_avx2: 124.3 vvc_avg_10_32x64_c: 31784.6 vvc_avg_10_32x64_avx2: 757.3 vvc_avg_10_32x128_c: 63892.8 vvc_avg_10_32x128_avx2: 1711.3 vvc_avg_10_64x2_c: 1890.8 vvc_avg_10_64x2_avx2: 34.3 vvc_avg_10_64x4_c: 6267.3 vvc_avg_10_64x4_avx2: 42.6 vvc_avg_10_64x8_c: 12778.1 vvc_avg_10_64x8_avx2: 67.8 vvc_avg_10_64x16_c: 22304.3 vvc_avg_10_64x16_avx2: 116.8 vvc_avg_10_64x32_c: 30777.1 vvc_avg_10_64x32_avx2: 201.1 vvc_avg_10_64x64_c: 60169.1 vvc_avg_10_64x64_avx2: 1454.3 vvc_avg_10_64x128_c: 124392.8 vvc_avg_10_64x128_avx2: 3648.6 vvc_avg_10_128x2
[FFmpeg-devel] [PATCH v3 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/h26x/h2656_inter.asm | 32 ++--- libavcodec/x86/h26x/h2656dsp.c | 4 ++-- libavcodec/x86/h26x/h2656dsp.h | 2 +- libavcodec/x86/hevcdsp_init.c | 2 +- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm index aa296d549c..cbba0c1ea5 100644 --- a/libavcodec/x86/h26x/h2656_inter.asm +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -22,8 +22,6 @@ ; */ %include "libavutil/x86/x86util.asm" -%define MAX_PB_SIZE 64 - SECTION_RODATA 32 cextern pw_255 cextern pw_512 @@ -342,7 +340,7 @@ SECTION .text %endmacro %macro LOOP_END 3 -add %1q, 2*MAX_PB_SIZE ; dst += dststride +add %1q, dststrideq ; dst += dststride add %2q, %3q; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -539,7 +537,7 @@ SECTION .text ; ** -; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void %1_put_pixels(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** @@ -549,7 +547,7 @@ SECTION .text %endmacro %macro MC_PIXELS 3 -cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height +cglobal %1_put_pixels%2_%3, 5, 5, 3, dst, dststride, src, srcstride, height pxor m2, m2 .loop: SIMPLE_LOAD %2, %3, srcq, m0 @@ -579,10 +577,10 @@ cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, src, srcstride, height %endif ; ** -; void %1_put_4tap_hX(int16_t *dst, +; void %1_put_4tap_hX(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width); ; ** -cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf +cglobal %1_put_4tap_h%2_%3, 6, 6, XMM_REGS, dst, dststride, src, srcstride, height, hf %assign %%stride ((%3 + 7)/8) MC_4TAP_FILTER %3, hf, m4, m5 .loop: @@ -612,10 +610,10 @@ cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, dststride, src, srcstride, RET ; ** -; void %1_put_4tap_v(int16_t *dst, +; void %1_put_4tap_v(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width) ; ** -cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, r3src, vf +cglobal %1_put_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, vf sub srcq, srcstrideq MC_4TAP_FILTER%3, vf, m4, m5 lea r3srcq, [srcstrideq*3] @@ -649,10 +647,10 @@ cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, %macro PUT_4TAP_HV 3 ; ** -; void put_4tap_hv(int16_t *dst, +; void put_4tap_hv(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width) ; ** -cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, r3src +cglobal %1_put_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, height, hf, vf, r3src %assign %%stride ((%3 + 7)/8) sub srcq, srcstrideq MC_4TAP_HV_FILTER%3 @@ -784,12 +782,12 @@ cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, heig %endmacro ; ** -; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void put_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** %macro PUT_8TAP 3 -cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf +cglobal %1_put_8tap_h%2_%3, 6, 6, 16, dst, dststride, src, srcstride, height, hf MC_8TAP_FILTER %3, hf .loop: MC_8TAP_H_LOAD %3, srcq, %2, 10 @@ -824,10 +822,10 @@ cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, dststride, src, srcstride, heigh ; ** -; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void put_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** -cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf +cglobal %1_put_8tap_v%2_%3, 7, 8, 16, dst, dststride, src, srcstride, height,
[FFmpeg-devel] [PATCH v3 6/8] tests/checkasm: add checkasm_check_vvc_mc
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 + tests/checkasm/checkasm.h | 1 + tests/checkasm/vvc_mc.c | 270 ++ 4 files changed, 275 insertions(+) create mode 100644 tests/checkasm/vvc_mc.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 3b5b54352b..3562acb2b2 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -40,6 +40,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o +AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_mc.o CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 87f24c77ca..36a97957e5 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -194,6 +194,9 @@ static const struct { #if CONFIG_VORBIS_DECODER { "vorbisdsp", checkasm_check_vorbisdsp }, #endif +#if CONFIG_VVC_DECODER +{ "vvc_mc", checkasm_check_vvc_mc }, +#endif #endif #if CONFIG_AVFILTER #if CONFIG_AFIR_FILTER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 4db8c495ea..53cb3ccfbf 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -131,6 +131,7 @@ void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); void checkasm_check_videodsp(void); void checkasm_check_vorbisdsp(void); +void checkasm_check_vvc_mc(void); struct CheckasmPerf; diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c new file mode 100644 index 00..711280deec --- /dev/null +++ b/tests/checkasm/vvc_mc.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2023-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +#include "checkasm.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/vvc/vvc_ctu.h" +#include "libavcodec/vvc/vvc_data.h" + +#include "libavutil/common.h" +#include "libavutil/internal.h" +#include "libavutil/internal.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0x }; +static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 }; + +#define PIXEL_STRIDE (MAX_CTU_SIZE * 2) +#define EXTRA_BEFORE 3 +#define EXTRA_AFTER 4 +#define SRC_EXTRA(EXTRA_BEFORE + EXTRA_AFTER) * 2 +#define SRC_BUF_SIZE (PIXEL_STRIDE + SRC_EXTRA) * (PIXEL_STRIDE + SRC_EXTRA) +#define DST_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE * 2) +#define SRC_OFFSET ((PIXEL_STRIDE + EXTRA_BEFORE * 2) * EXTRA_BEFORE) + +#define randomize_buffers(buf0, buf1, size, mask) \ +do {\ +int k; \ +for (k = 0; k < size; k += 4) { \ +uint32_t r = rnd() & mask; \ +AV_WN32A(buf0 + k, r); \ +AV_WN32A(buf1 + k, r); \ +} \ +} while (0) + +#define randomize_pixels(buf0, buf1, size) \ +do {\ +uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \ +randomize_buffers(buf0, buf1, size, mask); \ +} while (0) + +#define randomize_avg_src(buf0, buf1, size) \ +do {\ +uint32_t mask = 0x3fff3fff; \ +randomize_buffers(buf0, buf1, size, mask); \ +} while (0) + +static void check_put_vvc_luma(void) +{ +LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]); +LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]); +LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]); +VVCDSPContext c; + +declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t *dst, const uint8_t *src, const ptrdiff_t src_stride, +
[FFmpeg-devel] [PATCH v3 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/hevcdsp_template.c | 594 +++--- 1 file changed, 46 insertions(+), 548 deletions(-) diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 0de14e9dcf..9b48bdf08e 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -26,6 +26,7 @@ #include "bit_depth_template.c" #include "hevcdsp.h" #include "h26x/h2656_sao_template.c" +#include "h26x/h2656_inter_template.c" static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height, GetBitContext *gb, int pcm_bit_depth) @@ -299,37 +300,51 @@ IDCT_DC(32) // -static void FUNC(put_hevc_pel_pixels)(int16_t *dst, - const uint8_t *_src, ptrdiff_t _srcstride, - int height, intptr_t mx, intptr_t my, int width) -{ -int x, y; -const pixel *src= (const pixel *)_src; -ptrdiff_t srcstride = _srcstride / sizeof(pixel); - -for (y = 0; y < height; y++) { -for (x = 0; x < width; x++) -dst[x] = src[x] << (14 - BIT_DEPTH); -src += srcstride; -dst += MAX_PB_SIZE; -} -} - -static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, - int height, intptr_t mx, intptr_t my, int width) -{ -int y; -const pixel *src= (const pixel *)_src; -ptrdiff_t srcstride = _srcstride / sizeof(pixel); -pixel *dst = (pixel *)_dst; -ptrdiff_t dststride = _dststride / sizeof(pixel); - -for (y = 0; y < height; y++) { -memcpy(dst, src, width * sizeof(pixel)); -src += srcstride; -dst += dststride; -} -} +#define ff_hevc_pel_filters ff_hevc_qpel_filters +#define DECL_HV_FILTER(f) \ +const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \ +const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1]; + +#define FW_PUT(p, f, t) \ +static void FUNC(put_hevc_## f)(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height,\ + intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_UNI(p, f, t) \ +static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride, int height, intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, dststride, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_UNI_W(p, f, t) \ +static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride,int height, int denom, int wx, int ox, \ + intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, dststride, src, srcstride, height, denom, wx, ox, hf, vf, width);\ +} + +#define FW_PUT_FUNCS(f, t, dir) \ +FW_PUT(f, f ## _ ## dir, t ## _ ## dir) \ +FW_PUT_UNI(f, f ## _uni_ ## dir, uni_ ## t ## _ ## dir)\ +FW_PUT_UNI_W(f, f ## _uni_w_ ## dir, uni_## t ## _w_ ## dir) + +FW_PUT(pel, pel_pixels, pixels) +FW_PUT_UNI(pel, pel_uni_pixels, uni_pixels) +FW_PUT_UNI_W(pel, pel_uni_w_pixels, uni_w_pixels) + +FW_PUT_FUNCS(qpel, luma, h ) +FW_PUT_FUNCS(qpel, luma, v ) +FW_PUT_FUNCS(qpel, luma, hv) +FW_PUT_FUNCS(epel, chroma, h ) +FW_PUT_FUNCS(epel, chroma, v ) +FW_PUT_FUNCS(epel, chroma, hv) static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_
[FFmpeg-devel] [PATCH v3 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm
From: Wu Jianhua This enable that the asm optimization can be reused by VVC Signed-off-by: Wu Jianhua --- libavcodec/x86/Makefile |1 + libavcodec/x86/h26x/h2656_inter.asm | 1145 +++ libavcodec/x86/h26x/h2656dsp.c | 98 +++ libavcodec/x86/h26x/h2656dsp.h | 103 +++ libavcodec/x86/hevc_mc.asm | 462 +-- libavcodec/x86/hevcdsp_init.c | 108 ++- 6 files changed, 1471 insertions(+), 446 deletions(-) create mode 100644 libavcodec/x86/h26x/h2656_inter.asm create mode 100644 libavcodec/x86/h26x/h2656dsp.c create mode 100644 libavcodec/x86/h26x/h2656dsp.h diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index d5fb30645a..8098cd840c 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o\ x86/hevc_deblock.o\ x86/hevc_idct.o \ x86/hevc_mc.o \ + x86/h26x/h2656_inter.o\ x86/hevc_sao.o\ x86/hevc_sao_10bit.o X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm new file mode 100644 index 00..aa296d549c --- /dev/null +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -0,0 +1,1145 @@ +; /* +; * Provide SSE luma and chroma mc functions for HEVC/VVC decoding +; * Copyright (c) 2013 Pierre-Edouard LEPERE +; * Copyright (c) 2023-2024 Nuo Mi +; * Copyright (c) 2023-2024 Wu Jianhua +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +%define MAX_PB_SIZE 64 + +SECTION_RODATA 32 +cextern pw_255 +cextern pw_512 +cextern pw_2048 +cextern pw_1023 +cextern pw_1024 +cextern pw_4096 +cextern pw_8192 +%define scale_8 pw_512 +%define scale_10 pw_2048 +%define scale_12 pw_8192 +%define max_pixels_8 pw_255 +%define max_pixels_10 pw_1023 +max_pixels_12: times 16 dw ((1 << 12)-1) +cextern pb_0 + +SECTION .text +%macro SIMPLE_LOAD 4;width, bitd, tab, r1 +%if %1 == 2 || (%2 == 8 && %1 <= 4) +movd %4, [%3] ; load data from source +%elif %1 == 4 || (%2 == 8 && %1 <= 8) +movq %4, [%3] ; load data from source +%elif notcpuflag(avx) +movu %4, [%3] ; load data from source +%elif %1 <= 8 || (%2 == 8 && %1 <= 16) +movdqu %4, [%3] +%else +movu %4, [%3] +%endif +%endmacro + +%macro VPBROADCASTW 2 +%if notcpuflag(avx2) +movd %1, %2 +pshuflw%1, %1, 0 +punpcklwd %1, %1 +%else +vpbroadcastw %1, %2 +%endif +%endmacro + +%macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b, +VPBROADCASTW %3, [%2q + 0 * 2] ; coeff 0, 1 +VPBROADCASTW %4, [%2q + 1 * 2] ; coeff 2, 3 +%if %1 != 8 +pmovsxbw %3, xmm%3 +pmovsxbw %4, xmm%4 +%endif +%endmacro + +%macro MC_4TAP_HV_FILTER 1 +VPBROADCASTW m12, [vfq + 0 * 2] ; vf 0, 1 +VPBROADCASTW m13, [vfq + 1 * 2] ; vf 2, 3 +VPBROADCASTW m14, [hfq + 0 * 2] ; hf 0, 1 +VPBROADCASTW m15, [hfq + 1 * 2] ; hf 2, 3 + +pmovsxbw m12, xm12 +pmovsxbw m13, xm13 +%if %1 != 8 +pmovsxbw m14, xm14 +pmovsxbw m15, xm15 +%endif +lea r3srcq, [srcstrideq*3] +%endmacro + +%macro MC_8TAP_SAVE_FILTER 5;offset, mm registers +mova [rsp + %1 + 0*mmsize], %2 +mova [rsp + %1 + 1*mmsize], %3 +mova [rsp + %1 + 2*mmsize], %4 +mova [rsp + %1 + 3*mmsize], %5 +%endmacro + +%macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset +VPBROADCASTW m12, [%2q + 0 * 2] ; coeff 0, 1 +VPBROADCASTW m13, [%2q + 1 * 2] ; coeff 2, 3 +VPBROADCASTW m14, [%2q + 2 * 2] ; coeff 4, 5 +VPBROADCASTW m15, [%2q + 3 * 2]
[FFmpeg-devel] [PATCH v3 1/8] avcodec/vvc/vvc_inter_template: move put/put_luma/put_chroma template to h2656_inter_template.c
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/h26x/h2656_inter_template.c | 577 + libavcodec/vvc/vvc_inter_template.c| 559 +--- 2 files changed, 578 insertions(+), 558 deletions(-) create mode 100644 libavcodec/h26x/h2656_inter_template.c diff --git a/libavcodec/h26x/h2656_inter_template.c b/libavcodec/h26x/h2656_inter_template.c new file mode 100644 index 00..864f6c7e7d --- /dev/null +++ b/libavcodec/h26x/h2656_inter_template.c @@ -0,0 +1,577 @@ +/* + * inter prediction template for HEVC/VVC + * + * Copyright (C) 2022 Nuo Mi + * Copyright (C) 2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define CHROMA_EXTRA_BEFORE 1 +#define CHROMA_EXTRA3 +#define LUMA_EXTRA_BEFORE 3 +#define LUMA_EXTRA 7 + +static void FUNC(put_pixels)(int16_t *dst, +const uint8_t *_src, const ptrdiff_t _src_stride, +const int height, const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src= (const pixel *)_src; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) +dst[x] = src[x] << (14 - BIT_DEPTH); +src += src_stride; +dst += MAX_PB_SIZE; +} +} + +static void FUNC(put_uni_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, +const uint8_t *_src, const ptrdiff_t _src_stride, const int height, + const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src= (const pixel *)_src; +pixel *dst = (pixel *)_dst; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + +for (int y = 0; y < height; y++) { +memcpy(dst, src, width * sizeof(pixel)); +src += src_stride; +dst += dst_stride; +} +} + +static void FUNC(put_uni_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, +const uint8_t *_src, const ptrdiff_t _src_stride, const int height, +const int denom, const int wx, const int _ox, const int8_t *hf, const int8_t *vf, +const int width) +{ +const pixel *src= (const pixel *)_src; +pixel *dst = (pixel *)_dst; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); +const int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 +const int offset= 1 << (shift - 1); +#else +const int offset= 0; +#endif +const int ox= _ox * (1 << (BIT_DEPTH - 8)); + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) { +const int v = (src[x] << (14 - BIT_DEPTH)); +dst[x] = av_clip_pixel(((v * wx + offset) >> shift) + ox); +} +src += src_stride; +dst += dst_stride; +} +} + +#define LUMA_FILTER(src, stride) \ +(filter[0] * src[x - 3 * stride] + \ + filter[1] * src[x - 2 * stride] + \ + filter[2] * src[x - stride] + \ + filter[3] * src[x ] + \ + filter[4] * src[x + stride] + \ + filter[5] * src[x + 2 * stride] + \ + filter[6] * src[x + 3 * stride] + \ + filter[7] * src[x + 4 * stride]) + +static void FUNC(put_luma_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, +const int height, const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src = (const pixel*)_src; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const int8_t *filter = hf; + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) +dst[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); +src += src_stride; +dst += MAX_PB_SIZE; +} +} + +static void FUNC(put_luma_v)(int
[FFmpeg-devel] [PATCH v3 6/8] tests/checkasm: add checkasm_check_vvc_mc
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 + tests/checkasm/checkasm.h | 1 + tests/checkasm/vvc_mc.c | 270 ++ 4 files changed, 275 insertions(+) create mode 100644 tests/checkasm/vvc_mc.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 3b5b54352b..3562acb2b2 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -40,6 +40,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o +AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_mc.o CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 87f24c77ca..36a97957e5 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -194,6 +194,9 @@ static const struct { #if CONFIG_VORBIS_DECODER { "vorbisdsp", checkasm_check_vorbisdsp }, #endif +#if CONFIG_VVC_DECODER +{ "vvc_mc", checkasm_check_vvc_mc }, +#endif #endif #if CONFIG_AVFILTER #if CONFIG_AFIR_FILTER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 4db8c495ea..53cb3ccfbf 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -131,6 +131,7 @@ void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); void checkasm_check_videodsp(void); void checkasm_check_vorbisdsp(void); +void checkasm_check_vvc_mc(void); struct CheckasmPerf; diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c new file mode 100644 index 00..711280deec --- /dev/null +++ b/tests/checkasm/vvc_mc.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2023-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +#include "checkasm.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/vvc/vvc_ctu.h" +#include "libavcodec/vvc/vvc_data.h" + +#include "libavutil/common.h" +#include "libavutil/internal.h" +#include "libavutil/internal.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0x }; +static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 }; + +#define PIXEL_STRIDE (MAX_CTU_SIZE * 2) +#define EXTRA_BEFORE 3 +#define EXTRA_AFTER 4 +#define SRC_EXTRA(EXTRA_BEFORE + EXTRA_AFTER) * 2 +#define SRC_BUF_SIZE (PIXEL_STRIDE + SRC_EXTRA) * (PIXEL_STRIDE + SRC_EXTRA) +#define DST_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE * 2) +#define SRC_OFFSET ((PIXEL_STRIDE + EXTRA_BEFORE * 2) * EXTRA_BEFORE) + +#define randomize_buffers(buf0, buf1, size, mask) \ +do {\ +int k; \ +for (k = 0; k < size; k += 4) { \ +uint32_t r = rnd() & mask; \ +AV_WN32A(buf0 + k, r); \ +AV_WN32A(buf1 + k, r); \ +} \ +} while (0) + +#define randomize_pixels(buf0, buf1, size) \ +do {\ +uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \ +randomize_buffers(buf0, buf1, size, mask); \ +} while (0) + +#define randomize_avg_src(buf0, buf1, size) \ +do {\ +uint32_t mask = 0x3fff3fff; \ +randomize_buffers(buf0, buf1, size, mask); \ +} while (0) + +static void check_put_vvc_luma(void) +{ +LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]); +LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]); +LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]); +VVCDSPContext c; + +declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t *dst, const uint8_t *src, const ptrdiff_t src_stride, +
[FFmpeg-devel] [PATCH v3 8/8] tests/checkasm/vvc_mc: add check_avg
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/vvc_mc.c | 64 + 1 file changed, 64 insertions(+) diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c index 711280deec..8adb00573f 100644 --- a/tests/checkasm/vvc_mc.c +++ b/tests/checkasm/vvc_mc.c @@ -35,6 +35,7 @@ static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0x }; static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 }; +#define SIZEOF_PIXEL ((bit_depth + 7) / 8) #define PIXEL_STRIDE (MAX_CTU_SIZE * 2) #define EXTRA_BEFORE 3 #define EXTRA_AFTER 4 @@ -261,10 +262,73 @@ static void check_put_vvc_chroma_uni(void) report("put_uni_chroma"); } +#define AVG_SRC_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE) +#define AVG_DST_BUF_SIZE (MAX_PB_SIZE * MAX_PB_SIZE * 2) + +static void check_avg(void) +{ +LOCAL_ALIGNED_32(int16_t, src00, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src01, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src10, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src11, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, dst0, [AVG_DST_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, dst1, [AVG_DST_BUF_SIZE]); +VVCDSPContext c; + +for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) { +randomize_avg_src((uint8_t*)src00, (uint8_t*)src10, AVG_SRC_BUF_SIZE * sizeof(int16_t)); +randomize_avg_src((uint8_t*)src01, (uint8_t*)src11, AVG_SRC_BUF_SIZE * sizeof(int16_t)); +ff_vvc_dsp_init(&c, bit_depth); +for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) { +for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) { +{ + declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride, +const int16_t *src0, const int16_t *src1, int width, int height); +if (check_func(c.inter.avg, "avg_%d_%dx%d", bit_depth, w, h)) { +memset(dst0, 0, AVG_DST_BUF_SIZE); +memset(dst1, 0, AVG_DST_BUF_SIZE); +call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h); +call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h); +if (memcmp(dst0, dst1, DST_BUF_SIZE)) +fail(); +if (w == h) +bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h); +} +} +{ +declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride, +const int16_t *src0, const int16_t *src1, int width, int height, +int denom, int w0, int w1, int o0, int o1); +{ +const int denom = rnd() % 8; +const int w0= rnd() % 256 - 128; +const int w1= rnd() % 256 - 128; +const int o0= rnd() % 256 - 128; +const int o1= rnd() % 256 - 128; +if (check_func(c.inter.w_avg, "w_avg_%d_%dx%d", bit_depth, w, h)) { +memset(dst0, 0, AVG_DST_BUF_SIZE); +memset(dst1, 0, AVG_DST_BUF_SIZE); + +call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1); +call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h, denom, w0, w1, o0, o1); +if (memcmp(dst0, dst1, DST_BUF_SIZE)) +fail(); +if (w == h) +bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1); +} +} +} +} +} +} +report("avg"); +} + void checkasm_check_vvc_mc(void) { check_put_vvc_luma(); check_put_vvc_luma_uni(); check_put_vvc_chroma(); check_put_vvc_chroma_uni(); +check_avg(); } -- 2.34.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations
From: Wu Jianhua The avg/avg_w is based on dav1d. See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm vvc_avg_8_2x2_c: 71.6 vvc_avg_8_2x2_avx2: 26.8 vvc_avg_8_2x4_c: 140.8 vvc_avg_8_2x4_avx2: 34.6 vvc_avg_8_2x8_c: 410.3 vvc_avg_8_2x8_avx2: 41.3 vvc_avg_8_2x16_c: 769.3 vvc_avg_8_2x16_avx2: 60.3 vvc_avg_8_2x32_c: 1669.6 vvc_avg_8_2x32_avx2: 105.1 vvc_avg_8_2x64_c: 1978.3 vvc_avg_8_2x64_avx2: 425.8 vvc_avg_8_2x128_c: 6536.8 vvc_avg_8_2x128_avx2: 1315.1 vvc_avg_8_4x2_c: 155.6 vvc_avg_8_4x2_avx2: 26.1 vvc_avg_8_4x4_c: 250.3 vvc_avg_8_4x4_avx2: 31.3 vvc_avg_8_4x8_c: 831.8 vvc_avg_8_4x8_avx2: 41.3 vvc_avg_8_4x16_c: 1461.1 vvc_avg_8_4x16_avx2: 57.1 vvc_avg_8_4x32_c: 2821.6 vvc_avg_8_4x32_avx2: 105.1 vvc_avg_8_4x64_c: 3615.8 vvc_avg_8_4x64_avx2: 412.6 vvc_avg_8_4x128_c: 11962.6 vvc_avg_8_4x128_avx2: 1274.3 vvc_avg_8_8x2_c: 215.8 vvc_avg_8_8x2_avx2: 29.1 vvc_avg_8_8x4_c: 430.6 vvc_avg_8_8x4_avx2: 37.6 vvc_avg_8_8x8_c: 1463.3 vvc_avg_8_8x8_avx2: 51.8 vvc_avg_8_8x16_c: 2630.1 vvc_avg_8_8x16_avx2: 97.6 vvc_avg_8_8x32_c: 5813.8 vvc_avg_8_8x32_avx2: 196.6 vvc_avg_8_8x64_c: 6687.3 vvc_avg_8_8x64_avx2: 487.8 vvc_avg_8_8x128_c: 13178.6 vvc_avg_8_8x128_avx2: 1290.6 vvc_avg_8_16x2_c: 443.8 vvc_avg_8_16x2_avx2: 28.3 vvc_avg_8_16x4_c: 1253.3 vvc_avg_8_16x4_avx2: 32.1 vvc_avg_8_16x8_c: 2236.3 vvc_avg_8_16x8_avx2: 44.3 vvc_avg_8_16x16_c: 5127.8 vvc_avg_8_16x16_avx2: 63.3 vvc_avg_8_16x32_c: 6573.3 vvc_avg_8_16x32_avx2: 223.6 vvc_avg_8_16x64_c: 30311.8 vvc_avg_8_16x64_avx2: 437.8 vvc_avg_8_16x128_c: 25693.3 vvc_avg_8_16x128_avx2: 1266.8 vvc_avg_8_32x2_c: 954.6 vvc_avg_8_32x2_avx2: 32.1 vvc_avg_8_32x4_c: 2359.6 vvc_avg_8_32x4_avx2: 39.6 vvc_avg_8_32x8_c: 5703.6 vvc_avg_8_32x8_avx2: 57.1 vvc_avg_8_32x16_c: 9967.6 vvc_avg_8_32x16_avx2: 107.1 vvc_avg_8_32x32_c: 21327.6 vvc_avg_8_32x32_avx2: 272.6 vvc_avg_8_32x64_c: 39240.8 vvc_avg_8_32x64_avx2: 529.6 vvc_avg_8_32x128_c: 52580.8 vvc_avg_8_32x128_avx2: 1338.8 vvc_avg_8_64x2_c: 1647.3 vvc_avg_8_64x2_avx2: 38.8 vvc_avg_8_64x4_c: 5130.1 vvc_avg_8_64x4_avx2: 58.8 vvc_avg_8_64x8_c: 6529.3 vvc_avg_8_64x8_avx2: 88.3 vvc_avg_8_64x16_c: 19913.6 vvc_avg_8_64x16_avx2: 162.3 vvc_avg_8_64x32_c: 39360.8 vvc_avg_8_64x32_avx2: 295.8 vvc_avg_8_64x64_c: 49658.3 vvc_avg_8_64x64_avx2: 784.1 vvc_avg_8_64x128_c: 108513.1 vvc_avg_8_64x128_avx2: 1977.1 vvc_avg_8_128x2_c: 3226.1 vvc_avg_8_128x2_avx2: 61.1 vvc_avg_8_128x4_c: 10280.3 vvc_avg_8_128x4_avx2: 94.6 vvc_avg_8_128x8_c: 18079.3 vvc_avg_8_128x8_avx2: 155.3 vvc_avg_8_128x16_c: 45121.8 vvc_avg_8_128x16_avx2: 285.3 vvc_avg_8_128x32_c: 48651.8 vvc_avg_8_128x32_avx2: 581.6 vvc_avg_8_128x64_c: 165078.6 vvc_avg_8_128x64_avx2: 1942.8 vvc_avg_8_128x128_c: 339103.1 vvc_avg_8_128x128_avx2: 4332.6 vvc_avg_10_2x2_c: 144.3 vvc_avg_10_2x2_avx2: 26.8 vvc_avg_10_2x4_c: 142.6 vvc_avg_10_2x4_avx2: 45.3 vvc_avg_10_2x8_c: 478.1 vvc_avg_10_2x8_avx2: 38.1 vvc_avg_10_2x16_c: 518.3 vvc_avg_10_2x16_avx2: 58.1 vvc_avg_10_2x32_c: 2059.8 vvc_avg_10_2x32_avx2: 93.1 vvc_avg_10_2x64_c: 2383.8 vvc_avg_10_2x64_avx2: 714.8 vvc_avg_10_2x128_c: 4498.3 vvc_avg_10_2x128_avx2: 1466.3 vvc_avg_10_4x2_c: 228.6 vvc_avg_10_4x2_avx2: 26.8 vvc_avg_10_4x4_c: 378.3 vvc_avg_10_4x4_avx2: 30.6 vvc_avg_10_4x8_c: 866.8 vvc_avg_10_4x8_avx2: 44.6 vvc_avg_10_4x16_c: 1018.1 vvc_avg_10_4x16_avx2: 58.1 vvc_avg_10_4x32_c: 3590.8 vvc_avg_10_4x32_avx2: 128.8 vvc_avg_10_4x64_c: 4200.8 vvc_avg_10_4x64_avx2: 663.6 vvc_avg_10_4x128_c: 8450.8 vvc_avg_10_4x128_avx2: 1531.8 vvc_avg_10_8x2_c: 369.3 vvc_avg_10_8x2_avx2: 28.3 vvc_avg_10_8x4_c: 513.8 vvc_avg_10_8x4_avx2: 32.1 vvc_avg_10_8x8_c: 1720.3 vvc_avg_10_8x8_avx2: 49.1 vvc_avg_10_8x16_c: 1894.8 vvc_avg_10_8x16_avx2: 71.6 vvc_avg_10_8x32_c: 3931.3 vvc_avg_10_8x32_avx2: 148.1 vvc_avg_10_8x64_c: 7964.3 vvc_avg_10_8x64_avx2: 613.1 vvc_avg_10_8x128_c: 15540.1 vvc_avg_10_8x128_avx2: 1585.1 vvc_avg_10_16x2_c: 877.3 vvc_avg_10_16x2_avx2: 27.6 vvc_avg_10_16x4_c: 955.8 vvc_avg_10_16x4_avx2: 29.8 vvc_avg_10_16x8_c: 3419.6 vvc_avg_10_16x8_avx2: 62.6 vvc_avg_10_16x16_c: 3826.8 vvc_avg_10_16x16_avx2: 54.3 vvc_avg_10_16x32_c: 7655.3 vvc_avg_10_16x32_avx2: 86.3 vvc_avg_10_16x64_c: 30011.1 vvc_avg_10_16x64_avx2: 692.6 vvc_avg_10_16x128_c: 47894.8 vvc_avg_10_16x128_avx2: 1580.3 vvc_avg_10_32x2_c: 944.3 vvc_avg_10_32x2_avx2: 29.8 vvc_avg_10_32x4_c: 2022.6 vvc_avg_10_32x4_avx2: 35.1 vvc_avg_10_32x8_c: 6148.8 vvc_avg_10_32x8_avx2: 51.3 vvc_avg_10_32x16_c: 12601.6 vvc_avg_10_32x16_avx2: 70.8 vvc_avg_10_32x32_c: 15958.6 vvc_avg_10_32x32_avx2: 124.3 vvc_avg_10_32x64_c: 31784.6 vvc_avg_10_32x64_avx2: 757.3 vvc_avg_10_32x128_c: 63892.8 vvc_avg_10_32x128_avx2: 1711.3 vvc_avg_10_64x2_c: 1890.8 vvc_avg_10_64x2_avx2: 34.3 vvc_avg_10_64x4_c: 6267.3 vvc_avg_10_64x4_avx2: 42.6 vvc_avg_10_64x8_c: 12778.1 vvc_avg_10_64x8_avx2: 67.8 vvc_avg_10_64x16_c: 22304.3 vvc_avg_10_64x16_avx2: 116.8 vvc_avg_10_64x32_c: 30777.1 vvc_avg_10_64x32_avx2: 201.1 vvc_avg_10_64x64_c: 60169.1 vvc_avg_10_64x64_avx2: 1454.3 vvc_avg_10_64x128_c: 124392.8 vvc_avg_10_64x128_avx2: 3648.6 vvc_avg_10_128x2
[FFmpeg-devel] [PATCH v3 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/Makefile | 1 + libavcodec/vvc/vvcdsp.c | 4 + libavcodec/vvc/vvcdsp.h | 2 + libavcodec/x86/vvc/Makefile | 6 + libavcodec/x86/vvc/vvcdsp_init.c | 202 +++ 5 files changed, 215 insertions(+) create mode 100644 libavcodec/x86/vvc/Makefile create mode 100644 libavcodec/x86/vvc/vvcdsp_init.c diff --git a/libavcodec/Makefile b/libavcodec/Makefile index bb42095165..ce33631b60 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -65,6 +65,7 @@ OBJS = ac3_parser.o \ # subsystems include $(SRC_PATH)/libavcodec/vvc/Makefile +include $(SRC_PATH)/libavcodec/x86/vvc/Makefile OBJS-$(CONFIG_AANDCTTABLES)+= aandcttab.o OBJS-$(CONFIG_AC3DSP) += ac3dsp.o ac3.o ac3tab.o OBJS-$(CONFIG_ADTS_HEADER) += adts_header.o mpeg4audio_sample_rates.o diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c index c82ea7be30..c542be5258 100644 --- a/libavcodec/vvc/vvcdsp.c +++ b/libavcodec/vvc/vvcdsp.c @@ -138,4 +138,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth) VVC_DSP(8); break; } + +#if ARCH_X86 +ff_vvc_dsp_init_x86(vvcdsp, bit_depth); +#endif } diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h index b5a63c5833..6f59e73654 100644 --- a/libavcodec/vvc/vvcdsp.h +++ b/libavcodec/vvc/vvcdsp.h @@ -167,4 +167,6 @@ typedef struct VVCDSPContext { void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); +void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); + #endif /* AVCODEC_VVC_VVCDSP_H */ diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile new file mode 100644 index 00..b4acc22501 --- /dev/null +++ b/libavcodec/x86/vvc/Makefile @@ -0,0 +1,6 @@ +clean:: + $(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%) + +OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o +X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/h26x/h2656dsp.o \ + x86/h26x/h2656_inter.o diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c new file mode 100644 index 00..c197cdb4cc --- /dev/null +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -0,0 +1,202 @@ +/* + * VVC DSP init for x86 + * + * Copyright (C) 2022-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vvc/vvcdec.h" +#include "libavcodec/vvc/vvc_ctu.h" +#include "libavcodec/vvc/vvcdsp.h" +#include "libavcodec/x86/h26x/h2656dsp.h" + +#define FW_PUT(name, depth, opt) \ +static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \ + int height, const int8_t *hf, const int8_t *vf, int width)\ +{ \ +ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_TAP(fname, bitd, opt ) \ +FW_PUT(fname##4, bitd, opt ); \ +FW_PUT(fname##8, bitd, opt ); \ +FW_PUT(fname##16, bitd, opt ); \ +FW_PUT(fname##32, bitd, opt ); \ +FW_PUT(fname##64, bitd, opt ); \ +FW_PUT(fname##128, bitd, opt ); \ + +#define FW_PUT_4TAP(fname, bitd, opt) \ +FW_PUT(fname ## 2, bitd, opt) \ +FW_PUT_TAP(fname, bitd, opt) + +#define FW_PUT_4TAP_SSE4(bitd) \ +FW_PUT_4TAP(pixels, bitd, sse4) \ +FW_PUT_4TAP(4tap_h, bitd, sse4) \ +FW_PUT_4TAP(4tap_v, bitd, sse4) \ +FW_PUT_4TAP(4tap_hv, bitd, sse4) + +#define FW_PUT_8TAP_SSE4(bitd) \ +FW_PUT_TAP(8tap_h, bitd, sse4) \ +FW_PUT_TAP(8tap_v, bitd, sse4) \ +FW_PUT_TAP(8tap_hv, bitd, sse4) + +#define FW_PUT_SSE4(bitd) \ +FW_PUT_4TAP_SSE4(bitd) \ +FW_PUT_8TAP_SSE4(bitd) + +FW_PUT_SSE4( 8); +FW_PUT_SSE4(10); +FW_PUT_SSE4(12); + +#define FW_PUT_T
[FFmpeg-devel] [PATCH v3 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/h26x/h2656_inter.asm | 32 ++--- libavcodec/x86/h26x/h2656dsp.c | 4 ++-- libavcodec/x86/h26x/h2656dsp.h | 2 +- libavcodec/x86/hevcdsp_init.c | 2 +- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm index aa296d549c..cbba0c1ea5 100644 --- a/libavcodec/x86/h26x/h2656_inter.asm +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -22,8 +22,6 @@ ; */ %include "libavutil/x86/x86util.asm" -%define MAX_PB_SIZE 64 - SECTION_RODATA 32 cextern pw_255 cextern pw_512 @@ -342,7 +340,7 @@ SECTION .text %endmacro %macro LOOP_END 3 -add %1q, 2*MAX_PB_SIZE ; dst += dststride +add %1q, dststrideq ; dst += dststride add %2q, %3q; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -539,7 +537,7 @@ SECTION .text ; ** -; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void %1_put_pixels(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** @@ -549,7 +547,7 @@ SECTION .text %endmacro %macro MC_PIXELS 3 -cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height +cglobal %1_put_pixels%2_%3, 5, 5, 3, dst, dststride, src, srcstride, height pxor m2, m2 .loop: SIMPLE_LOAD %2, %3, srcq, m0 @@ -579,10 +577,10 @@ cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, src, srcstride, height %endif ; ** -; void %1_put_4tap_hX(int16_t *dst, +; void %1_put_4tap_hX(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width); ; ** -cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf +cglobal %1_put_4tap_h%2_%3, 6, 6, XMM_REGS, dst, dststride, src, srcstride, height, hf %assign %%stride ((%3 + 7)/8) MC_4TAP_FILTER %3, hf, m4, m5 .loop: @@ -612,10 +610,10 @@ cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, dststride, src, srcstride, RET ; ** -; void %1_put_4tap_v(int16_t *dst, +; void %1_put_4tap_v(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width) ; ** -cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, r3src, vf +cglobal %1_put_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, vf sub srcq, srcstrideq MC_4TAP_FILTER%3, vf, m4, m5 lea r3srcq, [srcstrideq*3] @@ -649,10 +647,10 @@ cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, %macro PUT_4TAP_HV 3 ; ** -; void put_4tap_hv(int16_t *dst, +; void put_4tap_hv(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width) ; ** -cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, r3src +cglobal %1_put_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, height, hf, vf, r3src %assign %%stride ((%3 + 7)/8) sub srcq, srcstrideq MC_4TAP_HV_FILTER%3 @@ -784,12 +782,12 @@ cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, heig %endmacro ; ** -; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void put_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** %macro PUT_8TAP 3 -cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf +cglobal %1_put_8tap_h%2_%3, 6, 6, 16, dst, dststride, src, srcstride, height, hf MC_8TAP_FILTER %3, hf .loop: MC_8TAP_H_LOAD %3, srcq, %2, 10 @@ -824,10 +822,10 @@ cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, dststride, src, srcstride, heigh ; ** -; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void put_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** -cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf +cglobal %1_put_8tap_v%2_%3, 7, 8, 16, dst, dststride, src, srcstride, height,
[FFmpeg-devel] [PATCH v3 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm
From: Wu Jianhua This enable that the asm optimization can be reused by VVC Signed-off-by: Wu Jianhua --- libavcodec/x86/Makefile |1 + libavcodec/x86/h26x/h2656_inter.asm | 1145 +++ libavcodec/x86/h26x/h2656dsp.c | 98 +++ libavcodec/x86/h26x/h2656dsp.h | 103 +++ libavcodec/x86/hevc_mc.asm | 462 +-- libavcodec/x86/hevcdsp_init.c | 108 ++- 6 files changed, 1471 insertions(+), 446 deletions(-) create mode 100644 libavcodec/x86/h26x/h2656_inter.asm create mode 100644 libavcodec/x86/h26x/h2656dsp.c create mode 100644 libavcodec/x86/h26x/h2656dsp.h diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index d5fb30645a..8098cd840c 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o\ x86/hevc_deblock.o\ x86/hevc_idct.o \ x86/hevc_mc.o \ + x86/h26x/h2656_inter.o\ x86/hevc_sao.o\ x86/hevc_sao_10bit.o X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm new file mode 100644 index 00..aa296d549c --- /dev/null +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -0,0 +1,1145 @@ +; /* +; * Provide SSE luma and chroma mc functions for HEVC/VVC decoding +; * Copyright (c) 2013 Pierre-Edouard LEPERE +; * Copyright (c) 2023-2024 Nuo Mi +; * Copyright (c) 2023-2024 Wu Jianhua +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +%define MAX_PB_SIZE 64 + +SECTION_RODATA 32 +cextern pw_255 +cextern pw_512 +cextern pw_2048 +cextern pw_1023 +cextern pw_1024 +cextern pw_4096 +cextern pw_8192 +%define scale_8 pw_512 +%define scale_10 pw_2048 +%define scale_12 pw_8192 +%define max_pixels_8 pw_255 +%define max_pixels_10 pw_1023 +max_pixels_12: times 16 dw ((1 << 12)-1) +cextern pb_0 + +SECTION .text +%macro SIMPLE_LOAD 4;width, bitd, tab, r1 +%if %1 == 2 || (%2 == 8 && %1 <= 4) +movd %4, [%3] ; load data from source +%elif %1 == 4 || (%2 == 8 && %1 <= 8) +movq %4, [%3] ; load data from source +%elif notcpuflag(avx) +movu %4, [%3] ; load data from source +%elif %1 <= 8 || (%2 == 8 && %1 <= 16) +movdqu %4, [%3] +%else +movu %4, [%3] +%endif +%endmacro + +%macro VPBROADCASTW 2 +%if notcpuflag(avx2) +movd %1, %2 +pshuflw%1, %1, 0 +punpcklwd %1, %1 +%else +vpbroadcastw %1, %2 +%endif +%endmacro + +%macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b, +VPBROADCASTW %3, [%2q + 0 * 2] ; coeff 0, 1 +VPBROADCASTW %4, [%2q + 1 * 2] ; coeff 2, 3 +%if %1 != 8 +pmovsxbw %3, xmm%3 +pmovsxbw %4, xmm%4 +%endif +%endmacro + +%macro MC_4TAP_HV_FILTER 1 +VPBROADCASTW m12, [vfq + 0 * 2] ; vf 0, 1 +VPBROADCASTW m13, [vfq + 1 * 2] ; vf 2, 3 +VPBROADCASTW m14, [hfq + 0 * 2] ; hf 0, 1 +VPBROADCASTW m15, [hfq + 1 * 2] ; hf 2, 3 + +pmovsxbw m12, xm12 +pmovsxbw m13, xm13 +%if %1 != 8 +pmovsxbw m14, xm14 +pmovsxbw m15, xm15 +%endif +lea r3srcq, [srcstrideq*3] +%endmacro + +%macro MC_8TAP_SAVE_FILTER 5;offset, mm registers +mova [rsp + %1 + 0*mmsize], %2 +mova [rsp + %1 + 1*mmsize], %3 +mova [rsp + %1 + 2*mmsize], %4 +mova [rsp + %1 + 3*mmsize], %5 +%endmacro + +%macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset +VPBROADCASTW m12, [%2q + 0 * 2] ; coeff 0, 1 +VPBROADCASTW m13, [%2q + 1 * 2] ; coeff 2, 3 +VPBROADCASTW m14, [%2q + 2 * 2] ; coeff 4, 5 +VPBROADCASTW m15, [%2q + 3 * 2]
[FFmpeg-devel] [PATCH v3 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/hevcdsp_template.c | 594 +++--- 1 file changed, 46 insertions(+), 548 deletions(-) diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 0de14e9dcf..9b48bdf08e 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -26,6 +26,7 @@ #include "bit_depth_template.c" #include "hevcdsp.h" #include "h26x/h2656_sao_template.c" +#include "h26x/h2656_inter_template.c" static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height, GetBitContext *gb, int pcm_bit_depth) @@ -299,37 +300,51 @@ IDCT_DC(32) // -static void FUNC(put_hevc_pel_pixels)(int16_t *dst, - const uint8_t *_src, ptrdiff_t _srcstride, - int height, intptr_t mx, intptr_t my, int width) -{ -int x, y; -const pixel *src= (const pixel *)_src; -ptrdiff_t srcstride = _srcstride / sizeof(pixel); - -for (y = 0; y < height; y++) { -for (x = 0; x < width; x++) -dst[x] = src[x] << (14 - BIT_DEPTH); -src += srcstride; -dst += MAX_PB_SIZE; -} -} - -static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, - int height, intptr_t mx, intptr_t my, int width) -{ -int y; -const pixel *src= (const pixel *)_src; -ptrdiff_t srcstride = _srcstride / sizeof(pixel); -pixel *dst = (pixel *)_dst; -ptrdiff_t dststride = _dststride / sizeof(pixel); - -for (y = 0; y < height; y++) { -memcpy(dst, src, width * sizeof(pixel)); -src += srcstride; -dst += dststride; -} -} +#define ff_hevc_pel_filters ff_hevc_qpel_filters +#define DECL_HV_FILTER(f) \ +const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \ +const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1]; + +#define FW_PUT(p, f, t) \ +static void FUNC(put_hevc_## f)(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height,\ + intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_UNI(p, f, t) \ +static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride, int height, intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, dststride, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_UNI_W(p, f, t) \ +static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride,int height, int denom, int wx, int ox, \ + intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, dststride, src, srcstride, height, denom, wx, ox, hf, vf, width);\ +} + +#define FW_PUT_FUNCS(f, t, dir) \ +FW_PUT(f, f ## _ ## dir, t ## _ ## dir) \ +FW_PUT_UNI(f, f ## _uni_ ## dir, uni_ ## t ## _ ## dir)\ +FW_PUT_UNI_W(f, f ## _uni_w_ ## dir, uni_## t ## _w_ ## dir) + +FW_PUT(pel, pel_pixels, pixels) +FW_PUT_UNI(pel, pel_uni_pixels, uni_pixels) +FW_PUT_UNI_W(pel, pel_uni_w_pixels, uni_w_pixels) + +FW_PUT_FUNCS(qpel, luma, h ) +FW_PUT_FUNCS(qpel, luma, v ) +FW_PUT_FUNCS(qpel, luma, hv) +FW_PUT_FUNCS(epel, chroma, h ) +FW_PUT_FUNCS(epel, chroma, v ) +FW_PUT_FUNCS(epel, chroma, hv) static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_
[FFmpeg-devel] [PATCH v3 1/8] avcodec/vvc/vvc_inter_template: move put/put_luma/put_chroma template to h2656_inter_template.c
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/h26x/h2656_inter_template.c | 577 + libavcodec/vvc/vvc_inter_template.c| 559 +--- 2 files changed, 578 insertions(+), 558 deletions(-) create mode 100644 libavcodec/h26x/h2656_inter_template.c diff --git a/libavcodec/h26x/h2656_inter_template.c b/libavcodec/h26x/h2656_inter_template.c new file mode 100644 index 00..864f6c7e7d --- /dev/null +++ b/libavcodec/h26x/h2656_inter_template.c @@ -0,0 +1,577 @@ +/* + * inter prediction template for HEVC/VVC + * + * Copyright (C) 2022 Nuo Mi + * Copyright (C) 2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define CHROMA_EXTRA_BEFORE 1 +#define CHROMA_EXTRA3 +#define LUMA_EXTRA_BEFORE 3 +#define LUMA_EXTRA 7 + +static void FUNC(put_pixels)(int16_t *dst, +const uint8_t *_src, const ptrdiff_t _src_stride, +const int height, const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src= (const pixel *)_src; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) +dst[x] = src[x] << (14 - BIT_DEPTH); +src += src_stride; +dst += MAX_PB_SIZE; +} +} + +static void FUNC(put_uni_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, +const uint8_t *_src, const ptrdiff_t _src_stride, const int height, + const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src= (const pixel *)_src; +pixel *dst = (pixel *)_dst; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + +for (int y = 0; y < height; y++) { +memcpy(dst, src, width * sizeof(pixel)); +src += src_stride; +dst += dst_stride; +} +} + +static void FUNC(put_uni_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, +const uint8_t *_src, const ptrdiff_t _src_stride, const int height, +const int denom, const int wx, const int _ox, const int8_t *hf, const int8_t *vf, +const int width) +{ +const pixel *src= (const pixel *)_src; +pixel *dst = (pixel *)_dst; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); +const int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 +const int offset= 1 << (shift - 1); +#else +const int offset= 0; +#endif +const int ox= _ox * (1 << (BIT_DEPTH - 8)); + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) { +const int v = (src[x] << (14 - BIT_DEPTH)); +dst[x] = av_clip_pixel(((v * wx + offset) >> shift) + ox); +} +src += src_stride; +dst += dst_stride; +} +} + +#define LUMA_FILTER(src, stride) \ +(filter[0] * src[x - 3 * stride] + \ + filter[1] * src[x - 2 * stride] + \ + filter[2] * src[x - stride] + \ + filter[3] * src[x ] + \ + filter[4] * src[x + stride] + \ + filter[5] * src[x + 2 * stride] + \ + filter[6] * src[x + 3 * stride] + \ + filter[7] * src[x + 4 * stride]) + +static void FUNC(put_luma_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, +const int height, const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src = (const pixel*)_src; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const int8_t *filter = hf; + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) +dst[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); +src += src_stride; +dst += MAX_PB_SIZE; +} +} + +static void FUNC(put_luma_v)(int
[FFmpeg-devel] [PATCH v2 8/8] tests/checkasm/vvc_mc: add check_avg
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/vvc_mc.c | 64 + 1 file changed, 64 insertions(+) diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c index 711280deec..8adb00573f 100644 --- a/tests/checkasm/vvc_mc.c +++ b/tests/checkasm/vvc_mc.c @@ -35,6 +35,7 @@ static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0x }; static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 }; +#define SIZEOF_PIXEL ((bit_depth + 7) / 8) #define PIXEL_STRIDE (MAX_CTU_SIZE * 2) #define EXTRA_BEFORE 3 #define EXTRA_AFTER 4 @@ -261,10 +262,73 @@ static void check_put_vvc_chroma_uni(void) report("put_uni_chroma"); } +#define AVG_SRC_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE) +#define AVG_DST_BUF_SIZE (MAX_PB_SIZE * MAX_PB_SIZE * 2) + +static void check_avg(void) +{ +LOCAL_ALIGNED_32(int16_t, src00, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src01, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src10, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src11, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, dst0, [AVG_DST_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, dst1, [AVG_DST_BUF_SIZE]); +VVCDSPContext c; + +for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) { +randomize_avg_src((uint8_t*)src00, (uint8_t*)src10, AVG_SRC_BUF_SIZE * sizeof(int16_t)); +randomize_avg_src((uint8_t*)src01, (uint8_t*)src11, AVG_SRC_BUF_SIZE * sizeof(int16_t)); +ff_vvc_dsp_init(&c, bit_depth); +for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) { +for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) { +{ + declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride, +const int16_t *src0, const int16_t *src1, int width, int height); +if (check_func(c.inter.avg, "avg_%d_%dx%d", bit_depth, w, h)) { +memset(dst0, 0, AVG_DST_BUF_SIZE); +memset(dst1, 0, AVG_DST_BUF_SIZE); +call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h); +call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h); +if (memcmp(dst0, dst1, DST_BUF_SIZE)) +fail(); +if (w == h) +bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h); +} +} +{ +declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride, +const int16_t *src0, const int16_t *src1, int width, int height, +int denom, int w0, int w1, int o0, int o1); +{ +const int denom = rnd() % 8; +const int w0= rnd() % 256 - 128; +const int w1= rnd() % 256 - 128; +const int o0= rnd() % 256 - 128; +const int o1= rnd() % 256 - 128; +if (check_func(c.inter.w_avg, "w_avg_%d_%dx%d", bit_depth, w, h)) { +memset(dst0, 0, AVG_DST_BUF_SIZE); +memset(dst1, 0, AVG_DST_BUF_SIZE); + +call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1); +call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h, denom, w0, w1, o0, o1); +if (memcmp(dst0, dst1, DST_BUF_SIZE)) +fail(); +if (w == h) +bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1); +} +} +} +} +} +} +report("avg"); +} + void checkasm_check_vvc_mc(void) { check_put_vvc_luma(); check_put_vvc_luma_uni(); check_put_vvc_chroma(); check_put_vvc_chroma_uni(); +check_avg(); } -- 2.34.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations
From: Wu Jianhua The avg/avg_w is based on dav1d. See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm vvc_avg_8_2x2_c: 71.6 vvc_avg_8_2x2_avx2: 26.8 vvc_avg_8_2x4_c: 140.8 vvc_avg_8_2x4_avx2: 34.6 vvc_avg_8_2x8_c: 410.3 vvc_avg_8_2x8_avx2: 41.3 vvc_avg_8_2x16_c: 769.3 vvc_avg_8_2x16_avx2: 60.3 vvc_avg_8_2x32_c: 1669.6 vvc_avg_8_2x32_avx2: 105.1 vvc_avg_8_2x64_c: 1978.3 vvc_avg_8_2x64_avx2: 425.8 vvc_avg_8_2x128_c: 6536.8 vvc_avg_8_2x128_avx2: 1315.1 vvc_avg_8_4x2_c: 155.6 vvc_avg_8_4x2_avx2: 26.1 vvc_avg_8_4x4_c: 250.3 vvc_avg_8_4x4_avx2: 31.3 vvc_avg_8_4x8_c: 831.8 vvc_avg_8_4x8_avx2: 41.3 vvc_avg_8_4x16_c: 1461.1 vvc_avg_8_4x16_avx2: 57.1 vvc_avg_8_4x32_c: 2821.6 vvc_avg_8_4x32_avx2: 105.1 vvc_avg_8_4x64_c: 3615.8 vvc_avg_8_4x64_avx2: 412.6 vvc_avg_8_4x128_c: 11962.6 vvc_avg_8_4x128_avx2: 1274.3 vvc_avg_8_8x2_c: 215.8 vvc_avg_8_8x2_avx2: 29.1 vvc_avg_8_8x4_c: 430.6 vvc_avg_8_8x4_avx2: 37.6 vvc_avg_8_8x8_c: 1463.3 vvc_avg_8_8x8_avx2: 51.8 vvc_avg_8_8x16_c: 2630.1 vvc_avg_8_8x16_avx2: 97.6 vvc_avg_8_8x32_c: 5813.8 vvc_avg_8_8x32_avx2: 196.6 vvc_avg_8_8x64_c: 6687.3 vvc_avg_8_8x64_avx2: 487.8 vvc_avg_8_8x128_c: 13178.6 vvc_avg_8_8x128_avx2: 1290.6 vvc_avg_8_16x2_c: 443.8 vvc_avg_8_16x2_avx2: 28.3 vvc_avg_8_16x4_c: 1253.3 vvc_avg_8_16x4_avx2: 32.1 vvc_avg_8_16x8_c: 2236.3 vvc_avg_8_16x8_avx2: 44.3 vvc_avg_8_16x16_c: 5127.8 vvc_avg_8_16x16_avx2: 63.3 vvc_avg_8_16x32_c: 6573.3 vvc_avg_8_16x32_avx2: 223.6 vvc_avg_8_16x64_c: 30311.8 vvc_avg_8_16x64_avx2: 437.8 vvc_avg_8_16x128_c: 25693.3 vvc_avg_8_16x128_avx2: 1266.8 vvc_avg_8_32x2_c: 954.6 vvc_avg_8_32x2_avx2: 32.1 vvc_avg_8_32x4_c: 2359.6 vvc_avg_8_32x4_avx2: 39.6 vvc_avg_8_32x8_c: 5703.6 vvc_avg_8_32x8_avx2: 57.1 vvc_avg_8_32x16_c: 9967.6 vvc_avg_8_32x16_avx2: 107.1 vvc_avg_8_32x32_c: 21327.6 vvc_avg_8_32x32_avx2: 272.6 vvc_avg_8_32x64_c: 39240.8 vvc_avg_8_32x64_avx2: 529.6 vvc_avg_8_32x128_c: 52580.8 vvc_avg_8_32x128_avx2: 1338.8 vvc_avg_8_64x2_c: 1647.3 vvc_avg_8_64x2_avx2: 38.8 vvc_avg_8_64x4_c: 5130.1 vvc_avg_8_64x4_avx2: 58.8 vvc_avg_8_64x8_c: 6529.3 vvc_avg_8_64x8_avx2: 88.3 vvc_avg_8_64x16_c: 19913.6 vvc_avg_8_64x16_avx2: 162.3 vvc_avg_8_64x32_c: 39360.8 vvc_avg_8_64x32_avx2: 295.8 vvc_avg_8_64x64_c: 49658.3 vvc_avg_8_64x64_avx2: 784.1 vvc_avg_8_64x128_c: 108513.1 vvc_avg_8_64x128_avx2: 1977.1 vvc_avg_8_128x2_c: 3226.1 vvc_avg_8_128x2_avx2: 61.1 vvc_avg_8_128x4_c: 10280.3 vvc_avg_8_128x4_avx2: 94.6 vvc_avg_8_128x8_c: 18079.3 vvc_avg_8_128x8_avx2: 155.3 vvc_avg_8_128x16_c: 45121.8 vvc_avg_8_128x16_avx2: 285.3 vvc_avg_8_128x32_c: 48651.8 vvc_avg_8_128x32_avx2: 581.6 vvc_avg_8_128x64_c: 165078.6 vvc_avg_8_128x64_avx2: 1942.8 vvc_avg_8_128x128_c: 339103.1 vvc_avg_8_128x128_avx2: 4332.6 vvc_avg_10_2x2_c: 144.3 vvc_avg_10_2x2_avx2: 26.8 vvc_avg_10_2x4_c: 142.6 vvc_avg_10_2x4_avx2: 45.3 vvc_avg_10_2x8_c: 478.1 vvc_avg_10_2x8_avx2: 38.1 vvc_avg_10_2x16_c: 518.3 vvc_avg_10_2x16_avx2: 58.1 vvc_avg_10_2x32_c: 2059.8 vvc_avg_10_2x32_avx2: 93.1 vvc_avg_10_2x64_c: 2383.8 vvc_avg_10_2x64_avx2: 714.8 vvc_avg_10_2x128_c: 4498.3 vvc_avg_10_2x128_avx2: 1466.3 vvc_avg_10_4x2_c: 228.6 vvc_avg_10_4x2_avx2: 26.8 vvc_avg_10_4x4_c: 378.3 vvc_avg_10_4x4_avx2: 30.6 vvc_avg_10_4x8_c: 866.8 vvc_avg_10_4x8_avx2: 44.6 vvc_avg_10_4x16_c: 1018.1 vvc_avg_10_4x16_avx2: 58.1 vvc_avg_10_4x32_c: 3590.8 vvc_avg_10_4x32_avx2: 128.8 vvc_avg_10_4x64_c: 4200.8 vvc_avg_10_4x64_avx2: 663.6 vvc_avg_10_4x128_c: 8450.8 vvc_avg_10_4x128_avx2: 1531.8 vvc_avg_10_8x2_c: 369.3 vvc_avg_10_8x2_avx2: 28.3 vvc_avg_10_8x4_c: 513.8 vvc_avg_10_8x4_avx2: 32.1 vvc_avg_10_8x8_c: 1720.3 vvc_avg_10_8x8_avx2: 49.1 vvc_avg_10_8x16_c: 1894.8 vvc_avg_10_8x16_avx2: 71.6 vvc_avg_10_8x32_c: 3931.3 vvc_avg_10_8x32_avx2: 148.1 vvc_avg_10_8x64_c: 7964.3 vvc_avg_10_8x64_avx2: 613.1 vvc_avg_10_8x128_c: 15540.1 vvc_avg_10_8x128_avx2: 1585.1 vvc_avg_10_16x2_c: 877.3 vvc_avg_10_16x2_avx2: 27.6 vvc_avg_10_16x4_c: 955.8 vvc_avg_10_16x4_avx2: 29.8 vvc_avg_10_16x8_c: 3419.6 vvc_avg_10_16x8_avx2: 62.6 vvc_avg_10_16x16_c: 3826.8 vvc_avg_10_16x16_avx2: 54.3 vvc_avg_10_16x32_c: 7655.3 vvc_avg_10_16x32_avx2: 86.3 vvc_avg_10_16x64_c: 30011.1 vvc_avg_10_16x64_avx2: 692.6 vvc_avg_10_16x128_c: 47894.8 vvc_avg_10_16x128_avx2: 1580.3 vvc_avg_10_32x2_c: 944.3 vvc_avg_10_32x2_avx2: 29.8 vvc_avg_10_32x4_c: 2022.6 vvc_avg_10_32x4_avx2: 35.1 vvc_avg_10_32x8_c: 6148.8 vvc_avg_10_32x8_avx2: 51.3 vvc_avg_10_32x16_c: 12601.6 vvc_avg_10_32x16_avx2: 70.8 vvc_avg_10_32x32_c: 15958.6 vvc_avg_10_32x32_avx2: 124.3 vvc_avg_10_32x64_c: 31784.6 vvc_avg_10_32x64_avx2: 757.3 vvc_avg_10_32x128_c: 63892.8 vvc_avg_10_32x128_avx2: 1711.3 vvc_avg_10_64x2_c: 1890.8 vvc_avg_10_64x2_avx2: 34.3 vvc_avg_10_64x4_c: 6267.3 vvc_avg_10_64x4_avx2: 42.6 vvc_avg_10_64x8_c: 12778.1 vvc_avg_10_64x8_avx2: 67.8 vvc_avg_10_64x16_c: 22304.3 vvc_avg_10_64x16_avx2: 116.8 vvc_avg_10_64x32_c: 30777.1 vvc_avg_10_64x32_avx2: 201.1 vvc_avg_10_64x64_c: 60169.1 vvc_avg_10_64x64_avx2: 1454.3 vvc_avg_10_64x128_c: 124392.8 vvc_avg_10_64x128_avx2: 3648.6 vvc_avg_10_128x2
[FFmpeg-devel] [PATCH v2 6/8] tests/checkasm: add checkasm_check_vvc_mc
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 + tests/checkasm/checkasm.h | 1 + tests/checkasm/vvc_mc.c | 270 ++ 4 files changed, 275 insertions(+) create mode 100644 tests/checkasm/vvc_mc.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 3b5b54352b..3562acb2b2 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -40,6 +40,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o +AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_mc.o CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 87f24c77ca..36a97957e5 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -194,6 +194,9 @@ static const struct { #if CONFIG_VORBIS_DECODER { "vorbisdsp", checkasm_check_vorbisdsp }, #endif +#if CONFIG_VVC_DECODER +{ "vvc_mc", checkasm_check_vvc_mc }, +#endif #endif #if CONFIG_AVFILTER #if CONFIG_AFIR_FILTER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 4db8c495ea..53cb3ccfbf 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -131,6 +131,7 @@ void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); void checkasm_check_videodsp(void); void checkasm_check_vorbisdsp(void); +void checkasm_check_vvc_mc(void); struct CheckasmPerf; diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c new file mode 100644 index 00..711280deec --- /dev/null +++ b/tests/checkasm/vvc_mc.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2023-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +#include "checkasm.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/vvc/vvc_ctu.h" +#include "libavcodec/vvc/vvc_data.h" + +#include "libavutil/common.h" +#include "libavutil/internal.h" +#include "libavutil/internal.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0x }; +static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 }; + +#define PIXEL_STRIDE (MAX_CTU_SIZE * 2) +#define EXTRA_BEFORE 3 +#define EXTRA_AFTER 4 +#define SRC_EXTRA(EXTRA_BEFORE + EXTRA_AFTER) * 2 +#define SRC_BUF_SIZE (PIXEL_STRIDE + SRC_EXTRA) * (PIXEL_STRIDE + SRC_EXTRA) +#define DST_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE * 2) +#define SRC_OFFSET ((PIXEL_STRIDE + EXTRA_BEFORE * 2) * EXTRA_BEFORE) + +#define randomize_buffers(buf0, buf1, size, mask) \ +do {\ +int k; \ +for (k = 0; k < size; k += 4) { \ +uint32_t r = rnd() & mask; \ +AV_WN32A(buf0 + k, r); \ +AV_WN32A(buf1 + k, r); \ +} \ +} while (0) + +#define randomize_pixels(buf0, buf1, size) \ +do {\ +uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \ +randomize_buffers(buf0, buf1, size, mask); \ +} while (0) + +#define randomize_avg_src(buf0, buf1, size) \ +do {\ +uint32_t mask = 0x3fff3fff; \ +randomize_buffers(buf0, buf1, size, mask); \ +} while (0) + +static void check_put_vvc_luma(void) +{ +LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]); +LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]); +LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]); +VVCDSPContext c; + +declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t *dst, const uint8_t *src, const ptrdiff_t src_stride, +
[FFmpeg-devel] [PATCH v2 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/Makefile | 1 + libavcodec/vvc/vvcdsp.c | 4 + libavcodec/vvc/vvcdsp.h | 2 + libavcodec/x86/vvc/Makefile | 6 + libavcodec/x86/vvc/vvcdsp_init.c | 202 +++ 5 files changed, 215 insertions(+) create mode 100644 libavcodec/x86/vvc/Makefile create mode 100644 libavcodec/x86/vvc/vvcdsp_init.c diff --git a/libavcodec/Makefile b/libavcodec/Makefile index bb42095165..ce33631b60 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -65,6 +65,7 @@ OBJS = ac3_parser.o \ # subsystems include $(SRC_PATH)/libavcodec/vvc/Makefile +include $(SRC_PATH)/libavcodec/x86/vvc/Makefile OBJS-$(CONFIG_AANDCTTABLES)+= aandcttab.o OBJS-$(CONFIG_AC3DSP) += ac3dsp.o ac3.o ac3tab.o OBJS-$(CONFIG_ADTS_HEADER) += adts_header.o mpeg4audio_sample_rates.o diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c index c82ea7be30..c542be5258 100644 --- a/libavcodec/vvc/vvcdsp.c +++ b/libavcodec/vvc/vvcdsp.c @@ -138,4 +138,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth) VVC_DSP(8); break; } + +#if ARCH_X86 +ff_vvc_dsp_init_x86(vvcdsp, bit_depth); +#endif } diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h index b5a63c5833..6f59e73654 100644 --- a/libavcodec/vvc/vvcdsp.h +++ b/libavcodec/vvc/vvcdsp.h @@ -167,4 +167,6 @@ typedef struct VVCDSPContext { void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); +void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); + #endif /* AVCODEC_VVC_VVCDSP_H */ diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile new file mode 100644 index 00..b4acc22501 --- /dev/null +++ b/libavcodec/x86/vvc/Makefile @@ -0,0 +1,6 @@ +clean:: + $(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%) + +OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o +X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/h26x/h2656dsp.o \ + x86/h26x/h2656_inter.o diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c new file mode 100644 index 00..c197cdb4cc --- /dev/null +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -0,0 +1,202 @@ +/* + * VVC DSP init for x86 + * + * Copyright (C) 2022-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vvc/vvcdec.h" +#include "libavcodec/vvc/vvc_ctu.h" +#include "libavcodec/vvc/vvcdsp.h" +#include "libavcodec/x86/h26x/h2656dsp.h" + +#define FW_PUT(name, depth, opt) \ +static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \ + int height, const int8_t *hf, const int8_t *vf, int width)\ +{ \ +ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_TAP(fname, bitd, opt ) \ +FW_PUT(fname##4, bitd, opt ); \ +FW_PUT(fname##8, bitd, opt ); \ +FW_PUT(fname##16, bitd, opt ); \ +FW_PUT(fname##32, bitd, opt ); \ +FW_PUT(fname##64, bitd, opt ); \ +FW_PUT(fname##128, bitd, opt ); \ + +#define FW_PUT_4TAP(fname, bitd, opt) \ +FW_PUT(fname ## 2, bitd, opt) \ +FW_PUT_TAP(fname, bitd, opt) + +#define FW_PUT_4TAP_SSE4(bitd) \ +FW_PUT_4TAP(pixels, bitd, sse4) \ +FW_PUT_4TAP(4tap_h, bitd, sse4) \ +FW_PUT_4TAP(4tap_v, bitd, sse4) \ +FW_PUT_4TAP(4tap_hv, bitd, sse4) + +#define FW_PUT_8TAP_SSE4(bitd) \ +FW_PUT_TAP(8tap_h, bitd, sse4) \ +FW_PUT_TAP(8tap_v, bitd, sse4) \ +FW_PUT_TAP(8tap_hv, bitd, sse4) + +#define FW_PUT_SSE4(bitd) \ +FW_PUT_4TAP_SSE4(bitd) \ +FW_PUT_8TAP_SSE4(bitd) + +FW_PUT_SSE4( 8); +FW_PUT_SSE4(10); +FW_PUT_SSE4(12); + +#define FW_PUT_T
[FFmpeg-devel] [PATCH v2 1/8] avcodec/vvc/vvc_inter_template: move put/put_luma/put_chroma template to h2656_inter_template.c
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/h26x/h2656_inter_template.c | 577 + libavcodec/vvc/vvc_inter_template.c| 559 +--- 2 files changed, 578 insertions(+), 558 deletions(-) create mode 100644 libavcodec/h26x/h2656_inter_template.c diff --git a/libavcodec/h26x/h2656_inter_template.c b/libavcodec/h26x/h2656_inter_template.c new file mode 100644 index 00..864f6c7e7d --- /dev/null +++ b/libavcodec/h26x/h2656_inter_template.c @@ -0,0 +1,577 @@ +/* + * inter prediction template for HEVC/VVC + * + * Copyright (C) 2022 Nuo Mi + * Copyright (C) 2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define CHROMA_EXTRA_BEFORE 1 +#define CHROMA_EXTRA3 +#define LUMA_EXTRA_BEFORE 3 +#define LUMA_EXTRA 7 + +static void FUNC(put_pixels)(int16_t *dst, +const uint8_t *_src, const ptrdiff_t _src_stride, +const int height, const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src= (const pixel *)_src; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) +dst[x] = src[x] << (14 - BIT_DEPTH); +src += src_stride; +dst += MAX_PB_SIZE; +} +} + +static void FUNC(put_uni_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, +const uint8_t *_src, const ptrdiff_t _src_stride, const int height, + const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src= (const pixel *)_src; +pixel *dst = (pixel *)_dst; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + +for (int y = 0; y < height; y++) { +memcpy(dst, src, width * sizeof(pixel)); +src += src_stride; +dst += dst_stride; +} +} + +static void FUNC(put_uni_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, +const uint8_t *_src, const ptrdiff_t _src_stride, const int height, +const int denom, const int wx, const int _ox, const int8_t *hf, const int8_t *vf, +const int width) +{ +const pixel *src= (const pixel *)_src; +pixel *dst = (pixel *)_dst; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); +const int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 +const int offset= 1 << (shift - 1); +#else +const int offset= 0; +#endif +const int ox= _ox * (1 << (BIT_DEPTH - 8)); + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) { +const int v = (src[x] << (14 - BIT_DEPTH)); +dst[x] = av_clip_pixel(((v * wx + offset) >> shift) + ox); +} +src += src_stride; +dst += dst_stride; +} +} + +#define LUMA_FILTER(src, stride) \ +(filter[0] * src[x - 3 * stride] + \ + filter[1] * src[x - 2 * stride] + \ + filter[2] * src[x - stride] + \ + filter[3] * src[x ] + \ + filter[4] * src[x + stride] + \ + filter[5] * src[x + 2 * stride] + \ + filter[6] * src[x + 3 * stride] + \ + filter[7] * src[x + 4 * stride]) + +static void FUNC(put_luma_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, +const int height, const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src = (const pixel*)_src; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const int8_t *filter = hf; + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) +dst[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); +src += src_stride; +dst += MAX_PB_SIZE; +} +} + +static void FUNC(put_luma_v)(int
[FFmpeg-devel] [PATCH v2 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/h26x/h2656_inter.asm | 32 ++--- libavcodec/x86/h26x/h2656dsp.c | 4 ++-- libavcodec/x86/h26x/h2656dsp.h | 2 +- libavcodec/x86/hevcdsp_init.c | 2 +- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm index 4316c8ae3d..68f88832a6 100644 --- a/libavcodec/x86/h26x/h2656_inter.asm +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -22,8 +22,6 @@ ; */ %include "libavutil/x86/x86util.asm" -%define MAX_PB_SIZE 64 - SECTION_RODATA 32 cextern pw_255 cextern pw_512 @@ -332,7 +330,7 @@ SECTION .text %endmacro %macro LOOP_END 3 -add %1q, 2*MAX_PB_SIZE ; dst += dststride +add %1q, dststrideq ; dst += dststride add %2q, %3q; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -529,7 +527,7 @@ SECTION .text ; ** -; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void %1_put_pixels(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** @@ -539,7 +537,7 @@ SECTION .text %endmacro %macro MC_PIXELS 3 -cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height +cglobal %1_put_pixels%2_%3, 5, 5, 3, dst, dststride, src, srcstride, height pxor m2, m2 .loop: SIMPLE_LOAD %2, %3, srcq, m0 @@ -569,10 +567,10 @@ cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, src, srcstride, height %endif ; ** -; void %1_put_4tap_hX(int16_t *dst, +; void %1_put_4tap_hX(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width); ; ** -cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf +cglobal %1_put_4tap_h%2_%3, 6, 6, XMM_REGS, dst, dststride, src, srcstride, height, hf %assign %%stride ((%3 + 7)/8) MC_4TAP_FILTER %3, hf, m4, m5 .loop: @@ -602,10 +600,10 @@ cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, dststride, src, srcstride, RET ; ** -; void %1_put_4tap_v(int16_t *dst, +; void %1_put_4tap_v(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width) ; ** -cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, r3src, vf +cglobal %1_put_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, vf sub srcq, srcstrideq MC_4TAP_FILTER%3, vf, m4, m5 lea r3srcq, [srcstrideq*3] @@ -639,10 +637,10 @@ cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, %macro PUT_4TAP_HV 3 ; ** -; void put_4tap_hv(int16_t *dst, +; void put_4tap_hv(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width) ; ** -cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, r3src +cglobal %1_put_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, height, hf, vf, r3src %assign %%stride ((%3 + 7)/8) sub srcq, srcstrideq MC_4TAP_HV_FILTER%3 @@ -774,12 +772,12 @@ cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, heig %endmacro ; ** -; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void put_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** %macro PUT_8TAP 3 -cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf +cglobal %1_put_8tap_h%2_%3, 6, 6, 16, dst, dststride, src, srcstride, height, hf MC_8TAP_FILTER %3, hf .loop: MC_8TAP_H_LOAD %3, srcq, %2, 10 @@ -814,10 +812,10 @@ cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, dststride, src, srcstride, heigh ; ** -; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void put_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** -cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf +cglobal %1_put_8tap_v%2_%3, 7, 8, 16, dst, dststride, src, srcstride, height,
[FFmpeg-devel] [PATCH v2 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm
From: Wu Jianhua This enable that the asm optimization can be reused by VVC Signed-off-by: Wu Jianhua --- libavcodec/x86/Makefile |1 + libavcodec/x86/h26x/h2656_inter.asm | 1135 +++ libavcodec/x86/h26x/h2656dsp.c | 98 +++ libavcodec/x86/h26x/h2656dsp.h | 103 +++ libavcodec/x86/hevc_mc.asm | 462 +-- libavcodec/x86/hevcdsp_init.c | 108 ++- 6 files changed, 1461 insertions(+), 446 deletions(-) create mode 100644 libavcodec/x86/h26x/h2656_inter.asm create mode 100644 libavcodec/x86/h26x/h2656dsp.c create mode 100644 libavcodec/x86/h26x/h2656dsp.h diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index d5fb30645a..8098cd840c 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o\ x86/hevc_deblock.o\ x86/hevc_idct.o \ x86/hevc_mc.o \ + x86/h26x/h2656_inter.o\ x86/hevc_sao.o\ x86/hevc_sao_10bit.o X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm new file mode 100644 index 00..4316c8ae3d --- /dev/null +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -0,0 +1,1135 @@ +; /* +; * Provide SSE luma and chroma mc functions for HEVC/VVC decoding +; * Copyright (c) 2013 Pierre-Edouard LEPERE +; * Copyright (c) 2023-2024 Nuo Mi +; * Copyright (c) 2023-2024 Wu Jianhua +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +%define MAX_PB_SIZE 64 + +SECTION_RODATA 32 +cextern pw_255 +cextern pw_512 +cextern pw_2048 +cextern pw_1023 +cextern pw_1024 +cextern pw_4096 +cextern pw_8192 +%define scale_8 pw_512 +%define scale_10 pw_2048 +%define scale_12 pw_8192 +%define max_pixels_8 pw_255 +%define max_pixels_10 pw_1023 +max_pixels_12: times 16 dw ((1 << 12)-1) +cextern pb_0 + +SECTION .text +%macro SIMPLE_LOAD 4;width, bitd, tab, r1 +%if %1 == 2 || (%2 == 8 && %1 <= 4) +movd %4, [%3] ; load data from source +%elif %1 == 4 || (%2 == 8 && %1 <= 8) +movq %4, [%3] ; load data from source +%elif notcpuflag(avx) +movu %4, [%3] ; load data from source +%elif %1 <= 8 || (%2 == 8 && %1 <= 16) +movdqu %4, [%3] +%else +movu %4, [%3] +%endif +%endmacro + +%macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b, +vpbroadcastw %3, [%2q + 0 * 2] ; coeff 0, 1 +vpbroadcastw %4, [%2q + 1 * 2] ; coeff 2, 3 +%if %1 != 8 +pmovsxbw %3, xmm%3 +pmovsxbw %4, xmm%4 +%endif +%endmacro + +%macro MC_4TAP_HV_FILTER 1 +vpbroadcastw m12, [vfq + 0 * 2] ; vf 0, 1 +vpbroadcastw m13, [vfq + 1 * 2] ; vf 2, 3 +vpbroadcastw m14, [hfq + 0 * 2] ; hf 0, 1 +vpbroadcastw m15, [hfq + 1 * 2] ; hf 2, 3 + +pmovsxbw m12, xm12 +pmovsxbw m13, xm13 +%if %1 != 8 +pmovsxbw m14, xm14 +pmovsxbw m15, xm15 +%endif +lea r3srcq, [srcstrideq*3] +%endmacro + +%macro MC_8TAP_SAVE_FILTER 5;offset, mm registers +mova [rsp + %1 + 0*mmsize], %2 +mova [rsp + %1 + 1*mmsize], %3 +mova [rsp + %1 + 2*mmsize], %4 +mova [rsp + %1 + 3*mmsize], %5 +%endmacro + +%macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset +vpbroadcastw m12, [%2q + 0 * 2] ; coeff 0, 1 +vpbroadcastw m13, [%2q + 1 * 2] ; coeff 2, 3 +vpbroadcastw m14, [%2q + 2 * 2] ; coeff 4, 5 +vpbroadcastw m15, [%2q + 3 * 2] ; coeff 6, 7 +%if %0 == 3 +MC_8TAP_SAVE_FILTER%3, m12, m13, m14, m15 +%endif + +%if %1 != 8 +pmovsxbw m12, xm12 +pmovsxbw
[FFmpeg-devel] [PATCH v2 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/hevcdsp_template.c | 594 +++--- 1 file changed, 46 insertions(+), 548 deletions(-) diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 0de14e9dcf..9b48bdf08e 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -26,6 +26,7 @@ #include "bit_depth_template.c" #include "hevcdsp.h" #include "h26x/h2656_sao_template.c" +#include "h26x/h2656_inter_template.c" static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height, GetBitContext *gb, int pcm_bit_depth) @@ -299,37 +300,51 @@ IDCT_DC(32) // -static void FUNC(put_hevc_pel_pixels)(int16_t *dst, - const uint8_t *_src, ptrdiff_t _srcstride, - int height, intptr_t mx, intptr_t my, int width) -{ -int x, y; -const pixel *src= (const pixel *)_src; -ptrdiff_t srcstride = _srcstride / sizeof(pixel); - -for (y = 0; y < height; y++) { -for (x = 0; x < width; x++) -dst[x] = src[x] << (14 - BIT_DEPTH); -src += srcstride; -dst += MAX_PB_SIZE; -} -} - -static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, - int height, intptr_t mx, intptr_t my, int width) -{ -int y; -const pixel *src= (const pixel *)_src; -ptrdiff_t srcstride = _srcstride / sizeof(pixel); -pixel *dst = (pixel *)_dst; -ptrdiff_t dststride = _dststride / sizeof(pixel); - -for (y = 0; y < height; y++) { -memcpy(dst, src, width * sizeof(pixel)); -src += srcstride; -dst += dststride; -} -} +#define ff_hevc_pel_filters ff_hevc_qpel_filters +#define DECL_HV_FILTER(f) \ +const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \ +const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1]; + +#define FW_PUT(p, f, t) \ +static void FUNC(put_hevc_## f)(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height,\ + intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_UNI(p, f, t) \ +static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride, int height, intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, dststride, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_UNI_W(p, f, t) \ +static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride,int height, int denom, int wx, int ox, \ + intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, dststride, src, srcstride, height, denom, wx, ox, hf, vf, width);\ +} + +#define FW_PUT_FUNCS(f, t, dir) \ +FW_PUT(f, f ## _ ## dir, t ## _ ## dir) \ +FW_PUT_UNI(f, f ## _uni_ ## dir, uni_ ## t ## _ ## dir)\ +FW_PUT_UNI_W(f, f ## _uni_w_ ## dir, uni_## t ## _w_ ## dir) + +FW_PUT(pel, pel_pixels, pixels) +FW_PUT_UNI(pel, pel_uni_pixels, uni_pixels) +FW_PUT_UNI_W(pel, pel_uni_w_pixels, uni_w_pixels) + +FW_PUT_FUNCS(qpel, luma, h ) +FW_PUT_FUNCS(qpel, luma, v ) +FW_PUT_FUNCS(qpel, luma, hv) +FW_PUT_FUNCS(epel, chroma, h ) +FW_PUT_FUNCS(epel, chroma, v ) +FW_PUT_FUNCS(epel, chroma, hv) static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_
[FFmpeg-devel] [PATCH 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/Makefile | 1 + libavcodec/vvc/vvcdsp.c | 4 + libavcodec/vvc/vvcdsp.h | 2 + libavcodec/x86/vvc/Makefile | 6 + libavcodec/x86/vvc/vvcdsp_init.c | 200 +++ 5 files changed, 213 insertions(+) create mode 100644 libavcodec/x86/vvc/Makefile create mode 100644 libavcodec/x86/vvc/vvcdsp_init.c diff --git a/libavcodec/Makefile b/libavcodec/Makefile index bb42095165..ce33631b60 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -65,6 +65,7 @@ OBJS = ac3_parser.o \ # subsystems include $(SRC_PATH)/libavcodec/vvc/Makefile +include $(SRC_PATH)/libavcodec/x86/vvc/Makefile OBJS-$(CONFIG_AANDCTTABLES)+= aandcttab.o OBJS-$(CONFIG_AC3DSP) += ac3dsp.o ac3.o ac3tab.o OBJS-$(CONFIG_ADTS_HEADER) += adts_header.o mpeg4audio_sample_rates.o diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c index c82ea7be30..c542be5258 100644 --- a/libavcodec/vvc/vvcdsp.c +++ b/libavcodec/vvc/vvcdsp.c @@ -138,4 +138,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth) VVC_DSP(8); break; } + +#if ARCH_X86 +ff_vvc_dsp_init_x86(vvcdsp, bit_depth); +#endif } diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h index b5a63c5833..6f59e73654 100644 --- a/libavcodec/vvc/vvcdsp.h +++ b/libavcodec/vvc/vvcdsp.h @@ -167,4 +167,6 @@ typedef struct VVCDSPContext { void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); +void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); + #endif /* AVCODEC_VVC_VVCDSP_H */ diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile new file mode 100644 index 00..b4acc22501 --- /dev/null +++ b/libavcodec/x86/vvc/Makefile @@ -0,0 +1,6 @@ +clean:: + $(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%) + +OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o +X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/h26x/h2656dsp.o \ + x86/h26x/h2656_inter.o diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c new file mode 100644 index 00..69bbd07c80 --- /dev/null +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -0,0 +1,200 @@ +/* + * VVC DSP init for x86 + * + * Copyright (C) 2022-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vvc/vvcdec.h" +#include "libavcodec/vvc/vvc_ctu.h" +#include "libavcodec/vvc/vvcdsp.h" +#include "libavcodec/x86/h26x/h2656dsp.h" + +#define FW_PUT(name, depth, opt) \ +static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \ + int height, const int8_t *hf, const int8_t *vf, int width)\ +{ \ +ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_TAP(fname, bitd, opt ) \ +FW_PUT(fname##4, bitd, opt ); \ +FW_PUT(fname##8, bitd, opt ); \ +FW_PUT(fname##16, bitd, opt ); \ +FW_PUT(fname##32, bitd, opt ); \ +FW_PUT(fname##64, bitd, opt ); \ +FW_PUT(fname##128, bitd, opt ); \ + +#define FW_PUT_4TAP(fname, bitd, opt) \ +FW_PUT(fname ## 2, bitd, opt) \ +FW_PUT_TAP(fname, bitd, opt) + +#define FW_PUT_4TAP_SSE4(bitd) \ +FW_PUT_4TAP(pixels, bitd, sse4) \ +FW_PUT_4TAP(4tap_h, bitd, sse4) \ +FW_PUT_4TAP(4tap_v, bitd, sse4) \ +FW_PUT_4TAP(4tap_hv, bitd, sse4) + +#define FW_PUT_8TAP_SSE4(bitd) \ +FW_PUT_TAP(8tap_h, bitd, sse4) \ +FW_PUT_TAP(8tap_v, bitd, sse4) \ +FW_PUT_TAP(8tap_hv, bitd, sse4) + +#define FW_PUT_SSE4(bitd) \ +FW_PUT_4TAP_SSE4(bitd) \ +FW_PUT_8TAP_SSE4(bitd) + +FW_PUT_SSE4( 8); +FW_PUT_SSE4(10); +FW_PUT_SSE4(12); + +#define FW_PUT_T
[FFmpeg-devel] [PATCH 8/8] tests/checkasm/vvc_mc: add check_avg
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/vvc_mc.c | 64 + 1 file changed, 64 insertions(+) diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c index 711280deec..8adb00573f 100644 --- a/tests/checkasm/vvc_mc.c +++ b/tests/checkasm/vvc_mc.c @@ -35,6 +35,7 @@ static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0x }; static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 }; +#define SIZEOF_PIXEL ((bit_depth + 7) / 8) #define PIXEL_STRIDE (MAX_CTU_SIZE * 2) #define EXTRA_BEFORE 3 #define EXTRA_AFTER 4 @@ -261,10 +262,73 @@ static void check_put_vvc_chroma_uni(void) report("put_uni_chroma"); } +#define AVG_SRC_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE) +#define AVG_DST_BUF_SIZE (MAX_PB_SIZE * MAX_PB_SIZE * 2) + +static void check_avg(void) +{ +LOCAL_ALIGNED_32(int16_t, src00, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src01, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src10, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(int16_t, src11, [AVG_SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, dst0, [AVG_DST_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, dst1, [AVG_DST_BUF_SIZE]); +VVCDSPContext c; + +for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) { +randomize_avg_src((uint8_t*)src00, (uint8_t*)src10, AVG_SRC_BUF_SIZE * sizeof(int16_t)); +randomize_avg_src((uint8_t*)src01, (uint8_t*)src11, AVG_SRC_BUF_SIZE * sizeof(int16_t)); +ff_vvc_dsp_init(&c, bit_depth); +for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) { +for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) { +{ + declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride, +const int16_t *src0, const int16_t *src1, int width, int height); +if (check_func(c.inter.avg, "avg_%d_%dx%d", bit_depth, w, h)) { +memset(dst0, 0, AVG_DST_BUF_SIZE); +memset(dst1, 0, AVG_DST_BUF_SIZE); +call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h); +call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h); +if (memcmp(dst0, dst1, DST_BUF_SIZE)) +fail(); +if (w == h) +bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h); +} +} +{ +declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride, +const int16_t *src0, const int16_t *src1, int width, int height, +int denom, int w0, int w1, int o0, int o1); +{ +const int denom = rnd() % 8; +const int w0= rnd() % 256 - 128; +const int w1= rnd() % 256 - 128; +const int o0= rnd() % 256 - 128; +const int o1= rnd() % 256 - 128; +if (check_func(c.inter.w_avg, "w_avg_%d_%dx%d", bit_depth, w, h)) { +memset(dst0, 0, AVG_DST_BUF_SIZE); +memset(dst1, 0, AVG_DST_BUF_SIZE); + +call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1); +call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h, denom, w0, w1, o0, o1); +if (memcmp(dst0, dst1, DST_BUF_SIZE)) +fail(); +if (w == h) +bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1); +} +} +} +} +} +} +report("avg"); +} + void checkasm_check_vvc_mc(void) { check_put_vvc_luma(); check_put_vvc_luma_uni(); check_put_vvc_chroma(); check_put_vvc_chroma_uni(); +check_avg(); } -- 2.34.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations
From: Wu Jianhua The avg/avg_w is based on dav1d. See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm vvc_avg_8_2x2_c: 71.6 vvc_avg_8_2x2_avx2: 26.8 vvc_avg_8_2x4_c: 140.8 vvc_avg_8_2x4_avx2: 34.6 vvc_avg_8_2x8_c: 410.3 vvc_avg_8_2x8_avx2: 41.3 vvc_avg_8_2x16_c: 769.3 vvc_avg_8_2x16_avx2: 60.3 vvc_avg_8_2x32_c: 1669.6 vvc_avg_8_2x32_avx2: 105.1 vvc_avg_8_2x64_c: 1978.3 vvc_avg_8_2x64_avx2: 425.8 vvc_avg_8_2x128_c: 6536.8 vvc_avg_8_2x128_avx2: 1315.1 vvc_avg_8_4x2_c: 155.6 vvc_avg_8_4x2_avx2: 26.1 vvc_avg_8_4x4_c: 250.3 vvc_avg_8_4x4_avx2: 31.3 vvc_avg_8_4x8_c: 831.8 vvc_avg_8_4x8_avx2: 41.3 vvc_avg_8_4x16_c: 1461.1 vvc_avg_8_4x16_avx2: 57.1 vvc_avg_8_4x32_c: 2821.6 vvc_avg_8_4x32_avx2: 105.1 vvc_avg_8_4x64_c: 3615.8 vvc_avg_8_4x64_avx2: 412.6 vvc_avg_8_4x128_c: 11962.6 vvc_avg_8_4x128_avx2: 1274.3 vvc_avg_8_8x2_c: 215.8 vvc_avg_8_8x2_avx2: 29.1 vvc_avg_8_8x4_c: 430.6 vvc_avg_8_8x4_avx2: 37.6 vvc_avg_8_8x8_c: 1463.3 vvc_avg_8_8x8_avx2: 51.8 vvc_avg_8_8x16_c: 2630.1 vvc_avg_8_8x16_avx2: 97.6 vvc_avg_8_8x32_c: 5813.8 vvc_avg_8_8x32_avx2: 196.6 vvc_avg_8_8x64_c: 6687.3 vvc_avg_8_8x64_avx2: 487.8 vvc_avg_8_8x128_c: 13178.6 vvc_avg_8_8x128_avx2: 1290.6 vvc_avg_8_16x2_c: 443.8 vvc_avg_8_16x2_avx2: 28.3 vvc_avg_8_16x4_c: 1253.3 vvc_avg_8_16x4_avx2: 32.1 vvc_avg_8_16x8_c: 2236.3 vvc_avg_8_16x8_avx2: 44.3 vvc_avg_8_16x16_c: 5127.8 vvc_avg_8_16x16_avx2: 63.3 vvc_avg_8_16x32_c: 6573.3 vvc_avg_8_16x32_avx2: 223.6 vvc_avg_8_16x64_c: 30311.8 vvc_avg_8_16x64_avx2: 437.8 vvc_avg_8_16x128_c: 25693.3 vvc_avg_8_16x128_avx2: 1266.8 vvc_avg_8_32x2_c: 954.6 vvc_avg_8_32x2_avx2: 32.1 vvc_avg_8_32x4_c: 2359.6 vvc_avg_8_32x4_avx2: 39.6 vvc_avg_8_32x8_c: 5703.6 vvc_avg_8_32x8_avx2: 57.1 vvc_avg_8_32x16_c: 9967.6 vvc_avg_8_32x16_avx2: 107.1 vvc_avg_8_32x32_c: 21327.6 vvc_avg_8_32x32_avx2: 272.6 vvc_avg_8_32x64_c: 39240.8 vvc_avg_8_32x64_avx2: 529.6 vvc_avg_8_32x128_c: 52580.8 vvc_avg_8_32x128_avx2: 1338.8 vvc_avg_8_64x2_c: 1647.3 vvc_avg_8_64x2_avx2: 38.8 vvc_avg_8_64x4_c: 5130.1 vvc_avg_8_64x4_avx2: 58.8 vvc_avg_8_64x8_c: 6529.3 vvc_avg_8_64x8_avx2: 88.3 vvc_avg_8_64x16_c: 19913.6 vvc_avg_8_64x16_avx2: 162.3 vvc_avg_8_64x32_c: 39360.8 vvc_avg_8_64x32_avx2: 295.8 vvc_avg_8_64x64_c: 49658.3 vvc_avg_8_64x64_avx2: 784.1 vvc_avg_8_64x128_c: 108513.1 vvc_avg_8_64x128_avx2: 1977.1 vvc_avg_8_128x2_c: 3226.1 vvc_avg_8_128x2_avx2: 61.1 vvc_avg_8_128x4_c: 10280.3 vvc_avg_8_128x4_avx2: 94.6 vvc_avg_8_128x8_c: 18079.3 vvc_avg_8_128x8_avx2: 155.3 vvc_avg_8_128x16_c: 45121.8 vvc_avg_8_128x16_avx2: 285.3 vvc_avg_8_128x32_c: 48651.8 vvc_avg_8_128x32_avx2: 581.6 vvc_avg_8_128x64_c: 165078.6 vvc_avg_8_128x64_avx2: 1942.8 vvc_avg_8_128x128_c: 339103.1 vvc_avg_8_128x128_avx2: 4332.6 vvc_avg_10_2x2_c: 144.3 vvc_avg_10_2x2_avx2: 26.8 vvc_avg_10_2x4_c: 142.6 vvc_avg_10_2x4_avx2: 45.3 vvc_avg_10_2x8_c: 478.1 vvc_avg_10_2x8_avx2: 38.1 vvc_avg_10_2x16_c: 518.3 vvc_avg_10_2x16_avx2: 58.1 vvc_avg_10_2x32_c: 2059.8 vvc_avg_10_2x32_avx2: 93.1 vvc_avg_10_2x64_c: 2383.8 vvc_avg_10_2x64_avx2: 714.8 vvc_avg_10_2x128_c: 4498.3 vvc_avg_10_2x128_avx2: 1466.3 vvc_avg_10_4x2_c: 228.6 vvc_avg_10_4x2_avx2: 26.8 vvc_avg_10_4x4_c: 378.3 vvc_avg_10_4x4_avx2: 30.6 vvc_avg_10_4x8_c: 866.8 vvc_avg_10_4x8_avx2: 44.6 vvc_avg_10_4x16_c: 1018.1 vvc_avg_10_4x16_avx2: 58.1 vvc_avg_10_4x32_c: 3590.8 vvc_avg_10_4x32_avx2: 128.8 vvc_avg_10_4x64_c: 4200.8 vvc_avg_10_4x64_avx2: 663.6 vvc_avg_10_4x128_c: 8450.8 vvc_avg_10_4x128_avx2: 1531.8 vvc_avg_10_8x2_c: 369.3 vvc_avg_10_8x2_avx2: 28.3 vvc_avg_10_8x4_c: 513.8 vvc_avg_10_8x4_avx2: 32.1 vvc_avg_10_8x8_c: 1720.3 vvc_avg_10_8x8_avx2: 49.1 vvc_avg_10_8x16_c: 1894.8 vvc_avg_10_8x16_avx2: 71.6 vvc_avg_10_8x32_c: 3931.3 vvc_avg_10_8x32_avx2: 148.1 vvc_avg_10_8x64_c: 7964.3 vvc_avg_10_8x64_avx2: 613.1 vvc_avg_10_8x128_c: 15540.1 vvc_avg_10_8x128_avx2: 1585.1 vvc_avg_10_16x2_c: 877.3 vvc_avg_10_16x2_avx2: 27.6 vvc_avg_10_16x4_c: 955.8 vvc_avg_10_16x4_avx2: 29.8 vvc_avg_10_16x8_c: 3419.6 vvc_avg_10_16x8_avx2: 62.6 vvc_avg_10_16x16_c: 3826.8 vvc_avg_10_16x16_avx2: 54.3 vvc_avg_10_16x32_c: 7655.3 vvc_avg_10_16x32_avx2: 86.3 vvc_avg_10_16x64_c: 30011.1 vvc_avg_10_16x64_avx2: 692.6 vvc_avg_10_16x128_c: 47894.8 vvc_avg_10_16x128_avx2: 1580.3 vvc_avg_10_32x2_c: 944.3 vvc_avg_10_32x2_avx2: 29.8 vvc_avg_10_32x4_c: 2022.6 vvc_avg_10_32x4_avx2: 35.1 vvc_avg_10_32x8_c: 6148.8 vvc_avg_10_32x8_avx2: 51.3 vvc_avg_10_32x16_c: 12601.6 vvc_avg_10_32x16_avx2: 70.8 vvc_avg_10_32x32_c: 15958.6 vvc_avg_10_32x32_avx2: 124.3 vvc_avg_10_32x64_c: 31784.6 vvc_avg_10_32x64_avx2: 757.3 vvc_avg_10_32x128_c: 63892.8 vvc_avg_10_32x128_avx2: 1711.3 vvc_avg_10_64x2_c: 1890.8 vvc_avg_10_64x2_avx2: 34.3 vvc_avg_10_64x4_c: 6267.3 vvc_avg_10_64x4_avx2: 42.6 vvc_avg_10_64x8_c: 12778.1 vvc_avg_10_64x8_avx2: 67.8 vvc_avg_10_64x16_c: 22304.3 vvc_avg_10_64x16_avx2: 116.8 vvc_avg_10_64x32_c: 30777.1 vvc_avg_10_64x32_avx2: 201.1 vvc_avg_10_64x64_c: 60169.1 vvc_avg_10_64x64_avx2: 1454.3 vvc_avg_10_64x128_c: 124392.8 vvc_avg_10_64x128_avx2: 3648.6 vvc_avg_10_128x2
[FFmpeg-devel] [PATCH 6/8] tests/checkasm: add checkasm_check_vvc_mc
From: Wu Jianhua Signed-off-by: Wu Jianhua --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 + tests/checkasm/checkasm.h | 1 + tests/checkasm/vvc_mc.c | 270 ++ 4 files changed, 275 insertions(+) create mode 100644 tests/checkasm/vvc_mc.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 3b5b54352b..3562acb2b2 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -40,6 +40,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o +AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_mc.o CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 87f24c77ca..36a97957e5 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -194,6 +194,9 @@ static const struct { #if CONFIG_VORBIS_DECODER { "vorbisdsp", checkasm_check_vorbisdsp }, #endif +#if CONFIG_VVC_DECODER +{ "vvc_mc", checkasm_check_vvc_mc }, +#endif #endif #if CONFIG_AVFILTER #if CONFIG_AFIR_FILTER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 4db8c495ea..53cb3ccfbf 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -131,6 +131,7 @@ void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); void checkasm_check_videodsp(void); void checkasm_check_vorbisdsp(void); +void checkasm_check_vvc_mc(void); struct CheckasmPerf; diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c new file mode 100644 index 00..711280deec --- /dev/null +++ b/tests/checkasm/vvc_mc.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2023-2024 Nuo Mi + * Copyright (c) 2023-2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +#include "checkasm.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/vvc/vvc_ctu.h" +#include "libavcodec/vvc/vvc_data.h" + +#include "libavutil/common.h" +#include "libavutil/internal.h" +#include "libavutil/internal.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0x }; +static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 }; + +#define PIXEL_STRIDE (MAX_CTU_SIZE * 2) +#define EXTRA_BEFORE 3 +#define EXTRA_AFTER 4 +#define SRC_EXTRA(EXTRA_BEFORE + EXTRA_AFTER) * 2 +#define SRC_BUF_SIZE (PIXEL_STRIDE + SRC_EXTRA) * (PIXEL_STRIDE + SRC_EXTRA) +#define DST_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE * 2) +#define SRC_OFFSET ((PIXEL_STRIDE + EXTRA_BEFORE * 2) * EXTRA_BEFORE) + +#define randomize_buffers(buf0, buf1, size, mask) \ +do {\ +int k; \ +for (k = 0; k < size; k += 4) { \ +uint32_t r = rnd() & mask; \ +AV_WN32A(buf0 + k, r); \ +AV_WN32A(buf1 + k, r); \ +} \ +} while (0) + +#define randomize_pixels(buf0, buf1, size) \ +do {\ +uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \ +randomize_buffers(buf0, buf1, size, mask); \ +} while (0) + +#define randomize_avg_src(buf0, buf1, size) \ +do {\ +uint32_t mask = 0x3fff3fff; \ +randomize_buffers(buf0, buf1, size, mask); \ +} while (0) + +static void check_put_vvc_luma(void) +{ +LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]); +LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]); +LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]); +LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]); +VVCDSPContext c; + +declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t *dst, const uint8_t *src, const ptrdiff_t src_stride, +
[FFmpeg-devel] [PATCH 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm
From: Wu Jianhua This enable that the asm optimization can be reused by VVC Signed-off-by: Wu Jianhua --- libavcodec/x86/Makefile |1 + libavcodec/x86/h26x/h2656_inter.asm | 1135 +++ libavcodec/x86/h26x/h2656dsp.c | 98 +++ libavcodec/x86/h26x/h2656dsp.h | 105 +++ libavcodec/x86/hevc_mc.asm | 462 +-- libavcodec/x86/hevcdsp_init.c | 108 ++- 6 files changed, 1463 insertions(+), 446 deletions(-) create mode 100644 libavcodec/x86/h26x/h2656_inter.asm create mode 100644 libavcodec/x86/h26x/h2656dsp.c create mode 100644 libavcodec/x86/h26x/h2656dsp.h diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index d5fb30645a..8098cd840c 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o\ x86/hevc_deblock.o\ x86/hevc_idct.o \ x86/hevc_mc.o \ + x86/h26x/h2656_inter.o\ x86/hevc_sao.o\ x86/hevc_sao_10bit.o X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm new file mode 100644 index 00..4316c8ae3d --- /dev/null +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -0,0 +1,1135 @@ +; /* +; * Provide SSE luma and chroma mc functions for HEVC/VVC decoding +; * Copyright (c) 2013 Pierre-Edouard LEPERE +; * Copyright (c) 2023-2024 Nuo Mi +; * Copyright (c) 2023-2024 Wu Jianhua +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +%define MAX_PB_SIZE 64 + +SECTION_RODATA 32 +cextern pw_255 +cextern pw_512 +cextern pw_2048 +cextern pw_1023 +cextern pw_1024 +cextern pw_4096 +cextern pw_8192 +%define scale_8 pw_512 +%define scale_10 pw_2048 +%define scale_12 pw_8192 +%define max_pixels_8 pw_255 +%define max_pixels_10 pw_1023 +max_pixels_12: times 16 dw ((1 << 12)-1) +cextern pb_0 + +SECTION .text +%macro SIMPLE_LOAD 4;width, bitd, tab, r1 +%if %1 == 2 || (%2 == 8 && %1 <= 4) +movd %4, [%3] ; load data from source +%elif %1 == 4 || (%2 == 8 && %1 <= 8) +movq %4, [%3] ; load data from source +%elif notcpuflag(avx) +movu %4, [%3] ; load data from source +%elif %1 <= 8 || (%2 == 8 && %1 <= 16) +movdqu %4, [%3] +%else +movu %4, [%3] +%endif +%endmacro + +%macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b, +vpbroadcastw %3, [%2q + 0 * 2] ; coeff 0, 1 +vpbroadcastw %4, [%2q + 1 * 2] ; coeff 2, 3 +%if %1 != 8 +pmovsxbw %3, xmm%3 +pmovsxbw %4, xmm%4 +%endif +%endmacro + +%macro MC_4TAP_HV_FILTER 1 +vpbroadcastw m12, [vfq + 0 * 2] ; vf 0, 1 +vpbroadcastw m13, [vfq + 1 * 2] ; vf 2, 3 +vpbroadcastw m14, [hfq + 0 * 2] ; hf 0, 1 +vpbroadcastw m15, [hfq + 1 * 2] ; hf 2, 3 + +pmovsxbw m12, xm12 +pmovsxbw m13, xm13 +%if %1 != 8 +pmovsxbw m14, xm14 +pmovsxbw m15, xm15 +%endif +lea r3srcq, [srcstrideq*3] +%endmacro + +%macro MC_8TAP_SAVE_FILTER 5;offset, mm registers +mova [rsp + %1 + 0*mmsize], %2 +mova [rsp + %1 + 1*mmsize], %3 +mova [rsp + %1 + 2*mmsize], %4 +mova [rsp + %1 + 3*mmsize], %5 +%endmacro + +%macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset +vpbroadcastw m12, [%2q + 0 * 2] ; coeff 0, 1 +vpbroadcastw m13, [%2q + 1 * 2] ; coeff 2, 3 +vpbroadcastw m14, [%2q + 2 * 2] ; coeff 4, 5 +vpbroadcastw m15, [%2q + 3 * 2] ; coeff 6, 7 +%if %0 == 3 +MC_8TAP_SAVE_FILTER%3, m12, m13, m14, m15 +%endif + +%if %1 != 8 +pmovsxbw m12, xm12 +pmovsxbw
[FFmpeg-devel] [PATCH 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/x86/h26x/h2656_inter.asm | 32 ++--- libavcodec/x86/h26x/h2656dsp.c | 4 ++-- libavcodec/x86/h26x/h2656dsp.h | 2 +- libavcodec/x86/hevcdsp_init.c | 2 +- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm index 4316c8ae3d..68f88832a6 100644 --- a/libavcodec/x86/h26x/h2656_inter.asm +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -22,8 +22,6 @@ ; */ %include "libavutil/x86/x86util.asm" -%define MAX_PB_SIZE 64 - SECTION_RODATA 32 cextern pw_255 cextern pw_512 @@ -332,7 +330,7 @@ SECTION .text %endmacro %macro LOOP_END 3 -add %1q, 2*MAX_PB_SIZE ; dst += dststride +add %1q, dststrideq ; dst += dststride add %2q, %3q; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -529,7 +527,7 @@ SECTION .text ; ** -; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void %1_put_pixels(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** @@ -539,7 +537,7 @@ SECTION .text %endmacro %macro MC_PIXELS 3 -cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height +cglobal %1_put_pixels%2_%3, 5, 5, 3, dst, dststride, src, srcstride, height pxor m2, m2 .loop: SIMPLE_LOAD %2, %3, srcq, m0 @@ -569,10 +567,10 @@ cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, src, srcstride, height %endif ; ** -; void %1_put_4tap_hX(int16_t *dst, +; void %1_put_4tap_hX(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width); ; ** -cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf +cglobal %1_put_4tap_h%2_%3, 6, 6, XMM_REGS, dst, dststride, src, srcstride, height, hf %assign %%stride ((%3 + 7)/8) MC_4TAP_FILTER %3, hf, m4, m5 .loop: @@ -602,10 +600,10 @@ cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, dststride, src, srcstride, RET ; ** -; void %1_put_4tap_v(int16_t *dst, +; void %1_put_4tap_v(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width) ; ** -cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, r3src, vf +cglobal %1_put_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, vf sub srcq, srcstrideq MC_4TAP_FILTER%3, vf, m4, m5 lea r3srcq, [srcstrideq*3] @@ -639,10 +637,10 @@ cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, %macro PUT_4TAP_HV 3 ; ** -; void put_4tap_hv(int16_t *dst, +; void put_4tap_hv(int16_t *dst, ptrdiff_t dststride, ; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width) ; ** -cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, r3src +cglobal %1_put_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, height, hf, vf, r3src %assign %%stride ((%3 + 7)/8) sub srcq, srcstrideq MC_4TAP_HV_FILTER%3 @@ -774,12 +772,12 @@ cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, heig %endmacro ; ** -; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void put_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** %macro PUT_8TAP 3 -cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf +cglobal %1_put_8tap_h%2_%3, 6, 6, 16, dst, dststride, src, srcstride, height, hf MC_8TAP_FILTER %3, hf .loop: MC_8TAP_H_LOAD %3, srcq, %2, 10 @@ -814,10 +812,10 @@ cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, dststride, src, srcstride, heigh ; ** -; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride, +; void put_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride, ; int height, const int8_t *hf, const int8_t *vf, int width) ; ** -cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf +cglobal %1_put_8tap_v%2_%3, 7, 8, 16, dst, dststride, src, srcstride, height,
[FFmpeg-devel] [PATCH 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/hevcdsp_template.c | 594 +++--- 1 file changed, 46 insertions(+), 548 deletions(-) diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 0de14e9dcf..9b48bdf08e 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -26,6 +26,7 @@ #include "bit_depth_template.c" #include "hevcdsp.h" #include "h26x/h2656_sao_template.c" +#include "h26x/h2656_inter_template.c" static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height, GetBitContext *gb, int pcm_bit_depth) @@ -299,37 +300,51 @@ IDCT_DC(32) // -static void FUNC(put_hevc_pel_pixels)(int16_t *dst, - const uint8_t *_src, ptrdiff_t _srcstride, - int height, intptr_t mx, intptr_t my, int width) -{ -int x, y; -const pixel *src= (const pixel *)_src; -ptrdiff_t srcstride = _srcstride / sizeof(pixel); - -for (y = 0; y < height; y++) { -for (x = 0; x < width; x++) -dst[x] = src[x] << (14 - BIT_DEPTH); -src += srcstride; -dst += MAX_PB_SIZE; -} -} - -static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, - int height, intptr_t mx, intptr_t my, int width) -{ -int y; -const pixel *src= (const pixel *)_src; -ptrdiff_t srcstride = _srcstride / sizeof(pixel); -pixel *dst = (pixel *)_dst; -ptrdiff_t dststride = _dststride / sizeof(pixel); - -for (y = 0; y < height; y++) { -memcpy(dst, src, width * sizeof(pixel)); -src += srcstride; -dst += dststride; -} -} +#define ff_hevc_pel_filters ff_hevc_qpel_filters +#define DECL_HV_FILTER(f) \ +const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \ +const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1]; + +#define FW_PUT(p, f, t) \ +static void FUNC(put_hevc_## f)(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height,\ + intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_UNI(p, f, t) \ +static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride, int height, intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, dststride, src, srcstride, height, hf, vf, width); \ +} + +#define FW_PUT_UNI_W(p, f, t) \ +static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride,int height, int denom, int wx, int ox, \ + intptr_t mx, intptr_t my, int width) \ +{ \ +DECL_HV_FILTER(p) \ +FUNC(put_ ## t)(dst, dststride, src, srcstride, height, denom, wx, ox, hf, vf, width);\ +} + +#define FW_PUT_FUNCS(f, t, dir) \ +FW_PUT(f, f ## _ ## dir, t ## _ ## dir) \ +FW_PUT_UNI(f, f ## _uni_ ## dir, uni_ ## t ## _ ## dir)\ +FW_PUT_UNI_W(f, f ## _uni_w_ ## dir, uni_## t ## _w_ ## dir) + +FW_PUT(pel, pel_pixels, pixels) +FW_PUT_UNI(pel, pel_uni_pixels, uni_pixels) +FW_PUT_UNI_W(pel, pel_uni_w_pixels, uni_w_pixels) + +FW_PUT_FUNCS(qpel, luma, h ) +FW_PUT_FUNCS(qpel, luma, v ) +FW_PUT_FUNCS(qpel, luma, hv) +FW_PUT_FUNCS(epel, chroma, h ) +FW_PUT_FUNCS(epel, chroma, v ) +FW_PUT_FUNCS(epel, chroma, hv) static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_
[FFmpeg-devel] [PATCH 1/8] avcodec/vvc/vvc_inter_template: move put/put_luma/put_chroma template to h2656_inter_template.c
From: Wu Jianhua Signed-off-by: Wu Jianhua --- libavcodec/h26x/h2656_inter_template.c | 577 + libavcodec/vvc/vvc_inter_template.c| 559 +--- 2 files changed, 578 insertions(+), 558 deletions(-) create mode 100644 libavcodec/h26x/h2656_inter_template.c diff --git a/libavcodec/h26x/h2656_inter_template.c b/libavcodec/h26x/h2656_inter_template.c new file mode 100644 index 00..864f6c7e7d --- /dev/null +++ b/libavcodec/h26x/h2656_inter_template.c @@ -0,0 +1,577 @@ +/* + * inter prediction template for HEVC/VVC + * + * Copyright (C) 2022 Nuo Mi + * Copyright (C) 2024 Wu Jianhua + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define CHROMA_EXTRA_BEFORE 1 +#define CHROMA_EXTRA3 +#define LUMA_EXTRA_BEFORE 3 +#define LUMA_EXTRA 7 + +static void FUNC(put_pixels)(int16_t *dst, +const uint8_t *_src, const ptrdiff_t _src_stride, +const int height, const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src= (const pixel *)_src; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) +dst[x] = src[x] << (14 - BIT_DEPTH); +src += src_stride; +dst += MAX_PB_SIZE; +} +} + +static void FUNC(put_uni_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, +const uint8_t *_src, const ptrdiff_t _src_stride, const int height, + const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src= (const pixel *)_src; +pixel *dst = (pixel *)_dst; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); + +for (int y = 0; y < height; y++) { +memcpy(dst, src, width * sizeof(pixel)); +src += src_stride; +dst += dst_stride; +} +} + +static void FUNC(put_uni_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride, +const uint8_t *_src, const ptrdiff_t _src_stride, const int height, +const int denom, const int wx, const int _ox, const int8_t *hf, const int8_t *vf, +const int width) +{ +const pixel *src= (const pixel *)_src; +pixel *dst = (pixel *)_dst; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const ptrdiff_t dst_stride = _dst_stride / sizeof(pixel); +const int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 +const int offset= 1 << (shift - 1); +#else +const int offset= 0; +#endif +const int ox= _ox * (1 << (BIT_DEPTH - 8)); + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) { +const int v = (src[x] << (14 - BIT_DEPTH)); +dst[x] = av_clip_pixel(((v * wx + offset) >> shift) + ox); +} +src += src_stride; +dst += dst_stride; +} +} + +#define LUMA_FILTER(src, stride) \ +(filter[0] * src[x - 3 * stride] + \ + filter[1] * src[x - 2 * stride] + \ + filter[2] * src[x - stride] + \ + filter[3] * src[x ] + \ + filter[4] * src[x + stride] + \ + filter[5] * src[x + 2 * stride] + \ + filter[6] * src[x + 3 * stride] + \ + filter[7] * src[x + 4 * stride]) + +static void FUNC(put_luma_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, +const int height, const int8_t *hf, const int8_t *vf, const int width) +{ +const pixel *src = (const pixel*)_src; +const ptrdiff_t src_stride = _src_stride / sizeof(pixel); +const int8_t *filter = hf; + +for (int y = 0; y < height; y++) { +for (int x = 0; x < width; x++) +dst[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8); +src += src_stride; +dst += MAX_PB_SIZE; +} +} + +static void FUNC(put_luma_v)(int