About 16% faster on large clips (>1200px width), more than 2x slower on small clips (352px). So using a heuristic to select with one to use. --- libavcodec/huffyuvenc.c | 6 +++--- libavcodec/huffyuvencdsp.c | 4 ++-- libavcodec/huffyuvencdsp.h | 4 ++-- libavcodec/pngenc.c | 2 +- libavcodec/utvideoenc.c | 2 +- libavcodec/x86/huffyuvencdsp.asm | 5 +++++ libavcodec/x86/huffyuvencdsp_mmx.c | 9 ++++++++- 7 files changed, 22 insertions(+), 10 deletions(-)
diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c index 49d711a..7e133b5 100644 --- a/libavcodec/huffyuvenc.c +++ b/libavcodec/huffyuvenc.c @@ -60,12 +60,12 @@ static inline int sub_left_prediction(HYuvContext *s, uint8_t *dst, } return left; } else { - for (i = 0; i < 16; i++) { + for (i = 0; i < 32; i++) { const int temp = src[i]; dst[i] = temp - left; left = temp; } - s->hencdsp.diff_bytes(dst + 16, src + 16, src + 15, w - 16); + s->hencdsp.diff_bytes(dst + 32, src + 32, src + 31, w - 32); return src[w-1]; } } else { @@ -217,7 +217,7 @@ static av_cold int encode_init(AVCodecContext *avctx) const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt); ff_huffyuv_common_init(avctx); - ff_huffyuvencdsp_init(&s->hencdsp); + ff_huffyuvencdsp_init(&s->hencdsp, s->width); avctx->extradata = av_mallocz(3*MAX_N + 4); if (s->flags&AV_CODEC_FLAG_PASS1) { diff --git a/libavcodec/huffyuvencdsp.c b/libavcodec/huffyuvencdsp.c index fdcd0b0..08bfd63 100644 --- a/libavcodec/huffyuvencdsp.c +++ b/libavcodec/huffyuvencdsp.c @@ -74,11 +74,11 @@ static void sub_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1, *left_top = lt; } -av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c) +av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int w) { c->diff_bytes = diff_bytes_c; c->sub_hfyu_median_pred = sub_hfyu_median_pred_c; if (ARCH_X86) - ff_huffyuvencdsp_init_x86(c); + ff_huffyuvencdsp_init_x86(c, w); } diff --git a/libavcodec/huffyuvencdsp.h b/libavcodec/huffyuvencdsp.h index 9d09095..d66590b 100644 --- a/libavcodec/huffyuvencdsp.h +++ b/libavcodec/huffyuvencdsp.h @@ -35,7 +35,7 @@ typedef struct HuffYUVEncDSPContext { int *left, int *left_top); } HuffYUVEncDSPContext; -void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c); -void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c); +void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int w); +void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int w); #endif /* AVCODEC_HUFFYUVENCDSP_H */ diff --git a/libavcodec/pngenc.c b/libavcodec/pngenc.c index 4204df2..26cde92 100644 --- a/libavcodec/pngenc.c +++ b/libavcodec/pngenc.c @@ -981,7 +981,7 @@ FF_DISABLE_DEPRECATION_WARNINGS FF_ENABLE_DEPRECATION_WARNINGS #endif - ff_huffyuvencdsp_init(&s->hdsp); + ff_huffyuvencdsp_init(&s->hdsp, avctx->width); s->filter_type = av_clip(avctx->prediction_method, PNG_FILTER_VALUE_NONE, diff --git a/libavcodec/utvideoenc.c b/libavcodec/utvideoenc.c index b8e1cc3..4753cfa 100644 --- a/libavcodec/utvideoenc.c +++ b/libavcodec/utvideoenc.c @@ -109,7 +109,7 @@ static av_cold int utvideo_encode_init(AVCodecContext *avctx) } ff_bswapdsp_init(&c->bdsp); - ff_huffyuvencdsp_init(&c->hdsp); + ff_huffyuvencdsp_init(&c->hdsp, avctx->width); /* Check the prediction method, and error out if unsupported */ if (avctx->prediction_method < 0 || avctx->prediction_method > 4) { diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm index 9625fbe..85a6616 100644 --- a/libavcodec/x86/huffyuvencdsp.asm +++ b/libavcodec/x86/huffyuvencdsp.asm @@ -65,3 +65,8 @@ DIFF_BYTES INIT_XMM sse2 DIFF_BYTES + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +DIFF_BYTES +%endif diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/huffyuvencdsp_mmx.c index 9af5305..3eda0ba 100644 --- a/libavcodec/x86/huffyuvencdsp_mmx.c +++ b/libavcodec/x86/huffyuvencdsp_mmx.c @@ -33,6 +33,8 @@ void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w); void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w); +void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, + intptr_t w); #if HAVE_INLINE_ASM @@ -78,7 +80,7 @@ static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1, #endif /* HAVE_INLINE_ASM */ -av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) +av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int w) { av_unused int cpu_flags = av_get_cpu_flags(); @@ -93,4 +95,9 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) if (EXTERNAL_SSE2(cpu_flags)) { c->diff_bytes = ff_diff_bytes_sse2; } + + // avx2 version only faster than sse2 when width is sufficiently large + if (EXTERNAL_AVX2(cpu_flags) && w > 1200) { + c->diff_bytes = ff_diff_bytes_avx2; + } } -- 1.9.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel