About 16% faster on large clips (>1200px width), more than 2x slower on small 
clips
(352px). So using a heuristic to select with one to use.
---
 libavcodec/huffyuvenc.c            | 6 +++---
 libavcodec/huffyuvencdsp.c         | 4 ++--
 libavcodec/huffyuvencdsp.h         | 4 ++--
 libavcodec/pngenc.c                | 2 +-
 libavcodec/utvideoenc.c            | 2 +-
 libavcodec/x86/huffyuvencdsp.asm   | 5 +++++
 libavcodec/x86/huffyuvencdsp_mmx.c | 9 ++++++++-
 7 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c
index 49d711a..7e133b5 100644
--- a/libavcodec/huffyuvenc.c
+++ b/libavcodec/huffyuvenc.c
@@ -60,12 +60,12 @@ static inline int sub_left_prediction(HYuvContext *s, 
uint8_t *dst,
             }
             return left;
         } else {
-            for (i = 0; i < 16; i++) {
+            for (i = 0; i < 32; i++) {
                 const int temp = src[i];
                 dst[i] = temp - left;
                 left   = temp;
             }
-            s->hencdsp.diff_bytes(dst + 16, src + 16, src + 15, w - 16);
+            s->hencdsp.diff_bytes(dst + 32, src + 32, src + 31, w - 32);
             return src[w-1];
         }
     } else {
@@ -217,7 +217,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt);
 
     ff_huffyuv_common_init(avctx);
-    ff_huffyuvencdsp_init(&s->hencdsp);
+    ff_huffyuvencdsp_init(&s->hencdsp, s->width);
 
     avctx->extradata = av_mallocz(3*MAX_N + 4);
     if (s->flags&AV_CODEC_FLAG_PASS1) {
diff --git a/libavcodec/huffyuvencdsp.c b/libavcodec/huffyuvencdsp.c
index fdcd0b0..08bfd63 100644
--- a/libavcodec/huffyuvencdsp.c
+++ b/libavcodec/huffyuvencdsp.c
@@ -74,11 +74,11 @@ static void sub_hfyu_median_pred_c(uint8_t *dst, const 
uint8_t *src1,
     *left_top = lt;
 }
 
-av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c)
+av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int w)
 {
     c->diff_bytes           = diff_bytes_c;
     c->sub_hfyu_median_pred = sub_hfyu_median_pred_c;
 
     if (ARCH_X86)
-        ff_huffyuvencdsp_init_x86(c);
+        ff_huffyuvencdsp_init_x86(c, w);
 }
diff --git a/libavcodec/huffyuvencdsp.h b/libavcodec/huffyuvencdsp.h
index 9d09095..d66590b 100644
--- a/libavcodec/huffyuvencdsp.h
+++ b/libavcodec/huffyuvencdsp.h
@@ -35,7 +35,7 @@ typedef struct HuffYUVEncDSPContext {
                                  int *left, int *left_top);
 } HuffYUVEncDSPContext;
 
-void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c);
-void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c);
+void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int w);
+void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int w);
 
 #endif /* AVCODEC_HUFFYUVENCDSP_H */
diff --git a/libavcodec/pngenc.c b/libavcodec/pngenc.c
index 4204df2..26cde92 100644
--- a/libavcodec/pngenc.c
+++ b/libavcodec/pngenc.c
@@ -981,7 +981,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
 FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
-    ff_huffyuvencdsp_init(&s->hdsp);
+    ff_huffyuvencdsp_init(&s->hdsp, avctx->width);
 
     s->filter_type = av_clip(avctx->prediction_method,
                              PNG_FILTER_VALUE_NONE,
diff --git a/libavcodec/utvideoenc.c b/libavcodec/utvideoenc.c
index b8e1cc3..4753cfa 100644
--- a/libavcodec/utvideoenc.c
+++ b/libavcodec/utvideoenc.c
@@ -109,7 +109,7 @@ static av_cold int utvideo_encode_init(AVCodecContext 
*avctx)
     }
 
     ff_bswapdsp_init(&c->bdsp);
-    ff_huffyuvencdsp_init(&c->hdsp);
+    ff_huffyuvencdsp_init(&c->hdsp, avctx->width);
 
     /* Check the prediction method, and error out if unsupported */
     if (avctx->prediction_method < 0 || avctx->prediction_method > 4) {
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 9625fbe..85a6616 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -65,3 +65,8 @@ DIFF_BYTES
 
 INIT_XMM sse2
 DIFF_BYTES
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+DIFF_BYTES
+%endif
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c 
b/libavcodec/x86/huffyuvencdsp_mmx.c
index 9af5305..3eda0ba 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/huffyuvencdsp_mmx.c
@@ -33,6 +33,8 @@ void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, 
const uint8_t *src2,
                        intptr_t w);
 void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                         intptr_t w);
+void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+                        intptr_t w);
 
 #if HAVE_INLINE_ASM
 
@@ -78,7 +80,7 @@ static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const 
uint8_t *src1,
 
 #endif /* HAVE_INLINE_ASM */
 
-av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c)
+av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int w)
 {
     av_unused int cpu_flags = av_get_cpu_flags();
 
@@ -93,4 +95,9 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext 
*c)
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->diff_bytes = ff_diff_bytes_sse2;
     }
+
+    // avx2 version only faster than sse2 when width is sufficiently large
+    if (EXTERNAL_AVX2(cpu_flags) && w > 1200) {
+        c->diff_bytes = ff_diff_bytes_avx2;
+    }
 }
-- 
1.9.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to