In some cases, 2 or 3 calls are performed to functions for unusual
widths. Instead, perform 2 calls for different widths to split the
workload.

The 8+16 and 4+8 widths for respectively 8 and more than 8 bits can't
be processed that way without modifications: some calls use unaligned
buffers, and having branches to handle this was resulting in no
micro-benchmark benefit.

For block_w == 12 (around 1% of the pixels of the sequence):
Before:
12758 decicycles in epel_uni, 4093 runs, 3 skips
19389 decicycles in qpel_uni, 8187 runs, 5 skips
22699 decicycles in epel_bi, 32743 runs, 25 skips
34736 decicycles in qpel_bi, 32733 runs, 35 skips

After:
11929 decicycles in epel_uni, 4096 runs, 0 skips
18131 decicycles in qpel_uni, 8184 runs, 8 skips
20065 decicycles in epel_bi, 32750 runs, 18 skips
31458 decicycles in qpel_bi, 32753 runs, 15 skips
---
 libavcodec/x86/hevcdsp_init.c | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 3b7cc51..6bcced6 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -123,6 +123,45 @@ void 
ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dst
     mc_rep_uni_func(name, bitd, step, W, opt);        \
     mc_rep_bi_func(name, bitd, step, W, opt)
 
+#define mc_rep_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst,                 
                                 \
+                                                 uint8_t *src, ptrdiff_t 
_srcstride, int height,                \
+                                                 intptr_t mx, intptr_t my, int 
width)                           \
+{                                                                              
                                 \
+    ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, 
height, mx, my, width);               \
+    ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 
* ((bitd + 7) / 8)),              \
+                                                    _srcstride, height, mx, 
my, width);                         \
+}
+#define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t 
dststride,                         \
+                                                     uint8_t *src, ptrdiff_t 
_srcstride, int height,            \
+                                                     intptr_t mx, intptr_t my, 
int width)                       \
+{                                                                              
                                 \
+    ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, 
_srcstride, height, mx, my, width);\
+    ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd 
+ 7) / 8)), dststride,            \
+                                                        src + (step1 * ((bitd 
+ 7) / 8)), _srcstride,           \
+                                                        height, mx, my, 
width);                                 \
+}
+#define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t 
dststride, uint8_t *src,            \
+                                                    ptrdiff_t _srcstride, 
int16_t* src2,                        \
+                                                    int height, intptr_t mx, 
intptr_t my, int width)            \
+{                                                                              
                                 \
+    ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, 
_srcstride, src2, height, mx, my, width);\
+    ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 
7) / 8)), dststride,             \
+                                                       src + (step1 * ((bitd + 
7) / 8)), _srcstride,            \
+                                                       src2 + step1, height, 
mx, my, width);                    \
+}
+
+#define mc_rep_funcs(name, bitd, step, W, opt)        \
+    mc_rep_func(name, bitd, step, W, opt);            \
+    mc_rep_uni_func(name, bitd, step, W, opt);        \
+    mc_rep_bi_func(name, bitd, step, W, opt)
+
+#define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
+    mc_rep_func2(name, bitd, step1, step2, W, opt);     \
+    mc_rep_uni_func2(name, bitd, step1, step2, W, opt); \
+    mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
 
 #if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
 
@@ -180,7 +219,7 @@ mc_rep_funcs(epel_hv, 8,  8, 48, sse4);
 mc_rep_funcs(epel_hv, 8,  8, 32, sse4);
 mc_rep_funcs(epel_hv, 8,  8, 24, sse4);
 mc_rep_funcs(epel_hv, 8,  8, 16, sse4);
-mc_rep_funcs(epel_hv, 8,  4, 12, sse4);
+mc_rep_funcs2(epel_hv,8,  8,  4, 12, sse4);
 mc_rep_funcs(epel_hv,10,  8, 64, sse4);
 mc_rep_funcs(epel_hv,10,  8, 48, sse4);
 mc_rep_funcs(epel_hv,10,  8, 32, sse4);
@@ -231,7 +270,7 @@ mc_rep_funcs(qpel_hv, 8,  8, 48, sse4);
 mc_rep_funcs(qpel_hv, 8,  8, 32, sse4);
 mc_rep_funcs(qpel_hv, 8,  8, 24, sse4);
 mc_rep_funcs(qpel_hv, 8,  8, 16, sse4);
-mc_rep_funcs(qpel_hv, 8,  4, 12, sse4);
+mc_rep_funcs2(qpel_hv,8,  8,  4, 12, sse4);
 mc_rep_funcs(qpel_hv,10,  8, 64, sse4);
 mc_rep_funcs(qpel_hv,10,  8, 48, sse4);
 mc_rep_funcs(qpel_hv,10,  8, 32, sse4);
-- 
1.9.2.msysgit.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to