On Win64:

Before: 155576b
64765 decicycles in qpel_bi_w, 8185 runs, 7 skips
13676 decicycles in epel_bi_w, 16378 runs, 6 skips
54402 decicycles in qpel_uni_w, 1023 runs, 1 skips
12328 decicycles in epel_uni_w, 2048 runs, 0 skips

After: 94260b
65037 decicycles in qpel_bi_w, 8185 runs, 7 skips
13752 decicycles in epel_bi_w, 16380 runs, 4 skips
54709 decicycles in qpel_uni_w, 1021 runs, 3 skips
12037 decicycles in epel_uni_w, 2047 runs, 1 skips
---
 libavcodec/x86/hevcdsp_init.c | 542 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 461 insertions(+), 81 deletions(-)

diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 4c536ac..a8284db 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -550,9 +550,23 @@ mc_rep_proxies(qpel_hv,12,  8, sse4);
 #define ff_hevc_put_hevc_bi_qpel_hv16_12_sse4  proxy_bi_qpel_hv8_12_sse4
 mc_rep_funcs(qpel_hv,12,  4, 12, sse4);
 
+#define mc_rep_uni_w_proxy(bitd, step, opt) \
+static void proxy_uni_w##step##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t 
dststride, int16_t *_src, ptrdiff_t _srcstride, \
+                                               int height, int denom,  int 
_wx, int _ox, int width)                     \
+{                                                                              
                                         \
+    int i;                                                                     
                                         \
+    int16_t *src;                                                              
                                         \
+    uint8_t *dst;                                                              
                                         \
+    for (i = 0; i < width; i += step) {                                        
                                         \
+        src= _src + i;                                                         
                                         \
+        dst= _dst + (i * ((bitd + 7) / 8));                                    
                                         \
+        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, 
_srcstride, height, denom, _wx, _ox);        \
+    }                                                                          
                                         \
+}
+
 #define mc_rep_uni_w(bitd, step, W, opt) \
-void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t 
dststride, int16_t *_src, ptrdiff_t _srcstride,\
-                                               int height, int denom,  int 
_wx, int _ox)                                \
+static void no_proxy_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t 
dststride, int16_t *_src, ptrdiff_t _srcstride, \
+                                              int height, int denom,  int _wx, 
int _ox, int width)                      \
 {                                                                              
                                         \
     int i;                                                                     
                                         \
     int16_t *src;                                                              
                                         \
@@ -560,36 +574,84 @@ void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t 
*_dst, ptrdiff_t dststri
     for (i = 0; i < W; i += step) {                                            
                                         \
         src= _src + i;                                                         
                                         \
         dst= _dst + (i * ((bitd + 7) / 8));                                    
                                         \
-        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, 
_srcstride,                                  \
-                                                     height, denom, _wx, _ox); 
                                         \
+        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, 
_srcstride, height, denom, _wx, _ox);        \
     }                                                                          
                                         \
 }
 
+#define mc_rep_uni_w_unproxy(bitd, W, opt) \
+static void unproxy_uni_w##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t 
dststride, int16_t *src, ptrdiff_t srcstride,     \
+                                              int height, int denom,  int _wx, 
int _ox, int width)                      \
+{                                                                              
                                         \
+    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(dst, dststride, src, srcstride, 
height, denom, _wx, _ox);                \
+}
+
 mc_rep_uni_w(8, 6, 12, sse4);
-mc_rep_uni_w(8, 8, 16, sse4);
-mc_rep_uni_w(8, 8, 24, sse4);
-mc_rep_uni_w(8, 8, 32, sse4);
-mc_rep_uni_w(8, 8, 48, sse4);
-mc_rep_uni_w(8, 8, 64, sse4);
+#define ff_hevc_put_hevc_uni_w12_8_sse4   no_proxy_uni_w12_8_sse4
+mc_rep_uni_w_proxy(8, 8, sse4);
+#define ff_hevc_put_hevc_uni_w64_8_sse4   proxy_uni_w8_8_sse4
+#define ff_hevc_put_hevc_uni_w48_8_sse4   proxy_uni_w8_8_sse4
+#define ff_hevc_put_hevc_uni_w32_8_sse4   proxy_uni_w8_8_sse4
+#define ff_hevc_put_hevc_uni_w24_8_sse4   proxy_uni_w8_8_sse4
+#define ff_hevc_put_hevc_uni_w16_8_sse4   proxy_uni_w8_8_sse4
+mc_rep_uni_w_unproxy(8, 4, sse4);
+mc_rep_uni_w_unproxy(8, 6, sse4);
+mc_rep_uni_w_unproxy(8, 8, sse4);
+#define ff_hevc_put_hevc_uni_w4_8_sse4    unproxy_uni_w4_8_sse4
+#define ff_hevc_put_hevc_uni_w6_8_sse4    unproxy_uni_w6_8_sse4
+#define ff_hevc_put_hevc_uni_w8_8_sse4    unproxy_uni_w8_8_sse4
 
 mc_rep_uni_w(10, 6, 12, sse4);
-mc_rep_uni_w(10, 8, 16, sse4);
-mc_rep_uni_w(10, 8, 24, sse4);
-mc_rep_uni_w(10, 8, 32, sse4);
-mc_rep_uni_w(10, 8, 48, sse4);
-mc_rep_uni_w(10, 8, 64, sse4);
+#define ff_hevc_put_hevc_uni_w12_10_sse4  no_proxy_uni_w12_10_sse4
+mc_rep_uni_w_proxy(10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w64_10_sse4  proxy_uni_w8_10_sse4
+#define ff_hevc_put_hevc_uni_w48_10_sse4  proxy_uni_w8_10_sse4
+#define ff_hevc_put_hevc_uni_w32_10_sse4  proxy_uni_w8_10_sse4
+#define ff_hevc_put_hevc_uni_w24_10_sse4  proxy_uni_w8_10_sse4
+#define ff_hevc_put_hevc_uni_w16_10_sse4  proxy_uni_w8_10_sse4
+mc_rep_uni_w_unproxy(10, 4, sse4);
+mc_rep_uni_w_unproxy(10, 6, sse4);
+mc_rep_uni_w_unproxy(10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w4_10_sse4   unproxy_uni_w4_10_sse4
+#define ff_hevc_put_hevc_uni_w6_10_sse4   unproxy_uni_w6_10_sse4
+#define ff_hevc_put_hevc_uni_w8_10_sse4   unproxy_uni_w8_10_sse4
 
 mc_rep_uni_w(12, 6, 12, sse4);
-mc_rep_uni_w(12, 8, 16, sse4);
-mc_rep_uni_w(12, 8, 24, sse4);
-mc_rep_uni_w(12, 8, 32, sse4);
-mc_rep_uni_w(12, 8, 48, sse4);
-mc_rep_uni_w(12, 8, 64, sse4);
+#define ff_hevc_put_hevc_uni_w12_12_sse4  no_proxy_uni_w12_12_sse4
+mc_rep_uni_w_proxy(12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w64_12_sse4  proxy_uni_w8_12_sse4
+#define ff_hevc_put_hevc_uni_w48_12_sse4  proxy_uni_w8_12_sse4
+#define ff_hevc_put_hevc_uni_w32_12_sse4  proxy_uni_w8_12_sse4
+#define ff_hevc_put_hevc_uni_w24_12_sse4  proxy_uni_w8_12_sse4
+#define ff_hevc_put_hevc_uni_w16_12_sse4  proxy_uni_w8_12_sse4
+mc_rep_uni_w_unproxy(12, 4, sse4);
+mc_rep_uni_w_unproxy(12, 6, sse4);
+mc_rep_uni_w_unproxy(12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w4_12_sse4   unproxy_uni_w4_12_sse4
+#define ff_hevc_put_hevc_uni_w6_12_sse4   unproxy_uni_w6_12_sse4
+#define ff_hevc_put_hevc_uni_w8_12_sse4   unproxy_uni_w8_12_sse4
+
+#define mc_rep_bi_w_proxy(bitd, step, opt) \
+static void proxy_bi_w##step##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t 
dststride, int16_t *_src, ptrdiff_t _srcstride,  \
+                                              int16_t *_src2, int height,      
                                         \
+                                              int denom,  int _wx0,  int _wx1, 
int _ox0, int _ox1, int width)           \
+{                                                                              
                                         \
+    int i;                                                                     
                                         \
+    int16_t *src;                                                              
                                         \
+    int16_t *src2;                                                             
                                         \
+    uint8_t *dst;                                                              
                                         \
+    for (i = 0; i < width; i += step) {                                        
                                         \
+        src  = _src  + i;                                                      
                                         \
+        src2 = _src2 + i;                                                      
                                         \
+        dst  = _dst  + (i * ((bitd + 7) / 8));                                 
                                         \
+        ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, 
_srcstride, src2,                             \
+                                                     height, denom, _wx0, 
_wx1, _ox0, _ox1);                            \
+    }                                                                          
                                         \
+}
 
 #define mc_rep_bi_w(bitd, step, W, opt) \
-void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t 
dststride, int16_t *_src, ptrdiff_t _srcstride, \
+static void no_proxy_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t 
dststride, int16_t *_src, ptrdiff_t _srcstride,  \
                                               int16_t *_src2, int height,      
                                         \
-                                              int denom,  int _wx0,  int _wx1, 
int _ox0, int _ox1)                      \
+                                              int denom,  int _wx0,  int _wx1, 
int _ox0, int _ox1, int width)           \
 {                                                                              
                                         \
     int i;                                                                     
                                         \
     int16_t *src;                                                              
                                         \
@@ -604,26 +666,69 @@ void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t 
*_dst, ptrdiff_t dststrid
     }                                                                          
                                         \
 }
 
+#define mc_rep_bi_w_unproxy(bitd, W, opt) \
+static void unproxy_bi_w##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dstride, 
int16_t *src, ptrdiff_t sstride,          \
+                                             int16_t *src2, int h, int denom, 
int w0, int w1, int o0, int o1, int w)    \
+{                                                                              
                                         \
+    ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(dst, dstride, src, sstride, 
src2, h, denom, w0, w1, o0, o1);              \
+}
+
 mc_rep_bi_w(8, 6, 12, sse4);
-mc_rep_bi_w(8, 8, 16, sse4);
-mc_rep_bi_w(8, 8, 24, sse4);
-mc_rep_bi_w(8, 8, 32, sse4);
-mc_rep_bi_w(8, 8, 48, sse4);
-mc_rep_bi_w(8, 8, 64, sse4);
+#define ff_hevc_put_hevc_bi_w12_8_sse4   no_proxy_bi_w12_8_sse4
+mc_rep_bi_w_proxy(8, 8, sse4);
+#define ff_hevc_put_hevc_bi_w64_8_sse4   proxy_bi_w8_8_sse4
+#define ff_hevc_put_hevc_bi_w48_8_sse4   proxy_bi_w8_8_sse4
+#define ff_hevc_put_hevc_bi_w32_8_sse4   proxy_bi_w8_8_sse4
+#define ff_hevc_put_hevc_bi_w24_8_sse4   proxy_bi_w8_8_sse4
+#define ff_hevc_put_hevc_bi_w16_8_sse4   proxy_bi_w8_8_sse4
+mc_rep_bi_w_unproxy(8, 4, sse4);
+mc_rep_bi_w_unproxy(8, 6, sse4);
+mc_rep_bi_w_unproxy(8, 8, sse4);
+#define ff_hevc_put_hevc_bi_w4_8_sse4    unproxy_bi_w4_8_sse4
+#define ff_hevc_put_hevc_bi_w6_8_sse4    unproxy_bi_w6_8_sse4
+#define ff_hevc_put_hevc_bi_w8_8_sse4    unproxy_bi_w8_8_sse4
 
 mc_rep_bi_w(10, 6, 12, sse4);
-mc_rep_bi_w(10, 8, 16, sse4);
-mc_rep_bi_w(10, 8, 24, sse4);
-mc_rep_bi_w(10, 8, 32, sse4);
-mc_rep_bi_w(10, 8, 48, sse4);
-mc_rep_bi_w(10, 8, 64, sse4);
+#define ff_hevc_put_hevc_bi_w12_10_sse4  no_proxy_bi_w12_10_sse4
+mc_rep_bi_w_proxy(10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w64_10_sse4  proxy_bi_w8_10_sse4
+#define ff_hevc_put_hevc_bi_w48_10_sse4  proxy_bi_w8_10_sse4
+#define ff_hevc_put_hevc_bi_w32_10_sse4  proxy_bi_w8_10_sse4
+#define ff_hevc_put_hevc_bi_w24_10_sse4  proxy_bi_w8_10_sse4
+#define ff_hevc_put_hevc_bi_w16_10_sse4  proxy_bi_w8_10_sse4
+mc_rep_bi_w_unproxy(10, 4, sse4);
+mc_rep_bi_w_unproxy(10, 6, sse4);
+mc_rep_bi_w_unproxy(10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w4_10_sse4   unproxy_bi_w4_10_sse4
+#define ff_hevc_put_hevc_bi_w6_10_sse4   unproxy_bi_w6_10_sse4
+#define ff_hevc_put_hevc_bi_w8_10_sse4   unproxy_bi_w8_10_sse4
 
 mc_rep_bi_w(12, 6, 12, sse4);
-mc_rep_bi_w(12, 8, 16, sse4);
-mc_rep_bi_w(12, 8, 24, sse4);
-mc_rep_bi_w(12, 8, 32, sse4);
-mc_rep_bi_w(12, 8, 48, sse4);
-mc_rep_bi_w(12, 8, 64, sse4);
+#define ff_hevc_put_hevc_bi_w12_12_sse4  no_proxy_bi_w12_12_sse4
+mc_rep_bi_w_proxy(12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w64_12_sse4  proxy_bi_w8_12_sse4
+#define ff_hevc_put_hevc_bi_w48_12_sse4  proxy_bi_w8_12_sse4
+#define ff_hevc_put_hevc_bi_w32_12_sse4  proxy_bi_w8_12_sse4
+#define ff_hevc_put_hevc_bi_w24_12_sse4  proxy_bi_w8_12_sse4
+#define ff_hevc_put_hevc_bi_w16_12_sse4  proxy_bi_w8_12_sse4
+mc_rep_bi_w_unproxy(12, 4, sse4);
+mc_rep_bi_w_unproxy(12, 6, sse4);
+mc_rep_bi_w_unproxy(12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w4_12_sse4   unproxy_bi_w4_12_sse4
+#define ff_hevc_put_hevc_bi_w6_12_sse4   unproxy_bi_w6_12_sse4
+#define ff_hevc_put_hevc_bi_w8_12_sse4   unproxy_bi_w8_12_sse4
+
+#define mc_uni_w_func_proxy(name, bitd, step, opt) \
+static void proxy_uni_w_##name##step##_##bitd##_##opt(uint8_t *dst, ptrdiff_t 
dststride,           \
+                                                      uint8_t *src, ptrdiff_t 
srcstride,           \
+                                                      int height, int denom,   
                    \
+                                                      int wx, int ox,          
                    \
+                                                      intptr_t mx, intptr_t 
my, int width)         \
+{                                                                              
                    \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                       
                    \
+    proxy_##name##step##_##bitd##_##opt(temp, src, srcstride, height, mx, my, 
width);              \
+    proxy_uni_w8##_##bitd##_##opt(dst, dststride, temp, MAX_PB_SIZE, height, 
denom, wx, ox, width);\
+}
 
 #define mc_uni_w_func(name, bitd, W, opt) \
 void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, 
ptrdiff_t _dststride,         \
@@ -634,54 +739,199 @@ void 
ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t
 {                                                                              
                     \
     LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                       
                     \
     ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, 
height, mx, my, width);     \
-    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, 
MAX_PB_SIZE, height, denom, _wx, _ox);\
+    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, 
MAX_PB_SIZE, height, denom, _wx, _ox, width);\
 }
 
 #define mc_uni_w_funcs(name, bitd, opt)       \
         mc_uni_w_func(name, bitd, 4, opt);    \
         mc_uni_w_func(name, bitd, 8, opt);    \
         mc_uni_w_func(name, bitd, 12, opt);   \
-        mc_uni_w_func(name, bitd, 16, opt);   \
         mc_uni_w_func(name, bitd, 24, opt);   \
+        mc_uni_w_func(name, bitd, 16, opt);   \
         mc_uni_w_func(name, bitd, 32, opt);   \
         mc_uni_w_func(name, bitd, 48, opt);   \
         mc_uni_w_func(name, bitd, 64, opt)
 
-mc_uni_w_funcs(pel_pixels, 8, sse4);
+#define mc_uni_w_proxy_funcs(name, bitd, step, opt) \
+        mc_uni_w_func(name, bitd, 4, opt);    \
+        mc_uni_w_func(name, bitd, 8, opt);    \
+        mc_uni_w_func(name, bitd, 12, opt);   \
+        mc_uni_w_func_proxy(name, bitd, step, opt)
+
+
+mc_uni_w_proxy_funcs(pel_pixels, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_pel_pixels16_8_sse4  
proxy_uni_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels32_8_sse4  
proxy_uni_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels48_8_sse4  
proxy_uni_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels64_8_sse4  
proxy_uni_w_pel_pixels16_8_sse4
+mc_uni_w_func(pel_pixels, 8, 24, sse4);
 mc_uni_w_func(pel_pixels, 8, 6, sse4);
-mc_uni_w_funcs(epel_h, 8, sse4);
+
+mc_uni_w_proxy_funcs(epel_h, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_h16_8_sse4      proxy_uni_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h32_8_sse4      proxy_uni_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h48_8_sse4      proxy_uni_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h64_8_sse4      proxy_uni_w_epel_h16_8_sse4
+mc_uni_w_func(epel_h, 8, 24, sse4);
 mc_uni_w_func(epel_h, 8, 6, sse4);
-mc_uni_w_funcs(epel_v, 8, sse4);
+
+mc_uni_w_proxy_funcs(epel_v, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_v16_8_sse4      proxy_uni_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v32_8_sse4      proxy_uni_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v48_8_sse4      proxy_uni_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v64_8_sse4      proxy_uni_w_epel_v16_8_sse4
+mc_uni_w_func(epel_v, 8, 24, sse4);
 mc_uni_w_func(epel_v, 8, 6, sse4);
-mc_uni_w_funcs(epel_hv, 8, sse4);
+
+mc_uni_w_proxy_funcs(epel_hv, 8, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_hv16_8_sse4     proxy_uni_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv24_8_sse4     proxy_uni_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv32_8_sse4     proxy_uni_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv48_8_sse4     proxy_uni_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv64_8_sse4     proxy_uni_w_epel_hv8_8_sse4
 mc_uni_w_func(epel_hv, 8, 6, sse4);
-mc_uni_w_funcs(qpel_h, 8, sse4);
-mc_uni_w_funcs(qpel_v, 8, sse4);
-mc_uni_w_funcs(qpel_hv, 8, sse4);
 
-mc_uni_w_funcs(pel_pixels, 10, sse4);
+mc_uni_w_proxy_funcs(qpel_h, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_h16_8_sse4      proxy_uni_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h32_8_sse4      proxy_uni_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h48_8_sse4      proxy_uni_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h64_8_sse4      proxy_uni_w_qpel_h16_8_sse4
+mc_uni_w_func(qpel_h, 8, 24, sse4);
+
+mc_uni_w_proxy_funcs(qpel_v, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_v16_8_sse4      proxy_uni_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v32_8_sse4      proxy_uni_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v48_8_sse4      proxy_uni_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v64_8_sse4      proxy_uni_w_qpel_v16_8_sse4
+mc_uni_w_func(qpel_v, 8, 24, sse4);
+
+mc_uni_w_proxy_funcs(qpel_hv, 8, 8, sse4);
+mc_uni_w_func(qpel_hv, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_hv24_8_sse4     proxy_uni_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv32_8_sse4     proxy_uni_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv48_8_sse4     proxy_uni_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv64_8_sse4     proxy_uni_w_qpel_hv8_8_sse4
+
+mc_uni_w_proxy_funcs(pel_pixels, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_pel_pixels16_10_sse4 
proxy_uni_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels24_10_sse4 
proxy_uni_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels32_10_sse4 
proxy_uni_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels48_10_sse4 
proxy_uni_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels64_10_sse4 
proxy_uni_w_pel_pixels8_10_sse4
 mc_uni_w_func(pel_pixels, 10, 6, sse4);
-mc_uni_w_funcs(epel_h, 10, sse4);
+
+mc_uni_w_proxy_funcs(epel_h, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_h16_10_sse4     proxy_uni_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h24_10_sse4     proxy_uni_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h32_10_sse4     proxy_uni_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h48_10_sse4     proxy_uni_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h64_10_sse4     proxy_uni_w_epel_h8_10_sse4
 mc_uni_w_func(epel_h, 10, 6, sse4);
-mc_uni_w_funcs(epel_v, 10, sse4);
+
+mc_uni_w_proxy_funcs(epel_v, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_v16_10_sse4     proxy_uni_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v24_10_sse4     proxy_uni_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v32_10_sse4     proxy_uni_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v48_10_sse4     proxy_uni_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v64_10_sse4     proxy_uni_w_epel_v8_10_sse4
 mc_uni_w_func(epel_v, 10, 6, sse4);
-mc_uni_w_funcs(epel_hv, 10, sse4);
+
+mc_uni_w_proxy_funcs(epel_hv, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_hv16_10_sse4    
proxy_uni_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv24_10_sse4    
proxy_uni_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv32_10_sse4    
proxy_uni_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv48_10_sse4    
proxy_uni_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv64_10_sse4    
proxy_uni_w_epel_hv8_10_sse4
 mc_uni_w_func(epel_hv, 10, 6, sse4);
-mc_uni_w_funcs(qpel_h, 10, sse4);
-mc_uni_w_funcs(qpel_v, 10, sse4);
-mc_uni_w_funcs(qpel_hv, 10, sse4);
 
-mc_uni_w_funcs(pel_pixels, 12, sse4);
+mc_uni_w_proxy_funcs(qpel_h, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_h16_10_sse4     proxy_uni_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h24_10_sse4     proxy_uni_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h32_10_sse4     proxy_uni_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h48_10_sse4     proxy_uni_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h64_10_sse4     proxy_uni_w_qpel_h8_10_sse4
+
+mc_uni_w_proxy_funcs(qpel_v, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_v16_10_sse4     proxy_uni_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v24_10_sse4     proxy_uni_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v32_10_sse4     proxy_uni_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v48_10_sse4     proxy_uni_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v64_10_sse4     proxy_uni_w_qpel_v8_10_sse4
+
+mc_uni_w_proxy_funcs(qpel_hv, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_hv16_10_sse4    
proxy_uni_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv24_10_sse4    
proxy_uni_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv32_10_sse4    
proxy_uni_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv48_10_sse4    
proxy_uni_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv64_10_sse4    
proxy_uni_w_qpel_hv8_10_sse4
+
+mc_uni_w_proxy_funcs(pel_pixels, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_pel_pixels16_12_sse4 
proxy_uni_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels24_12_sse4 
proxy_uni_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels32_12_sse4 
proxy_uni_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels48_12_sse4 
proxy_uni_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels64_12_sse4 
proxy_uni_w_pel_pixels8_12_sse4
 mc_uni_w_func(pel_pixels, 12, 6, sse4);
-mc_uni_w_funcs(epel_h, 12, sse4);
+
+mc_uni_w_proxy_funcs(epel_h, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_h16_12_sse4     proxy_uni_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h24_12_sse4     proxy_uni_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h32_12_sse4     proxy_uni_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h48_12_sse4     proxy_uni_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h64_12_sse4     proxy_uni_w_epel_h8_12_sse4
 mc_uni_w_func(epel_h, 12, 6, sse4);
-mc_uni_w_funcs(epel_v, 12, sse4);
+
+mc_uni_w_proxy_funcs(epel_v, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_v16_12_sse4     proxy_uni_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v24_12_sse4     proxy_uni_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v32_12_sse4     proxy_uni_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v48_12_sse4     proxy_uni_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v64_12_sse4     proxy_uni_w_epel_v8_12_sse4
 mc_uni_w_func(epel_v, 12, 6, sse4);
-mc_uni_w_funcs(epel_hv, 12, sse4);
+
+mc_uni_w_proxy_funcs(epel_hv, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_hv16_12_sse4    
proxy_uni_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv24_12_sse4    
proxy_uni_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv32_12_sse4    
proxy_uni_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv48_12_sse4    
proxy_uni_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv64_12_sse4    
proxy_uni_w_epel_hv8_12_sse4
 mc_uni_w_func(epel_hv, 12, 6, sse4);
-mc_uni_w_funcs(qpel_h, 12, sse4);
-mc_uni_w_funcs(qpel_v, 12, sse4);
-mc_uni_w_funcs(qpel_hv, 12, sse4);
+
+mc_uni_w_proxy_funcs(qpel_h, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_h16_12_sse4     proxy_uni_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h24_12_sse4     proxy_uni_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h32_12_sse4     proxy_uni_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h48_12_sse4     proxy_uni_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h64_12_sse4     proxy_uni_w_qpel_h8_12_sse4
+
+mc_uni_w_proxy_funcs(qpel_v, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_v16_12_sse4     proxy_uni_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v24_12_sse4     proxy_uni_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v32_12_sse4     proxy_uni_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v48_12_sse4     proxy_uni_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v64_12_sse4     proxy_uni_w_qpel_v8_12_sse4
+
+mc_uni_w_proxy_funcs(qpel_hv, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_hv16_12_sse4    
proxy_uni_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv24_12_sse4    
proxy_uni_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv32_12_sse4    
proxy_uni_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv48_12_sse4    
proxy_uni_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv64_12_sse4    
proxy_uni_w_qpel_hv8_12_sse4
+
+// Step only for first proxy
+#define mc_bi_w_func_proxy(name, bitd, step, opt) \
+static void proxy_bi_w_##name##step##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t 
_dststride,           \
+                                                     uint8_t *_src, ptrdiff_t 
_srcstride,            \
+                                                     int16_t *_src2,           
                      \
+                                                     int height, int denom,    
                      \
+                                                     int _wx0, int _wx1, int 
_ox0, int _ox1,         \
+                                                     intptr_t mx, intptr_t my, 
int width)            \
+{                                                                              
                    \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                       
                    \
+    proxy_##name##step##_##bitd##_##opt(temp, _src, _srcstride, height, mx, 
my, width);      \
+    proxy_bi_w8##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, _src2,   
         \
+                                 height, denom, _wx0, _wx1, _ox0, _ox1, 
width);          \
+}
 
 #define mc_bi_w_func(name, bitd, W, opt) \
 void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t 
_dststride,           \
@@ -694,7 +944,7 @@ void 
ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _
     LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                       
                      \
     ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, 
height, mx, my, width);      \
     ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, 
MAX_PB_SIZE, _src2,            \
-                                             height, denom, _wx0, _wx1, _ox0, 
_ox1);                 \
+                                             height, denom, _wx0, _wx1, _ox0, 
_ox1, width);          \
 }
 
 #define mc_bi_w_funcs(name, bitd, opt)       \
@@ -707,41 +957,171 @@ void 
ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _
         mc_bi_w_func(name, bitd, 48, opt);   \
         mc_bi_w_func(name, bitd, 64, opt)
 
-mc_bi_w_funcs(pel_pixels, 8, sse4);
+#define mc_bi_w_proxy_funcs(name, bitd, step, opt) \
+        mc_bi_w_func(name, bitd, 4, opt);    \
+        mc_bi_w_func(name, bitd, 8, opt);    \
+        mc_bi_w_func(name, bitd, 12, opt);   \
+        mc_bi_w_func_proxy(name, bitd, step, opt)
+
+mc_bi_w_proxy_funcs(pel_pixels, 8, 16, sse4);
+#define ff_hevc_put_hevc_bi_w_pel_pixels16_8_sse4 
proxy_bi_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels32_8_sse4 
proxy_bi_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels48_8_sse4 
proxy_bi_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels64_8_sse4 
proxy_bi_w_pel_pixels16_8_sse4
+mc_bi_w_func(pel_pixels, 8, 24, sse4);
 mc_bi_w_func(pel_pixels, 8, 6, sse4);
-mc_bi_w_funcs(epel_h, 8, sse4);
+
+mc_bi_w_proxy_funcs(epel_h, 8, 16, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_h16_8_sse4     proxy_bi_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h32_8_sse4     proxy_bi_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h48_8_sse4     proxy_bi_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h64_8_sse4     proxy_bi_w_epel_h16_8_sse4
+mc_bi_w_func(epel_h, 8, 24, sse4);
 mc_bi_w_func(epel_h, 8, 6, sse4);
-mc_bi_w_funcs(epel_v, 8, sse4);
+
+mc_bi_w_proxy_funcs(epel_v, 8, 16, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_v16_8_sse4     proxy_bi_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v32_8_sse4     proxy_bi_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v48_8_sse4     proxy_bi_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v64_8_sse4     proxy_bi_w_epel_v16_8_sse4
+mc_bi_w_func(epel_v, 8, 24, sse4);
 mc_bi_w_func(epel_v, 8, 6, sse4);
-mc_bi_w_funcs(epel_hv, 8, sse4);
+
+mc_bi_w_proxy_funcs(epel_hv, 8, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_hv16_8_sse4     proxy_bi_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv24_8_sse4     proxy_bi_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv32_8_sse4     proxy_bi_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv48_8_sse4     proxy_bi_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv64_8_sse4     proxy_bi_w_epel_hv8_8_sse4
 mc_bi_w_func(epel_hv, 8, 6, sse4);
-mc_bi_w_funcs(qpel_h, 8, sse4);
-mc_bi_w_funcs(qpel_v, 8, sse4);
-mc_bi_w_funcs(qpel_hv, 8, sse4);
 
-mc_bi_w_funcs(pel_pixels, 10, sse4);
+mc_bi_w_proxy_funcs(qpel_h, 8, 16, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_h16_8_sse4     proxy_bi_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h32_8_sse4     proxy_bi_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h48_8_sse4     proxy_bi_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h64_8_sse4     proxy_bi_w_qpel_h16_8_sse4
+mc_bi_w_func(qpel_h, 8, 24, sse4);
+
+mc_bi_w_proxy_funcs(qpel_v, 8, 16, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_v16_8_sse4     proxy_bi_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v32_8_sse4     proxy_bi_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v48_8_sse4     proxy_bi_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v64_8_sse4     proxy_bi_w_qpel_v16_8_sse4
+mc_bi_w_func(qpel_v, 8, 24, sse4);
+
+mc_bi_w_proxy_funcs(qpel_hv, 8, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_hv16_8_sse4     proxy_bi_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv24_8_sse4     proxy_bi_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv32_8_sse4     proxy_bi_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv48_8_sse4     proxy_bi_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv64_8_sse4     proxy_bi_w_qpel_hv8_8_sse4
+
+mc_bi_w_proxy_funcs(pel_pixels, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_pel_pixels16_10_sse4 
proxy_bi_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels24_10_sse4 
proxy_bi_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels32_10_sse4 
proxy_bi_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels48_10_sse4 
proxy_bi_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels64_10_sse4 
proxy_bi_w_pel_pixels8_10_sse4
 mc_bi_w_func(pel_pixels, 10, 6, sse4);
-mc_bi_w_funcs(epel_h, 10, sse4);
+
+mc_bi_w_proxy_funcs(epel_h, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_h16_10_sse4     proxy_bi_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h24_10_sse4     proxy_bi_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h32_10_sse4     proxy_bi_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h48_10_sse4     proxy_bi_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h64_10_sse4     proxy_bi_w_epel_h8_10_sse4
 mc_bi_w_func(epel_h, 10, 6, sse4);
-mc_bi_w_funcs(epel_v, 10, sse4);
+
+mc_bi_w_proxy_funcs(epel_v, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_v16_10_sse4     proxy_bi_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v24_10_sse4     proxy_bi_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v32_10_sse4     proxy_bi_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v48_10_sse4     proxy_bi_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v64_10_sse4     proxy_bi_w_epel_v8_10_sse4
 mc_bi_w_func(epel_v, 10, 6, sse4);
-mc_bi_w_funcs(epel_hv, 10, sse4);
+
+mc_bi_w_proxy_funcs(epel_hv, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_hv16_10_sse4     proxy_bi_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv24_10_sse4     proxy_bi_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv32_10_sse4     proxy_bi_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv48_10_sse4     proxy_bi_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv64_10_sse4     proxy_bi_w_epel_hv8_10_sse4
 mc_bi_w_func(epel_hv, 10, 6, sse4);
-mc_bi_w_funcs(qpel_h, 10, sse4);
-mc_bi_w_funcs(qpel_v, 10, sse4);
-mc_bi_w_funcs(qpel_hv, 10, sse4);
 
-mc_bi_w_funcs(pel_pixels, 12, sse4);
+mc_bi_w_proxy_funcs(qpel_h, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_h16_10_sse4     proxy_bi_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h24_10_sse4     proxy_bi_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h32_10_sse4     proxy_bi_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h48_10_sse4     proxy_bi_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h64_10_sse4     proxy_bi_w_qpel_h8_10_sse4
+
+mc_bi_w_proxy_funcs(qpel_v, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_v16_10_sse4     proxy_bi_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v24_10_sse4     proxy_bi_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v32_10_sse4     proxy_bi_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v48_10_sse4     proxy_bi_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v64_10_sse4     proxy_bi_w_qpel_v8_10_sse4
+
+mc_bi_w_proxy_funcs(qpel_hv, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_hv16_10_sse4     proxy_bi_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv24_10_sse4     proxy_bi_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv32_10_sse4     proxy_bi_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv48_10_sse4     proxy_bi_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv64_10_sse4     proxy_bi_w_qpel_hv8_10_sse4
+
+mc_bi_w_proxy_funcs(pel_pixels, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_pel_pixels16_12_sse4 
proxy_bi_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels24_12_sse4 
proxy_bi_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels32_12_sse4 
proxy_bi_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels48_12_sse4 
proxy_bi_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels64_12_sse4 
proxy_bi_w_pel_pixels8_12_sse4
 mc_bi_w_func(pel_pixels, 12, 6, sse4);
-mc_bi_w_funcs(epel_h, 12, sse4);
+
+mc_bi_w_proxy_funcs(epel_h, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_h16_12_sse4     proxy_bi_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h24_12_sse4     proxy_bi_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h32_12_sse4     proxy_bi_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h48_12_sse4     proxy_bi_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h64_12_sse4     proxy_bi_w_epel_h8_12_sse4
 mc_bi_w_func(epel_h, 12, 6, sse4);
-mc_bi_w_funcs(epel_v, 12, sse4);
+
+mc_bi_w_proxy_funcs(epel_v, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_v16_12_sse4     proxy_bi_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v24_12_sse4     proxy_bi_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v32_12_sse4     proxy_bi_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v48_12_sse4     proxy_bi_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v64_12_sse4     proxy_bi_w_epel_v8_12_sse4
 mc_bi_w_func(epel_v, 12, 6, sse4);
-mc_bi_w_funcs(epel_hv, 12, sse4);
+
+mc_bi_w_proxy_funcs(epel_hv, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_hv16_12_sse4     proxy_bi_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv24_12_sse4     proxy_bi_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv32_12_sse4     proxy_bi_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv48_12_sse4     proxy_bi_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv64_12_sse4     proxy_bi_w_epel_hv8_12_sse4
 mc_bi_w_func(epel_hv, 12, 6, sse4);
-mc_bi_w_funcs(qpel_h, 12, sse4);
-mc_bi_w_funcs(qpel_v, 12, sse4);
-mc_bi_w_funcs(qpel_hv, 12, sse4);
+
+mc_bi_w_proxy_funcs(qpel_h, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_h16_12_sse4     proxy_bi_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h24_12_sse4     proxy_bi_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h32_12_sse4     proxy_bi_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h48_12_sse4     proxy_bi_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h64_12_sse4     proxy_bi_w_qpel_h8_12_sse4
+
+mc_bi_w_proxy_funcs(qpel_v, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_v16_12_sse4     proxy_bi_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v24_12_sse4     proxy_bi_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v32_12_sse4     proxy_bi_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v48_12_sse4     proxy_bi_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v64_12_sse4     proxy_bi_w_qpel_v8_12_sse4
+
+mc_bi_w_proxy_funcs(qpel_hv, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_hv16_12_sse4     proxy_bi_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv24_12_sse4     proxy_bi_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv32_12_sse4     proxy_bi_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv48_12_sse4     proxy_bi_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv64_12_sse4     proxy_bi_w_qpel_hv8_12_sse4
+
 #endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
 
 
-- 
1.9.2.msysgit.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to