From: James Almer <jamr...@gmail.com>

~20% faster than AVX.

Integration to Libav by Josh de Kock <josh at itanimul.li>.

Reviewed-by: Michael Niedermayer <michae...@gmx.at>
Signed-off-by: James Almer <jamr...@gmail.com>
---
 libavcodec/x86/hevc_res_add.asm | 35 +++++++++++++++++++++++++++++++----
 libavcodec/x86/hevcdsp_init.c   |  4 ++++
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
index 0faa7af..c3fba74 100644
--- a/libavcodec/x86/hevc_res_add.asm
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -89,8 +89,12 @@ cglobal hevc_add_residual_4_8, 3, 5, 6, dst, coeffs, stride
 %endmacro
 
 %macro RES_ADD_SSE_16_32_8 3
-    mova               m2, [coeffsq + %1]
-    mova               m6, [coeffsq + %1 + 16]
+    mova              xm2, [coeffsq + %1]
+    mova              xm6, [coeffsq + %1 + 16]
+%if cpuflag(avx2)
+    vinserti128        m2, m2, [coeffsq + %1 + 32], 1
+    vinserti128        m6, m6, [coeffsq + %1 + 48], 1
+%endif
 %if cpuflag(avx)
     psubw              m1, m0, m2
     psubw              m5, m0, m6
@@ -103,8 +107,12 @@ cglobal hevc_add_residual_4_8, 3, 5, 6, dst, coeffs, stride
     packuswb           m2, m6
     packuswb           m1, m5
 
-    mova               m4, [coeffsq + %1 + 32]
-    mova               m6, [coeffsq + %1 + 48]
+    mova              xm4, [coeffsq + %1 + mmsize*2]
+    mova              xm6, [coeffsq + %1 + mmsize*2+16]
+%if cpuflag(avx2)
+    vinserti128        m4, m4, [coeffsq + %1 + 96],  1
+    vinserti128        m6, m6, [coeffsq + %1 + 112], 1
+%endif
 %if cpuflag(avx)
     psubw              m3, m0, m4
     psubw              m5, m0, m6
@@ -174,6 +182,25 @@ RESIDUAL_ADD_8
 INIT_XMM avx
 RESIDUAL_ADD_8
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t 
stride)
+cglobal hevc_add_residual_32_8, 3, 5, 7
+    pxor                 m0, m0
+    lea                  r3, [strideq*3]
+    RES_ADD_SSE_16_32_8   0, dstq,      dstq+strideq
+    RES_ADD_SSE_16_32_8 128, dstq+strideq*2, dstq+r3
+    mov r4d, 7
+.loop:
+    add             coeffsq, 256
+    lea                dstq, [dstq+strideq*4]
+    RES_ADD_SSE_16_32_8   0, dstq,      dstq+strideq
+    RES_ADD_SSE_16_32_8 128, dstq+strideq*2, dstq+r3
+    dec r4d
+    jnz .loop
+    RET
+%endif
+
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_add_residual_10(pixel *dst, int16_t *coeffs, ptrdiff_t stride)
 ;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index b97f4c8..c84c222 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -92,6 +92,8 @@ void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t 
*coeffs, ptrdiff_t strid
 void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t 
stride);
 void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t 
stride);
 
+void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t 
stride);
+
 void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t 
stride);
 void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t 
stride);
 
@@ -364,6 +366,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
bit_depth)
         if (EXTERNAL_AVX2(cpu_flags)) {
             c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
             c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
+
+            c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_SSSE3(cpu_flags)) {
-- 
2.7.4 (Apple Git-66)

_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to