From: James Almer <jamr...@gmail.com> ~20% faster than AVX.
Integration to Libav by Josh de Kock <josh at itanimul.li>. Reviewed-by: Michael Niedermayer <michae...@gmx.at> Signed-off-by: James Almer <jamr...@gmail.com> --- libavcodec/x86/hevc_res_add.asm | 35 +++++++++++++++++++++++++++++++---- libavcodec/x86/hevcdsp_init.c | 4 ++++ 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm index 0faa7af..c3fba74 100644 --- a/libavcodec/x86/hevc_res_add.asm +++ b/libavcodec/x86/hevc_res_add.asm @@ -89,8 +89,12 @@ cglobal hevc_add_residual_4_8, 3, 5, 6, dst, coeffs, stride %endmacro %macro RES_ADD_SSE_16_32_8 3 - mova m2, [coeffsq + %1] - mova m6, [coeffsq + %1 + 16] + mova xm2, [coeffsq + %1] + mova xm6, [coeffsq + %1 + 16] +%if cpuflag(avx2) + vinserti128 m2, m2, [coeffsq + %1 + 32], 1 + vinserti128 m6, m6, [coeffsq + %1 + 48], 1 +%endif %if cpuflag(avx) psubw m1, m0, m2 psubw m5, m0, m6 @@ -103,8 +107,12 @@ cglobal hevc_add_residual_4_8, 3, 5, 6, dst, coeffs, stride packuswb m2, m6 packuswb m1, m5 - mova m4, [coeffsq + %1 + 32] - mova m6, [coeffsq + %1 + 48] + mova xm4, [coeffsq + %1 + mmsize*2] + mova xm6, [coeffsq + %1 + mmsize*2+16] +%if cpuflag(avx2) + vinserti128 m4, m4, [coeffsq + %1 + 96], 1 + vinserti128 m6, m6, [coeffsq + %1 + 112], 1 +%endif %if cpuflag(avx) psubw m3, m0, m4 psubw m5, m0, m6 @@ -174,6 +182,25 @@ RESIDUAL_ADD_8 INIT_XMM avx RESIDUAL_ADD_8 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_add_residual_32_8, 3, 5, 7 + pxor m0, m0 + lea r3, [strideq*3] + RES_ADD_SSE_16_32_8 0, dstq, dstq+strideq + RES_ADD_SSE_16_32_8 128, dstq+strideq*2, dstq+r3 + mov r4d, 7 +.loop: + add coeffsq, 256 + lea dstq, [dstq+strideq*4] + RES_ADD_SSE_16_32_8 0, dstq, dstq+strideq + RES_ADD_SSE_16_32_8 128, dstq+strideq*2, dstq+r3 + dec r4d + jnz .loop + RET +%endif + ;----------------------------------------------------------------------------- ; void ff_hevc_add_residual_10(pixel *dst, int16_t *coeffs, ptrdiff_t stride) ;----------------------------------------------------------------------------- diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index b97f4c8..c84c222 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -92,6 +92,8 @@ void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); @@ -364,6 +366,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (EXTERNAL_AVX2(cpu_flags)) { c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2; + + c->add_residual[3] = ff_hevc_add_residual_32_8_avx2; } } else if (bit_depth == 10) { if (EXTERNAL_SSSE3(cpu_flags)) { -- 2.7.4 (Apple Git-66) _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel