[FFmpeg-devel] [PATCH] Revised ff_v210_planar_unpack AVX2
Replaced VSHUFPS with VPBLENDD to relieve port 5 bottleneck AVX2 is now 1.4x faster than AVX Tested on Broadwell CPU, Ubuntu 18.10 x86_64 ~/FFmpeg$ tests/checkasm/checkasm --bench --test=v210dec benchmarking with native FFmpeg timers nop: 94.1 checkasm: using random seed 3963743306 SSSE3: - v210dec.v210_unpack [OK] AVX: - v210dec.v210_unpack [OK] AVX2: - v210dec.v210_unpack [OK] checkasm: all 3 tests passed v210_unpack_c: 1625.2 v210_unpack_ssse3: 604.2 v210_unpack_avx: 592.2 v210_unpack_avx2: 422.2 --- libavcodec/v210dec.c | 10 +- libavcodec/x86/v210-init.c | 8 + libavcodec/x86/v210.asm| 72 +- 3 files changed, 73 insertions(+), 17 deletions(-) diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c index ddc5dbe8be..26954c0df3 100644 --- a/libavcodec/v210dec.c +++ b/libavcodec/v210dec.c @@ -119,7 +119,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, const uint32_t *src = (const uint32_t*)psrc; uint32_t val; -w = (avctx->width / 6) * 6; +w = (avctx->width / 12) * 12; s->unpack_frame(src, y, u, v, w); y += w; @@ -127,6 +127,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, v += w >> 1; src += (w << 1) / 3; +if (w < avctx->width - 5) { + READ_PIXELS(u, y, v); + READ_PIXELS(y, u, y); + READ_PIXELS(v, y, u); + READ_PIXELS(y, v, y); +w += 6; +} + if (w < avctx->width - 1) { READ_PIXELS(u, y, v); diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c index d64dbca1a8..cb9a6cbd6a 100644 --- a/libavcodec/x86/v210-init.c +++ b/libavcodec/x86/v210-init.c @@ -21,9 +21,11 @@ extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_unaligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); av_cold void ff_v210_x86_init(V210DecContext *s) { @@ -36,6 +38,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) s->unpack_frame = ff_v210_planar_unpack_aligned_avx; + +if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) +s->unpack_frame = ff_v210_planar_unpack_aligned_avx2; } else { if (cpu_flags & AV_CPU_FLAG_SSSE3) @@ -43,6 +48,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) s->unpack_frame = ff_v210_planar_unpack_unaligned_avx; + +if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) +s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2; } #endif } diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm index c24c765e5b..706712313d 100644 --- a/libavcodec/x86/v210.asm +++ b/libavcodec/x86/v210.asm @@ -22,9 +22,14 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 + +; for AVX2 version only +v210_luma_permute: dd 0,1,2,4,5,6,7,7 ; 32-byte alignment required +v210_chroma_shuf2: db 0,1,2,3,4,5,8,9,10,11,12,13,-1,-1,-1,-1 +v210_luma_shuf_avx2: db 0,1,4,5,6,7,8,9,12,13,14,15,-1,-1,-1,-1 +v210_chroma_shuf_avx2: db 0,1,4,5,10,11,-1,-1,2,3,8,9,12,13,-1,-1 -v210_mask: times 4 dd 0x3ff v210_mult: dw 64,4,64,4,64,4,64,4 v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 @@ -34,40 +39,65 @@ SECTION .text %macro v210_planar_unpack 1 ; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) -cglobal v210_planar_unpack_%1, 5, 5, 7 +cglobal v210_planar_unpack_%1, 5, 5, 8 movsxdifnidn r4, r4d lear1, [r1+2*r4] addr2, r4 addr3, r4 negr4 -mova m3, [v210_mult] -mova m4, [v210_mask] -mova m5, [v210_luma_shuf] -mova m6, [v210_chroma_shuf] +VBROADCASTI128 m3, [v210_mult] +VBROADCASTI128 m5, [v210_chroma_shuf] + +%if cpuflag(avx2) +VBROADCASTI128 m4, [v210_luma_shuf_avx2] +VBROADCASTI128 m5, [v210_chroma_shuf_avx2] +mova m6, [v210_luma_permute] +VBROADCASTI128 m7, [v210_chroma_shuf2] +%else +VBROADCASTI128 m4, [v210_luma_shuf] +VBROADCASTI128 m5, [v210_chroma_shuf]
Re: [FFmpeg-devel] [PATCH] Revised ff_v210_planar_unpack AVX2
I am submitting another patch. Please disregard this one. -Mike ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] Revised ff_v210_planar_unpack AVX2
--- libavcodec/v210dec.c | 10 +- libavcodec/x86/v210-init.c | 8 + libavcodec/x86/v210.asm| 63 -- 3 files changed, 64 insertions(+), 17 deletions(-) diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c index ddc5dbe8be..26954c0df3 100644 --- a/libavcodec/v210dec.c +++ b/libavcodec/v210dec.c @@ -119,7 +119,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, const uint32_t *src = (const uint32_t*)psrc; uint32_t val; -w = (avctx->width / 6) * 6; +w = (avctx->width / 12) * 12; s->unpack_frame(src, y, u, v, w); y += w; @@ -127,6 +127,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, v += w >> 1; src += (w << 1) / 3; +if (w < avctx->width - 5) { + READ_PIXELS(u, y, v); + READ_PIXELS(y, u, y); + READ_PIXELS(v, y, u); + READ_PIXELS(y, v, y); +w += 6; +} + if (w < avctx->width - 1) { READ_PIXELS(u, y, v); diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c index d64dbca1a8..cb9a6cbd6a 100644 --- a/libavcodec/x86/v210-init.c +++ b/libavcodec/x86/v210-init.c @@ -21,9 +21,11 @@ extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_unaligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); av_cold void ff_v210_x86_init(V210DecContext *s) { @@ -36,6 +38,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) s->unpack_frame = ff_v210_planar_unpack_aligned_avx; + +if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) +s->unpack_frame = ff_v210_planar_unpack_aligned_avx2; } else { if (cpu_flags & AV_CPU_FLAG_SSSE3) @@ -43,6 +48,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) s->unpack_frame = ff_v210_planar_unpack_unaligned_avx; + +if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) +s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2; } #endif } diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm index c24c765e5b..064185354f 100644 --- a/libavcodec/x86/v210.asm +++ b/libavcodec/x86/v210.asm @@ -22,9 +22,12 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 + +; for AVX2 version only +v210_luma_permute: dd 0,1,2,4,5,6,7,7 ; 32-byte alignment required +v210_chroma_shuf2: db 0,1,2,3,4,5,8,9,10,11,12,13,-1,-1,-1,-1 -v210_mask: times 4 dd 0x3ff v210_mult: dw 64,4,64,4,64,4,64,4 v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 @@ -34,40 +37,58 @@ SECTION .text %macro v210_planar_unpack 1 ; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) -cglobal v210_planar_unpack_%1, 5, 5, 7 +cglobal v210_planar_unpack_%1, 5, 5, 8 movsxdifnidn r4, r4d lear1, [r1+2*r4] addr2, r4 addr3, r4 negr4 -mova m3, [v210_mult] -mova m4, [v210_mask] -mova m5, [v210_luma_shuf] -mova m6, [v210_chroma_shuf] +VBROADCASTI128 m3, [v210_mult] +VBROADCASTI128 m4, [v210_luma_shuf] +VBROADCASTI128 m5, [v210_chroma_shuf] + +%if cpuflag(avx2) +mova m6, [v210_luma_permute] +VBROADCASTI128 m7, [v210_chroma_shuf2] +%endif + .loop: %ifidn %1, unaligned -movu m0, [r0] +movu m0, [r0]; yB v5 yA u5 y9 v4 y8 u4 y7 v3 y6 u3 y5 v2 y4 u2 y3 v1 y2 u1 y1 v0 y0 u0 %else mova m0, [r0] %endif pmullw m1, m0, m3 -psrld m0, 10 -psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5 -pand m0, m4 ; y0 __ u1 __ y3 __ v2 __ +pslld m0, 12 +psrlw m1, 6 ; yB yA u5 v4 y8 y7 v3 u3 y5 y4 u2 v1 y2 y1 v0 u0 +psrld m0, 22 ; 00 v5 00 y9 00 u4 00 y6 00 v2 00 y3 00 u1 00 y0 + +shufps m2, m1, m0, 0x8d; 00 y9 00 y6 yB yA y8 y7 00 y3 00 y0 y5 y4 y2 y1 +pshufb m2, m4 ; 00 00 yB yA y9 y8 y7 y6 00 00 y5 y4 y3 y2 y1 y0 + +%if cpuflag(avx2) +vpermd m2, m6, m2 ; 00 00 00 00 yB yA y9 y8 y7