Re: [FFmpeg-devel] [PATCH] avcodec/v210: add avx2 version of the line encoder
On 2016-01-14 21:42, Henrik Gramner wrote: > On Thu, Jan 14, 2016 at 9:27 PM, James Darnley > wrote: >> On 2016-01-14 20:21, Henrik Gramner wrote: >>> xmN can be used unconditionally which gets rid of the %else. E.g. >>> >>> movu xm1, [yq+widthq*2] >>> %if cpuflag(avx2) >>> vinserti128 m1, m1, [yq+widthq*2+12], 1 >>> %endif >> >> I can change that. I slightly prefer to not mix register sizes like >> that but it seems unavoidable with avx2. > > The xmN notation was invented for code like this and I don't really > think that there's anything negative with using it. Reducing the > amount of %if/%else makes stuff easier to read. > > It assembles into mmN with INIT_MMX, and xmmN with INIT_XMM and INIT_YMM. (I shouldn't have said anything.) I do know why it is there. It is just my opinion that the mixing of "styles" looks bad. signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avcodec/v210: add avx2 version of the line encoder
On Thu, Jan 14, 2016 at 9:27 PM, James Darnley wrote: > On 2016-01-14 20:21, Henrik Gramner wrote: >> xmN can be used unconditionally which gets rid of the %else. E.g. >> >> movu xm1, [yq+widthq*2] >> %if cpuflag(avx2) >> vinserti128 m1, m1, [yq+widthq*2+12], 1 >> %endif > > I can change that. I slightly prefer to not mix register sizes like > that but it seems unavoidable with avx2. The xmN notation was invented for code like this and I don't really think that there's anything negative with using it. Reducing the amount of %if/%else makes stuff easier to read. It assembles into mmN with INIT_MMX, and xmmN with INIT_XMM and INIT_YMM. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avcodec/v210: add avx2 version of the line encoder
On 2016-01-14 20:21, Henrik Gramner wrote: > On Wed, Jan 13, 2016 at 4:55 PM, James Darnley > wrote: >> diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm >> index 859e2d9..a8f3d3c 100644 >> --- a/libavcodec/x86/v210enc.asm >> +++ b/libavcodec/x86/v210enc.asm >> -cextern pb_FE >> -%define v210_enc_max_8 pb_FE >> +;cextern pb_FE >> +local_pb_FE: times 32 db 0xfe >> +%define v210_enc_max_8 local_pb_FE > > You could change ff_pb_FE to be 32-byte instead of duplicating it. I can change that. >> +%if cpuflag(avx2) >> +movuxm1, [yq+widthq*2] >> +vinserti128 m1, m1, [yq+widthq*2+12], 1 >> +%else >> movum1, [yq+2*widthq] >> +%endif > > xmN can be used unconditionally which gets rid of the %else. E.g. > > movu xm1, [yq+widthq*2] > %if cpuflag(avx2) > vinserti128 m1, m1, [yq+widthq*2+12], 1 > %endif I can change that. I slightly prefer to not mix register sizes like that but it seems unavoidable with avx2. >> +%if cpuflag(avx2) >> +movq xm3, [uq+widthq] >> +movhps xm3, [vq+widthq] >> +movq xm7, [uq+widthq+6] >> +movhps xm7, [vq+widthq+6] >> +vinserti128 m3, m3, xm7, 1 >> +%else >> movqm3, [uq+widthq] >> movhps m3, [vq+widthq] >> +%endif > > Ditto. Also use xm2 instead of xm7 since it's unused at this point and > it avoids having to use an extra vector register in the AVX2 version. Thanks. >> +%if cpuflag(avx2) >> +movu [dstq],xm0 >> +movu [dstq+16], xm1 >> +vextracti128 [dstq+32], m0, 1 >> +vextracti128 [dstq+48], m1, 1 >> +%else >> movu[dstq], m0 >> movu[dstq+mmsize], m1 >> +%endif > > Ditto. > > Otherwise LGTM. Noted. signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avcodec/v210: add avx2 version of the line encoder
On Wed, Jan 13, 2016 at 4:55 PM, James Darnley wrote: > diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm > index 859e2d9..a8f3d3c 100644 > --- a/libavcodec/x86/v210enc.asm > +++ b/libavcodec/x86/v210enc.asm > -cextern pb_FE > -%define v210_enc_max_8 pb_FE > +;cextern pb_FE > +local_pb_FE: times 32 db 0xfe > +%define v210_enc_max_8 local_pb_FE You could change ff_pb_FE to be 32-byte instead of duplicating it. > +%if cpuflag(avx2) > +movuxm1, [yq+widthq*2] > +vinserti128 m1, m1, [yq+widthq*2+12], 1 > +%else > movum1, [yq+2*widthq] > +%endif xmN can be used unconditionally which gets rid of the %else. E.g. movu xm1, [yq+widthq*2] %if cpuflag(avx2) vinserti128 m1, m1, [yq+widthq*2+12], 1 %endif > +%if cpuflag(avx2) > +movq xm3, [uq+widthq] > +movhps xm3, [vq+widthq] > +movq xm7, [uq+widthq+6] > +movhps xm7, [vq+widthq+6] > +vinserti128 m3, m3, xm7, 1 > +%else > movqm3, [uq+widthq] > movhps m3, [vq+widthq] > +%endif Ditto. Also use xm2 instead of xm7 since it's unused at this point and it avoids having to use an extra vector register in the AVX2 version. > +%if cpuflag(avx2) > +movu [dstq],xm0 > +movu [dstq+16], xm1 > +vextracti128 [dstq+32], m0, 1 > +vextracti128 [dstq+48], m1, 1 > +%else > movu[dstq], m0 > movu[dstq+mmsize], m1 > +%endif Ditto. Otherwise LGTM. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avcodec/v210: add avx2 version of the line encoder
Around 35% faster than the avx version. --- libavcodec/v210enc.c | 5 ++-- libavcodec/v210enc.h | 1 + libavcodec/x86/v210enc.asm| 53 +++ libavcodec/x86/v210enc_init.c | 7 ++ 4 files changed, 49 insertions(+), 17 deletions(-) diff --git a/libavcodec/v210enc.c b/libavcodec/v210enc.c index 0612248..ec63864 100644 --- a/libavcodec/v210enc.c +++ b/libavcodec/v210enc.c @@ -86,6 +86,7 @@ av_cold void ff_v210enc_init(V210EncContext *s) { s->pack_line_8 = v210_planar_pack_8_c; s->pack_line_10 = v210_planar_pack_10_c; +s->sample_factor = 1; if (ARCH_X86) ff_v210enc_init_x86(s); @@ -172,13 +173,13 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt, const uint8_t *v = pic->data[2]; for (h = 0; h < avctx->height; h++) { uint32_t val; -w = (avctx->width / 12) * 12; +w = (avctx->width / (12 * s->sample_factor)) * 12 * s->sample_factor; s->pack_line_8(y, u, v, dst, w); y += w; u += w >> 1; v += w >> 1; -dst += (w / 12) * 32; +dst += (w / (12 * s->sample_factor)) * 32 * s->sample_factor; for (; w < avctx->width - 5; w += 6) { WRITE_PIXELS8(u, y, v); diff --git a/libavcodec/v210enc.h b/libavcodec/v210enc.h index a205427..85f84f1 100644 --- a/libavcodec/v210enc.h +++ b/libavcodec/v210enc.h @@ -28,6 +28,7 @@ typedef struct V210EncContext { const uint8_t *v, uint8_t *dst, ptrdiff_t width); void (*pack_line_10)(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); +int sample_factor; } V210EncContext; void ff_v210enc_init(V210EncContext *s); diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index 859e2d9..a8f3d3c 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -21,30 +21,31 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 cextern pw_4 %define v210_enc_min_10 pw_4 -v210_enc_max_10: times 8 dw 0x3fb +v210_enc_max_10: times 16 dw 0x3fb -v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0 -v210_enc_luma_shuf_10: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 +v210_enc_luma_mult_10: times 2 dw 4,1,16,4,1,16,0,0 +v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 -v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0 -v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 +v210_enc_chroma_mult_10: times 2 dw 1,4,16,0,16,1,4,0 +v210_enc_chroma_shuf_10: times 2 db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 cextern pb_1 %define v210_enc_min_8 pb_1 -cextern pb_FE -%define v210_enc_max_8 pb_FE +;cextern pb_FE +local_pb_FE: times 32 db 0xfe +%define v210_enc_max_8 local_pb_FE -v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 -v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0 +v210_enc_luma_shuf_8: times 2 db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 +v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0 -v210_enc_chroma_shuf1_8: db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1 -v210_enc_chroma_shuf2_8: db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1 +v210_enc_chroma_shuf1_8: times 2 db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1 +v210_enc_chroma_shuf2_8: times 2 db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1 -v210_enc_chroma_mult_8: dw 4,16,64,0,64,4,16,0 +v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0 SECTION .text @@ -91,7 +92,7 @@ v210_planar_pack_10 %macro v210_planar_pack_8 0 ; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width) -cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width +cglobal v210_planar_pack_8, 5, 5, 7+cpuflag(avx2), y, u, v, dst, width add yq, widthq shr widthq, 1 add uq, widthq @@ -103,7 +104,12 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width pxorm6, m6 .loop: +%if cpuflag(avx2) +movuxm1, [yq+widthq*2] +vinserti128 m1, m1, [yq+widthq*2+12], 1 +%else movum1, [yq+2*widthq] +%endif CLIPUB m1, m4, m5 punpcklbw m0, m1, m6 @@ -116,8 +122,16 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width pshufb m0, [v210_enc_luma_shuf_10] pshufb m1, [v210_enc_luma_shuf_10] +%if cpuflag(avx2) +movq xm3, [uq+widthq] +movhps xm3, [vq+widthq] +movq xm7, [uq+widthq+6] +movhps xm7, [vq+widthq+6] +vinserti128 m3, m3, xm7, 1 +%else movqm3, [uq+widthq] movhps m3, [vq+widthq] +%endif CLIPUB m3, m4, m5 ; shuffle and multiply to get the same packing as in 10-bit @@ -132,11 +146,18 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width por m0, m2 por m1, m3 +%if cpuflag(avx2) +movu [dstq],xm0 +movu [dstq+16], xm1 +vextracti128 [dstq+32], m0,