--- libavcodec/v210enc.c | 151 +++++++++++++++++++++++++++++++----------- libavcodec/v210enc.h | 4 +- libavcodec/x86/v210enc.asm | 104 ++++++++++++++++++++++++----- libavcodec/x86/v210enc_init.c | 13 +++- libavutil/x86/x86util.asm | 5 ++ 5 files changed, 218 insertions(+), 59 deletions(-)
diff --git a/libavcodec/v210enc.c b/libavcodec/v210enc.c index 55bcaea..d06e903 100644 --- a/libavcodec/v210enc.c +++ b/libavcodec/v210enc.c @@ -27,6 +27,7 @@ #include "v210enc.h" #define CLIP(v) av_clip(v, 4, 1019) +#define CLIP8(v) av_clip(v, 1, 254) #define WRITE_PIXELS(a, b, c) \ do { \ @@ -37,8 +38,36 @@ dst += 4; \ } while (0) -static void v210_planar_pack_c(const uint16_t *y, const uint16_t *u, - const uint16_t *v, uint8_t *dst, ptrdiff_t width) +#define WRITE_PIXELS8(a, b, c) \ + do { \ + val = (CLIP8(*a++) << 2); \ + val |= (CLIP8(*b++) << 12) | \ + (CLIP8(*c++) << 22); \ + AV_WL32(dst, val); \ + dst += 4; \ + } while (0) + +static void v210_planar_pack_8_c(const uint8_t *y, const uint8_t *u, + const uint8_t *v, uint8_t *dst, ptrdiff_t width) +{ + uint32_t val; + int i; + + /* unroll this to match the assembly */ + for( i = 0; i < width-11; i += 12 ){ + WRITE_PIXELS8(u, y, v); + WRITE_PIXELS8(y, u, y); + WRITE_PIXELS8(v, y, u); + WRITE_PIXELS8(y, v, y); + WRITE_PIXELS8(u, y, v); + WRITE_PIXELS8(y, u, y); + WRITE_PIXELS8(v, y, u); + WRITE_PIXELS8(y, v, y); + } +} + +static void v210_planar_pack_10_c(const uint16_t *y, const uint16_t *u, + const uint16_t *v, uint8_t *dst, ptrdiff_t width) { uint32_t val; int i; @@ -60,17 +89,14 @@ static av_cold int encode_init(AVCodecContext *avctx) return AVERROR(EINVAL); } - if (avctx->bits_per_raw_sample != 10) - av_log(avctx, AV_LOG_WARNING, "bits per raw sample: %d != 10-bit\n", - avctx->bits_per_raw_sample); - avctx->coded_frame = av_frame_alloc(); if (!avctx->coded_frame) return AVERROR(ENOMEM); avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I; - s->pack_line = v210_planar_pack_c; + s->pack_line_8 = v210_planar_pack_8_c; + s->pack_line_10 = v210_planar_pack_10_c; if (ARCH_X86) v210enc_x86_init(s); @@ -87,9 +113,6 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt, int stride = aligned_width * 8 / 3; int line_padding = stride - ((avctx->width * 8 + 11) / 12) * 4; int h, w, ret; - const uint16_t *y = (const uint16_t*)pic->data[0]; - const uint16_t *u = (const uint16_t*)pic->data[1]; - const uint16_t *v = (const uint16_t*)pic->data[2]; uint8_t *dst; if ((ret = ff_alloc_packet(pkt, avctx->height * stride)) < 0) { @@ -99,40 +122,92 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt, dst = pkt->data; - for (h = 0; h < avctx->height; h++) { - uint32_t val; - w = (avctx->width / 6) * 6; - s->pack_line(y, u, v, dst, w); - - y += w; - u += w >> 1; - v += w >> 1; - dst += (w / 6) * 16; - if (w < avctx->width - 1) { - WRITE_PIXELS(u, y, v); + if (pic->format == AV_PIX_FMT_YUV422P10) { + const uint16_t *y = (const uint16_t*)pic->data[0]; + const uint16_t *u = (const uint16_t*)pic->data[1]; + const uint16_t *v = (const uint16_t*)pic->data[2]; + for (h = 0; h < avctx->height; h++) { + uint32_t val; + w = (avctx->width / 6) * 6; + s->pack_line_10(y, u, v, dst, w); + + y += w; + u += w >> 1; + v += w >> 1; + dst += (w / 6) * 16; + if (w < avctx->width - 1) { + WRITE_PIXELS(u, y, v); + + val = CLIP(*y++); + if (w == avctx->width - 2) { + AV_WL32(dst, val); + dst += 4; + } + } + if (w < avctx->width - 3) { + val |= (CLIP(*u++) << 10) | (CLIP(*y++) << 20); + AV_WL32(dst, val); + dst += 4; - val = CLIP(*y++); - if (w == avctx->width - 2) { + val = CLIP(*v++) | (CLIP(*y++) << 10); AV_WL32(dst, val); dst += 4; } + + memset(dst, 0, line_padding); + dst += line_padding; + + y += pic->linesize[0] / 2 - avctx->width; + u += pic->linesize[1] / 2 - avctx->width / 2; + v += pic->linesize[2] / 2 - avctx->width / 2; } - if (w < avctx->width - 3) { - val |= (CLIP(*u++) << 10) | (CLIP(*y++) << 20); - AV_WL32(dst, val); - dst += 4; - - val = CLIP(*v++) | (CLIP(*y++) << 10); - AV_WL32(dst, val); - dst += 4; - } + } + else if(pic->format == AV_PIX_FMT_YUV422P) { + const uint8_t *y = pic->data[0]; + const uint8_t *u = pic->data[1]; + const uint8_t *v = pic->data[2]; + for (h = 0; h < avctx->height; h++) { + uint32_t val; + w = (avctx->width / 12) * 12; + s->pack_line_8(y, u, v, dst, w); + + y += w; + u += w >> 1; + v += w >> 1; + dst += (w / 12) * 32; + + for( ; w < avctx->width-5; w += 6 ){ + WRITE_PIXELS8(u, y, v); + WRITE_PIXELS8(y, u, y); + WRITE_PIXELS8(v, y, u); + WRITE_PIXELS8(y, v, y); + } + if (w < avctx->width - 1) { + WRITE_PIXELS8(u, y, v); + + val = CLIP8(*y++) << 2; + if (w == avctx->width - 2) { + AV_WL32(dst, val); + dst += 4; + } + } + if (w < avctx->width - 3) { + val |= (CLIP8(*u++) << 12) | (CLIP8(*y++) << 22); + AV_WL32(dst, val); + dst += 4; - memset(dst, 0, line_padding); - dst += line_padding; + val = (CLIP8(*v++) << 2) | (CLIP8(*y++) << 12); + AV_WL32(dst, val); + dst += 4; + } + + memset(dst, 0, line_padding); + dst += line_padding; - y += pic->linesize[0] / 2 - avctx->width; - u += pic->linesize[1] / 2 - avctx->width / 2; - v += pic->linesize[2] / 2 - avctx->width / 2; + y += pic->linesize[0] - avctx->width; + u += pic->linesize[1] - avctx->width / 2; + v += pic->linesize[2] - avctx->width / 2; + } } pkt->flags |= AV_PKT_FLAG_KEY; @@ -156,5 +231,5 @@ AVCodec ff_v210_encoder = { .init = encode_init, .encode2 = encode_frame, .close = encode_close, - .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV422P10, AV_PIX_FMT_NONE }, + .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P, AV_PIX_FMT_NONE }, }; diff --git a/libavcodec/v210enc.h b/libavcodec/v210enc.h index b8b6143..df6903b 100644 --- a/libavcodec/v210enc.h +++ b/libavcodec/v210enc.h @@ -21,9 +21,11 @@ #include "libavutil/log.h" #include "libavutil/opt.h" +#include "libavutil/pixfmt.h" typedef struct { - void (*pack_line)(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); + void (*pack_line_8)(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width); + void (*pack_line_10)(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); } V210EncContext; void v210enc_x86_init(V210EncContext *s); diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index ddc2d1a..21d2b3a 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -23,42 +23,53 @@ SECTION_RODATA -v210_enc_min: times 8 dw 0x4 -v210_enc_max: times 8 dw 0x3fb +v210_enc_min_10: times 8 dw 0x4 +v210_enc_max_10: times 8 dw 0x3fb -v210_enc_luma_mult: dw 4,1,16,4,1,16,0,0 -v210_enc_luma_shuf: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 +v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0 +v210_enc_luma_shuf_10: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 -v210_enc_chroma_mult: dw 1,4,16,0,16,1,4,0 -v210_enc_chroma_shuf: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 +v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0 +v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 + +v210_enc_min_8: times 16 db 0x1 +v210_enc_max_8: times 16 db 0xfe + +v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 +v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0 + +v210_enc_chroma_shuf1_8: db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1 +v210_enc_chroma_shuf2_8: db 4,-1,5,-1,6,-1,7,-1,12,-1,13,-1,14,-1,15,-1 + +v210_enc_chroma_mult_8: dw 4,16,64,0,64,4,16,0 SECTION .text -%macro v210_planar_pack 0 +%macro v210_planar_pack_10 0 -; v210_planar_pack(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width) -cglobal v210_planar_pack, 5, 5, 4, y, u, v, dst, width +; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width) +cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width lea r0, [yq+2*widthq] add uq, widthq add vq, widthq neg widthq - mova m2, [v210_enc_min] - mova m3, [v210_enc_max] + mova m2, [v210_enc_min_10] + mova m3, [v210_enc_max_10] .loop - movu m0, [yq+2*widthq] + movu m0, [yq+widthq] CLIPW m0, m2, m3 movq m1, [uq+widthq] movhps m1, [vq+widthq] CLIPW m1, m2, m3 - pmullw m0, [v210_enc_luma_mult] - pshufb m0, [v210_enc_luma_shuf] + pmullw m0, [v210_enc_luma_mult_10] + pshufb m0, [v210_enc_luma_shuf_10] - pmullw m1, [v210_enc_chroma_mult] - pshufb m1, [v210_enc_chroma_shuf] + pmullw m1, [v210_enc_chroma_mult_10] + pshufb m1, [v210_enc_chroma_shuf_10] por m0, m1 @@ -72,5 +83,64 @@ cglobal v210_planar_pack, 5, 5, 4, y, u, v, dst, width %endmacro INIT_XMM ssse3 -v210_planar_pack +v210_planar_pack_10 + +%macro v210_planar_pack_8 0 + +; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width) +cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width + add yq, widthq + shr widthq, 1 + add uq, widthq + add vq, widthq + neg widthq + + mova m4, [v210_enc_min_8] + mova m5, [v210_enc_max_8] + pxor m6, m6 + +.loop + movu m1, [yq+2*widthq] + CLIPUB m1, m4, m5 + + punpcklbw m0, m1, m6 + ; can't unpack high bytes in the same way because we process + ; only six bytes at a time + pshufb m1, [v210_enc_luma_shuf_8] + + pmullw m0, [v210_enc_luma_mult_8] + pmullw m1, [v210_enc_luma_mult_8] + pshufb m0, [v210_enc_luma_shuf_10] + pshufb m1, [v210_enc_luma_shuf_10] + + movq m3, [uq+widthq] + movhps m3, [vq+widthq] + CLIPUB m3, m4, m5 + + ; shuffle and multiply to get the same packing as in 10-bit + pshufb m2, m3, [v210_enc_chroma_shuf1_8] + pshufb m3, [v210_enc_chroma_shuf2_8] + + pmullw m2, [v210_enc_chroma_mult_8] + pmullw m3, [v210_enc_chroma_mult_8] + pshufb m2, [v210_enc_chroma_shuf_10] + pshufb m3, [v210_enc_chroma_shuf_10] + + por m0, m2 + por m1, m3 + + movu [dstq], m0 + movu [dstq+mmsize], m1 + + add dstq, 2*mmsize + add widthq, 6 + jl .loop + + RET +%endmacro + +INIT_XMM ssse3 +v210_planar_pack_8 +INIT_XMM avx +v210_planar_pack_8 diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c index 5d87c7b..6420464 100644 --- a/libavcodec/x86/v210enc_init.c +++ b/libavcodec/x86/v210enc_init.c @@ -20,12 +20,19 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/v210enc.h" -void ff_v210_planar_pack_ssse3(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); +void ff_v210_planar_pack_8_ssse3(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width); +void ff_v210_planar_pack_8_avx(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width); +void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width); av_cold void v210enc_x86_init(V210EncContext *s) { int cpu_flags = av_get_cpu_flags(); - if( EXTERNAL_SSSE3(cpu_flags) ) - s->pack_line = ff_v210_planar_pack_ssse3; + if( EXTERNAL_SSSE3(cpu_flags) ) { + s->pack_line_8 = ff_v210_planar_pack_8_ssse3; + s->pack_line_10 = ff_v210_planar_pack_10_ssse3; + } + + if( EXTERNAL_AVX(cpu_flags) ) + s->pack_line_8 = ff_v210_planar_pack_8_avx; } diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index 11779cf..9f64dd1 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -584,6 +584,11 @@ %endif %endmacro +%macro CLIPUB 3 ;(dst, min, max) + pmaxub %1, %2 + pminub %1, %3 +%endmacro + %macro CLIPW 3 ;(dst, min, max) pmaxsw %1, %2 pminsw %1, %3 -- 1.9.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel