From: Kieran Kunhya <kie...@kunhya.com> --- libswscale/swscale.c | 227 ++++++++++++---------------------------- libswscale/swscale_internal.h | 42 ++++++-- 2 files changed, 99 insertions(+), 170 deletions(-)
diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 278c81e..a75012e 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -196,141 +196,67 @@ DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] = { 64, 64, 64, 64, 64, 64, 64, 64 }; static av_always_inline void -yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc, - int lumFilterSize, const int16_t *chrFilter, - const int32_t **chrUSrc, const int32_t **chrVSrc, - int chrFilterSize, const int32_t **alpSrc, - uint16_t *dest[4], int dstW, int chrDstW, +yuv2yuvX16_c_template(const int16_t *Filter, int FilterSize, + const int32_t **Src, uint16_t *Dest, int dstW, int big_endian, int output_bits) { - //FIXME Optimize (just quickly written not optimized..) - int i; - uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], - *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; - int shift = 15 + 16 - output_bits - 1; - #define output_pixel(pos, val) \ if (big_endian) { \ AV_WB16(pos, av_clip_uint16(val >> shift)); \ } else { \ AV_WL16(pos, av_clip_uint16(val >> shift)); \ } + + int i; + int shift = 15 + 16 - output_bits - 1; + for (i = 0; i < dstW; i++) { int val = 1 << (30-output_bits - 1); int j; - for (j = 0; j < lumFilterSize; j++) - val += (lumSrc[j][i] * lumFilter[j]) >> 1; + for (j = 0; j < FilterSize; j++) + val += (Src[j][i] * Filter[j]) >> 1; - output_pixel(&yDest[i], val); - } - - if (uDest) { - for (i = 0; i < chrDstW; i++) { - int u = 1 << (30-output_bits - 1); - int v = 1 << (30-output_bits - 1); - int j; - - for (j = 0; j < chrFilterSize; j++) { - u += (chrUSrc[j][i] * chrFilter[j]) >> 1; - v += (chrVSrc[j][i] * chrFilter[j]) >> 1; - } - - output_pixel(&uDest[i], u); - output_pixel(&vDest[i], v); - } - } - - if (CONFIG_SWSCALE_ALPHA && aDest) { - for (i = 0; i < dstW; i++) { - int val = 1 << (30-output_bits - 1); - int j; - - for (j = 0; j < lumFilterSize; j++) - val += (alpSrc[j][i] * lumFilter[j]) >> 1; - - output_pixel(&aDest[i], val); - } + output_pixel(&Dest[i], val); } #undef output_pixel } -static av_always_inline void -yuv2yuvX10_c_template(const int16_t *lumFilter, const int16_t **lumSrc, - int lumFilterSize, const int16_t *chrFilter, - const int16_t **chrUSrc, const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint16_t *dest[4], int dstW, int chrDstW, - int big_endian, int output_bits) -{ - //FIXME Optimize (just quickly written not optimized..) - int i; - uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], - *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; - int shift = 11 + 16 - output_bits; - #define output_pixel(pos, val) \ if (big_endian) { \ AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \ } else { \ AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \ } + +static av_always_inline void +yuv2yuvX10_c_template(const int16_t *Filter, int FilterSize, + const int16_t **Src, uint16_t *Dest, int dstW, + int big_endian, int output_bits) +{ + int i; + int shift = 11 + 16 - output_bits; + for (i = 0; i < dstW; i++) { int val = 1 << (26-output_bits); int j; - for (j = 0; j < lumFilterSize; j++) - val += lumSrc[j][i] * lumFilter[j]; - - output_pixel(&yDest[i], val); - } - - if (uDest) { - for (i = 0; i < chrDstW; i++) { - int u = 1 << (26-output_bits); - int v = 1 << (26-output_bits); - int j; + for (j = 0; j < FilterSize; j++) + val += Src[j][i] * Filter[j]; - for (j = 0; j < chrFilterSize; j++) { - u += chrUSrc[j][i] * chrFilter[j]; - v += chrVSrc[j][i] * chrFilter[j]; - } - - output_pixel(&uDest[i], u); - output_pixel(&vDest[i], v); - } + output_pixel(&Dest[i], val); } +} - if (CONFIG_SWSCALE_ALPHA && aDest) { - for (i = 0; i < dstW; i++) { - int val = 1 << (26-output_bits); - int j; - - for (j = 0; j < lumFilterSize; j++) - val += alpSrc[j][i] * lumFilter[j]; - - output_pixel(&aDest[i], val); - } - } #undef output_pixel -} #define yuv2NBPS(bits, BE_LE, is_be, yuv2yuvX_template_fn, typeX_t) \ -static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \ - const int16_t **_lumSrc, int lumFilterSize, \ - const int16_t *chrFilter, const int16_t **_chrUSrc, \ - const int16_t **_chrVSrc, \ - int chrFilterSize, const int16_t **_alpSrc, \ - uint8_t *_dest[4], int dstW, int chrDstW) \ +static void yuv2yuvX ## bits ## BE_LE ## _c(const int16_t *Filter, int FilterSize, \ + const int16_t **Src, uint16_t *Dest, int dstW, \ + const uint8_t *Dither, int offset)\ { \ - const typeX_t **lumSrc = (const typeX_t **) _lumSrc, \ - **chrUSrc = (const typeX_t **) _chrUSrc, \ - **chrVSrc = (const typeX_t **) _chrVSrc, \ - **alpSrc = (const typeX_t **) _alpSrc; \ - yuv2yuvX_template_fn(lumFilter, lumSrc, lumFilterSize, \ - chrFilter, chrUSrc, chrVSrc, chrFilterSize, \ - alpSrc, (uint16_t **) _dest, \ - dstW, chrDstW, is_be, bits); \ + yuv2yuvX_template_fn(Filter, FilterSize, (const typeX_t **) Src, \ + Dest, dstW, is_be, bits); \ } yuv2NBPS( 9, BE, 1, yuv2yuvX10_c_template, int16_t); yuv2NBPS( 9, LE, 0, yuv2yuvX10_c_template, int16_t); @@ -339,51 +265,19 @@ yuv2NBPS(10, LE, 0, yuv2yuvX10_c_template, int16_t); yuv2NBPS(16, BE, 1, yuv2yuvX16_c_template, int32_t); yuv2NBPS(16, LE, 0, yuv2yuvX16_c_template, int32_t); -static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, - int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest[4], int dstW, int chrDstW) +static void yuv2yuvX_c(const int16_t *Filter, int FilterSize, + const int16_t **Src, uint8_t *Dest, int dstW, + const uint8_t *Dither, int offset) { - uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], - *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; int i; - const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; - - //FIXME Optimize (just quickly written not optimized..) for (i=0; i<dstW; i++) { - int val = lumDither[i & 7] << 12; + int val = Dither[(i + offset) & 7] << 12; int j; - for (j=0; j<lumFilterSize; j++) - val += lumSrc[j][i] * lumFilter[j]; + for (j=0; j<FilterSize; j++) + val += Src[j][i] * Filter[j]; - yDest[i]= av_clip_uint8(val>>19); + Dest[i]= av_clip_uint8(val>>19); } - - if (uDest) - for (i=0; i<chrDstW; i++) { - int u = chrDither[i & 7] << 12; - int v = chrDither[(i + 3) & 7] << 12; - int j; - for (j=0; j<chrFilterSize; j++) { - u += chrUSrc[j][i] * chrFilter[j]; - v += chrVSrc[j][i] * chrFilter[j]; - } - - uDest[i]= av_clip_uint8(u>>19); - vDest[i]= av_clip_uint8(v>>19); - } - - if (CONFIG_SWSCALE_ALPHA && aDest) - for (i=0; i<dstW; i++) { - int val = lumDither[i & 7] << 12; - int j; - for (j=0; j<lumFilterSize; j++) - val += alpSrc[j][i] * lumFilter[j]; - - aDest[i]= av_clip_uint8(val>>19); - } } static void yuv2yuv1_c(const int16_t *Src, uint8_t *Dest, int dstW, @@ -2117,26 +2011,28 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2 static av_always_inline void find_c_packed_planar_out_funcs(SwsContext *c, - yuv2planar1_fn *yuv2yuv1, yuv2planarX_fn *yuv2yuvX, + yuv2planar1_fn *yuv2yuv1, yuv2planarX_fn *yuv2plane_luma, + yuv2planarX_fn *yuv2plane_chroma, yuv2interleavedX_fn *yuv2nv12X, yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2, yuv2packedX_fn *yuv2packedX) { enum PixelFormat dstFormat = c->dstFormat; if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { - *yuv2yuvX = yuv2nv12X_c; + *yuv2nv12X = yuv2nv12X_c; } else if (is16BPS(dstFormat)) { - *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX16BE_c : yuv2yuvX16LE_c; + *yuv2plane_luma = *yuv2plane_chroma = isBE(dstFormat) ? yuv2yuvX16BE_c : yuv2yuvX16LE_c; } else if (is9_OR_10BPS(dstFormat)) { if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) { - *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c : yuv2yuvX9LE_c; + *yuv2plane_luma = *yuv2plane_chroma = isBE(dstFormat) ? yuv2yuvX9BE_c : yuv2yuvX9LE_c; } else { - *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c; + *yuv2plane_luma = *yuv2plane_chroma = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c; } } else { *yuv2yuv1 = yuv2yuv1_c; - *yuv2yuvX = yuv2yuvX_c; + *yuv2plane_luma = *yuv2plane_chroma = yuv2yuvX_c; } + if(c->flags & SWS_FULL_CHR_H_INT) { switch (dstFormat) { case PIX_FMT_RGBA: @@ -2396,7 +2292,9 @@ static int swScale(SwsContext *c, const uint8_t* src[], int lastDstY; uint32_t *pal=c->pal_yuv; yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1; - yuv2planarX_fn yuv2yuvX = c->yuv2yuvX; + yuv2planarX_fn yuv2plane_luma = c->yuv2plane_luma; + yuv2planarX_fn yuv2plane_chroma = c->yuv2plane_chroma; + yuv2interleavedX_fn yuv2nv12X = c->yuv2nv12X; yuv2packed1_fn yuv2packed1 = c->yuv2packed1; yuv2packed2_fn yuv2packed2 = c->yuv2packed2; yuv2packedX_fn yuv2packedX = c->yuv2packedX; @@ -2548,8 +2446,8 @@ static int swScale(SwsContext *c, const uint8_t* src[], } if (dstY >= dstH-2) { // hmm looks like we can't use MMX here without overwriting this array's tail - find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX, - &yuv2packed1, &yuv2packed2, + find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2plane_luma, &yuv2plane_chroma, + &yuv2nv12X, &yuv2packed1, &yuv2packed2, &yuv2packedX); } @@ -2562,8 +2460,15 @@ static int swScale(SwsContext *c, const uint8_t* src[], const int chrSkipMask= (1<<c->chrDstVSubSample)-1; if ((dstY&chrSkipMask) || isGray(dstFormat)) dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi - if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12 - const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL; + const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL; + + if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { + yuv2nv12X(c, vLumFilter + dstY * vLumFilterSize, + lumSrcPtr, vLumFilterSize, + vChrFilter + chrDstY * vChrFilterSize, + chrUSrcPtr, chrVSrcPtr, vChrFilterSize, + alpSrcPtr, dest, dstW, chrDstW); + } else if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12 yuv2yuv1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0); if (dest[1]){ @@ -2571,14 +2476,18 @@ static int swScale(SwsContext *c, const uint8_t* src[], yuv2yuv1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3); } - if (alpBuf) + if (alpBuf && dest[3]) yuv2yuv1(alpBuf, dest[3], dstW, c->lumDither8, 0); } else { //General YV12 - yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize, - lumSrcPtr, vLumFilterSize, - vChrFilter + chrDstY * vChrFilterSize, - chrUSrcPtr, chrVSrcPtr, vChrFilterSize, - alpSrcPtr, dest, dstW, chrDstW); + yuv2plane_luma(vLumFilter + dstY * vLumFilterSize, vLumFilterSize, lumSrcPtr, dest[0], dstW, c->lumDither8, 0); + + if (dest[1]){ + yuv2plane_chroma(vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize, chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0); + yuv2plane_chroma(vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize, chrVSrcPtr, dest[2], chrDstW, c->chrDither8, 3); + } + + if (alpBuf && dest[3]) + yuv2plane_luma(vLumFilter + dstY * vLumFilterSize, vLumFilterSize, alpSrcPtr, dest[3], dstW, c->lumDither8, 0); } } else { assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); @@ -2632,8 +2541,8 @@ static av_cold void sws_init_swScale_c(SwsContext *c) { enum PixelFormat srcFormat = c->srcFormat; - find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX, - &c->yuv2packed1, &c->yuv2packed2, + find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2plane_luma, &c->yuv2plane_chroma, + &c->yuv2nv12X, &c->yuv2packed1, &c->yuv2packed2, &c->yuv2packedX); c->chrToYV12 = NULL; diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 8a8ad86..8cf1d45 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -66,17 +66,34 @@ typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t* src[], * * @param Src scaled source data, 15bit for 8-10bit output, * 19-bit for 16bit output (in int32_t) - * @param dest pointer to the 4 output planes (Y/U/V/A). For >8bit + * @param Dest pointer to the output plane. For >8bit * output, this is in uint16_t - * @param dstW width of dest[0], dest[3], lumSrc and alpSrc in pixels - * @param Dither TODO + * @param dstW width of destination in pixels + * @param Dither ordered dither array of type int16_t and size 8 * @param offset Dither offset */ typedef void (*yuv2planar1_fn) (const int16_t *Src, uint8_t *Dest, int dstW, const uint8_t *Dither, int offset); /** - * Write one line of horizontally scaled Y/U/V/A to planar output + * Write one line of horizontally scaled data to planar output + * with multi-point vertical scaling between input pixels. + * + * @param Filter vertical luma/alpha scaling coefficients, 12bit [0,4096] + * @param Src scaled luma (Y) or alpha (A) source data, 15bit for 8-10bit output, + * 19-bit for 16bit output (in int32_t) + * @param FilterSize number of vertical input lines to scale + * @param dest pointer to output plane. For >8bit + * output, this is in uint16_t + * @param dstW width of destination pixels + * @param offset Dither offset + */ +typedef void (*yuv2planarX_fn) (const int16_t *Filter, int FilterSize, + const int16_t **Src, uint8_t *Dest, int dstW, + const uint8_t *Dither, int offset); + +/** + * Write one line of horizontally scaled Y/U/V/A to interleaved output * with multi-point vertical scaling between input pixels. * * @param c SWS scaling context @@ -97,12 +114,13 @@ typedef void (*yuv2planar1_fn) (const int16_t *Src, uint8_t *Dest, int dstW, * @param dstW width of dest[0], dest[3], lumSrc and alpSrc in pixels * @param chrDstW width of dest[1], dest[2], chrUSrc and chrVSrc */ -typedef void (*yuv2planarX_fn) (struct SwsContext *c, const int16_t *lumFilter, - const int16_t **lumSrc, int lumFilterSize, - const int16_t *chrFilter, const int16_t **chrUSrc, - const int16_t **chrVSrc, int chrFilterSize, - const int16_t **alpSrc, uint8_t *dest[4], - int dstW, int chrDstW); +typedef void (*yuv2interleavedX_fn) (struct SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, int chrFilterSize, + const int16_t **alpSrc, uint8_t *dest[4], + int dstW, int chrDstW); + /** * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB * output without any additional vertical scaling (or point-scaling). Note @@ -405,7 +423,9 @@ typedef struct SwsContext { /* function pointers for swScale() */ yuv2planar1_fn yuv2yuv1; - yuv2planarX_fn yuv2yuvX; + yuv2planarX_fn yuv2plane_luma; + yuv2planarX_fn yuv2plane_chroma; + yuv2interleavedX_fn yuv2nv12X; yuv2packed1_fn yuv2packed1; yuv2packed2_fn yuv2packed2; yuv2packedX_fn yuv2packedX; -- 1.7.4.1 _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel