[libav-devel] [PATCH 2/2] Split up yuv2yuvX functions.

Kieran Kunhya Tue, 27 Sep 2011 09:13:06 -0700

From: Kieran Kunhya <kie...@kunhya.com>

---
 libswscale/swscale.c          |  227 ++++++++++++----------------------------
 libswscale/swscale_internal.h |   42 ++++++--
 2 files changed, 99 insertions(+), 170 deletions(-)


diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 278c81e..a75012e 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -196,141 +196,67 @@ DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 {  64, 64, 64, 64, 64, 64, 64, 64 };
 
 static av_always_inline void
-yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
-                      int lumFilterSize, const int16_t *chrFilter,
-                      const int32_t **chrUSrc, const int32_t **chrVSrc,
-                      int chrFilterSize, const int32_t **alpSrc,
-                      uint16_t *dest[4], int dstW, int chrDstW,
+yuv2yuvX16_c_template(const int16_t *Filter, int FilterSize,
+                      const int32_t **Src, uint16_t *Dest, int dstW,
                       int big_endian, int output_bits)
 {
-    //FIXME Optimize (just quickly written not optimized..)
-    int i;
-    uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
-             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
-    int shift = 15 + 16 - output_bits - 1;
-
 #define output_pixel(pos, val) \
     if (big_endian) { \
         AV_WB16(pos, av_clip_uint16(val >> shift)); \
     } else { \
         AV_WL16(pos, av_clip_uint16(val >> shift)); \
     }
+
+    int i;
+    int shift = 15 + 16 - output_bits - 1;
+
     for (i = 0; i < dstW; i++) {
         int val = 1 << (30-output_bits - 1);
         int j;
 
-        for (j = 0; j < lumFilterSize; j++)
-            val += (lumSrc[j][i] * lumFilter[j]) >> 1;
+        for (j = 0; j < FilterSize; j++)
+            val += (Src[j][i] * Filter[j]) >> 1;
 
-        output_pixel(&yDest[i], val);
-    }
-
-    if (uDest) {
-        for (i = 0; i < chrDstW; i++) {
-            int u = 1 << (30-output_bits - 1);
-            int v = 1 << (30-output_bits - 1);
-            int j;
-
-            for (j = 0; j < chrFilterSize; j++) {
-                u += (chrUSrc[j][i] * chrFilter[j]) >> 1;
-                v += (chrVSrc[j][i] * chrFilter[j]) >> 1;
-            }
-
-            output_pixel(&uDest[i], u);
-            output_pixel(&vDest[i], v);
-        }
-    }
-
-    if (CONFIG_SWSCALE_ALPHA && aDest) {
-        for (i = 0; i < dstW; i++) {
-            int val = 1 << (30-output_bits - 1);
-            int j;
-
-            for (j = 0; j < lumFilterSize; j++)
-                val += (alpSrc[j][i] * lumFilter[j]) >> 1;
-
-            output_pixel(&aDest[i], val);
-        }
+        output_pixel(&Dest[i], val);
     }
 #undef output_pixel
 }
 
-static av_always_inline void
-yuv2yuvX10_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
-                      int lumFilterSize, const int16_t *chrFilter,
-                      const int16_t **chrUSrc, const int16_t **chrVSrc,
-                      int chrFilterSize, const int16_t **alpSrc,
-                      uint16_t *dest[4], int dstW, int chrDstW,
-                      int big_endian, int output_bits)
-{
-    //FIXME Optimize (just quickly written not optimized..)
-    int i;
-    uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
-             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
-    int shift = 11 + 16 - output_bits;
-
 #define output_pixel(pos, val) \
     if (big_endian) { \
         AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
     } else { \
         AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
     }
+
+static av_always_inline void
+yuv2yuvX10_c_template(const int16_t *Filter, int FilterSize,
+                      const int16_t **Src, uint16_t *Dest, int dstW,
+                      int big_endian, int output_bits)
+{
+    int i;
+    int shift = 11 + 16 - output_bits;
+
     for (i = 0; i < dstW; i++) {
         int val = 1 << (26-output_bits);
         int j;
 
-        for (j = 0; j < lumFilterSize; j++)
-            val += lumSrc[j][i] * lumFilter[j];
-
-        output_pixel(&yDest[i], val);
-    }
-
-    if (uDest) {
-        for (i = 0; i < chrDstW; i++) {
-            int u = 1 << (26-output_bits);
-            int v = 1 << (26-output_bits);
-            int j;
+        for (j = 0; j < FilterSize; j++)
+            val += Src[j][i] * Filter[j];
 
-            for (j = 0; j < chrFilterSize; j++) {
-                u += chrUSrc[j][i] * chrFilter[j];
-                v += chrVSrc[j][i] * chrFilter[j];
-            }
-
-            output_pixel(&uDest[i], u);
-            output_pixel(&vDest[i], v);
-        }
+        output_pixel(&Dest[i], val);
     }
+}
 
-    if (CONFIG_SWSCALE_ALPHA && aDest) {
-        for (i = 0; i < dstW; i++) {
-            int val = 1 << (26-output_bits);
-            int j;
-
-            for (j = 0; j < lumFilterSize; j++)
-                val += alpSrc[j][i] * lumFilter[j];
-
-            output_pixel(&aDest[i], val);
-        }
-    }
 #undef output_pixel
-}
 
 #define yuv2NBPS(bits, BE_LE, is_be, yuv2yuvX_template_fn, typeX_t) \
-static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t 
*lumFilter, \
-                              const int16_t **_lumSrc, int lumFilterSize, \
-                              const int16_t *chrFilter, const int16_t 
**_chrUSrc, \
-                              const int16_t **_chrVSrc, \
-                              int chrFilterSize, const int16_t **_alpSrc, \
-                              uint8_t *_dest[4], int dstW, int chrDstW) \
+static void yuv2yuvX ## bits ## BE_LE ## _c(const int16_t *Filter, int 
FilterSize, \
+                              const int16_t **Src, uint16_t *Dest, int dstW, \
+                              const uint8_t *Dither, int offset)\
 { \
-    const typeX_t **lumSrc  = (const typeX_t **) _lumSrc, \
-                  **chrUSrc = (const typeX_t **) _chrUSrc, \
-                  **chrVSrc = (const typeX_t **) _chrVSrc, \
-                  **alpSrc  = (const typeX_t **) _alpSrc; \
-    yuv2yuvX_template_fn(lumFilter, lumSrc, lumFilterSize, \
-                         chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
-                         alpSrc, (uint16_t **) _dest, \
-                         dstW, chrDstW, is_be, bits); \
+    yuv2yuvX_template_fn(Filter, FilterSize, (const typeX_t **) Src, \
+                         Dest, dstW, is_be, bits); \
 }
 yuv2NBPS( 9, BE, 1, yuv2yuvX10_c_template, int16_t);
 yuv2NBPS( 9, LE, 0, yuv2yuvX10_c_template, int16_t);
@@ -339,51 +265,19 @@ yuv2NBPS(10, LE, 0, yuv2yuvX10_c_template, int16_t);
 yuv2NBPS(16, BE, 1, yuv2yuvX16_c_template, int32_t);
 yuv2NBPS(16, LE, 0, yuv2yuvX16_c_template, int32_t);
 
-static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
-                       const int16_t **lumSrc, int lumFilterSize,
-                       const int16_t *chrFilter, const int16_t **chrUSrc,
-                       const int16_t **chrVSrc,
-                       int chrFilterSize, const int16_t **alpSrc,
-                       uint8_t *dest[4], int dstW, int chrDstW)
+static void yuv2yuvX_c(const int16_t *Filter, int FilterSize,
+                       const int16_t **Src, uint8_t *Dest, int dstW,
+                       const uint8_t *Dither, int offset)
 {
-    uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
-            *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
     int i;
-    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
-
-    //FIXME Optimize (just quickly written not optimized..)
     for (i=0; i<dstW; i++) {
-        int val = lumDither[i & 7] << 12;
+        int val = Dither[(i + offset) & 7] << 12;
         int j;
-        for (j=0; j<lumFilterSize; j++)
-            val += lumSrc[j][i] * lumFilter[j];
+        for (j=0; j<FilterSize; j++)
+            val += Src[j][i] * Filter[j];
 
-        yDest[i]= av_clip_uint8(val>>19);
+        Dest[i]= av_clip_uint8(val>>19);
     }
-
-    if (uDest)
-        for (i=0; i<chrDstW; i++) {
-            int u = chrDither[i & 7] << 12;
-            int v = chrDither[(i + 3) & 7] << 12;
-            int j;
-            for (j=0; j<chrFilterSize; j++) {
-                u += chrUSrc[j][i] * chrFilter[j];
-                v += chrVSrc[j][i] * chrFilter[j];
-            }
-
-            uDest[i]= av_clip_uint8(u>>19);
-            vDest[i]= av_clip_uint8(v>>19);
-        }
-
-    if (CONFIG_SWSCALE_ALPHA && aDest)
-        for (i=0; i<dstW; i++) {
-            int val = lumDither[i & 7] << 12;
-            int j;
-            for (j=0; j<lumFilterSize; j++)
-                val += alpSrc[j][i] * lumFilter[j];
-
-            aDest[i]= av_clip_uint8(val>>19);
-        }
 }
 
 static void yuv2yuv1_c(const int16_t *Src, uint8_t *Dest, int dstW,
@@ -2117,26 +2011,28 @@ static av_always_inline void hcscale(SwsContext *c, 
int16_t *dst1, int16_t *dst2
 
 static av_always_inline void
 find_c_packed_planar_out_funcs(SwsContext *c,
-                               yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn 
*yuv2yuvX,
+                               yuv2planar1_fn *yuv2yuv1, yuv2planarX_fn 
*yuv2plane_luma,
+                               yuv2planarX_fn *yuv2plane_chroma, 
yuv2interleavedX_fn *yuv2nv12X,
                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn 
*yuv2packed2,
                                yuv2packedX_fn *yuv2packedX)
 {
     enum PixelFormat dstFormat = c->dstFormat;
 
     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
-        *yuv2yuvX     = yuv2nv12X_c;
+        *yuv2nv12X     = yuv2nv12X_c;
     } else if (is16BPS(dstFormat)) {
-        *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
+        *yuv2plane_luma = *yuv2plane_chroma = isBE(dstFormat) ? yuv2yuvX16BE_c 
 : yuv2yuvX16LE_c;
     } else if (is9_OR_10BPS(dstFormat)) {
         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
-            *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
+            *yuv2plane_luma = *yuv2plane_chroma = isBE(dstFormat) ? 
yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
         } else {
-            *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
+            *yuv2plane_luma = *yuv2plane_chroma = isBE(dstFormat) ? 
yuv2yuvX10BE_c : yuv2yuvX10LE_c;
         }
     } else {
         *yuv2yuv1     = yuv2yuv1_c;
-        *yuv2yuvX     = yuv2yuvX_c;
+        *yuv2plane_luma = *yuv2plane_chroma = yuv2yuvX_c;
     }
+
     if(c->flags & SWS_FULL_CHR_H_INT) {
         switch (dstFormat) {
             case PIX_FMT_RGBA:
@@ -2396,7 +2292,9 @@ static int swScale(SwsContext *c, const uint8_t* src[],
     int lastDstY;
     uint32_t *pal=c->pal_yuv;
     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
-    yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
+    yuv2planarX_fn yuv2plane_luma = c->yuv2plane_luma;
+    yuv2planarX_fn yuv2plane_chroma = c->yuv2plane_chroma;
+    yuv2interleavedX_fn yuv2nv12X = c->yuv2nv12X;
     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
@@ -2548,8 +2446,8 @@ static int swScale(SwsContext *c, const uint8_t* src[],
         }
         if (dstY >= dstH-2) {
             // hmm looks like we can't use MMX here without overwriting this 
array's tail
-            find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
-                                           &yuv2packed1, &yuv2packed2,
+            find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2plane_luma, 
&yuv2plane_chroma,
+                                           &yuv2nv12X, &yuv2packed1, 
&yuv2packed2,
                                            &yuv2packedX);
         }
 
@@ -2562,8 +2460,15 @@ static int swScale(SwsContext *c, const uint8_t* src[],
                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                 if ((dstY&chrSkipMask) || isGray(dstFormat))
                     dest[1] = dest[2] = NULL; //FIXME split functions in lumi 
/ chromi
-                if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) 
{ // unscaled YV12
-                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) 
? alpSrcPtr[0] : NULL;
+                const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? 
alpSrcPtr[0] : NULL;
+
+                if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
+                   yuv2nv12X(c, vLumFilter + dstY * vLumFilterSize,
+                             lumSrcPtr, vLumFilterSize,
+                             vChrFilter + chrDstY * vChrFilterSize,
+                             chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
+                             alpSrcPtr, dest, dstW, chrDstW);
+                } else if (c->yuv2yuv1 && vLumFilterSize == 1 && 
vChrFilterSize == 1) { // unscaled YV12
                     yuv2yuv1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
 
                     if (dest[1]){
@@ -2571,14 +2476,18 @@ static int swScale(SwsContext *c, const uint8_t* src[],
                         yuv2yuv1(chrVSrcPtr[0], dest[2], chrDstW, 
c->chrDither8, 3);
                     }
 
-                    if (alpBuf)
+                    if (alpBuf && dest[3])
                         yuv2yuv1(alpBuf, dest[3], dstW, c->lumDither8, 0);
                 } else { //General YV12
-                    yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
-                             lumSrcPtr, vLumFilterSize,
-                             vChrFilter + chrDstY * vChrFilterSize,
-                             chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
-                             alpSrcPtr, dest, dstW, chrDstW);
+                    yuv2plane_luma(vLumFilter + dstY * vLumFilterSize, 
vLumFilterSize, lumSrcPtr, dest[0], dstW, c->lumDither8, 0);
+
+                    if (dest[1]){
+                        yuv2plane_chroma(vChrFilter + chrDstY * 
vChrFilterSize, vChrFilterSize, chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0);
+                        yuv2plane_chroma(vChrFilter + chrDstY * 
vChrFilterSize, vChrFilterSize, chrVSrcPtr, dest[2], chrDstW, c->chrDither8, 3);
+                    }
+
+                    if (alpBuf && dest[3])
+                        yuv2plane_luma(vLumFilter + dstY * vLumFilterSize, 
vLumFilterSize, alpSrcPtr, dest[3], dstW, c->lumDither8, 0);
                 }
             } else {
                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + 
vLumBufSize*2);
@@ -2632,8 +2541,8 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
 {
     enum PixelFormat srcFormat = c->srcFormat;
 
-    find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
-                                   &c->yuv2packed1, &c->yuv2packed2,
+    find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2plane_luma, 
&c->yuv2plane_chroma,
+                                   &c->yuv2nv12X, &c->yuv2packed1, 
&c->yuv2packed2,
                                    &c->yuv2packedX);
 
     c->chrToYV12 = NULL;
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 8a8ad86..8cf1d45 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -66,17 +66,34 @@ typedef int (*SwsFunc)(struct SwsContext *context, const 
uint8_t* src[],
  *
  * @param Src     scaled source data, 15bit for 8-10bit output,
  *                19-bit for 16bit output (in int32_t)
- * @param dest    pointer to the 4 output planes (Y/U/V/A). For >8bit
+ * @param Dest    pointer to the output plane. For >8bit
  *                output, this is in uint16_t
- * @param dstW    width of dest[0], dest[3], lumSrc and alpSrc in pixels
- * @param Dither  TODO
+ * @param dstW    width of destination in pixels
+ * @param Dither  ordered dither array of type int16_t and size 8
  * @param offset  Dither offset
  */
 typedef void (*yuv2planar1_fn) (const int16_t *Src, uint8_t *Dest, int dstW,
                                 const uint8_t *Dither, int offset);
 
 /**
- * Write one line of horizontally scaled Y/U/V/A to planar output
+ * Write one line of horizontally scaled data to planar output
+ * with multi-point vertical scaling between input pixels.
+ *
+ * @param Filter        vertical luma/alpha scaling coefficients, 12bit 
[0,4096]
+ * @param Src           scaled luma (Y) or alpha (A) source data, 15bit for 
8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param FilterSize    number of vertical input lines to scale
+ * @param dest          pointer to output plane. For >8bit
+ *                      output, this is in uint16_t
+ * @param dstW          width of destination pixels
+ * @param offset        Dither offset
+ */
+typedef void (*yuv2planarX_fn) (const int16_t *Filter, int FilterSize,
+                                const int16_t **Src, uint8_t *Dest, int dstW,
+                                const uint8_t *Dither, int offset);
+
+/**
+ * Write one line of horizontally scaled Y/U/V/A to interleaved output
  * with multi-point vertical scaling between input pixels.
  *
  * @param c             SWS scaling context
@@ -97,12 +114,13 @@ typedef void (*yuv2planar1_fn) (const int16_t *Src, 
uint8_t *Dest, int dstW,
  * @param dstW          width of dest[0], dest[3], lumSrc and alpSrc in pixels
  * @param chrDstW       width of dest[1], dest[2], chrUSrc and chrVSrc
  */
-typedef void (*yuv2planarX_fn) (struct SwsContext *c, const int16_t *lumFilter,
-                                const int16_t **lumSrc, int lumFilterSize,
-                                const int16_t *chrFilter, const int16_t 
**chrUSrc,
-                                const int16_t **chrVSrc,  int chrFilterSize,
-                                const int16_t **alpSrc, uint8_t *dest[4],
-                                int dstW, int chrDstW);
+typedef void (*yuv2interleavedX_fn) (struct SwsContext *c, const int16_t 
*lumFilter,
+                                     const int16_t **lumSrc, int lumFilterSize,
+                                     const int16_t *chrFilter, const int16_t 
**chrUSrc,
+                                     const int16_t **chrVSrc, int 
chrFilterSize,
+                                     const int16_t **alpSrc, uint8_t *dest[4],
+                                     int dstW, int chrDstW);
+
 /**
  * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
  * output without any additional vertical scaling (or point-scaling). Note
@@ -405,7 +423,9 @@ typedef struct SwsContext {
 
     /* function pointers for swScale() */
     yuv2planar1_fn yuv2yuv1;
-    yuv2planarX_fn yuv2yuvX;
+    yuv2planarX_fn yuv2plane_luma;
+    yuv2planarX_fn yuv2plane_chroma;
+    yuv2interleavedX_fn yuv2nv12X;
     yuv2packed1_fn yuv2packed1;
     yuv2packed2_fn yuv2packed2;
     yuv2packedX_fn yuv2packedX;
-- 
1.7.4.1

_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 2/2] Split up yuv2yuvX functions.

Reply via email to