Re: [FFmpeg-devel] [PATCH] lavfi: add xbr filter

Clément Bœsch Tue, 28 Oct 2014 11:17:18 -0700

On Tue, Oct 28, 2014 at 06:30:34PM +0100, Stefano Sabatini wrote:
[...]
> How much effort would it take to implement the remaining scaling modes?
>


According to
https://ffmpeg.org/pipermail/ffmpeg-devel/2014-October/164574.html

"I think 4x can be done fast enough, but 3x will take time."

[...]
> > +typedef struct {
> > +    uint32_t rgbtoyuv[1<<24];
> 
> We should avoid this 64MiB. Also the table should be possibly static,
> so you don't have to fill it per each xBR instance.
> 

So, I requested to do it exactly the same as HQx because this part is
common according to the specifications. This should be kept the same
vf_hqx, and then factorized.

Now about removing this allocation, I did benchmark this LUT vs
computation (see attached patch for comp. version). And the problem is
that it's slightly slower, probably due to the /1000.

I wasn't able to make it bitexact with the current code using bithacks,
and while this sounds like a tolerable inaccuracy, it actually isn't and
has an impact of the output. For example, doing this (on top of attached
patch):

diff --git a/libavfilter/vf_hqx.c b/libavfilter/vf_hqx.c
index 41a77cf..f4d8006 100644
--- a/libavfilter/vf_hqx.c
+++ b/libavfilter/vf_hqx.c
@@ -29,6 +29,7 @@
 
 #include "libavutil/opt.h"
 #include "libavutil/avassert.h"
+#include "libavutil/colorspace.h"
 #include "libavutil/pixdesc.h"
 #include "internal.h"
 
@@ -58,9 +59,9 @@ static av_always_inline uint32_t rgb2yuv(uint32_t c)
     const int r = c >> 16 & 0xff;
     const int g = c >>  8 & 0xff;
     const int b = c       & 0xff;
-    const uint32_t y = (uint32_t)(( 299*r + 587*g + 114*b)/1000);
-    const uint32_t u = (uint32_t)((-169*r - 331*g + 500*b)/1000) + 128;
-    const uint32_t v = (uint32_t)(( 500*r - 419*g -  81*b)/1000) + 128;
+    const uint32_t y = RGB_TO_Y(r, g, b);
+    const uint32_t u = RGB_TO_U(r, g, b, 0);
+    const uint32_t v = RGB_TO_V(r, g, b, 0);
     return y<<16 | u<<8 | v;
 }
 

...leads to this: https://lut.im/S9sJXgGU/ttB0B1j1 vs
https://lut.im/9iRC6VMx/ef3PKqYd (look at the sorcerers typically, or
bomberman)

Even with a higher bit depth and checking the rounding, I had differences.

So for now, I prefer to keep the LUT unless someone has a better idea. And
anyway, this is orthogonal to this patch.

[...]

-- 
Clément B.

From 411b4e217f893a4ca1077d9814af02cf5349054a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= <u...@pkh.me>
Date: Mon, 27 Oct 2014 23:49:47 +0100
Subject: [PATCH] avfilter/hqx/WIP: remove LUT

---
 libavfilter/vf_hqx.c | 88 ++++++++++++++++++++--------------------------------
 1 file changed, 34 insertions(+), 54 deletions(-)

diff --git a/libavfilter/vf_hqx.c b/libavfilter/vf_hqx.c
index 4783381..41a77cf 100644
--- a/libavfilter/vf_hqx.c
+++ b/libavfilter/vf_hqx.c
@@ -38,12 +38,10 @@ typedef struct {
     const AVClass *class;
     int n;
     hqxfunc_t func;
-    uint32_t rgbtoyuv[1<<24];
 } HQXContext;
 
 typedef struct ThreadData {
     AVFrame *in, *out;
-    const uint32_t *rgbtoyuv;
 } ThreadData;
 
 #define OFFSET(x) offsetof(HQXContext, x)
@@ -55,9 +53,15 @@ static const AVOption hqx_options[] = {
 
 AVFILTER_DEFINE_CLASS(hqx);
 
-static av_always_inline uint32_t rgb2yuv(const uint32_t *r2y, uint32_t c)
+static av_always_inline uint32_t rgb2yuv(uint32_t c)
 {
-    return r2y[c & 0xffffff];
+    const int r = c >> 16 & 0xff;
+    const int g = c >>  8 & 0xff;
+    const int b = c       & 0xff;
+    const uint32_t y = (uint32_t)(( 299*r + 587*g + 114*b)/1000);
+    const uint32_t u = (uint32_t)((-169*r - 331*g + 500*b)/1000) + 128;
+    const uint32_t v = (uint32_t)(( 500*r - 419*g -  81*b)/1000) + 128;
+    return y<<16 | u<<8 | v;
 }
 
 static av_always_inline int yuv_diff(uint32_t yuv1, uint32_t yuv2)
@@ -97,7 +101,7 @@ static av_always_inline uint32_t interp_3px(uint32_t c1, int w1, uint32_t c2, in
 #define SHF(x, rot, n) (((x) >> ((rot) ? 7-DROP4(n) : DROP4(n)) & 1) << DROP4(p##n))
 
 /* used to check if there is YUV difference between 2 pixels */
-#define WDIFF(c1, c2) yuv_diff(rgb2yuv(r2y, c1), rgb2yuv(r2y, c2))
+#define WDIFF(c1, c2) yuv_diff(rgb2yuv(c1), rgb2yuv(c2))
 
 /* bootstrap template for every interpolation code. It defines the shuffled
  * masks and surrounding pixels. The rot flag is used to indicate if it's a
@@ -114,8 +118,7 @@ static av_always_inline uint32_t interp_3px(uint32_t c1, int w1, uint32_t c2, in
 /* Assuming p0..p8 is mapped to pixels 0..8, this function interpolates the
  * top-left pixel in the total of the 2x2 pixels to interpolates. The function
  * is also used for the 3 other pixels */
-static av_always_inline uint32_t hq2x_interp_1x1(const uint32_t *r2y, int k,
-                                                 const uint32_t *w,
+static av_always_inline uint32_t hq2x_interp_1x1(int k, const uint32_t *w,
                                                  int p0, int p1, int p2,
                                                  int p3, int p4, int p5,
                                                  int p6, int p7, int p8)
@@ -165,8 +168,7 @@ static av_always_inline uint32_t hq2x_interp_1x1(const uint32_t *r2y, int k,
  * defining the outline. The center pixel is not defined through this function,
  * since it's just the same as the original value. */
 static av_always_inline void hq3x_interp_2x1(uint32_t *dst, int dst_linesize,
-                                             const uint32_t *r2y, int k,
-                                             const uint32_t *w,
+                                             int k, const uint32_t *w,
                                              int pos00, int pos01,
                                              int p0, int p1, int p2,
                                              int p3, int p4, int p5,
@@ -231,8 +233,7 @@ static av_always_inline void hq3x_interp_2x1(uint32_t *dst, int dst_linesize,
  * interpolates. The function is also used for the 3 other blocks of 2x2
  * pixels. */
 static av_always_inline void hq4x_interp_2x2(uint32_t *dst, int dst_linesize,
-                                             const uint32_t *r2y, int k,
-                                             const uint32_t *w,
+                                             int k, const uint32_t *w,
                                              int pos00, int pos01,
                                              int pos10, int pos11,
                                              int p0, int p1, int p2,
@@ -382,7 +383,6 @@ static av_always_inline void hqx_filter(const ThreadData *td, int jobnr, int nb_
 {
     int x, y;
     AVFrame *in = td->in, *out = td->out;
-    const uint32_t *r2y = td->rgbtoyuv;
     const int height = in->height;
     const int width  = in->width;
     const int slice_start = (height *  jobnr   ) / nb_jobs;
@@ -409,32 +409,32 @@ static av_always_inline void hqx_filter(const ThreadData *td, int jobnr, int nb_
                 src32[prevcol           ], src32[       0], src32[           nextcol],
                 src32[prevcol + nextline], src32[nextline], src32[nextline + nextcol]
             };
-            const uint32_t yuv1 = rgb2yuv(r2y, w[4]);
-            const int pattern = (w[4] != w[0] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[0]))) : 0)
-                              | (w[4] != w[1] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[1]))) : 0) << 1
-                              | (w[4] != w[2] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[2]))) : 0) << 2
-                              | (w[4] != w[3] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[3]))) : 0) << 3
-                              | (w[4] != w[5] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[5]))) : 0) << 4
-                              | (w[4] != w[6] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[6]))) : 0) << 5
-                              | (w[4] != w[7] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[7]))) : 0) << 6
-                              | (w[4] != w[8] ? (yuv_diff(yuv1, rgb2yuv(r2y, w[8]))) : 0) << 7;
+            const uint32_t yuv1 = rgb2yuv(w[4]);
+            const int pattern = (w[4] != w[0] ? (yuv_diff(yuv1, rgb2yuv(w[0]))) : 0)
+                              | (w[4] != w[1] ? (yuv_diff(yuv1, rgb2yuv(w[1]))) : 0) << 1
+                              | (w[4] != w[2] ? (yuv_diff(yuv1, rgb2yuv(w[2]))) : 0) << 2
+                              | (w[4] != w[3] ? (yuv_diff(yuv1, rgb2yuv(w[3]))) : 0) << 3
+                              | (w[4] != w[5] ? (yuv_diff(yuv1, rgb2yuv(w[5]))) : 0) << 4
+                              | (w[4] != w[6] ? (yuv_diff(yuv1, rgb2yuv(w[6]))) : 0) << 5
+                              | (w[4] != w[7] ? (yuv_diff(yuv1, rgb2yuv(w[7]))) : 0) << 6
+                              | (w[4] != w[8] ? (yuv_diff(yuv1, rgb2yuv(w[8]))) : 0) << 7;
 
             if (n == 2) {
-                dst32[dst32_linesize*0 + 0] = hq2x_interp_1x1(r2y, pattern, w, 0,1,2,3,4,5,6,7,8);  // 00
-                dst32[dst32_linesize*0 + 1] = hq2x_interp_1x1(r2y, pattern, w, 2,1,0,5,4,3,8,7,6);  // 01 (vert mirrored)
-                dst32[dst32_linesize*1 + 0] = hq2x_interp_1x1(r2y, pattern, w, 6,7,8,3,4,5,0,1,2);  // 10 (horiz mirrored)
-                dst32[dst32_linesize*1 + 1] = hq2x_interp_1x1(r2y, pattern, w, 8,7,6,5,4,3,2,1,0);  // 11 (center mirrored)
+                dst32[dst32_linesize*0 + 0] = hq2x_interp_1x1(pattern, w, 0,1,2,3,4,5,6,7,8);  // 00
+                dst32[dst32_linesize*0 + 1] = hq2x_interp_1x1(pattern, w, 2,1,0,5,4,3,8,7,6);  // 01 (vert mirrored)
+                dst32[dst32_linesize*1 + 0] = hq2x_interp_1x1(pattern, w, 6,7,8,3,4,5,0,1,2);  // 10 (horiz mirrored)
+                dst32[dst32_linesize*1 + 1] = hq2x_interp_1x1(pattern, w, 8,7,6,5,4,3,2,1,0);  // 11 (center mirrored)
             } else if (n == 3) {
-                hq3x_interp_2x1(dst32,                        dst32_linesize, r2y, pattern, w, 0,1, 0,1,2,3,4,5,6,7,8, 0);  // 00 01
-                hq3x_interp_2x1(dst32 + 1,                    dst32_linesize, r2y, pattern, w, 1,3, 2,5,8,1,4,7,0,3,6, 1);  // 02 12 (rotated to the right)
-                hq3x_interp_2x1(dst32 + 1*dst32_linesize,     dst32_linesize, r2y, pattern, w, 2,0, 6,3,0,7,4,1,8,5,2, 1);  // 20 10 (rotated to the left)
-                hq3x_interp_2x1(dst32 + 1*dst32_linesize + 1, dst32_linesize, r2y, pattern, w, 3,2, 8,7,6,5,4,3,2,1,0, 0);  // 22 21 (center mirrored)
-                dst32[dst32_linesize + 1] = w[4];                                                                           // 11
+                hq3x_interp_2x1(dst32,                        dst32_linesize, pattern, w, 0,1, 0,1,2,3,4,5,6,7,8, 0);  // 00 01
+                hq3x_interp_2x1(dst32 + 1,                    dst32_linesize, pattern, w, 1,3, 2,5,8,1,4,7,0,3,6, 1);  // 02 12 (rotated to the right)
+                hq3x_interp_2x1(dst32 + 1*dst32_linesize,     dst32_linesize, pattern, w, 2,0, 6,3,0,7,4,1,8,5,2, 1);  // 20 10 (rotated to the left)
+                hq3x_interp_2x1(dst32 + 1*dst32_linesize + 1, dst32_linesize, pattern, w, 3,2, 8,7,6,5,4,3,2,1,0, 0);  // 22 21 (center mirrored)
+                dst32[dst32_linesize + 1] = w[4];                                                                      // 11
             } else if (n == 4) {
-                hq4x_interp_2x2(dst32,                        dst32_linesize, r2y, pattern, w, 0,1,2,3, 0,1,2,3,4,5,6,7,8); // 00 01 10 11
-                hq4x_interp_2x2(dst32 + 2,                    dst32_linesize, r2y, pattern, w, 1,0,3,2, 2,1,0,5,4,3,8,7,6); // 02 03 12 13 (vert mirrored)
-                hq4x_interp_2x2(dst32 + 2*dst32_linesize,     dst32_linesize, r2y, pattern, w, 2,3,0,1, 6,7,8,3,4,5,0,1,2); // 20 21 30 31 (horiz mirrored)
-                hq4x_interp_2x2(dst32 + 2*dst32_linesize + 2, dst32_linesize, r2y, pattern, w, 3,2,1,0, 8,7,6,5,4,3,2,1,0); // 22 23 32 33 (center mirrored)
+                hq4x_interp_2x2(dst32,                        dst32_linesize, pattern, w, 0,1,2,3, 0,1,2,3,4,5,6,7,8); // 00 01 10 11
+                hq4x_interp_2x2(dst32 + 2,                    dst32_linesize, pattern, w, 1,0,3,2, 2,1,0,5,4,3,8,7,6); // 02 03 12 13 (vert mirrored)
+                hq4x_interp_2x2(dst32 + 2*dst32_linesize,     dst32_linesize, pattern, w, 2,3,0,1, 6,7,8,3,4,5,0,1,2); // 20 21 30 31 (horiz mirrored)
+                hq4x_interp_2x2(dst32 + 2*dst32_linesize + 2, dst32_linesize, pattern, w, 3,2,1,0, 8,7,6,5,4,3,2,1,0); // 22 23 32 33 (center mirrored)
             } else {
                 av_assert0(0);
             }
@@ -497,7 +497,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 
     td.in = in;
     td.out = out;
-    td.rgbtoyuv = hqx->rgbtoyuv;
     ctx->internal->execute(ctx, hqx->func, &td, NULL, FFMIN(inlink->h, ctx->graph->nb_threads));
 
     av_frame_free(&in);
@@ -508,25 +507,6 @@ static av_cold int init(AVFilterContext *ctx)
 {
     HQXContext *hqx = ctx->priv;
     static const hqxfunc_t hqxfuncs[] = {hq2x, hq3x, hq4x};
-
-    uint32_t c;
-    int bg, rg, g;
-
-    for (bg=-255; bg<256; bg++) {
-        for (rg=-255; rg<256; rg++) {
-            const uint32_t u = (uint32_t)((-169*rg + 500*bg)/1000) + 128;
-            const uint32_t v = (uint32_t)(( 500*rg -  81*bg)/1000) + 128;
-            int startg = FFMAX3(-bg, -rg, 0);
-            int endg = FFMIN3(255-bg, 255-rg, 255);
-            uint32_t y = (uint32_t)(( 299*rg + 1000*startg + 114*bg)/1000);
-            c = bg + (rg<<16) + 0x010101 * startg;
-            for (g = startg; g <= endg; g++) {
-                hqx->rgbtoyuv[c] = ((y++) << 16) + (u << 8) + v;
-                c+= 0x010101;
-            }
-        }
-    }
-
     hqx->func = hqxfuncs[hqx->n - 2];
     return 0;
 }
-- 
2.1.2

pgpkqHDwXtaeI.pgp
Description: PGP signature

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] lavfi: add xbr filter

Reply via email to