[Pixman] [PATCH 03/14] More general BILINEAR=>NEAREST reduction

2016-04-11 Thread Søren Sandmann Pedersen
Generalize and simplify the code that reduces BILINEAR to NEAREST so
that the reduction happens for all affine transformations where
t00...t12 are integers and (t00 + t01) and (t10 + t11) are both
odd. This is a sufficient condition for the resulting transformed
coordinates to be exactly at the center of a pixel so that BILINEAR
becomes identical to NEAREST.

V2: Address some comments by Bill Spitzak

Signed-off-by: Søren Sandmann 
---
 pixman/pixman-image.c | 66 +--
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c
index 1ff1a49..681864e 100644
--- a/pixman/pixman-image.c
+++ b/pixman/pixman-image.c
@@ -335,37 +335,47 @@ compute_image_info (pixman_image_t *image)
{
flags |= FAST_PATH_NEAREST_FILTER;
}
-   else if (
-   /* affine and integer translation components in matrix ... */
-   ((flags & FAST_PATH_AFFINE_TRANSFORM) &&
-!pixman_fixed_frac (image->common.transform->matrix[0][2] |
-image->common.transform->matrix[1][2])) &&
-   (
-   /* ... combined with a simple rotation */
-   (flags & (FAST_PATH_ROTATE_90_TRANSFORM |
- FAST_PATH_ROTATE_180_TRANSFORM |
- FAST_PATH_ROTATE_270_TRANSFORM)) ||
-   /* ... or combined with a simple non-rotated translation */
-   (image->common.transform->matrix[0][0] == pixman_fixed_1 &&
-image->common.transform->matrix[1][1] == pixman_fixed_1 &&
-image->common.transform->matrix[0][1] == 0 &&
-image->common.transform->matrix[1][0] == 0)
-   )
-   )
+   else if (flags & FAST_PATH_AFFINE_TRANSFORM)
{
-   /* FIXME: there are some affine-test failures, showing that
-* handling of BILINEAR and NEAREST filter is not quite
-* equivalent when getting close to 32K for the translation
-* components of the matrix. That's likely some bug, but for
-* now just skip BILINEAR->NEAREST optimization in this case.
+   /* Suppose the transform is
+*
+*[ t00, t01, t02 ]
+*[ t10, t11, t12 ]
+*[   0,   0,   1 ]
+*
+* and the destination coordinates are (n + 0.5, m + 0.5). Then
+* the transformed x coordinate is:
+*
+* tx = t00 * (n + 0.5) + t01 * (m + 0.5) + t02
+*= t00 * n + t01 * m + t02 + (t00 + t01) * 0.5
+*
+* which implies that if t00, t01 and t02 are all integers
+* and (t00 + t01) is odd, then tx will be an integer plus 0.5,
+* which means a BILINEAR filter will reduce to NEAREST. The same
+* applies in the y direction
 */
-   pixman_fixed_t magic_limit = pixman_int_to_fixed (3);
-   if (image->common.transform->matrix[0][2] <= magic_limit  &&
-   image->common.transform->matrix[1][2] <= magic_limit  &&
-   image->common.transform->matrix[0][2] >= -magic_limit &&
-   image->common.transform->matrix[1][2] >= -magic_limit)
+   pixman_fixed_t (*t)[3] = image->common.transform->matrix;
+
+   if ((pixman_fixed_frac (
+t[0][0] | t[0][1] | t[0][2] |
+t[1][0] | t[1][1] | t[1][2]) == 0) &&
+   (pixman_fixed_to_int (
+   (t[0][0] + t[0][1]) & (t[1][0] + t[1][1])) % 2) == 1)
{
-   flags |= FAST_PATH_NEAREST_FILTER;
+   /* FIXME: there are some affine-test failures, showing that
+* handling of BILINEAR and NEAREST filter is not quite
+* equivalent when getting close to 32K for the translation
+* components of the matrix. That's likely some bug, but for
+* now just skip BILINEAR->NEAREST optimization in this case.
+*/
+   pixman_fixed_t magic_limit = pixman_int_to_fixed (3);
+   if (image->common.transform->matrix[0][2] <= magic_limit  &&
+   image->common.transform->matrix[1][2] <= magic_limit  &&
+   image->common.transform->matrix[0][2] >= -magic_limit &&
+   image->common.transform->matrix[1][2] >= -magic_limit)
+   {
+   flags |= FAST_PATH_NEAREST_FILTER;
+   }
}
}
break;
-- 
1.7.11.7

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 13/14] pixman-filter: Nested polynomial for cubic

2016-04-11 Thread Søren Sandmann Pedersen
From: Bill Spitzak 

v11: Restored range checks

Signed-off-by: Bill Spitzak 
Reviewed-by: Oded Gabbay 
---
 pixman/pixman-filter.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/pixman/pixman-filter.c b/pixman/pixman-filter.c
index 4abd05f..db4ab6e 100644
--- a/pixman/pixman-filter.c
+++ b/pixman/pixman-filter.c
@@ -109,14 +109,16 @@ general_cubic (double x, double B, double C)
 
 if (ax < 1)
 {
-   return ((12 - 9 * B - 6 * C) * ax * ax * ax +
-   (-18 + 12 * B + 6 * C) * ax * ax + (6 - 2 * B)) / 6;
+   return (((12 - 9 * B - 6 * C) * ax +
+(-18 + 12 * B + 6 * C)) * ax * ax +
+   (6 - 2 * B)) / 6;
 }
-else if (ax >= 1 && ax < 2)
+else if (ax < 2)
 {
-   return ((-B - 6 * C) * ax * ax * ax +
-   (6 * B + 30 * C) * ax * ax + (-12 * B - 48 * C) *
-   ax + (8 * B + 24 * C)) / 6;
+   return -B - 6 * C) * ax +
+ (6 * B + 30 * C)) * ax +
+(-12 * B - 48 * C)) * ax +
+   (8 * B + 24 * C)) / 6;
 }
 else
 {
-- 
1.7.11.7

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] Bill Spitzak patches

2016-04-11 Thread Søren Sandmann Pedersen
Hi,

The following patch series contains those of Bill's patches that I
think are ready to be pushed to master, plus some other related
changes that I also think are ready.

01-03: These are patches to do more BILINEAR->NEAREST filter
   reductions. They were inspired by a similar patch in Bill's
   series, but these patches do the reduction in more cases and
   include tests.

04:Compute the filter size from a transformed ellipse.

05-06: UI fixes to demos/scale.

07:gnuplot output

   This is based on Bill's gnuplot patch, but I rewrote the logic
   in pixman-filter.c to generate correct coordinates, and added a
   big comment explaining how the phase interleaving works.

08:Reduce malloc()/free()/memcpy()

09:Correct Simpson's integration

10:Integral splitting is only necessary for the LINEAR filter

   I rebased this so that it doesn't depend on the changes to the
   integral() from Bill's series, and made the comment in the code
   match the new code.

11:Speed up BOX/BOX

   I rebased this and removed the normalization

12:Fix several issues related to normalization

   This patch fixes several normalization issues including the one
   fixed in Bill's series.

13:Nested polynomial for cubic

14:Made Gaussian a bit wider

In the patches where I made changes, I have generally retained Bill as
the author if the patch still contained a substantial block of code
that was written by Bill. Those that I rewrote completely (the
BILINEAR=>NEAREST and the normalization ones), I have put myself as
author.

However, in all cases I'm happy enough to put either me or Bill as the
author. If anyone has strong opinions about this, let me know.

In all the patches I have also reformatted the commit log so that it
fits within 80 characters.

With the exception of the scale->rscale one, I think the remaining
patches in Bill's series should not be accepted, though it is possible
that a new series rebased on top of this will reveal that I
missed something.


Søren
___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 11/14] pixman-filter: Speed up BOX/BOX filter

2016-04-11 Thread Søren Sandmann Pedersen
The convolution of two BOX filters is simply the length of the
interval where both are non-zero, so we can simply return width from
the integral() function because the integration region has already
been restricted to be such that both functions are non-zero on it.

This is both faster and more accurate than doing numerical integration.

This patch is based on one by Bill Spitzak

https://lists.freedesktop.org/archives/pixman/2016-March/004446.html

with these changes:

- Rebased to not assume any changes in the arguments to integral().

- Dropped the multiplication by scale

- Added more details in the commit message.

Signed-off-by: Søren Sandmann 
---
 pixman/pixman-filter.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pixman/pixman-filter.c b/pixman/pixman-filter.c
index c868723..32aaa9a 100644
--- a/pixman/pixman-filter.c
+++ b/pixman/pixman-filter.c
@@ -160,11 +160,15 @@ integral (pixman_kernel_t kernel1, double x1,
  pixman_kernel_t kernel2, double scale, double x2,
  double width)
 {
+if (kernel1 == PIXMAN_KERNEL_BOX && kernel2 == PIXMAN_KERNEL_BOX)
+{
+   return width;
+}
 /* The LINEAR filter is not differentiable at 0, so if the
  * integration interval crosses zero, break it into two
  * separate integrals.
  */
-if (kernel1 == PIXMAN_KERNEL_LINEAR && x1 < 0 && x1 + width > 0)
+else if (kernel1 == PIXMAN_KERNEL_LINEAR && x1 < 0 && x1 + width > 0)
 {
return
integral (kernel1, x1, kernel2, scale, x2, - x1) +
-- 
1.7.11.7

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 02/14] Add new test of filter reduction from BILINEAR to NEAREST

2016-04-11 Thread Søren Sandmann Pedersen
This new test tests a bunch of bilinear downscalings, where many have
a transformation such that the BILINEAR filter can be reduced to
NEAREST (and many don't).

A CRC32 is computed for all the resulting images and compared to a
known-good value for both 4-bit and 7-bit interpolation.

V2: Remove leftover comment, some minor formatting fixes, use a
timestamp as the PRNG seed.

Signed-off-by: Søren Sandmann 
---
 test/Makefile.sources|   1 +
 test/filter-reduction-test.c | 112 +++
 2 files changed, 113 insertions(+)
 create mode 100644 test/filter-reduction-test.c

diff --git a/test/Makefile.sources b/test/Makefile.sources
index 5d55e67..0a56231 100644
--- a/test/Makefile.sources
+++ b/test/Makefile.sources
@@ -21,6 +21,7 @@ TESTPROGRAMS =  \
gradient-crash-test   \
pixel-test\
matrix-test   \
+   filter-reduction-test \
composite-traps-test  \
region-contains-test  \
glyph-test\
diff --git a/test/filter-reduction-test.c b/test/filter-reduction-test.c
new file mode 100644
index 000..705fa4b
--- /dev/null
+++ b/test/filter-reduction-test.c
@@ -0,0 +1,112 @@
+#include 
+#include 
+#include "utils.h"
+
+static const pixman_fixed_t entries[] =
+{
+pixman_double_to_fixed (-1.0),
+pixman_double_to_fixed (-0.5),
+pixman_double_to_fixed (-1/3.0),
+pixman_double_to_fixed (0.0),
+pixman_double_to_fixed (0.5),
+pixman_double_to_fixed (1.0),
+pixman_double_to_fixed (1.5),
+pixman_double_to_fixed (2.0),
+pixman_double_to_fixed (3.0),
+};
+
+#define SIZE 12
+
+static uint32_t
+test_scale (const pixman_transform_t *xform, uint32_t crc)
+{
+uint32_t *srcbuf, *dstbuf;
+pixman_image_t *src, *dest;
+
+srcbuf = malloc (SIZE * SIZE * 4);
+prng_randmemset (srcbuf, SIZE * SIZE * 4, 0);
+src = pixman_image_create_bits (
+   PIXMAN_a8r8g8b8, SIZE, SIZE, srcbuf, SIZE * 4);
+
+dstbuf = malloc (SIZE * SIZE * 4);
+prng_randmemset (dstbuf, SIZE * SIZE * 4, 0);
+dest = pixman_image_create_bits (
+   PIXMAN_a8r8g8b8, SIZE, SIZE, dstbuf, SIZE * 4);
+
+pixman_image_set_transform (src, xform);
+pixman_image_set_repeat (src, PIXMAN_REPEAT_NORMAL);
+pixman_image_set_filter (src, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+image_endian_swap (src);
+image_endian_swap (dest);
+
+pixman_image_composite (PIXMAN_OP_SRC,
+   src, NULL, dest,
+   0, 0, 0, 0, 0, 0,
+   SIZE, SIZE);
+
+crc = compute_crc32_for_image (crc, dest);
+
+pixman_image_unref (src);
+pixman_image_unref (dest);
+
+free (srcbuf);
+free (dstbuf);
+
+return crc;
+}
+
+#if BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM 0x02169677
+#elif BILINEAR_INTERPOLATION_BITS == 4
+#define CHECKSUM 0xE44B29AC
+#else
+#define CHECKSUM 0x
+#endif
+
+int
+main (int argc, const char *argv[])
+{
+const pixman_fixed_t *end = entries + ARRAY_LENGTH (entries);
+const pixman_fixed_t *t0, *t1, *t2, *t3, *t4, *t5;
+uint32_t crc = 0;
+
+prng_srand (0x56EA1DBD);
+
+for (t0 = entries; t0 < end; ++t0)
+{
+   for (t1 = entries; t1 < end; ++t1)
+   {
+   for (t2 = entries; t2 < end; ++t2)
+   {
+   for (t3 = entries; t3 < end; ++t3)
+   {
+   for (t4 = entries; t4 < end; ++t4)
+   {
+   for (t5 = entries; t5 < end; ++t5)
+   {
+   pixman_transform_t xform = {
+   { { *t0, *t1, *t2 },
+ { *t3, *t4, *t5 },
+ { 0, 0, pixman_fixed_1 } }
+   };
+
+   crc = test_scale (, crc);
+   }
+   }
+   }
+   }
+   }
+}
+
+if (crc != CHECKSUM)
+{
+   printf ("filter-reduction-test failed! (checksum=0x%08X, expected 
0x%08X)\n", crc, CHECKSUM);
+   return 1;
+}
+else
+{
+   printf ("filter-reduction-test passed (checksum=0x%08X)\n", crc);
+   return 0;
+}
+}
-- 
1.7.11.7

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 07/14] pixman-image: Added enable-gnuplot config to view filters in gnuplot

2016-04-11 Thread Søren Sandmann Pedersen
From: Bill Spitzak 

If enable-gnuplot is configured, then you can pipe the output of a
pixman-using program to gnuplot and get a continuously-updated plot of
the horizontal filter. This works well with demos/scale to test the
filter generation.

The plot is all the different subposition filters shuffled
together. This is misleading in a few cases:

  IMPULSE.BOX - goes up and down as the subfilters have different
numbers of non-zero samples

  IMPULSE.TRIANGLE - somewhat crooked for the same reason

  1-wide filters - looks triangular, but a 1-wide box would be more
   accurate

Changes by Søren: Rewrote the pixman-filter.c part to
 - make it generate correct coordinates
 - add a comment on how coordinates are generated
 - in rounding.txt, add a ceil() variant of the first-sample
   formula
 - make the gnuplot output slightly prettier

v7: First time this ability was included

v8: Use config option
Moved code to the filter generator
Modified scale demo to not call filter generator a second time.

v10: Only print if successful generation of plots
 Use #ifdef, not #if

v11: small whitespace fixes

Signed-off-by: Bill Spitzak 
Signed-off-by: Søren Sandmann 
---
 configure.ac   |  13 ++
 pixman/pixman-filter.c | 115 +
 pixman/rounding.txt|   1 +
 3 files changed, 129 insertions(+)

diff --git a/configure.ac b/configure.ac
index 6b2134e..e833e45 100644
--- a/configure.ac
+++ b/configure.ac
@@ -834,6 +834,19 @@ fi
 AC_SUBST(PIXMAN_TIMERS)
 
 dnl ===
+dnl gnuplot
+
+AC_ARG_ENABLE(gnuplot,
+   [AC_HELP_STRING([--enable-gnuplot],
+   [enable output of filters that can be piped to gnuplot 
[default=no]])],
+   [enable_gnuplot=$enableval], [enable_gnuplot=no])
+
+if test $enable_gnuplot = yes ; then
+   AC_DEFINE(PIXMAN_GNUPLOT, 1, [enable output that can be piped to gnuplot])
+fi
+AC_SUBST(PIXMAN_GNUPLOT)
+
+dnl ===
 dnl GTK+
 
 AC_ARG_ENABLE(gtk,
diff --git a/pixman/pixman-filter.c b/pixman/pixman-filter.c
index b2bf53f..af46a43 100644
--- a/pixman/pixman-filter.c
+++ b/pixman/pixman-filter.c
@@ -297,6 +297,117 @@ create_1d_filter (int *width,
 return params;
 }
 
+#ifdef PIXMAN_GNUPLOT
+
+/* If enable-gnuplot is configured, then you can pipe the output of a
+ * pixman-using program to gnuplot and get a continuously-updated plot
+ * of the horizontal filter. This works well with demos/scale to test
+ * the filter generation.
+ *
+ * The plot is all the different subposition filters shuffled
+ * together. This is misleading in a few cases:
+ *
+ *  IMPULSE.BOX - goes up and down as the subfilters have different
+ *   numbers of non-zero samples
+ *  IMPULSE.TRIANGLE - somewhat crooked for the same reason
+ *  1-wide filters - looks triangular, but a 1-wide box would be more
+ *  accurate
+ */
+static void
+gnuplot_filter (int width, int n_phases, const pixman_fixed_t* p)
+{
+double step;
+int i, j;
+int first;
+
+step = 1.0 / n_phases;
+
+printf ("set style line 1 lc rgb '#0060ad' lt 1 lw 0.5 pt 7 pi 1 ps 
0.5\n");
+printf ("plot '-' with linespoints ls 1\n");
+
+/* The position of the first sample of the phase corresponding to
+ * frac is given by:
+ * 
+ * ceil (frac - width / 2.0 - 0.5) + 0.5 - frac
+ * 
+ * We have to find the frac that minimizes this expression.
+ * 
+ * For odd widths, we have
+ * 
+ * ceil (frac - width / 2.0 - 0.5) + 0.5 - frac
+ *   = ceil (frac) + K - frac
+ *   = 1 + K - frac
+ * 
+ * for some K, so this is minimized when frac is maximized and
+ * strictly growing with frac. So for odd widths, we can simply
+ * start at the last phase and go backwards.
+ * 
+ * For even widths, we have
+ * 
+ * ceil (frac - width / 2.0 - 0.5) + 0.5 - frac
+ *   = ceil (frac - 0.5) + K - frac
+ * 
+ * The graph for this function (ignoring K) looks like this:
+ * 
+ *0.5
+ *   ||\ 
+ *   || \ 
+ *   ||  \ 
+ * 0 ||   \ 
+ *   |\   |
+ *   | \  |
+ *   |  \ |
+ *  -0.5 |   \|
+ *   -
+ *   00.5   1
+ * 
+ * So in this case we need to start with the phase whose frac is
+ * less than, but as close as possible to 0.5, then go backwards
+ * until we hit the first phase, then wrap around to the last
+ * phase and continue backwards.
+ * 
+ * Which phase is as close as possible 0.5? The locations of the
+ * sampling point corresponding to the kth phase is given by
+ * 1/(2 * n_phases) + k / n_phases:
+ * 
+ * 1/(2 * n_phases) + k / n_phases = 0.5
+ 

[Pixman] [PATCH 10/14] pixman-filter: integral splitting is only needed for triangle filter

2016-04-11 Thread Søren Sandmann Pedersen
From: Bill Spitzak 

Only the triangle is discontinuous at 0. The other filters resemble a
cubic closely enough that Simpsons integration works without
splitting.

Changes by Søren: Rebase without the changes to the integral function,
update comment to match the new code.

Signed-off-by: Bill Spitzak 
Signed-off-by: Søren Sandmann 
Reviewed-by: Søren Sandmann 
---
 pixman/pixman-filter.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/pixman/pixman-filter.c b/pixman/pixman-filter.c
index 8d4872a..c868723 100644
--- a/pixman/pixman-filter.c
+++ b/pixman/pixman-filter.c
@@ -160,18 +160,17 @@ integral (pixman_kernel_t kernel1, double x1,
  pixman_kernel_t kernel2, double scale, double x2,
  double width)
 {
-/* If the integration interval crosses zero, break it into
- * two separate integrals. This ensures that filters such
- * as LINEAR that are not differentiable at 0 will still
- * integrate properly.
+/* The LINEAR filter is not differentiable at 0, so if the
+ * integration interval crosses zero, break it into two
+ * separate integrals.
  */
-if (x1 < 0 && x1 + width > 0)
+if (kernel1 == PIXMAN_KERNEL_LINEAR && x1 < 0 && x1 + width > 0)
 {
return
integral (kernel1, x1, kernel2, scale, x2, - x1) +
integral (kernel1, 0, kernel2, scale, x2 - x1, width + x1);
 }
-else if (x2 < 0 && x2 + width > 0)
+else if (kernel2 == PIXMAN_KERNEL_LINEAR && x2 < 0 && x2 + width > 0)
 {
return
integral (kernel1, x1, kernel2, scale, x2, - x2) +
-- 
1.7.11.7

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 04/14] demos/scale: Compute filter size using boundary of xformed ellipse

2016-04-11 Thread Søren Sandmann Pedersen
From: Bill Spitzak 

Instead of using the boundary of xformed rectangle, use the boundary
of xformed ellipse. This is much more accurate and less blurry. In
particular the filtering does not change as the image is rotated.

Signed-off-by: Bill Spitzak 
Reviewed-by: Oded Gabbay 
Reviewed-by: Soren Sandmann 
---
 demos/scale.c | 102 +++---
 1 file changed, 61 insertions(+), 41 deletions(-)

diff --git a/demos/scale.c b/demos/scale.c
index d00307e..0995ad0 100644
--- a/demos/scale.c
+++ b/demos/scale.c
@@ -55,50 +55,70 @@ get_widget (app_t *app, const char *name)
 return widget;
 }
 
-static double
-min4 (double a, double b, double c, double d)
-{
-double m1, m2;
-
-m1 = MIN (a, b);
-m2 = MIN (c, d);
-return MIN (m1, m2);
-}
-
-static double
-max4 (double a, double b, double c, double d)
-{
-double m1, m2;
-
-m1 = MAX (a, b);
-m2 = MAX (c, d);
-return MAX (m1, m2);
-}
-
+/* Figure out the boundary of a diameter=1 circle transformed into an ellipse
+ * by trans. Proof that this is the correct calculation:
+ *
+ * Transform x,y to u,v by this matrix calculation:
+ *
+ *  |u|   |a c| |x|
+ *  |v| = |b d|*|y|
+ *
+ * Horizontal component:
+ *
+ *  u = ax+cy (1)
+ *
+ * For each x,y on a radius-1 circle (p is angle to the point):
+ *
+ *  x^2+y^2 = 1
+ *  x = cos(p)
+ *  y = sin(p)
+ *  dx/dp = -sin(p) = -y
+ *  dy/dp = cos(p) = x
+ *
+ * Figure out derivative of (1) relative to p:
+ *
+ *  du/dp = a(dx/dp) + c(dy/dp)
+ *= -ay + cx
+ *
+ * The min and max u are when du/dp is zero:
+ *
+ *  -ay + cx = 0
+ *  cx = ay
+ *  c = ay/x  (2)
+ *  y = cx/a  (3)
+ *
+ * Substitute (2) into (1) and simplify:
+ *
+ *  u = ax + ay^2/x
+ *= a(x^2+y^2)/x
+ *= a/x (because x^2+y^2 = 1)
+ *  x = a/u (4)
+ *
+ * Substitute (4) into (3) and simplify:
+ *
+ *  y = c(a/u)/a
+ *  y = c/u (5)
+ *
+ * Square (4) and (5) and add:
+ *
+ *  x^2+y^2 = (a^2+c^2)/u^2
+ *
+ * But x^2+y^2 is 1:
+ *
+ *  1 = (a^2+c^2)/u^2
+ *  u^2 = a^2+c^2
+ *  u = hypot(a,c)
+ *
+ * Similarily the max/min of v is at:
+ *
+ *  v = hypot(b,d)
+ *
+ */
 static void
 compute_extents (pixman_f_transform_t *trans, double *sx, double *sy)
 {
-double min_x, max_x, min_y, max_y;
-pixman_f_vector_t v[4] =
-{
-   { { 1, 1, 1 } },
-   { { -1, 1, 1 } },
-   { { -1, -1, 1 } },
-   { { 1, -1, 1 } },
-};
-
-pixman_f_transform_point (trans, [0]);
-pixman_f_transform_point (trans, [1]);
-pixman_f_transform_point (trans, [2]);
-pixman_f_transform_point (trans, [3]);
-
-min_x = min4 (v[0].v[0], v[1].v[0], v[2].v[0], v[3].v[0]);
-max_x = max4 (v[0].v[0], v[1].v[0], v[2].v[0], v[3].v[0]);
-min_y = min4 (v[0].v[1], v[1].v[1], v[2].v[1], v[3].v[1]);
-max_y = max4 (v[0].v[1], v[1].v[1], v[2].v[1], v[3].v[1]);
-
-*sx = (max_x - min_x) / 2.0;
-*sy = (max_y - min_y) / 2.0;
+*sx = hypot (trans->m[0][0], trans->m[0][1]) / trans->m[2][2];
+*sy = hypot (trans->m[1][0], trans->m[1][1]) / trans->m[2][2];
 }
 
 typedef struct
-- 
1.7.11.7

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 06/14] demos/scale: Default to locked axis

2016-04-11 Thread Søren Sandmann Pedersen
From: Bill Spitzak 

Signed-off-by: Bill Spitzak 
Reviewed-by: Søren Sandmann 
---
 demos/scale.ui | 1 +
 1 file changed, 1 insertion(+)

diff --git a/demos/scale.ui b/demos/scale.ui
index f6f6e89..d498d26 100644
--- a/demos/scale.ui
+++ b/demos/scale.ui
@@ -177,6 +177,7 @@
  id="lock_checkbutton">
Lock X and Y 
Dimensions
0.0
+   True
  
   
 False
-- 
1.7.11.7

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 09/14] pixman-filter: Correct Simpsons integration

2016-04-11 Thread Søren Sandmann Pedersen
From: Bill Spitzak 

Simpsons uses cubic curve fitting, with 3 samples defining each
cubic. This makes the weights of the samples be in a pattern of
1,4,2,4,2...4,1, and then dividing the result by 3.

The previous code was using weights of 1,2,0,6,0,6...,2,1.

With this fix the integration is accurate enough that the number of
samples could be reduced a lot. Multiples of 12 seem to work best.

v7: Merged with patch to reduce from 128 samples to 16
v9: Changed samples from 16 to 12
v10: Fixed rebase error that made it not compile
v11: minor whitespace change
v14: more whitespace changes

Signed-off-by: Bill Spitzak 
Reviewed-by: Oded Gabbay 
Reviewed-by: Søren Sandmann 
---
 pixman/pixman-filter.c | 21 +++--
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/pixman/pixman-filter.c b/pixman/pixman-filter.c
index dd5176d..8d4872a 100644
--- a/pixman/pixman-filter.c
+++ b/pixman/pixman-filter.c
@@ -189,13 +189,19 @@ integral (pixman_kernel_t kernel1, double x1,
 }
 else
 {
-   /* Integration via Simpson's rule */
-#define N_SEGMENTS 128
+   /* Integration via Simpson's rule
+* See http://www.intmath.com/integration/6-simpsons-rule.php
+* 12 segments (6 cubic approximations) seems to produce best
+* result for lanczos3.linear, which was the combination that
+* showed the most errors.  This makes sense as the lanczos3
+* filter is 6 wide.
+*/
+#define N_SEGMENTS 12
 #define SAMPLE(a1, a2) \
(filters[kernel1].func ((a1)) * filters[kernel2].func ((a2) * scale))

double s = 0.0;
-   double h = width / (double)N_SEGMENTS;
+   double h = width / N_SEGMENTS;
int i;
 
s = SAMPLE (x1, x2);
@@ -204,11 +210,14 @@ integral (pixman_kernel_t kernel1, double x1,
{
double a1 = x1 + h * i;
double a2 = x2 + h * i;
+   s += 4 * SAMPLE (a1, a2);
+   }
 
+   for (i = 2; i < N_SEGMENTS; i += 2)
+   {
+   double a1 = x1 + h * i;
+   double a2 = x2 + h * i;
s += 2 * SAMPLE (a1, a2);
-
-   if (i >= 2 && i < N_SEGMENTS - 1)
-   s += 4 * SAMPLE (a1, a2);
}
 
s += SAMPLE (x1 + width, x2 + width);
-- 
1.7.11.7

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 01/14] pixman-fast-path.c: Pick NEAREST affine fast paths before BILINEAR ones

2016-04-11 Thread Søren Sandmann Pedersen
When a BILINEAR filter is reduced to NEAREST, it is possible for both
types of fast paths to run; in this case, the NEAREST ones should be
preferred as that is the simpler filter.

Signed-off-by: Soren Sandmann 
---
 pixman/pixman-fast-path.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 53d4a1f..b4daa26 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -3258,9 +3258,9 @@ static const pixman_iter_info_t fast_iters[] =
 },
 
 #define AFFINE_FAST_PATHS(name, format, repeat)
\
-SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)   \
+NEAREST_AFFINE_FAST_PATH(name, format, repeat) \
 BILINEAR_AFFINE_FAST_PATH(name, format, repeat)\
-NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)
 
 AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
 AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
-- 
1.7.11.7

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 08/14] pixman-filter: reduce amount of malloc/free/memcpy to generate filter

2016-04-11 Thread Søren Sandmann Pedersen
From: Bill Spitzak 

Rearranged so that the entire block of memory for the filter pair
is allocated first, and then filled in. Previous version allocated
and freed two temporary buffers for each filter and did an extra
memcpy.

v8: small refactor to remove the filter_width function

v10: Restored filter_width function but with arguments changed to
 match later patches

v11: Removed unused arg and pointer from filter_width function
 Whitespace fixes.

Signed-off-by: Bill Spitzak 
Reviewed-by: Oded Gabbay 
Acked-by: Søren Sandmann 
---
 pixman/pixman-filter.c | 56 +-
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/pixman/pixman-filter.c b/pixman/pixman-filter.c
index af46a43..dd5176d 100644
--- a/pixman/pixman-filter.c
+++ b/pixman/pixman-filter.c
@@ -217,25 +217,17 @@ integral (pixman_kernel_t kernel1, double x1,
 }
 }
 
-static pixman_fixed_t *
-create_1d_filter (int *width,
+static void
+create_1d_filter (int  width,
  pixman_kernel_t  reconstruct,
  pixman_kernel_t  sample,
  double   scale,
- int  n_phases)
+ int  n_phases,
+ pixman_fixed_t *p)
 {
-pixman_fixed_t *params, *p;
 double step;
-double size;
 int i;
 
-size = scale * filters[sample].width + filters[reconstruct].width;
-*width = ceil (size);
-
-p = params = malloc (*width * n_phases * sizeof (pixman_fixed_t));
-if (!params)
-return NULL;
-
 step = 1.0 / n_phases;
 
 for (i = 0; i < n_phases; ++i)
@@ -250,8 +242,8 @@ create_1d_filter (int *width,
 * and sample positions.
 */
 
-   x1 = ceil (frac - *width / 2.0 - 0.5);
-x2 = x1 + *width;
+   x1 = ceil (frac - width / 2.0 - 0.5);
+   x2 = x1 + width;
 
total = 0;
 for (x = x1; x < x2; ++x)
@@ -279,7 +271,7 @@ create_1d_filter (int *width,
 }
 
/* Normalize */
-   p -= *width;
+   p -= width;
 total = 1 / total;
 new_total = 0;
for (x = x1; x < x2; ++x)
@@ -291,10 +283,15 @@ create_1d_filter (int *width,
}
 
if (new_total != pixman_fixed_1)
-   *(p - *width / 2) += (pixman_fixed_1 - new_total);
+   *(p - width / 2) += (pixman_fixed_1 - new_total);
 }
+}
 
-return params;
+
+static int
+filter_width (pixman_kernel_t reconstruct, pixman_kernel_t sample, double size)
+{
+return ceil (filters[reconstruct].width + size * filters[sample].width);
 }
 
 #ifdef PIXMAN_GNUPLOT
@@ -424,38 +421,31 @@ pixman_filter_create_separable_convolution (int   
  *n_values,
 {
 double sx = fabs (pixman_fixed_to_double (scale_x));
 double sy = fabs (pixman_fixed_to_double (scale_y));
-pixman_fixed_t *horz = NULL, *vert = NULL, *params = NULL;
+pixman_fixed_t *params;
 int subsample_x, subsample_y;
 int width, height;
 
+width = filter_width (reconstruct_x, sample_x, sx);
 subsample_x = (1 << subsample_bits_x);
-subsample_y = (1 << subsample_bits_y);
 
-horz = create_1d_filter (, reconstruct_x, sample_x, sx, subsample_x);
-vert = create_1d_filter (, reconstruct_y, sample_y, sy, 
subsample_y);
+height = filter_width (reconstruct_y, sample_y, sy);
+subsample_y = (1 << subsample_bits_y);
 
-if (!horz || !vert)
-goto out;
-
 *n_values = 4 + width * subsample_x + height * subsample_y;
 
 params = malloc (*n_values * sizeof (pixman_fixed_t));
 if (!params)
-goto out;
+   return NULL;
 
 params[0] = pixman_int_to_fixed (width);
 params[1] = pixman_int_to_fixed (height);
 params[2] = pixman_int_to_fixed (subsample_bits_x);
 params[3] = pixman_int_to_fixed (subsample_bits_y);
 
-memcpy (params + 4, horz,
-   width * subsample_x * sizeof (pixman_fixed_t));
-memcpy (params + 4 + width * subsample_x, vert,
-   height * subsample_y * sizeof (pixman_fixed_t));
-
-out:
-free (horz);
-free (vert);
+create_1d_filter (width, reconstruct_x, sample_x, sx, subsample_x,
+ params + 4);
+create_1d_filter (height, reconstruct_y, sample_y, sy, subsample_y,
+ params + 4 + width * subsample_x);
 
 #ifdef PIXMAN_GNUPLOT
 gnuplot_filter(width, subsample_x, params + 4);
-- 
1.7.11.7

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


Re: [Pixman] [PATCH v14 12/22] pixman-filter: fix subsample_bits == 0

2016-04-11 Thread Bill Spitzak
On Mon, Apr 11, 2016 at 12:35 PM, Søren Sandmann 
wrote:

> On Mon, Apr 11, 2016 at 2:43 PM, Bill Spitzak  wrote:
>
>
>> I feel this can be fixed. It is already correct for subsample_bits==0.
>> Since both the filter generator and filtering code would be changed in the
>> same version, only programs that generate their own filters would actually
>> be incompatible. And as you point out the error is very tiny for large
>> numbers of subsamples, and Cairo is already using an excessively large
>> number of subsamples (likely because I was trying to remove the difference
>> between identity filters and nearest filtering, and did not realize this
>> was the underlying problem).
>>
>
> It is technically an ABI break, and cairo 1.14 did ship with a
> copied-and-pasted filter generator that assumes the current subpixel
> positioning.
>
> But yeah, maybe we can just ignore that, if we make sure a there is a
> cairo 1.14.x update that uses pixman_filter_create_separate_convolution()
> instead of the copied-and-pasted filter generator.
>
> Other than that, the fix should be straight-forward enough.
>

As I wrote that Cairo patch I can state that I think it would be perfectly
fine to make this change. The subsamples are set unnaturally high in that
patch and hide the problem. Also it is possible my filter generator is
producing centered samples so this could be an improvement, need to check.
In addition, other than the (abandoned) idea of getting good/best into
pixman, this patch series is also designed so that Cairo can use the pixman
filter generator rather than it's own.
___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


Re: [Pixman] [PATCH v14 12/22] pixman-filter: fix subsample_bits == 0

2016-04-11 Thread Søren Sandmann
On Mon, Apr 11, 2016 at 2:43 PM, Bill Spitzak  wrote:


> I feel this can be fixed. It is already correct for subsample_bits==0.
> Since both the filter generator and filtering code would be changed in the
> same version, only programs that generate their own filters would actually
> be incompatible. And as you point out the error is very tiny for large
> numbers of subsamples, and Cairo is already using an excessively large
> number of subsamples (likely because I was trying to remove the difference
> between identity filters and nearest filtering, and did not realize this
> was the underlying problem).
>

It is technically an ABI break, and cairo 1.14 did ship with a
copied-and-pasted filter generator that assumes the current subpixel
positioning.

But yeah, maybe we can just ignore that, if we make sure a there is a cairo
1.14.x update that uses pixman_filter_create_separate_convolution() instead
of the copied-and-pasted filter generator.

Other than that, the fix should be straight-forward enough.


Søren


On Sun, Apr 10, 2016 at 10:01 PM, Søren Sandmann 
wrote:

>
> It does look like there is something really wrong. I compared and (except
>> for the subsample_bits==0 case) my version produces the same output as the
>> current git head.
>>
>> I think your intention is that there is a sample at offset=0 whether the
>> filter width is even or odd. However (except when subsample_bits==0) the
>> filter generator makes a symmetric filter for even sizes, with two equal
>> samples around the maximum center value. If a sample was at offset==0 then
>> it would be unique and larger than all the other samples.
>>
>
> The root of this confusion is probably that when subsample_bits = k, the
> subpixel positions used are:
>
> 0.5 / 2^k, 1.5 / 2^k, ..., (2^k-0.5)/2^k
>
> For example, for subsample_bits = 2:
>
> 0.125, 0.375, 0.625, 0.875
>
> and for subsample_bits = 0:
>
> 0.5
>
> That is, they are regularly spaced, but centered within the pixel. When
> there is an even number of them, this means there will not be a filter
> position at 0.5, and therefore no sample at offset 0. And the only case
> where number of subpixel locations is odd, is when subsample_bits = 0.
>
> I'm pretty sure that the existing code gets the filter generation right
> for these subpixel positions.
>
>
> [ You can argue that it would be better to use the sampling positions
>
> 0, 0.25, 0.5, 0.75
>
> for subsample_bits = 2, as Owen did here:
>
> https://lists.cairographics.org/archives/cairo/2014-March/025105.html
>
> and I agree that that would have been better. ]
>
>
> Søren
>
___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


Re: [Pixman] [PATCH v14 12/22] pixman-filter: fix subsample_bits == 0

2016-04-11 Thread Bill Spitzak
Okay the actual bug is that the gnuplot output is wrong for
subsample_bits==0. It apparently is correct for other values of
subsampling. I will try to get an updated version posted soon.

But I very much agree with Owen (and you?) that the current behavior is
incorrect and should be fixed exactly as stated. The current version
produces different images from fallbacks to impulse, bilinear, or
integer-sized box filters, when the filter math says the fallback should
produce an identical result. Scaling down by an integer produces a slightly
blurry and offset result, when a perfect result could be achieved at the
same speed. Increasing the subsamples moves the existing samples, thus may
increase artifacts, rather than always reducing them. All of this is not a
good situation.

I feel this can be fixed. It is already correct for subsample_bits==0.
Since both the filter generator and filtering code would be changed in the
same version, only programs that generate their own filters would actually
be incompatible. And as you point out the error is very tiny for large
numbers of subsamples, and Cairo is already using an excessively large
number of subsamples (likely because I was trying to remove the difference
between identity filters and nearest filtering, and did not realize this
was the underlying problem).

On Sun, Apr 10, 2016 at 10:01 PM, Søren Sandmann 
wrote:

>
> It does look like there is something really wrong. I compared and (except
>> for the subsample_bits==0 case) my version produces the same output as the
>> current git head.
>>
>> I think your intention is that there is a sample at offset=0 whether the
>> filter width is even or odd. However (except when subsample_bits==0) the
>> filter generator makes a symmetric filter for even sizes, with two equal
>> samples around the maximum center value. If a sample was at offset==0 then
>> it would be unique and larger than all the other samples.
>>
>
> The root of this confusion is probably that when subsample_bits = k, the
> subpixel positions used are:
>
> 0.5 / 2^k, 1.5 / 2^k, ..., (2^k-0.5)/2^k
>
> For example, for subsample_bits = 2:
>
> 0.125, 0.375, 0.625, 0.875
>
> and for subsample_bits = 0:
>
> 0.5
>
> That is, they are regularly spaced, but centered within the pixel. When
> there is an even number of them, this means there will not be a filter
> position at 0.5, and therefore no sample at offset 0. And the only case
> where number of subpixel locations is odd, is when subsample_bits = 0.
>
> I'm pretty sure that the existing code gets the filter generation right
> for these subpixel positions.
>
>
> [ You can argue that it would be better to use the sampling positions
>
> 0, 0.25, 0.5, 0.75
>
> for subsample_bits = 2, as Owen did here:
>
> https://lists.cairographics.org/archives/cairo/2014-March/025105.html
>
> and I agree that that would have been better. ]
>
>
> Søren
>
___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 6/9] armv7: Use aligned memory writes in both copies of bilinear code

2016-04-11 Thread Ben Avison
pixman-arm-neon-asm-bilinear.S contains duplicates of some macro
definitions from pixman-arm-neon-asm.S, but they were taken before
commit 9638af9 added the aligns and they have never been brought back
into line.

An equivalent macro to load from the destination buffer (not applicable
to the operations implemented in pixman-arm-neon-asm.S) benefits from
the same alignent hints.

Verified that the aligned versions don't cause memory faults for the
fast paths defined in pixman-arm-neon-asm-bilinear.S.

Signed-off-by: Ben Avison 
---
 pixman/pixman-arm-neon-asm-bilinear.S |   16 
 1 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm-bilinear.S 
b/pixman/pixman-arm-neon-asm-bilinear.S
index 0fd92d6..aba8d00 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -186,9 +186,9 @@
 
 .macro bilinear_store_ numpix, tmp1, tmp2
 .if numpix == 4
-vst1.32   {d0, d1}, [OUT]!
+vst1.32   {d0, d1}, [OUT, :128]!
 .elseif numpix == 2
-vst1.32   {d0}, [OUT]!
+vst1.32   {d0}, [OUT, :64]!
 .elseif numpix == 1
 vst1.32   {d0[0]}, [OUT, :32]!
 .else
@@ -203,11 +203,11 @@
 vuzp.u8 d0, d2
 convert__to_0565 d2, d1, d0, q1, tmp1, tmp2
 .if numpix == 4
-vst1.16   {d2}, [OUT]!
+vst1.16   {d2}, [OUT, :64]!
 .elseif numpix == 2
-vst1.32   {d2[0]}, [OUT]!
+vst1.32   {d2[0]}, [OUT, :32]!
 .elseif numpix == 1
-vst1.16   {d2[0]}, [OUT]!
+vst1.16   {d2[0]}, [OUT, :16]!
 .else
 .error bilinear_store_0565 numpix is unsupported
 .endif
@@ -251,11 +251,11 @@
 
 .macro bilinear_load_dst_ numpix, dst0, dst1, dst01
 .if numpix == 4
-vld1.32 {dst0, dst1}, [OUT]
+vld1.32 {dst0, dst1}, [OUT, :128]
 .elseif numpix == 2
-vld1.32 {dst0}, [OUT]
+vld1.32 {dst0}, [OUT, :64]
 .elseif numpix == 1
-vld1.32 {dst0[0]}, [OUT]
+vld1.32 {dst0[0]}, [OUT, :32]
 .else
 .error bilinear_load_dst_ numpix is unsupported
 .endif
-- 
1.7.5.4

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 4/9] armv7: Simplify constant load

2016-04-11 Thread Ben Avison
A minor point, but 0xFF00 is already a valid immediate constant for
NEON, there's no need to construct it in two steps.

Signed-off-by: Ben Avison 
---
 pixman/pixman-arm-neon-asm.S |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 9a5d85a..97315d4 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1186,8 +1186,7 @@ generate_composite_function \
 .endm
 
 .macro pixman_composite_src_x888__init
-vmov.u8  q2, #0xFF
-vshl.u32 q2, q2, #24
+vmov.u32 q2, #0xFF00
 .endm
 
 generate_composite_function \
-- 
1.7.5.4

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 8/9 v2] armv7: More use of fast paths with localized destination alpha

2016-04-11 Thread Ben Avison
There are a group of combiner types - SRC, OVER, IN_REVERSE, OUT_REVERSE
and ADD - where the destination alpha component is only used (if at all) to
determine the destination alpha component. This means that any such fast
paths with an a8r8g8b8 destination can also be applied to an x8r8g8b8
destination just by updating the fast path table, and likewise with
a8b8g8r8 and x8b8g8r8. The following operations are affected:

over___x888
add_n_8_x888
add__8_x888
add___x888
add__n_x888
add__x888
out_reverse_8_x888

v2: Changed summary line to make it distinct from similar patch relating
to localized source alpha

Signed-off-by: Ben Avison 
---
 pixman/pixman-arm-neon.c |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index be761c9..5f0561a 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -331,6 +331,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,   b5g6r5,   
neon_composite_over__8_0565),
 PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,   r5g6b5,   
neon_composite_over_0565_8_0565),
 PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,   b5g6r5,   
neon_composite_over_0565_8_0565),
+PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, x8r8g8b8, 
neon_composite_over___),
 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, 
neon_composite_over___),
 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5,   
neon_composite_over__0565),
 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5,   
neon_composite_over__0565),
@@ -341,17 +342,26 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, a8r8g8b8, 
neon_composite_src_x888_),
 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, a8b8g8r8, 
neon_composite_src_x888_),
 PIXMAN_STD_FAST_PATH (ADD,  solid,a8,   a8,   
neon_composite_add_n_8_8),
+PIXMAN_STD_FAST_PATH (ADD,  solid,a8,   x8r8g8b8, 
neon_composite_add_n_8_),
 PIXMAN_STD_FAST_PATH (ADD,  solid,a8,   a8r8g8b8, 
neon_composite_add_n_8_),
+PIXMAN_STD_FAST_PATH (ADD,  solid,a8,   x8b8g8r8, 
neon_composite_add_n_8_),
 PIXMAN_STD_FAST_PATH (ADD,  solid,a8,   a8b8g8r8, 
neon_composite_add_n_8_),
 PIXMAN_STD_FAST_PATH (ADD,  a8,   a8,   a8,   
neon_composite_add_8_8_8),
 PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,   r5g6b5,   
neon_composite_add_0565_8_0565),
 PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,   b5g6r5,   
neon_composite_add_0565_8_0565),
+PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,   x8r8g8b8, 
neon_composite_add__8_),
+PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,   x8b8g8r8, 
neon_composite_add__8_),
 PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,   a8r8g8b8, 
neon_composite_add__8_),
 PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,   a8b8g8r8, 
neon_composite_add__8_),
+PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, x8r8g8b8, 
neon_composite_add___),
 PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, 
neon_composite_add___),
+PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,x8r8g8b8, 
neon_composite_add__n_),
+PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,x8b8g8r8, 
neon_composite_add__n_),
 PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,a8r8g8b8, 
neon_composite_add__n_),
 PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,a8b8g8r8, 
neon_composite_add__n_),
 PIXMAN_STD_FAST_PATH (ADD,  a8,   null, a8,   
neon_composite_add_8_8),
+PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null, x8r8g8b8, 
neon_composite_add__),
+PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null, x8b8g8r8, 
neon_composite_add__),
 PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null, a8r8g8b8, 
neon_composite_add__),
 PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null, a8b8g8r8, 
neon_composite_add__),
 PIXMAN_STD_FAST_PATH (IN,   solid,null, a8,   
neon_composite_in_n_8),
@@ -359,7 +369,9 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, 
neon_composite_over_reverse_n_),
 PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,null, r5g6b5,   
neon_composite_out_reverse_8_0565),
 PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,null, b5g6r5,   
neon_composite_out_reverse_8_0565),
+PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,null, x8r8g8b8, 
neon_composite_out_reverse_8_),
 PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,null, a8r8g8b8, 
neon_composite_out_reverse_8_),
+PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,null, x8b8g8r8, 
neon_composite_out_reverse_8_),
 PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,

[Pixman] [PATCH 3/9 repost] armv7: Use VLD-to-all-lanes

2016-04-11 Thread Ben Avison
I noticed in passing that a number of opportunities to use the all-lanes
variant of VLD has been missed. I don't expect any measurable speedup because
these are all in init code, but this simplifies the code a bit.

Signed-off-by: Ben Avison 
---
 pixman/pixman-arm-neon-asm.S |  142 +-
 1 files changed, 58 insertions(+), 84 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 7e949a3..9a5d85a 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -396,11 +396,10 @@ generate_composite_function \
 
 .macro pixman_composite_over_n_0565_init
 add DUMMY, sp, #ARGS_STACK_OFFSET
-vld1.32 {d3[0]}, [DUMMY]
-vdup.8  d0, d3[0]
-vdup.8  d1, d3[1]
-vdup.8  d2, d3[2]
-vdup.8  d3, d3[3]
+vld1.8  {d0[]}, [DUMMY]!
+vld1.8  {d1[]}, [DUMMY]!
+vld1.8  {d2[]}, [DUMMY]!
+vld1.8  {d3[]}, [DUMMY]!
 vmvn.8  d3, d3  /* invert source alpha */
 .endm
 
@@ -761,11 +760,10 @@ generate_composite_function_single_scanline \
 
 .macro pixman_composite_over_n__init
 add DUMMY, sp, #ARGS_STACK_OFFSET
-vld1.32 {d3[0]}, [DUMMY]
-vdup.8  d0, d3[0]
-vdup.8  d1, d3[1]
-vdup.8  d2, d3[2]
-vdup.8  d3, d3[3]
+vld1.8  {d0[]}, [DUMMY]!
+vld1.8  {d1[]}, [DUMMY]!
+vld1.8  {d2[]}, [DUMMY]!
+vld1.8  {d3[]}, [DUMMY]!
 vmvn.8  d24, d3  /* get inverted alpha */
 .endm
 
@@ -813,11 +811,10 @@ generate_composite_function \
 
 .macro pixman_composite_over_reverse_n__init
 add DUMMY, sp, #ARGS_STACK_OFFSET
-vld1.32 {d7[0]}, [DUMMY]
-vdup.8  d4, d7[0]
-vdup.8  d5, d7[1]
-vdup.8  d6, d7[2]
-vdup.8  d7, d7[3]
+vld1.8  {d4[]}, [DUMMY]!
+vld1.8  {d5[]}, [DUMMY]!
+vld1.8  {d6[]}, [DUMMY]!
+vld1.8  {d7[]}, [DUMMY]!
 .endm
 
 generate_composite_function \
@@ -956,11 +953,10 @@ generate_composite_function \
 .macro pixman_composite_over_n_8_0565_init
 add DUMMY, sp, #ARGS_STACK_OFFSET
 vpush   {d8-d15}
-vld1.32 {d11[0]}, [DUMMY]
-vdup.8  d8, d11[0]
-vdup.8  d9, d11[1]
-vdup.8  d10, d11[2]
-vdup.8  d11, d11[3]
+vld1.8  {d8[]}, [DUMMY]!
+vld1.8  {d9[]}, [DUMMY]!
+vld1.8  {d10[]}, [DUMMY]!
+vld1.8  {d11[]}, [DUMMY]!
 .endm
 
 .macro pixman_composite_over_n_8_0565_cleanup
@@ -981,10 +977,9 @@ generate_composite_function \
 
/**/
 
 .macro pixman_composite_over__n_0565_init
-add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+add DUMMY, sp, #(ARGS_STACK_OFFSET + 11)
 vpush   {d8-d15}
-vld1.32 {d24[0]}, [DUMMY]
-vdup.8  d24, d24[3]
+vld1.8  {d24[]}, [DUMMY]
 .endm
 
 .macro pixman_composite_over__n_0565_cleanup
@@ -1049,12 +1044,8 @@ generate_composite_function \
 
 .macro pixman_composite_src_n_8_init
 add DUMMY, sp, #ARGS_STACK_OFFSET
-vld1.32 {d0[0]}, [DUMMY]
-vsli.u64d0, d0, #8
-vsli.u64d0, d0, #16
-vsli.u64d0, d0, #32
-vorrd1, d0, d0
-vorrq1, q0, q0
+vld1.8  {d0[],d1[]}, [DUMMY]
+vld1.8  {d2[],d3[]}, [DUMMY]
 .endm
 
 .macro pixman_composite_src_n_8_cleanup
@@ -1089,11 +1080,8 @@ generate_composite_function \
 
 .macro pixman_composite_src_n_0565_init
 add DUMMY, sp, #ARGS_STACK_OFFSET
-vld1.32 {d0[0]}, [DUMMY]
-vsli.u64d0, d0, #16
-vsli.u64d0, d0, #32
-vorrd1, d0, d0
-vorrq1, q0, q0
+vld1.16 {d0[],d1[]}, [DUMMY]
+vld1.16 {d2[],d3[]}, [DUMMY]
 .endm
 
 .macro pixman_composite_src_n_0565_cleanup
@@ -1128,10 +1116,8 @@ generate_composite_function \
 
 .macro pixman_composite_src_n__init
 add DUMMY, sp, #ARGS_STACK_OFFSET
-vld1.32 {d0[0]}, [DUMMY]
-vsli.u64d0, d0, #32
-vorrd1, d0, d0
-vorrq1, q0, q0
+vld1.32 {d0[],d1[]}, [DUMMY]
+vld1.32 {d2[],d3[]}, [DUMMY]
 .endm
 
 .macro pixman_composite_src_n__cleanup
@@ -1271,11 +1257,10 @@ generate_composite_function \
 
 .macro pixman_composite_src_n_8__init
 add DUMMY, sp, #ARGS_STACK_OFFSET
-vld1.32 {d3[0]}, [DUMMY]
-vdup.8  d0, d3[0]
-vdup.8  d1, d3[1]
-vdup.8  d2, d3[2]
-vdup.8  d3, d3[3]
+vld1.8  {d0[]}, [DUMMY]!
+vld1.8  {d1[]}, [DUMMY]!
+vld1.8  {d2[]}, [DUMMY]!
+vld1.8  {d3[]}, [DUMMY]!
 .endm
 
 .macro pixman_composite_src_n_8__cleanup
@@ -1339,9 +1324,8 @@ generate_composite_function \
 .endm
 
 .macro pixman_composite_src_n_8_8_init
-add DUMMY, sp, #ARGS_STACK_OFFSET
-vld1.32 {d16[0]}, [DUMMY]
-vdup.8  d16, d16[3]
+add DUMMY, sp, 

[Pixman] [PATCH 0/9] Changes to existing ARMv7 routines

2016-04-11 Thread Ben Avison
Since there are a few people around on the list at the moment who are
familiar with NEON, I'm hoping someone will be able to review my work so it
can make it into git.

To keep the number of patches manageable, here are a group which improve
incrementally upon existing ARMv7 routines, without adding any new ones yet.
Most of these are reposts which have had no review of the technical content.
The patch numbers have been reassigned within this series of 9 patches, and
won't match the numbers used when originally posted.

Ben Avison (9):
  armv7: Coalesce scalar accesses where possible
  armv7: Faster fill operations
  armv7: Use VLD-to-all-lanes
  armv7: Simplify constant load
  armv7: Use prefetch for small-width images too
  armv7: Use aligned memory writes in both copies of bilinear code
  armv7: Move common bilinear macro definitions to a new header file
  armv7: More use of fast paths with localized destination alpha
  armv7: More use of fast paths with localized source alpha

 pixman/Makefile.am|3 +-
 pixman/pixman-arm-neon-asm-bilinear.S |  153 +-
 pixman/pixman-arm-neon-asm-bilinear.h |  165 +++
 pixman/pixman-arm-neon-asm.S  |  280 +++--
 pixman/pixman-arm-neon-asm.h  |   20 +++
 pixman/pixman-arm-neon.c  |   21 +++
 6 files changed, 272 insertions(+), 370 deletions(-)
 create mode 100644 pixman/pixman-arm-neon-asm-bilinear.h

-- 
1.7.5.4

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 5/9 repost] armv7: Use prefetch for small-width images too

2016-04-11 Thread Ben Avison
After discovering that the ARMv6 optimised fast paths often out-performed
the ARMv7 ones on a Cortex-A7, particularly on the RT benchmark, I found
that the problem was due to the fact that the ARMv7 macros didn't attempt
any sort of prefetch for small images (fewer than pixblock_size * 2 pixels
across).

Since a pixblock is chosen to be no larger than a cacheline, and is in many
cases smaller, it seemed a reasonable compromise to avoid adding a lot of
complexity by simply doing one prefetch for the start of a pixel row when
starting to process the preceding one, and that is what this patch does.

I compared the effect of using LDRB (which is what is currently used at the
end of each long pixel row) against PLD for each of the source and
destination buffers for a selection of common operations: src__,
over__ and add__, and in each case PLD of both buffers was
the most beneficial. PLDW didn't make any measurable difference.

The overall effect of this patch on the three operations is as follows
(L1, L2 and M tests can be ignored because they're known not to involve the
use of short rows):

src__

Before  After
Mean   StdDev   Mean   StdDev  Confidence  Change
HT  60.8   0.1  61.1   0.1 100.0%  +0.6%
VT  61.0   0.3  62.6   0.2 100.0%  +2.6%
R   45.5   0.2  46.2   0.2 100.0%  +1.5%
RT  19.8   0.0  21.4   0.0 100.0%  +7.8%

over__

Before  After
Mean   StdDev   Mean   StdDev  Confidence  Change
HT  40.2   0.1  40.7   0.4 100.0%  +1.0%
VT  35.5   0.2  37.9   0.3 100.0%  +6.7%
R   32.8   0.0  33.8   0.3 100.0%  +3.0%
RT  12.9   0.0  15.6   0.2 100.0%  +21.4%

add__

Before  After
Mean   StdDev   Mean   StdDev  Confidence  Change
HT  51.0   0.6  51.9   0.5 100.0%  +1.7%
VT  44.0   0.4  46.8   0.5 100.0%  +6.3%
R   39.6   0.5  41.0   0.4 100.0%  +3.5%
RT  15.2   0.2  18.0   0.2 100.0%  +18.5%

Signed-off-by: Ben Avison 
---
 pixman/pixman-arm-neon-asm.h |9 +
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index 03257cc..a116e47 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -881,6 +881,15 @@ local skip1
  * nor prefetch are used.
  */
 8:
+.if src_bpp_shift >= 0
+PF pld, [SRC, SRC_STRIDE, lsl #src_bpp_shift]
+.endif
+.if dst_r_bpp != 0
+PF pld, [DST_R, DST_STRIDE, lsl #dst_bpp_shift]
+.endif
+.if mask_bpp_shift >= 0
+PF pld, [MASK, MASK_STRIDE, lsl #mask_bpp_shift]
+.endif
 /* Process exactly pixblock_size pixels if needed */
 tst W, #pixblock_size
 beq 1f
-- 
1.7.5.4

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 1/9 repost] armv7: Coalesce scalar accesses where possible

2016-04-11 Thread Ben Avison
Where the alignment of a block of elements is known to equal the size of the
block, but the block is smaller than 8 bytes, it is safe to use a larger
element size in a scalar VLD or VST without risking an alignment exception.
Typically the effect of this can be seen when accessing leading or trailing
halfwords or words in the destination buffer for long scanlines.

Sadly, the effect of this is too small to be measured, but it seems like a
good idea anyway.

Signed-off-by: Ben Avison 
---
 pixman/pixman-arm-neon-asm.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index bdcf6a9..76b3985 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -183,6 +183,10 @@
 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
 .elseif (bpp == 24) && (numpix == 1)
 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.elseif numpix * bpp == 32 && abits == 32
+pixldst 4, vst1, 32, basereg, mem_operand, abits
+.elseif numpix * bpp == 16 && abits == 16
+pixldst 2, vst1, 16, basereg, mem_operand, abits
 .else
 pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
 .endif
-- 
1.7.5.4

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 2/9 repost] armv7: Faster fill operations

2016-04-11 Thread Ben Avison
This eliminates a number of branches over blocks of code that are either
empty or can be trivially combined with a separate code block at the start
and end of each scanline. This has a surprisingly big effect, at least on
Cortex-A7, for src_n_8:

Before  After
Mean   StdDev   Mean   StdDev  Confidence  Change
L1  1570.4 133.11639.6 110.7   100.0%  +4.4%
L2  1042.6 19.9 1086.6 23.4100.0%  +4.2%
M   1030.8 7.2  1036.8 3.2 100.0%  +0.6%
HT  287.4  3.5  303.3  2.9 100.0%  +5.5%
VT  262.0  2.6  263.3  2.6 99.9%   +0.5%
R   206.5  2.4  209.9  2.4 100.0%  +1.7%
RT  56.5   1.0  59.2   0.5 100.0%  +4.7%

Signed-off-by: Ben Avison 
---
 pixman/pixman-arm-neon-asm.h |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index 76b3985..03257cc 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -468,6 +468,7 @@
 tst DST_R, #0xF
 beq 2f
 
+.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
 .irp lowbit, 1, 2, 4, 8, 16
 local skip1
 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
@@ -487,6 +488,7 @@ local skip1
 1:
 .endif
 .endr
+.endif
 pixdeinterleave src_bpp, src_basereg
 pixdeinterleave mask_bpp, mask_basereg
 pixdeinterleave dst_r_bpp, dst_r_basereg
@@ -503,6 +505,9 @@ local skip1
 tst DST_W, #lowbit
 beq 1f
 .endif
+.if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0
+sub W, W, #(lowbit * 8 / dst_w_bpp)
+.endif
 pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
 1:
 .endif
@@ -533,6 +538,7 @@ local skip1
process_pixblock_tail_head
 tst W, #(pixblock_size - 1)
 beq 2f
+.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
 .irp chunk_size, 16, 8, 4, 2, 1
 .if pixblock_size > chunk_size
 tst W, #chunk_size
@@ -550,6 +556,7 @@ local skip1
 1:
 .endif
 .endr
+.endif
 pixdeinterleave src_bpp, src_basereg
 pixdeinterleave mask_bpp, mask_basereg
 pixdeinterleave dst_r_bpp, dst_r_basereg
-- 
1.7.5.4

___
Pixman mailing list
Pixman@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/pixman


[Pixman] [PATCH 7/9] armv7: Move common bilinear macro definitions to a new header file

2016-04-11 Thread Ben Avison
This reduces code duplication.

Signed-off-by: Ben Avison 
---
 pixman/Makefile.am|3 +-
 pixman/pixman-arm-neon-asm-bilinear.S |  147 +-
 pixman/pixman-arm-neon-asm-bilinear.h |  165 +
 pixman/pixman-arm-neon-asm.S  |  135 +--
 4 files changed, 169 insertions(+), 281 deletions(-)
 create mode 100644 pixman/pixman-arm-neon-asm-bilinear.h

diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 581b6f6..b0cffaf 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -88,7 +88,8 @@ libpixman_arm_neon_la_SOURCES = \
 pixman-arm-neon-asm.S  \
pixman-arm-neon-asm-bilinear.S \
 pixman-arm-asm.h   \
-pixman-arm-neon-asm.h
+pixman-arm-neon-asm.h  \
+pixman-arm-neon-asm-bilinear.h
 libpixman_1_la_LIBADD += libpixman-arm-neon.la
 
 ASM_CFLAGS_arm_neon=
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S 
b/pixman/pixman-arm-neon-asm-bilinear.S
index aba8d00..1194d2d 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -67,152 +67,7 @@
 #include "pixman-private.h"
 #include "pixman-arm-asm.h"
 #include "pixman-arm-neon-asm.h"
-
-/*
- * Bilinear macros from pixman-arm-neon-asm.S
- */
-
-/*
- * Bilinear scaling support code which tries to provide pixel fetching, color
- * format conversion, and interpolation as separate macros which can be used
- * as the basic building blocks for constructing bilinear scanline functions.
- */
-
-.macro bilinear_load_ reg1, reg2, tmp
-mov   TMP1, X, asr #16
-add   X, X, UX
-add   TMP1, TOP, TMP1, asl #2
-vld1.32   {reg1}, [TMP1], STRIDE
-vld1.32   {reg2}, [TMP1]
-.endm
-
-.macro bilinear_load_0565 reg1, reg2, tmp
-mov   TMP1, X, asr #16
-add   X, X, UX
-add   TMP1, TOP, TMP1, asl #1
-vld1.32   {reg2[0]}, [TMP1], STRIDE
-vld1.32   {reg2[1]}, [TMP1]
-convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
-.endm
-
-.macro bilinear_load_and_vertical_interpolate_two_ \
-acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
-
-bilinear_load_ reg1, reg2, tmp1
-vmull.u8  acc1, reg1, d28
-vmlal.u8  acc1, reg2, d29
-bilinear_load_ reg3, reg4, tmp2
-vmull.u8  acc2, reg3, d28
-vmlal.u8  acc2, reg4, d29
-.endm
-
-.macro bilinear_load_and_vertical_interpolate_four_ \
-xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
-yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
-
-bilinear_load_and_vertical_interpolate_two_ \
-xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
-bilinear_load_and_vertical_interpolate_two_ \
-yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
-.endm
-
-.macro bilinear_load_and_vertical_interpolate_two_0565 \
-acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
-
-mov   TMP1, X, asr #16
-add   X, X, UX
-add   TMP1, TOP, TMP1, asl #1
-mov   TMP2, X, asr #16
-add   X, X, UX
-add   TMP2, TOP, TMP2, asl #1
-vld1.32   {acc2lo[0]}, [TMP1], STRIDE
-vld1.32   {acc2hi[0]}, [TMP2], STRIDE
-vld1.32   {acc2lo[1]}, [TMP1]
-vld1.32   {acc2hi[1]}, [TMP2]
-convert_0565_to_x888 acc2, reg3, reg2, reg1
-vzip.u8   reg1, reg3
-vzip.u8   reg2, reg4
-vzip.u8   reg3, reg4
-vzip.u8   reg1, reg2
-vmull.u8  acc1, reg1, d28
-vmlal.u8  acc1, reg2, d29
-vmull.u8  acc2, reg3, d28
-vmlal.u8  acc2, reg4, d29
-.endm
-
-.macro bilinear_load_and_vertical_interpolate_four_0565 \
-xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
-yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
-
-mov   TMP1, X, asr #16
-add   X, X, UX
-add   TMP1, TOP, TMP1, asl #1
-mov   TMP2, X, asr #16
-add   X, X, UX
-add   TMP2, TOP, TMP2, asl #1
-vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
-vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
-vld1.32   {xacc2lo[1]}, [TMP1]
-vld1.32   {xacc2hi[1]}, [TMP2]
-convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
-mov   TMP1, X, asr #16
-add   X, X, UX
-add   TMP1, TOP, TMP1, asl #1
-mov   TMP2, X, asr #16
-add   X, X, UX
-add   TMP2, TOP, TMP2, asl #1
-vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
-vzip.u8   xreg1, xreg3
-vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
-vzip.u8   xreg2, xreg4
-vld1.32   {yacc2lo[1]}, [TMP1]
-vzip.u8   xreg3, xreg4
-vld1.32   {yacc2hi[1]}, [TMP2]
-vzip.u8   xreg1, xreg2
-convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
-vmull.u8  xacc1, xreg1, d28
-vzip.u8   yreg1, yreg3
-vmlal.u8  xacc1, xreg2, d29
-vzip.u8   yreg2, yreg4
-vmull.u8  xacc2, xreg3, d28
-vzip.u8   yreg3, yreg4
-