Once upon a time this improved firefon-tron on a IVB i7-3720qm
from 68.6s to 45.2s. However, we have accomplished the same goal with
earlier tuning...

Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
---
 pixman/pixman-sse2.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 110 insertions(+), 3 deletions(-)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 9558e9c..e680c07 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5679,6 +5679,62 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
                               NORMAL, FLAG_NONE)
 
 static force_inline void
+scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t *       dst,
+                                            const uint32_t * mask,
+                                            const uint32_t * src_top,
+                                            const uint32_t * src_bottom,
+                                            int32_t          w,
+                                            int              wt,
+                                            int              wb,
+                                            pixman_fixed_t   vx,
+                                            pixman_fixed_t   unit_x,
+                                            pixman_fixed_t   max_vx,
+                                            pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+
+    while ((w -= 4) >= 0)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+       *dst++ = pix1 | 0xff000000;
+       *dst++ = pix2 | 0xff000000;
+       *dst++ = pix3 | 0xff000000;
+       *dst++ = pix4 | 0xff000000;
+    }
+
+    if (w & 2)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       *dst++ = pix1 | 0xff000000;
+       *dst++ = pix2 | 0xff000000;
+    }
+
+    if (w & 1)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       *dst = pix1 | 0xff000000;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
+                              scaled_bilinear_scanline_sse2_x888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
+                              scaled_bilinear_scanline_sse2_x888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
+                              scaled_bilinear_scanline_sse2_x888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              NORMAL, FLAG_NONE)
+
+static force_inline void
 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
                                              const uint32_t * mask,
                                              const uint32_t * src_top,
@@ -6186,6 +6242,13 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
 
+    SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
@@ -6323,12 +6386,16 @@ sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t 
*mask)
        w -= 16;
     }
 
-    while (w)
+    while (w >= 2)
     {
-       *dst++ = *(src++) << 24;
-       w--;
+       *(uint64_t)dst++ = (uint64_t)src[0] << 24 | (uint64_t)src[1] << 56;
+       src += 2;
+       w -= 2
     }
 
+    if (w)
+       *dst = *(src) << 24;
+
     return iter->buffer;
 }
 
@@ -6605,6 +6672,26 @@ sse2_fetch_bilinear (pixman_iter_t *iter,
            x_bottom = x_top = x += ux * width;
            mask += mask_inc * width;
        }
+       else if (format == PIXMAN_x8r8g8b8 && ux_top == ux && ux_bottom == ux 
&& x < w)
+       {
+           int width;
+
+           width = end - buffer;
+           if (width * ux + x > w)
+               width = (w - x + ux - 1) / ux;
+
+           scaled_bilinear_scanline_sse2_x888_8888_SRC (buffer, NULL,
+                                                        (uint32_t *)top_row,
+                                                        (uint32_t *)bottom_row,
+                                                        width,
+                                                        
BILINEAR_INTERPOLATION_RANGE - disty, disty,
+                                                        x, ux,
+                                                        0, 0);
+
+           buffer += width;
+           x_bottom = x_top = x += ux * width;
+           mask += mask_inc * width;
+       }
        else
        {
            while (buffer < end && x < w)
@@ -6722,6 +6809,26 @@ sse2_fetch_bilinear (pixman_iter_t *iter,
            x += ux * width;
            mask += mask_inc * width;
        }
+       else if (format == PIXMAN_x8r8g8b8 && x < w)
+       {
+           int width;
+
+           width = end - buffer;
+           if (width * ux > w - x)
+               width = (w - x + ux - 1) / ux;
+
+           scaled_bilinear_scanline_sse2_x888_8888_SRC (buffer, NULL,
+                                                        (uint32_t *)top_row,
+                                                        (uint32_t *)bottom_row,
+                                                        width,
+                                                        
BILINEAR_INTERPOLATION_RANGE - disty, disty,
+                                                        x, ux,
+                                                        0, 0);
+
+           buffer += width;
+           x += ux * width;
+           mask += mask_inc * width;
+       }
        else
        {
            while (buffer < end && x < w)
-- 
1.7.10.4

_______________________________________________
Pixman mailing list
Pixman@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/pixman

Reply via email to