Rebased ref, commits from common ancestor:
commit ee790044b08e3b668e6aa5d9229f46ed7295ebf0
Author: Oded Gabbay <oded.gab...@gmail.com>
Date:   Sat Aug 1 22:34:53 2015 +0300

    Pre-release version bump to 0.33.2
    
    Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>

diff --git a/configure.ac b/configure.ac
index 0339494..554b6d2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -54,7 +54,7 @@ AC_PREREQ([2.57])
 
 m4_define([pixman_major], 0)
 m4_define([pixman_minor], 33)
-m4_define([pixman_micro], 1)
+m4_define([pixman_micro], 2)
 
 m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
 

commit 8d9be3619a906855a3e3a1e052317833cb24cabe
Author: Oded Gabbay <oded.gab...@gmail.com>
Date:   Wed Jul 1 14:34:07 2015 +0300

    vmx: implement fast path iterator vmx_fetch_a8
    
    no changes were observed when running cairo trimmed benchmarks.
    
    Running "lowlevel-blt-bench src_8_8888" on POWER8, 8 cores,
    3.4GHz, RHEL 7.1 ppc64le gave the following results:
    
    reference memcpy speed = 25197.2MB/s (6299.3MP/s for 32bpp fills)
    
                    Before          After           Change
                  --------------------------------------------
    L1              965.34          3936           +307.73%
    L2              942.99          3436.29        +264.40%
    M               902.24          2757.77        +205.66%
    HT              448.46          784.99         +75.04%
    VT              430.05          819.78         +90.62%
    R               412.9           717.04         +73.66%
    RT              168.93          220.63         +30.60%
    Kops/s          1025            1303           +27.12%
    
    It was benchmarked against commid id e2d211a from pixman/master
    
    Siarhei Siamashka reported that on playstation3, it shows the following
    results:
    
    == before ==
    
                  src_8_8888 =  L1: 194.37  L2: 198.46  M:155.90 (148.35%)
                  HT: 59.18  VT: 36.71  R: 38.93  RT: 12.79 ( 106Kops/s)
    
    == after ==
    
                  src_8_8888 =  L1: 373.96  L2: 391.10  M:245.81 (233.88%)
                  HT: 80.81  VT: 44.33  R: 48.10  RT: 14.79 ( 122Kops/s)
    
    Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 773ad76..a9bd024 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -3139,6 +3139,49 @@ vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t 
*mask)
     return iter->buffer;
 }
 
+static uint32_t *
+vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+    vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
+
+    iter->bits += iter->stride;
+
+    while (w && (((uintptr_t)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 16)
+    {
+       vmx0 = load_128_unaligned((uint32_t *) src);
+
+       unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2);
+       unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4);
+       unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6);
+
+       save_128_aligned(dst, vmx6);
+       save_128_aligned((dst +  4), vmx5);
+       save_128_aligned((dst +  8), vmx4);
+       save_128_aligned((dst + 12), vmx3);
+
+       dst += 16;
+       src += 16;
+       w -= 16;
+    }
+
+    while (w)
+    {
+       *dst++ = *(src++) << 24;
+       w--;
+    }
+
+    return iter->buffer;
+}
+
 #define IMAGE_FLAGS                                                    \
     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |               \
      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
@@ -3148,6 +3191,9 @@ static const pixman_iter_info_t vmx_iters[] =
     { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
       _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
     },
+    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL
+    },
     { PIXMAN_null },
 };
 

commit 47f74ca94637d79ee66c37a81eea0200e453fcc1
Author: Oded Gabbay <oded.gab...@gmail.com>
Date:   Mon Jun 29 15:31:02 2015 +0300

    vmx: implement fast path iterator vmx_fetch_x8r8g8b8
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
    
    cairo trimmed benchmarks :
    
    Speedups
    ========
    t-firefox-asteroids  533.92  -> 489.94 :  1.09x
    
    Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 0950850..773ad76 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -3105,6 +3105,52 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     {   PIXMAN_OP_NONE },
 };
 
+static uint32_t *
+vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    vector unsigned int ff000000 = mask_ff000000;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((uintptr_t)dst) & 0x0f)
+    {
+       *dst++ = (*src++) | 0xff000000;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000));
+
+       dst += 4;
+       src += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       *dst++ = (*src++) | 0xff000000;
+       w--;
+    }
+
+    return iter->buffer;
+}
+
+#define IMAGE_FLAGS                                                    \
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |               \
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+static const pixman_iter_info_t vmx_iters[] =
+{
+    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
+    },
+    { PIXMAN_null },
+};
+
 pixman_implementation_t *
 _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
 {
@@ -3147,5 +3193,7 @@ _pixman_implementation_create_vmx 
(pixman_implementation_t *fallback)
 
     imp->fill = vmx_fill;
 
+    imp->iter_info = vmx_iters;
+
     return imp;
 }

commit fcbb97d4458d717b9c15858aedcbee2d33c8ac5a
Author: Oded Gabbay <oded.gab...@gmail.com>
Date:   Sun Jun 28 23:25:24 2015 +0300

    vmx: implement fast path scaled nearest vmx_8888_8888_OVER
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
    reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              134.36          181.68          +35.22%
    L2              135.07          180.67          +33.76%
    M               134.6           180.51          +34.11%
    HT              121.77          128.79          +5.76%
    VT              120.49          145.07          +20.40%
    R               93.83           102.3           +9.03%
    RT              50.82           46.93           -7.65%
    Kops/s          448             422             -5.80%
    
    cairo trimmed benchmarks :
    
    Speedups
    ========
    t-firefox-asteroids  533.92 -> 497.92 :  1.07x
        t-midori-zoomed  692.98 -> 651.24 :  1.06x
    
    Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 64e9125..0950850 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2954,6 +2954,129 @@ vmx_composite_add_8888_8888 (pixman_implementation_t 
*imp,
     }
 }
 
+static force_inline void
+scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t*       pd,
+                                            const uint32_t* ps,
+                                            int32_t         w,
+                                            pixman_fixed_t  vx,
+                                            pixman_fixed_t  unit_x,
+                                            pixman_fixed_t  src_width_fixed,
+                                            pixman_bool_t   
fully_transparent_src)
+{
+    uint32_t s, d;
+    const uint32_t* pm = NULL;
+
+    vector unsigned int vmx_dst_lo, vmx_dst_hi;
+    vector unsigned int vmx_src_lo, vmx_src_hi;
+    vector unsigned int vmx_alpha_lo, vmx_alpha_hi;
+
+    if (fully_transparent_src)
+       return;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((uintptr_t)pd & 15))
+    {
+       d = *pd;
+       s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+
+       *pd++ = core_combine_over_u_pixel_vmx (s, d);
+       if (pm)
+           pm++;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       vector unsigned int tmp;
+       uint32_t tmp1, tmp2, tmp3, tmp4;
+
+       tmp1 = *(ps + pixman_fixed_to_int (vx));
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp2 = *(ps + pixman_fixed_to_int (vx));
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp3 = *(ps + pixman_fixed_to_int (vx));
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp4 = *(ps + pixman_fixed_to_int (vx));
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+
+       tmp[0] = tmp1;
+       tmp[1] = tmp2;
+       tmp[2] = tmp3;
+       tmp[3] = tmp4;
+
+       vmx_src_hi = combine4 ((const uint32_t *) &tmp, pm);
+
+       if (is_opaque (vmx_src_hi))
+       {
+           save_128_aligned (pd, vmx_src_hi);
+       }
+       else if (!is_zero (vmx_src_hi))
+       {
+           vmx_dst_hi = load_128_aligned (pd);
+
+           unpack_128_2x128 (vmx_src_hi, (vector unsigned int) AVV(0),
+                               &vmx_src_lo, &vmx_src_hi);
+
+           unpack_128_2x128 (vmx_dst_hi, (vector unsigned int) AVV(0),
+                               &vmx_dst_lo, &vmx_dst_hi);
+
+           expand_alpha_2x128 (
+               vmx_src_lo, vmx_src_hi, &vmx_alpha_lo, &vmx_alpha_hi);
+
+           over_2x128 (&vmx_src_lo, &vmx_src_hi,
+                       &vmx_alpha_lo, &vmx_alpha_hi,
+                       &vmx_dst_lo, &vmx_dst_hi);
+
+           /* rebuid the 4 pixel data and save*/
+           save_128_aligned (pd, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi));
+       }
+
+       w -= 4;
+       pd += 4;
+       if (pm)
+           pm += 4;
+    }
+
+    while (w)
+    {
+       d = *pd;
+       s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+
+       *pd++ = core_combine_over_u_pixel_vmx (s, d);
+       if (pm)
+           pm++;
+
+       w--;
+    }
+}
+
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
+                      scaled_nearest_scanline_vmx_8888_8888_OVER,
+                      uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER,
+                      scaled_nearest_scanline_vmx_8888_8888_OVER,
+                      uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER,
+                      scaled_nearest_scanline_vmx_8888_8888_OVER,
+                      uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
+                      scaled_nearest_scanline_vmx_8888_8888_OVER,
+                      uint32_t, uint32_t, NORMAL)
+
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, 
vmx_composite_over_8888_8888),
@@ -2974,6 +3097,11 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, 
vmx_composite_src_x888_8888),
     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, 
vmx_composite_src_x888_8888),
 
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888),
+
     {   PIXMAN_OP_NONE },
 };
 

commit ad612c4205f0ae46fc72a50e0c90ccd05487fcba
Author: Oded Gabbay <oded.gab...@gmail.com>
Date:   Sun Jun 28 22:23:44 2015 +0300

    vmx: implement fast path vmx_composite_src_x888_8888
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
    reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              1115.4          5006.49         +348.85%
    L2              1112.26         4338.01         +290.02%
    M               1110.54         2524.15         +127.29%
    HT              745.41          1140.03         +52.94%
    VT              749.03          1287.13         +71.84%
    R               423.91          547.6           +29.18%
    RT              205.79          194.98          -5.25%
    Kops/s          1414            1361            -3.75%
    
    cairo trimmed benchmarks :
    
    Speedups
    ========
    t-gnome-system-monitor  1402.62  -> 1212.75 :  1.16x
       t-firefox-asteroids   533.92  ->  474.50 :  1.13x
    
    Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 47393dc..64e9125 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2689,6 +2689,62 @@ vmx_fill (pixman_implementation_t *imp,
 }
 
 static void
+vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int32_t w;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (uintptr_t)dst & 15)
+       {
+           *dst++ = *src++ | 0xff000000;
+           w--;
+       }
+
+       while (w >= 16)
+       {
+           vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4;
+
+           vmx_src1 = load_128_unaligned (src);
+           vmx_src2 = load_128_unaligned (src + 4);
+           vmx_src3 = load_128_unaligned (src + 8);
+           vmx_src4 = load_128_unaligned (src + 12);
+
+           save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000));
+           save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000));
+           save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000));
+           save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000));
+
+           dst += 16;
+           src += 16;
+           w -= 16;
+       }
+
+       while (w)
+       {
+           *dst++ = *src++ | 0xff000000;
+           w--;
+       }
+    }
+}
+
+static void
 vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
                                pixman_composite_info_t *info)
 {
@@ -2914,6 +2970,10 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, 
vmx_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, 
vmx_composite_add_8888_8888),
 
+    /* PIXMAN_OP_SRC */
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, 
vmx_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, 
vmx_composite_src_x888_8888),
+
     {   PIXMAN_OP_NONE },
 };
 

commit fafc1d403b8405727d3918bcb605cb98044af90a
Author: Oded Gabbay <oded.gab...@gmail.com>
Date:   Sun Jun 28 10:14:20 2015 +0300

    vmx: implement fast path vmx_composite_over_n_8888_8888_ca
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
    
    reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              61.92            244.91          +295.53%
    L2              62.74            243.3           +287.79%
    M               63.03            241.94          +283.85%
    HT              59.91            144.22          +140.73%
    VT              59.4             174.39          +193.59%
    R               53.6             111.37          +107.78%
    RT              37.99            46.38           +22.08%
    Kops/s          436              506             +16.06%
    
    cairo trimmed benchmarks :
    
    Speedups
    ========
    t-xfce4-terminal-a1  1540.37 -> 1226.14 :  1.26x
    t-firefox-talos-gfx  1488.59 -> 1209.19 :  1.23x
    
    Slowdowns
    =========
            t-evolution  553.88  -> 581.63  :  1.05x
              t-poppler  364.99  -> 383.79  :  1.05x
    t-firefox-scrolling  1223.65 -> 1304.34 :  1.07x
    
    The slowdowns can be explained in cases where the images are small and
    un-aligned to 16-byte boundary. In that case, the function will first
    work on the un-aligned area, even in operations of 1 byte. In case of
    small images, the overhead of such operations can be more than the
    savings we get from using the vmx instructions that are done on the
    aligned part of the image.
    
    In the C fast-path implementation, there is no special treatment for the
    un-aligned part, as it works in 4 byte quantities on the entire image.
    
    Because llbb is a synthetic test, I would assume it has much less
    alignment issues than "real-world" scenario, such as cairo benchmarks,
    which are basically recorded traces of real application activity.
    
    Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 641c487..47393dc 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2715,6 +2715,114 @@ vmx_composite_over_8888_8888 (pixman_implementation_t 
*imp,
 }
 
 static void
+vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    vector unsigned int vsrc, valpha, vmask, vdest;
+
+    vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi;
+    vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    vsrc = unpacklo_128_16x8(create_mask_1x32_128 (&src),
+                           (vector unsigned int) AVV(0));
+
+    valpha = expand_alpha_1x128(vsrc);
+
+    while (height--)
+    {
+       int w = width;
+       const uint32_t *pm = (uint32_t *)mask_line;
+       uint32_t *pd = (uint32_t *)dst_line;
+
+       dst_line += dst_stride;
+       mask_line += mask_stride;
+
+       while (w && (uintptr_t)pd & 15)
+       {
+           m = *pm++;
+
+           if (m)
+           {
+               d = *pd;
+               vmask = unpack_32_1x128(m);
+               vdest = unpack_32_1x128(d);
+
+               *pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+           }
+
+           pd++;
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           /* pm is NOT necessarily 16-byte aligned */
+           vmx_mask = load_128_unaligned (pm);
+
+           pack_cmp = vec_all_eq(vmx_mask, (vector unsigned int) AVV(0));
+
+           /* if all bits in mask are zero, pack_cmp is not 0 */
+           if (pack_cmp == 0)
+           {
+               /* pd is 16-byte aligned */
+               vmx_dst = load_128_aligned (pd);
+
+               unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0),
+                                   &vmx_mask_lo, &vmx_mask_hi);
+
+               unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0),
+                                   &vmx_dst_lo, &vmx_dst_hi);
+
+               in_over_2x128 (&vsrc, &vsrc,
+                              &valpha, &valpha,
+                              &vmx_mask_lo, &vmx_mask_hi,
+                              &vmx_dst_lo, &vmx_dst_hi);
+
+               save_128_aligned(pd, pack_2x128_128(vmx_dst_lo, vmx_dst_hi));
+           }
+
+           pd += 4;
+           pm += 4;
+           w -= 4;
+       }
+
+       while (w)
+       {
+           m = *pm++;
+
+           if (m)
+           {
+               d = *pd;
+               vmask = unpack_32_1x128(m);
+               vdest = unpack_32_1x128(d);
+
+               *pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest));
+           }
+
+           pd++;
+           w--;
+       }
+    }
+}
+
+static void
 vmx_composite_add_8_8 (pixman_implementation_t *imp,
             pixman_composite_info_t *info)
 {
@@ -2796,6 +2904,10 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, 
vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, 
vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, 
vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, 
vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, 
vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, 
vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, 
vmx_composite_over_n_8888_8888_ca),
 
     /* PIXMAN_OP_ADD */
     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),

commit a3e914407e354df70b9200e263608f1fc2e686cf
Author: Oded Gabbay <oded.gab...@gmail.com>
Date:   Thu Jun 18 15:05:49 2015 +0300

    vmx: implement fast path composite_add_8888_8888
    
    Copied impl. from sse2 file and edited to use vmx functions
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 16 cores, 3.4GHz, ppc64le :
    
    reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              248.76          3284.48         +1220.34%
    L2              264.09          2826.47         +970.27%
    M               261.24          2405.06         +820.63%
    HT              217.27          857.3           +294.58%
    VT              213.78          980.09          +358.46%
    R               176.61          442.95          +150.81%
    RT              107.54          150.08          +39.56%
    Kops/s          917             1125            +22.68%
    
    Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index e49e8aa..641c487 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2765,6 +2765,31 @@ vmx_composite_add_8_8 (pixman_implementation_t *imp,
     }
 }
 
+static void
+vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+
+       vmx_combine_add_u (imp, op, dst, src, NULL, width);
+    }
+}
+
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, 
vmx_composite_over_8888_8888),
@@ -2774,6 +2799,8 @@ static const pixman_fast_path_t vmx_fast_paths[] =
 
     /* PIXMAN_OP_ADD */
     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, 
vmx_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, 
vmx_composite_add_8888_8888),
 
     {   PIXMAN_OP_NONE },
 };

commit d5b5343c7df99082597e0c37aec937dcf5b6602d
Author: Oded Gabbay <oded.gab...@gmail.com>
Date:   Thu Jun 18 14:56:47 2015 +0300

    vmx: implement fast path composite_add_8_8
    
    Copied impl. from sse2 file and edited to use vmx functions
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 16 cores, 3.4GHz, ppc64le :
    
    reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              687.63          9140.84         +1229.33%
    L2              715             7495.78         +948.36%
    M               717.39          8460.14         +1079.29%
    HT              569.56          1020.12         +79.11%
    VT              520.3           1215.56         +133.63%
    R               514.81          874.35          +69.84%
    RT              341.28          305.42          -10.51%
    Kops/s          1621            1579            -2.59%
    
    Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 9eae31c..e49e8aa 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2714,12 +2714,67 @@ vmx_composite_over_8888_8888 (pixman_implementation_t 
*imp,
     }
 }
 
+static void
+vmx_composite_add_8_8 (pixman_implementation_t *imp,
+            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (
+    src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+    dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       src = src_line;
+
+       dst_line += dst_stride;
+       src_line += src_stride;
+       w = width;
+
+       /* Small head */
+       while (w && (uintptr_t)dst & 3)
+       {
+           t = (*dst) + (*src++);
+           *dst++ = t | (0 - (t >> 8));
+           w--;
+       }
+
+       vmx_combine_add_u (imp, op,
+                   (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+       /* Small tail */
+       dst += w & 0xfffc;
+       src += w & 0xfffc;
+
+       w &= 3;
+
+       while (w)
+       {
+           t = (*dst) + (*src++);
+           *dst++ = t | (0 - (t >> 8));
+           w--;
+       }
+    }
+}
+
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, 
vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, 
vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, 
vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, 
vmx_composite_over_8888_8888),
+
+    /* PIXMAN_OP_ADD */
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
+
     {   PIXMAN_OP_NONE },
 };
 

commit 339eeaf095f949694d7f79a45171ac03a3b06f90
Author: Oded Gabbay <oded.gab...@gmail.com>
Date:   Thu Jun 18 14:12:05 2015 +0300

    vmx: implement fast path composite_over_8888_8888
    
    Copied impl. from sse2 file and edited to use vmx functions
    
    It was benchmarked against commid id 2be523b from pixman/master
    
    POWER8, 16 cores, 3.4GHz, ppc64le :
    
    reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills)
    
                    Before           After           Change
                  ---------------------------------------------
    L1              129.47          1054.62         +714.57%
    L2              138.31          1011.02         +630.98%
    M               139.99          1008.65         +620.52%
    HT              122.11          468.45          +283.63%
    VT              121.06          532.21          +339.62%
    R               108.48          240.5           +121.70%
    RT              77.87           116.7           +49.87%
    Kops/s          758             981             +29.42%
    
    Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 61fdb80..9eae31c 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2688,8 +2688,38 @@ vmx_fill (pixman_implementation_t *imp,
     return TRUE;
 }
 
+static void
+vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int dst_stride, src_stride;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+
+    PIXMAN_IMAGE_GET_LINE (
+    dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+    src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    dst = dst_line;
+    src = src_line;
+
+    while (height--)
+    {
+        vmx_combine_over_u (imp, op, dst, src, NULL, width);
+
+        dst += dst_stride;
+        src += src_stride;
+    }
+}
+
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, 
vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, 
vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, 
vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, 
vmx_composite_over_8888_8888),
     {   PIXMAN_OP_NONE },
 };
 

commit 0cc8a2e9714efcb7cdd7e2a94c9cba49c3e29e00
Author: Oded Gabbay <oded.gab...@gmail.com>
Date:   Sun Jun 28 09:42:19 2015 +0300

    vmx: implement fast path vmx_fill
    
    Based on sse2 impl.
    
    It was benchmarked against commid id e2d211a from pixman/master
    
    Tested cairo trimmed benchmarks on POWER8, 8 cores, 3.4GHz,
    RHEL 7.1 ppc64le :
    
    speedups
    ========
         t-swfdec-giant-steps  1383.09 ->  718.63  :  1.92x speedup
       t-gnome-system-monitor  1403.53 ->  918.77  :  1.53x speedup
                  t-evolution  552.34  ->  415.24  :  1.33x speedup
          t-xfce4-terminal-a1  1573.97 ->  1351.46 :  1.16x speedup
          t-firefox-paintball  847.87  ->  734.50  :  1.15x speedup
          t-firefox-asteroids  565.99  ->  492.77  :  1.15x speedup
    t-firefox-canvas-swscroll  1656.87 ->  1447.48 :  1.14x speedup
              t-midori-zoomed  724.73  ->  642.16  :  1.13x speedup
       t-firefox-planet-gnome  975.78  ->  911.92  :  1.07x speedup
              t-chromium-tabs  292.12  ->  274.74  :  1.06x speedup
         t-firefox-chalkboard  690.78  ->  653.93  :  1.06x speedup
          t-firefox-talos-gfx  1375.30 ->  1303.74 :  1.05x speedup
       t-firefox-canvas-alpha  1016.79 ->  967.24  :  1.05x speedup
    
    Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
    Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 39d1a06..61fdb80 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2537,6 +2537,157 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
     }
 }
 
+static pixman_bool_t
+vmx_fill (pixman_implementation_t *imp,
+           uint32_t *               bits,
+           int                      stride,
+           int                      bpp,
+           int                      x,
+           int                      y,
+           int                      width,
+           int                      height,
+           uint32_t                filler)
+{
+    uint32_t byte_width;
+    uint8_t *byte_line;
+
+    vector unsigned int vfiller;
+
+    if (bpp == 8)
+    {
+       uint8_t b;
+       uint16_t w;
+
+       stride = stride * (int) sizeof (uint32_t) / 1;
+       byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+       byte_width = width;

Reply via email to