On Thu,  2 Jul 2015 13:04:09 +0300
Oded Gabbay <oded.gab...@gmail.com> wrote:

> No changes were observed when running cairo trimmed benchmarks.

Maybe mention that the performance improvements can be
observed after applying another "vmx: implement fast path
vmx_composite_copy_area" patch? Or even squash these two
patches.
 
> Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
>
> ---
>  pixman/pixman-vmx.c | 124 
> ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 124 insertions(+)
> 
> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
> index b9acd6c..b42288b 100644
> --- a/pixman/pixman-vmx.c
> +++ b/pixman/pixman-vmx.c
> @@ -2708,6 +2708,128 @@ vmx_fill (pixman_implementation_t *imp,
>      return TRUE;
>  }
>  
> +static pixman_bool_t
> +vmx_blt (pixman_implementation_t *imp,
> +          uint32_t *               src_bits,
> +          uint32_t *               dst_bits,
> +          int                      src_stride,
> +          int                      dst_stride,
> +          int                      src_bpp,
> +          int                      dst_bpp,
> +          int                      src_x,
> +          int                      src_y,
> +          int                      dest_x,
> +          int                      dest_y,
> +          int                      width,
> +          int                      height)
> +{
> +    uint8_t *   src_bytes;
> +    uint8_t *   dst_bytes;
> +    int byte_width;
> +
> +    if (src_bpp != dst_bpp)
> +     return FALSE;
> +
> +    if (src_bpp == 16)
> +    {
> +     src_stride = src_stride * (int) sizeof (uint32_t) / 2;
> +     dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
> +     src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + 
> (src_x));
> +     dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) 
> + (dest_x));
> +     byte_width = 2 * width;
> +     src_stride *= 2;
> +     dst_stride *= 2;
> +    }
> +    else if (src_bpp == 32)
> +    {
> +     src_stride = src_stride * (int) sizeof (uint32_t) / 4;
> +     dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
> +     src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + 
> (src_x));
> +     dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) 
> + (dest_x));
> +     byte_width = 4 * width;
> +     src_stride *= 4;
> +     dst_stride *= 4;
> +    }
> +    else
> +    {
> +     return FALSE;
> +    }
> +
> +    while (height--)
> +    {
> +     int w;
> +     uint8_t *s = src_bytes;
> +     uint8_t *d = dst_bytes;
> +     src_bytes += src_stride;
> +     dst_bytes += dst_stride;
> +     w = byte_width;
> +
> +     while (w >= 2 && ((uintptr_t)d & 3))
> +     {
> +         *(uint16_t *)d = *(uint16_t *)s;
> +         w -= 2;
> +         s += 2;
> +         d += 2;
> +     }
> +
> +     while (w >= 4 && ((uintptr_t)d & 15))
> +     {
> +         *(uint32_t *)d = *(uint32_t *)s;
> +
> +         w -= 4;
> +         s += 4;
> +         d += 4;
> +     }
> +
> +     while (w >= 64)
> +     {
> +         vector unsigned int vmx0, vmx1, vmx2, vmx3;
> +
> +         vmx0 = load_128_unaligned ((uint32_t*) s);
> +         vmx1 = load_128_unaligned ((uint32_t*)(s + 16));
> +         vmx2 = load_128_unaligned ((uint32_t*)(s + 32));
> +         vmx3 = load_128_unaligned ((uint32_t*)(s + 48));

On big endian systems the unaligned loads are not implemented very
efficiently. We get roughly twice more loads than necessary. This whole
loop compiles into the following code on a 32-bit big endian system
with GCC 4.8.4:

    b170:       39 69 00 10     addi    r11,r9,16
    b174:       7c 80 48 0c     lvsl    v4,r0,r9
    b178:       38 a9 00 20     addi    r5,r9,32
    b17c:       7c a0 48 ce     lvx     v5,r0,r9
    b180:       38 c9 00 30     addi    r6,r9,48
    b184:       7d 87 48 ce     lvx     v12,r7,r9
    b188:       7c c0 58 0c     lvsl    v6,r0,r11
    b18c:       39 29 00 40     addi    r9,r9,64
    b190:       7c e0 58 ce     lvx     v7,r0,r11
    b194:       7d a7 58 ce     lvx     v13,r7,r11
    b198:       39 6a 00 10     addi    r11,r10,16
    b19c:       7d 00 28 0c     lvsl    v8,r0,r5
    b1a0:       7d 20 28 ce     lvx     v9,r0,r5
    b1a4:       7c 27 28 ce     lvx     v1,r7,r5
    b1a8:       38 aa 00 20     addi    r5,r10,32
    b1ac:       7d 40 30 0c     lvsl    v10,r0,r6
    b1b0:       7d 60 30 ce     lvx     v11,r0,r6
    b1b4:       7c 07 30 ce     lvx     v0,r7,r6
    b1b8:       38 ca 00 30     addi    r6,r10,48
    b1bc:       11 85 61 2b     vperm   v12,v5,v12,v4
    b1c0:       11 a7 69 ab     vperm   v13,v7,v13,v6
    b1c4:       10 29 0a 2b     vperm   v1,v9,v1,v8
    b1c8:       10 0b 02 ab     vperm   v0,v11,v0,v10
    b1cc:       7d 80 51 ce     stvx    v12,r0,r10
    b1d0:       39 4a 00 40     addi    r10,r10,64
    b1d4:       7d a0 59 ce     stvx    v13,r0,r11
    b1d8:       7c 20 29 ce     stvx    v1,r0,r5
    b1dc:       7c 00 31 ce     stvx    v0,r0,r6
    b1e0:       42 00 ff 90     bdnz+   b170 <vmx_blt.part.4+0x260>

As we can see, there are 8 LVX instructions and 4 STVX. And it is
difficult to remove the redundant loads even for a clever compiler
because of a special 15 bytes offset used for the second load in each
pair. This is very powerpc specific handling of unaligned loads.

We had a similar discussion about ARM WMMX unaligned loads several
years ago:
    http://lists.freedesktop.org/archives/pixman/2011-July/001337.html
    http://lists.freedesktop.org/archives/pixman/2011-August/001361.html

Nothing was done at that time though, so there is still room for
improvement for WMMX too :-)


> +         save_128_aligned ((uint32_t*)(d), vmx0);
> +         save_128_aligned ((uint32_t*)(d + 16), vmx1);
> +         save_128_aligned ((uint32_t*)(d + 32), vmx2);
> +         save_128_aligned ((uint32_t*)(d + 48), vmx3);
> +
> +         s += 64;
> +         d += 64;
> +         w -= 64;
> +     }
> +
> +     while (w >= 16)
> +     {
> +         save_128_aligned ((uint32_t*) d, load_128_unaligned ((uint32_t*) 
> s));
> +
> +         w -= 16;
> +         d += 16;
> +         s += 16;
> +     }
> +
> +     while (w >= 4)
> +     {
> +         *(uint32_t *)d = *(uint32_t *)s;
> +
> +         w -= 4;
> +         s += 4;
> +         d += 4;
> +     }
> +
> +     if (w >= 2)
> +     {
> +         *(uint16_t *)d = *(uint16_t *)s;
> +         w -= 2;
> +         s += 2;
> +         d += 2;
> +     }
> +    }
> +
> +    return TRUE;
> +}
> +
>  static void
>  vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
>                                 pixman_composite_info_t *info)
> @@ -2812,6 +2934,7 @@ vmx_composite_add_8888_8888 (pixman_implementation_t 
> *imp,
>  
>  static const pixman_fast_path_t vmx_fast_paths[] =
>  {
> +    /* PIXMAN_OP_OVER */
>      PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, 
> vmx_composite_over_8888_8888),
>      PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, 
> vmx_composite_over_8888_8888),
>      PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, 
> vmx_composite_over_8888_8888),
> @@ -2865,6 +2988,7 @@ _pixman_implementation_create_vmx 
> (pixman_implementation_t *fallback)
>      imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
>      imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
>  
> +    imp->blt = vmx_blt;
>      imp->fill = vmx_fill;
>  
>      return imp;

Still because the "vmx_composite_copy_area" benchmarks show a
performance improvement in practice (even with less than perfect
unaligned loads handling on big endian systems), I see no reasons
to complain. So

Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com>

(if the commit message is updated to include benchmark results)

-- 
Best regards,
Siarhei Siamashka
_______________________________________________
Pixman mailing list
Pixman@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/pixman

Reply via email to