On Thu, 2 Jul 2015 13:04:09 +0300 Oded Gabbay <oded.gab...@gmail.com> wrote:
> No changes were observed when running cairo trimmed benchmarks. Maybe mention that the performance improvements can be observed after applying another "vmx: implement fast path vmx_composite_copy_area" patch? Or even squash these two patches. > Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> > > --- > pixman/pixman-vmx.c | 124 > ++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 124 insertions(+) > > diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c > index b9acd6c..b42288b 100644 > --- a/pixman/pixman-vmx.c > +++ b/pixman/pixman-vmx.c > @@ -2708,6 +2708,128 @@ vmx_fill (pixman_implementation_t *imp, > return TRUE; > } > > +static pixman_bool_t > +vmx_blt (pixman_implementation_t *imp, > + uint32_t * src_bits, > + uint32_t * dst_bits, > + int src_stride, > + int dst_stride, > + int src_bpp, > + int dst_bpp, > + int src_x, > + int src_y, > + int dest_x, > + int dest_y, > + int width, > + int height) > +{ > + uint8_t * src_bytes; > + uint8_t * dst_bytes; > + int byte_width; > + > + if (src_bpp != dst_bpp) > + return FALSE; > + > + if (src_bpp == 16) > + { > + src_stride = src_stride * (int) sizeof (uint32_t) / 2; > + dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; > + src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + > (src_x)); > + dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) > + (dest_x)); > + byte_width = 2 * width; > + src_stride *= 2; > + dst_stride *= 2; > + } > + else if (src_bpp == 32) > + { > + src_stride = src_stride * (int) sizeof (uint32_t) / 4; > + dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; > + src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + > (src_x)); > + dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) > + (dest_x)); > + byte_width = 4 * width; > + src_stride *= 4; > + dst_stride *= 4; > + } > + else > + { > + return FALSE; > + } > + > + while (height--) > + { > + int w; > + uint8_t *s = src_bytes; > + uint8_t *d = dst_bytes; > + src_bytes += src_stride; > + dst_bytes += dst_stride; > + w = byte_width; > + > + while (w >= 2 && ((uintptr_t)d & 3)) > + { > + *(uint16_t *)d = *(uint16_t *)s; > + w -= 2; > + s += 2; > + d += 2; > + } > + > + while (w >= 4 && ((uintptr_t)d & 15)) > + { > + *(uint32_t *)d = *(uint32_t *)s; > + > + w -= 4; > + s += 4; > + d += 4; > + } > + > + while (w >= 64) > + { > + vector unsigned int vmx0, vmx1, vmx2, vmx3; > + > + vmx0 = load_128_unaligned ((uint32_t*) s); > + vmx1 = load_128_unaligned ((uint32_t*)(s + 16)); > + vmx2 = load_128_unaligned ((uint32_t*)(s + 32)); > + vmx3 = load_128_unaligned ((uint32_t*)(s + 48)); On big endian systems the unaligned loads are not implemented very efficiently. We get roughly twice more loads than necessary. This whole loop compiles into the following code on a 32-bit big endian system with GCC 4.8.4: b170: 39 69 00 10 addi r11,r9,16 b174: 7c 80 48 0c lvsl v4,r0,r9 b178: 38 a9 00 20 addi r5,r9,32 b17c: 7c a0 48 ce lvx v5,r0,r9 b180: 38 c9 00 30 addi r6,r9,48 b184: 7d 87 48 ce lvx v12,r7,r9 b188: 7c c0 58 0c lvsl v6,r0,r11 b18c: 39 29 00 40 addi r9,r9,64 b190: 7c e0 58 ce lvx v7,r0,r11 b194: 7d a7 58 ce lvx v13,r7,r11 b198: 39 6a 00 10 addi r11,r10,16 b19c: 7d 00 28 0c lvsl v8,r0,r5 b1a0: 7d 20 28 ce lvx v9,r0,r5 b1a4: 7c 27 28 ce lvx v1,r7,r5 b1a8: 38 aa 00 20 addi r5,r10,32 b1ac: 7d 40 30 0c lvsl v10,r0,r6 b1b0: 7d 60 30 ce lvx v11,r0,r6 b1b4: 7c 07 30 ce lvx v0,r7,r6 b1b8: 38 ca 00 30 addi r6,r10,48 b1bc: 11 85 61 2b vperm v12,v5,v12,v4 b1c0: 11 a7 69 ab vperm v13,v7,v13,v6 b1c4: 10 29 0a 2b vperm v1,v9,v1,v8 b1c8: 10 0b 02 ab vperm v0,v11,v0,v10 b1cc: 7d 80 51 ce stvx v12,r0,r10 b1d0: 39 4a 00 40 addi r10,r10,64 b1d4: 7d a0 59 ce stvx v13,r0,r11 b1d8: 7c 20 29 ce stvx v1,r0,r5 b1dc: 7c 00 31 ce stvx v0,r0,r6 b1e0: 42 00 ff 90 bdnz+ b170 <vmx_blt.part.4+0x260> As we can see, there are 8 LVX instructions and 4 STVX. And it is difficult to remove the redundant loads even for a clever compiler because of a special 15 bytes offset used for the second load in each pair. This is very powerpc specific handling of unaligned loads. We had a similar discussion about ARM WMMX unaligned loads several years ago: http://lists.freedesktop.org/archives/pixman/2011-July/001337.html http://lists.freedesktop.org/archives/pixman/2011-August/001361.html Nothing was done at that time though, so there is still room for improvement for WMMX too :-) > + save_128_aligned ((uint32_t*)(d), vmx0); > + save_128_aligned ((uint32_t*)(d + 16), vmx1); > + save_128_aligned ((uint32_t*)(d + 32), vmx2); > + save_128_aligned ((uint32_t*)(d + 48), vmx3); > + > + s += 64; > + d += 64; > + w -= 64; > + } > + > + while (w >= 16) > + { > + save_128_aligned ((uint32_t*) d, load_128_unaligned ((uint32_t*) > s)); > + > + w -= 16; > + d += 16; > + s += 16; > + } > + > + while (w >= 4) > + { > + *(uint32_t *)d = *(uint32_t *)s; > + > + w -= 4; > + s += 4; > + d += 4; > + } > + > + if (w >= 2) > + { > + *(uint16_t *)d = *(uint16_t *)s; > + w -= 2; > + s += 2; > + d += 2; > + } > + } > + > + return TRUE; > +} > + > static void > vmx_composite_over_8888_8888 (pixman_implementation_t *imp, > pixman_composite_info_t *info) > @@ -2812,6 +2934,7 @@ vmx_composite_add_8888_8888 (pixman_implementation_t > *imp, > > static const pixman_fast_path_t vmx_fast_paths[] = > { > + /* PIXMAN_OP_OVER */ > PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, > vmx_composite_over_8888_8888), > PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, > vmx_composite_over_8888_8888), > PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, > vmx_composite_over_8888_8888), > @@ -2865,6 +2988,7 @@ _pixman_implementation_create_vmx > (pixman_implementation_t *fallback) > imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca; > imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca; > > + imp->blt = vmx_blt; > imp->fill = vmx_fill; > > return imp; Still because the "vmx_composite_copy_area" benchmarks show a performance improvement in practice (even with less than perfect unaligned loads handling on big endian systems), I see no reasons to complain. So Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com> (if the commit message is updated to include benchmark results) -- Best regards, Siarhei Siamashka _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman