Rebased ref, commits from common ancestor: commit ee790044b08e3b668e6aa5d9229f46ed7295ebf0 Author: Oded Gabbay <oded.gab...@gmail.com> Date: Sat Aug 1 22:34:53 2015 +0300
Pre-release version bump to 0.33.2 Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> diff --git a/configure.ac b/configure.ac index 0339494..554b6d2 100644 --- a/configure.ac +++ b/configure.ac @@ -54,7 +54,7 @@ AC_PREREQ([2.57]) m4_define([pixman_major], 0) m4_define([pixman_minor], 33) -m4_define([pixman_micro], 1) +m4_define([pixman_micro], 2) m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro]) commit 8d9be3619a906855a3e3a1e052317833cb24cabe Author: Oded Gabbay <oded.gab...@gmail.com> Date: Wed Jul 1 14:34:07 2015 +0300 vmx: implement fast path iterator vmx_fetch_a8 no changes were observed when running cairo trimmed benchmarks. Running "lowlevel-blt-bench src_8_8888" on POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le gave the following results: reference memcpy speed = 25197.2MB/s (6299.3MP/s for 32bpp fills) Before After Change -------------------------------------------- L1 965.34 3936 +307.73% L2 942.99 3436.29 +264.40% M 902.24 2757.77 +205.66% HT 448.46 784.99 +75.04% VT 430.05 819.78 +90.62% R 412.9 717.04 +73.66% RT 168.93 220.63 +30.60% Kops/s 1025 1303 +27.12% It was benchmarked against commid id e2d211a from pixman/master Siarhei Siamashka reported that on playstation3, it shows the following results: == before == src_8_8888 = L1: 194.37 L2: 198.46 M:155.90 (148.35%) HT: 59.18 VT: 36.71 R: 38.93 RT: 12.79 ( 106Kops/s) == after == src_8_8888 = L1: 373.96 L2: 391.10 M:245.81 (233.88%) HT: 80.81 VT: 44.33 R: 48.10 RT: 14.79 ( 122Kops/s) Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 773ad76..a9bd024 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -3139,6 +3139,49 @@ vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) return iter->buffer; } +static uint32_t * +vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) +{ + int w = iter->width; + uint32_t *dst = iter->buffer; + uint8_t *src = iter->bits; + vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6; + + iter->bits += iter->stride; + + while (w && (((uintptr_t)dst) & 15)) + { + *dst++ = *(src++) << 24; + w--; + } + + while (w >= 16) + { + vmx0 = load_128_unaligned((uint32_t *) src); + + unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2); + unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4); + unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6); + + save_128_aligned(dst, vmx6); + save_128_aligned((dst + 4), vmx5); + save_128_aligned((dst + 8), vmx4); + save_128_aligned((dst + 12), vmx3); + + dst += 16; + src += 16; + w -= 16; + } + + while (w) + { + *dst++ = *(src++) << 24; + w--; + } + + return iter->buffer; +} + #define IMAGE_FLAGS \ (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) @@ -3148,6 +3191,9 @@ static const pixman_iter_info_t vmx_iters[] = { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL }, + { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW, + _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL + }, { PIXMAN_null }, }; commit 47f74ca94637d79ee66c37a81eea0200e453fcc1 Author: Oded Gabbay <oded.gab...@gmail.com> Date: Mon Jun 29 15:31:02 2015 +0300 vmx: implement fast path iterator vmx_fetch_x8r8g8b8 It was benchmarked against commid id 2be523b from pixman/master POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le. cairo trimmed benchmarks : Speedups ======== t-firefox-asteroids 533.92 -> 489.94 : 1.09x Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 0950850..773ad76 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -3105,6 +3105,52 @@ static const pixman_fast_path_t vmx_fast_paths[] = { PIXMAN_OP_NONE }, }; +static uint32_t * +vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) +{ + int w = iter->width; + vector unsigned int ff000000 = mask_ff000000; + uint32_t *dst = iter->buffer; + uint32_t *src = (uint32_t *)iter->bits; + + iter->bits += iter->stride; + + while (w && ((uintptr_t)dst) & 0x0f) + { + *dst++ = (*src++) | 0xff000000; + w--; + } + + while (w >= 4) + { + save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000)); + + dst += 4; + src += 4; + w -= 4; + } + + while (w) + { + *dst++ = (*src++) | 0xff000000; + w--; + } + + return iter->buffer; +} + +#define IMAGE_FLAGS \ + (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ + FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) + +static const pixman_iter_info_t vmx_iters[] = +{ + { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, + _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL + }, + { PIXMAN_null }, +}; + pixman_implementation_t * _pixman_implementation_create_vmx (pixman_implementation_t *fallback) { @@ -3147,5 +3193,7 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback) imp->fill = vmx_fill; + imp->iter_info = vmx_iters; + return imp; } commit fcbb97d4458d717b9c15858aedcbee2d33c8ac5a Author: Oded Gabbay <oded.gab...@gmail.com> Date: Sun Jun 28 23:25:24 2015 +0300 vmx: implement fast path scaled nearest vmx_8888_8888_OVER It was benchmarked against commid id 2be523b from pixman/master POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le. reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills) Before After Change --------------------------------------------- L1 134.36 181.68 +35.22% L2 135.07 180.67 +33.76% M 134.6 180.51 +34.11% HT 121.77 128.79 +5.76% VT 120.49 145.07 +20.40% R 93.83 102.3 +9.03% RT 50.82 46.93 -7.65% Kops/s 448 422 -5.80% cairo trimmed benchmarks : Speedups ======== t-firefox-asteroids 533.92 -> 497.92 : 1.07x t-midori-zoomed 692.98 -> 651.24 : 1.06x Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 64e9125..0950850 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -2954,6 +2954,129 @@ vmx_composite_add_8888_8888 (pixman_implementation_t *imp, } } +static force_inline void +scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd, + const uint32_t* ps, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t src_width_fixed, + pixman_bool_t fully_transparent_src) +{ + uint32_t s, d; + const uint32_t* pm = NULL; + + vector unsigned int vmx_dst_lo, vmx_dst_hi; + vector unsigned int vmx_src_lo, vmx_src_hi; + vector unsigned int vmx_alpha_lo, vmx_alpha_hi; + + if (fully_transparent_src) + return; + + /* Align dst on a 16-byte boundary */ + while (w && ((uintptr_t)pd & 15)) + { + d = *pd; + s = combine1 (ps + pixman_fixed_to_int (vx), pm); + vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; + + *pd++ = core_combine_over_u_pixel_vmx (s, d); + if (pm) + pm++; + w--; + } + + while (w >= 4) + { + vector unsigned int tmp; + uint32_t tmp1, tmp2, tmp3, tmp4; + + tmp1 = *(ps + pixman_fixed_to_int (vx)); + vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; + tmp2 = *(ps + pixman_fixed_to_int (vx)); + vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; + tmp3 = *(ps + pixman_fixed_to_int (vx)); + vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; + tmp4 = *(ps + pixman_fixed_to_int (vx)); + vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; + + tmp[0] = tmp1; + tmp[1] = tmp2; + tmp[2] = tmp3; + tmp[3] = tmp4; + + vmx_src_hi = combine4 ((const uint32_t *) &tmp, pm); + + if (is_opaque (vmx_src_hi)) + { + save_128_aligned (pd, vmx_src_hi); + } + else if (!is_zero (vmx_src_hi)) + { + vmx_dst_hi = load_128_aligned (pd); + + unpack_128_2x128 (vmx_src_hi, (vector unsigned int) AVV(0), + &vmx_src_lo, &vmx_src_hi); + + unpack_128_2x128 (vmx_dst_hi, (vector unsigned int) AVV(0), + &vmx_dst_lo, &vmx_dst_hi); + + expand_alpha_2x128 ( + vmx_src_lo, vmx_src_hi, &vmx_alpha_lo, &vmx_alpha_hi); + + over_2x128 (&vmx_src_lo, &vmx_src_hi, + &vmx_alpha_lo, &vmx_alpha_hi, + &vmx_dst_lo, &vmx_dst_hi); + + /* rebuid the 4 pixel data and save*/ + save_128_aligned (pd, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi)); + } + + w -= 4; + pd += 4; + if (pm) + pm += 4; + } + + while (w) + { + d = *pd; + s = combine1 (ps + pixman_fixed_to_int (vx), pm); + vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; + + *pd++ = core_combine_over_u_pixel_vmx (s, d); + if (pm) + pm++; + + w--; + } +} + +FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER, + scaled_nearest_scanline_vmx_8888_8888_OVER, + uint32_t, uint32_t, COVER) +FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER, + scaled_nearest_scanline_vmx_8888_8888_OVER, + uint32_t, uint32_t, NONE) +FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER, + scaled_nearest_scanline_vmx_8888_8888_OVER, + uint32_t, uint32_t, PAD) +FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER, + scaled_nearest_scanline_vmx_8888_8888_OVER, + uint32_t, uint32_t, NORMAL) + static const pixman_fast_path_t vmx_fast_paths[] = { PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888), @@ -2974,6 +3097,11 @@ static const pixman_fast_path_t vmx_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888), PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888), + SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888), + { PIXMAN_OP_NONE }, }; commit ad612c4205f0ae46fc72a50e0c90ccd05487fcba Author: Oded Gabbay <oded.gab...@gmail.com> Date: Sun Jun 28 22:23:44 2015 +0300 vmx: implement fast path vmx_composite_src_x888_8888 It was benchmarked against commid id 2be523b from pixman/master POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le. reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills) Before After Change --------------------------------------------- L1 1115.4 5006.49 +348.85% L2 1112.26 4338.01 +290.02% M 1110.54 2524.15 +127.29% HT 745.41 1140.03 +52.94% VT 749.03 1287.13 +71.84% R 423.91 547.6 +29.18% RT 205.79 194.98 -5.25% Kops/s 1414 1361 -3.75% cairo trimmed benchmarks : Speedups ======== t-gnome-system-monitor 1402.62 -> 1212.75 : 1.16x t-firefox-asteroids 533.92 -> 474.50 : 1.13x Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 47393dc..64e9125 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -2689,6 +2689,62 @@ vmx_fill (pixman_implementation_t *imp, } static void +vmx_composite_src_x888_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int32_t w; + int dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (uintptr_t)dst & 15) + { + *dst++ = *src++ | 0xff000000; + w--; + } + + while (w >= 16) + { + vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4; + + vmx_src1 = load_128_unaligned (src); + vmx_src2 = load_128_unaligned (src + 4); + vmx_src3 = load_128_unaligned (src + 8); + vmx_src4 = load_128_unaligned (src + 12); + + save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000)); + save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000)); + save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000)); + save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000)); + + dst += 16; + src += 16; + w -= 16; + } + + while (w) + { + *dst++ = *src++ | 0xff000000; + w--; + } + } +} + +static void vmx_composite_over_8888_8888 (pixman_implementation_t *imp, pixman_composite_info_t *info) { @@ -2914,6 +2970,10 @@ static const pixman_fast_path_t vmx_fast_paths[] = PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888), PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888), + /* PIXMAN_OP_SRC */ + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888), + { PIXMAN_OP_NONE }, }; commit fafc1d403b8405727d3918bcb605cb98044af90a Author: Oded Gabbay <oded.gab...@gmail.com> Date: Sun Jun 28 10:14:20 2015 +0300 vmx: implement fast path vmx_composite_over_n_8888_8888_ca It was benchmarked against commid id 2be523b from pixman/master POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le. reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills) Before After Change --------------------------------------------- L1 61.92 244.91 +295.53% L2 62.74 243.3 +287.79% M 63.03 241.94 +283.85% HT 59.91 144.22 +140.73% VT 59.4 174.39 +193.59% R 53.6 111.37 +107.78% RT 37.99 46.38 +22.08% Kops/s 436 506 +16.06% cairo trimmed benchmarks : Speedups ======== t-xfce4-terminal-a1 1540.37 -> 1226.14 : 1.26x t-firefox-talos-gfx 1488.59 -> 1209.19 : 1.23x Slowdowns ========= t-evolution 553.88 -> 581.63 : 1.05x t-poppler 364.99 -> 383.79 : 1.05x t-firefox-scrolling 1223.65 -> 1304.34 : 1.07x The slowdowns can be explained in cases where the images are small and un-aligned to 16-byte boundary. In that case, the function will first work on the un-aligned area, even in operations of 1 byte. In case of small images, the overhead of such operations can be more than the savings we get from using the vmx instructions that are done on the aligned part of the image. In the C fast-path implementation, there is no special treatment for the un-aligned part, as it works in 4 byte quantities on the entire image. Because llbb is a synthetic test, I would assume it has much less alignment issues than "real-world" scenario, such as cairo benchmarks, which are basically recorded traces of real application activity. Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 641c487..47393dc 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -2715,6 +2715,114 @@ vmx_composite_over_8888_8888 (pixman_implementation_t *imp, } static void +vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src; + uint32_t *dst_line, d; + uint32_t *mask_line, m; + uint32_t pack_cmp; + int dst_stride, mask_stride; + + vector unsigned int vsrc, valpha, vmask, vdest; + + vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi; + vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi; + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + vsrc = unpacklo_128_16x8(create_mask_1x32_128 (&src), + (vector unsigned int) AVV(0)); + + valpha = expand_alpha_1x128(vsrc); + + while (height--) + { + int w = width; + const uint32_t *pm = (uint32_t *)mask_line; + uint32_t *pd = (uint32_t *)dst_line; + + dst_line += dst_stride; + mask_line += mask_stride; + + while (w && (uintptr_t)pd & 15) + { + m = *pm++; + + if (m) + { + d = *pd; + vmask = unpack_32_1x128(m); + vdest = unpack_32_1x128(d); + + *pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest)); + } + + pd++; + w--; + } + + while (w >= 4) + { + /* pm is NOT necessarily 16-byte aligned */ + vmx_mask = load_128_unaligned (pm); + + pack_cmp = vec_all_eq(vmx_mask, (vector unsigned int) AVV(0)); + + /* if all bits in mask are zero, pack_cmp is not 0 */ + if (pack_cmp == 0) + { + /* pd is 16-byte aligned */ + vmx_dst = load_128_aligned (pd); + + unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0), + &vmx_mask_lo, &vmx_mask_hi); + + unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0), + &vmx_dst_lo, &vmx_dst_hi); + + in_over_2x128 (&vsrc, &vsrc, + &valpha, &valpha, + &vmx_mask_lo, &vmx_mask_hi, + &vmx_dst_lo, &vmx_dst_hi); + + save_128_aligned(pd, pack_2x128_128(vmx_dst_lo, vmx_dst_hi)); + } + + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + m = *pm++; + + if (m) + { + d = *pd; + vmask = unpack_32_1x128(m); + vdest = unpack_32_1x128(d); + + *pd = pack_1x128_32(in_over (vsrc, valpha, vmask, vdest)); + } + + pd++; + w--; + } + } +} + +static void vmx_composite_add_8_8 (pixman_implementation_t *imp, pixman_composite_info_t *info) { @@ -2796,6 +2904,10 @@ static const pixman_fast_path_t vmx_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca), /* PIXMAN_OP_ADD */ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8), commit a3e914407e354df70b9200e263608f1fc2e686cf Author: Oded Gabbay <oded.gab...@gmail.com> Date: Thu Jun 18 15:05:49 2015 +0300 vmx: implement fast path composite_add_8888_8888 Copied impl. from sse2 file and edited to use vmx functions It was benchmarked against commid id 2be523b from pixman/master POWER8, 16 cores, 3.4GHz, ppc64le : reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills) Before After Change --------------------------------------------- L1 248.76 3284.48 +1220.34% L2 264.09 2826.47 +970.27% M 261.24 2405.06 +820.63% HT 217.27 857.3 +294.58% VT 213.78 980.09 +358.46% R 176.61 442.95 +150.81% RT 107.54 150.08 +39.56% Kops/s 917 1125 +22.68% Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index e49e8aa..641c487 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -2765,6 +2765,31 @@ vmx_composite_add_8_8 (pixman_implementation_t *imp, } } +static void +vmx_composite_add_8888_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + + vmx_combine_add_u (imp, op, dst, src, NULL, width); + } +} + static const pixman_fast_path_t vmx_fast_paths[] = { PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888), @@ -2774,6 +2799,8 @@ static const pixman_fast_path_t vmx_fast_paths[] = /* PIXMAN_OP_ADD */ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888), { PIXMAN_OP_NONE }, }; commit d5b5343c7df99082597e0c37aec937dcf5b6602d Author: Oded Gabbay <oded.gab...@gmail.com> Date: Thu Jun 18 14:56:47 2015 +0300 vmx: implement fast path composite_add_8_8 Copied impl. from sse2 file and edited to use vmx functions It was benchmarked against commid id 2be523b from pixman/master POWER8, 16 cores, 3.4GHz, ppc64le : reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills) Before After Change --------------------------------------------- L1 687.63 9140.84 +1229.33% L2 715 7495.78 +948.36% M 717.39 8460.14 +1079.29% HT 569.56 1020.12 +79.11% VT 520.3 1215.56 +133.63% R 514.81 874.35 +69.84% RT 341.28 305.42 -10.51% Kops/s 1621 1579 -2.59% Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 9eae31c..e49e8aa 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -2714,12 +2714,67 @@ vmx_composite_over_8888_8888 (pixman_implementation_t *imp, } } +static void +vmx_composite_add_8_8 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint16_t t; + + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + src = src_line; + + dst_line += dst_stride; + src_line += src_stride; + w = width; + + /* Small head */ + while (w && (uintptr_t)dst & 3) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); + w--; + } + + vmx_combine_add_u (imp, op, + (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); + + /* Small tail */ + dst += w & 0xfffc; + src += w & 0xfffc; + + w &= 3; + + while (w) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); + w--; + } + } +} + static const pixman_fast_path_t vmx_fast_paths[] = { PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888), + + /* PIXMAN_OP_ADD */ + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8), + { PIXMAN_OP_NONE }, }; commit 339eeaf095f949694d7f79a45171ac03a3b06f90 Author: Oded Gabbay <oded.gab...@gmail.com> Date: Thu Jun 18 14:12:05 2015 +0300 vmx: implement fast path composite_over_8888_8888 Copied impl. from sse2 file and edited to use vmx functions It was benchmarked against commid id 2be523b from pixman/master POWER8, 16 cores, 3.4GHz, ppc64le : reference memcpy speed = 27036.4MB/s (6759.1MP/s for 32bpp fills) Before After Change --------------------------------------------- L1 129.47 1054.62 +714.57% L2 138.31 1011.02 +630.98% M 139.99 1008.65 +620.52% HT 122.11 468.45 +283.63% VT 121.06 532.21 +339.62% R 108.48 240.5 +121.70% RT 77.87 116.7 +49.87% Kops/s 758 981 +29.42% Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 61fdb80..9eae31c 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -2688,8 +2688,38 @@ vmx_fill (pixman_implementation_t *imp, return TRUE; } +static void +vmx_composite_over_8888_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + int dst_stride, src_stride; + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + dst = dst_line; + src = src_line; + + while (height--) + { + vmx_combine_over_u (imp, op, dst, src, NULL, width); + + dst += dst_stride; + src += src_stride; + } +} + static const pixman_fast_path_t vmx_fast_paths[] = { + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888), { PIXMAN_OP_NONE }, }; commit 0cc8a2e9714efcb7cdd7e2a94c9cba49c3e29e00 Author: Oded Gabbay <oded.gab...@gmail.com> Date: Sun Jun 28 09:42:19 2015 +0300 vmx: implement fast path vmx_fill Based on sse2 impl. It was benchmarked against commid id e2d211a from pixman/master Tested cairo trimmed benchmarks on POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le : speedups ======== t-swfdec-giant-steps 1383.09 -> 718.63 : 1.92x speedup t-gnome-system-monitor 1403.53 -> 918.77 : 1.53x speedup t-evolution 552.34 -> 415.24 : 1.33x speedup t-xfce4-terminal-a1 1573.97 -> 1351.46 : 1.16x speedup t-firefox-paintball 847.87 -> 734.50 : 1.15x speedup t-firefox-asteroids 565.99 -> 492.77 : 1.15x speedup t-firefox-canvas-swscroll 1656.87 -> 1447.48 : 1.14x speedup t-midori-zoomed 724.73 -> 642.16 : 1.13x speedup t-firefox-planet-gnome 975.78 -> 911.92 : 1.07x speedup t-chromium-tabs 292.12 -> 274.74 : 1.06x speedup t-firefox-chalkboard 690.78 -> 653.93 : 1.06x speedup t-firefox-talos-gfx 1375.30 -> 1303.74 : 1.05x speedup t-firefox-canvas-alpha 1016.79 -> 967.24 : 1.05x speedup Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> Acked-by: Siarhei Siamashka <siarhei.siamas...@gmail.com> diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 39d1a06..61fdb80 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -2537,6 +2537,157 @@ vmx_combine_add_ca (pixman_implementation_t *imp, } } +static pixman_bool_t +vmx_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t filler) +{ + uint32_t byte_width; + uint8_t *byte_line; + + vector unsigned int vfiller; + + if (bpp == 8) + { + uint8_t b; + uint16_t w; + + stride = stride * (int) sizeof (uint32_t) / 1; + byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); + byte_width = width;