This path is being exercised by compositing of trapezoids, for instance as used in the firefox-asteroids cairo-trace (and appears to be the last non-SSE2 kernel).
core2 @ 2.66GHz, reference memcpy speed = 4898.2MB/s (1224.6MP/s for 32bpp fills) before: add_n_8_8888 = L1: 4.53 L2: 4.39 M: 1.62 ( 0.20%) HT: 1.66 VT: 1.63 R: 1.62 RT: 1.55 ( 21Kops/s) after: add_n_8_8888 = L1: 531.03 L2: 531.91 M:446.21 ( 54.56%) HT: 298.44 VT:258.56 R:161.87 RT: 63.80 ( 441Kops/s) firefox-asteroids (xvfb): 6.108s -> 5.943s Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> --- pixman/pixman-sse2.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index 8ef6afb..589a100 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -4586,6 +4586,101 @@ sse2_composite_add_n_8888 (pixman_implementation_t *imp, } } +static void +sse2_composite_add_n_8_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t src; + + __m128i xmm_src; + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + if (src == 0) + return; + xmm_src = expand_pixel_32_1x128 (src); + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && ((uintptr_t)dst & 15)) + { + uint8_t m = *mask++; + if (m) + { + *dst = pack_1x128_32 + (_mm_adds_epu16 + (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), + unpack_32_1x128 (*dst))); + } + dst++; + w--; + } + + while (w >= 4) + { + uint32_t m = *(uint32_t*)mask; + if (m) + { + __m128i xmm_mask_lo, xmm_mask_hi; + __m128i xmm_dst_lo, xmm_dst_hi; + + __m128i xmm_dst = load_128_aligned ((__m128i*)dst); + __m128i xmm_mask = + _mm_unpacklo_epi8 (unpack_32_1x128(m), + _mm_setzero_si128 ()); + + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + pix_multiply_2x128 (&xmm_src, &xmm_src, + &xmm_mask_lo, &xmm_mask_hi, + &xmm_mask_lo, &xmm_mask_hi); + + xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); + xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + if (m) + { + *dst = pack_1x128_32 + (_mm_adds_epu16 + (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), + unpack_32_1x128 (*dst))); + } + dst++; + w--; + } + } +} static pixman_bool_t sse2_blt (pixman_implementation_t *imp, @@ -5911,6 +6006,8 @@ static const pixman_fast_path_t sse2_fast_paths[] = PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888), PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888), /* PIXMAN_OP_SRC */ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), -- 1.7.10.4 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman