The generic C over_u combiner can be a lot faster with the addition of special shortcuts for 0xFF and 0x00 alpha/mask values. This is already implemented in C and SSE2 fast paths.
Profiling the run of cairo-perf-trace benchmarks with PIXMAN_DISABLE environment variable set to "fast mmx sse2" on Intel Core i7: === before === 37.32% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_over_u 21.37% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_no_repeat_8888 13.51% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_affine_none_a8r8g8b8 2.96% cairo-perf-trac libpixman-1.so.0.29.1 [.] radial_compute_color 2.74% cairo-perf-trac libpixman-1.so.0.29.1 [.] fetch_scanline_a8 2.71% cairo-perf-trac libpixman-1.so.0.29.1 [.] fetch_scanline_x8r8g8b8 2.17% cairo-perf-trac libpixman-1.so.0.29.1 [.] _pixman_gradient_walker_pixel 1.86% cairo-perf-trac libcairo.so.2.11200.0 [.] _cairo_tor_scan_converter_generate 1.57% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_affine_pad_a8r8g8b8 0.97% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_in_reverse_u 0.96% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_over_ca === after === 28.79% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_no_repeat_8888 18.44% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_affine_none_a8r8g8b8 15.54% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_over_u 3.94% cairo-perf-trac libpixman-1.so.0.29.1 [.] radial_compute_color 3.69% cairo-perf-trac libpixman-1.so.0.29.1 [.] fetch_scanline_a8 3.69% cairo-perf-trac libpixman-1.so.0.29.1 [.] fetch_scanline_x8r8g8b8 2.94% cairo-perf-trac libpixman-1.so.0.29.1 [.] _pixman_gradient_walker_pixel 2.52% cairo-perf-trac libcairo.so.2.11200.0 [.] _cairo_tor_scan_converter_generate 2.08% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_affine_pad_a8r8g8b8 1.31% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_in_reverse_u 1.29% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_over_ca --- pixman/pixman-combine32.c | 58 +++++++++++++++++++++++++++++++++++++++----- 1 files changed, 51 insertions(+), 7 deletions(-) diff --git a/pixman/pixman-combine32.c b/pixman/pixman-combine32.c index 54cc877..3ac7576 100644 --- a/pixman/pixman-combine32.c +++ b/pixman/pixman-combine32.c @@ -196,14 +196,58 @@ combine_over_u (pixman_implementation_t *imp, { int i; - for (i = 0; i < width; ++i) + if (!mask) { - uint32_t s = combine_mask (src, mask, i); - uint32_t d = *(dest + i); - uint32_t ia = ALPHA_8 (~s); - - UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); - *(dest + i) = d; + for (i = 0; i < width; ++i) + { + uint32_t s = *(src + i); + uint32_t a = ALPHA_8 (s); + if (a == 0xFF) + { + *(dest + i) = s; + } + else if (s) + { + uint32_t d = *(dest + i); + uint32_t ia = a ^ 0xFF; + UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); + *(dest + i) = d; + } + } + } + else + { + for (i = 0; i < width; ++i) + { + uint32_t m = ALPHA_8 (*(mask + i)); + if (m == 0xFF) + { + uint32_t s = *(src + i); + uint32_t a = ALPHA_8 (s); + if (a == 0xFF) + { + *(dest + i) = s; + } + else if (s) + { + uint32_t d = *(dest + i); + uint32_t ia = a ^ 0xFF; + UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); + *(dest + i) = d; + } + } + else if (m) + { + uint32_t s = *(src + i); + if (s) + { + uint32_t d = *(dest + i); + UN8x4_MUL_UN8 (s, m); + UN8x4_MUL_UN8_ADD_UN8x4 (d, ALPHA_8 (~s), s); + *(dest + i) = d; + } + } + } } } -- 1.7.8.6 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman