Once upon a time this improved firefon-tron on a IVB i7-3720qm from 68.6s to 45.2s. However, we have accomplished the same goal with earlier tuning...
Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> --- pixman/pixman-sse2.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 3 deletions(-) diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index 9558e9c..e680c07 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -5679,6 +5679,62 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, NORMAL, FLAG_NONE) static force_inline void +scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t * dst, + const uint32_t * mask, + const uint32_t * src_top, + const uint32_t * src_bottom, + int32_t w, + int wt, + int wb, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + BILINEAR_DECLARE_VARIABLES; + uint32_t pix1, pix2, pix3, pix4; + + while ((w -= 4) >= 0) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); + *dst++ = pix1 | 0xff000000; + *dst++ = pix2 | 0xff000000; + *dst++ = pix3 | 0xff000000; + *dst++ = pix4 | 0xff000000; + } + + if (w & 2) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); + *dst++ = pix1 | 0xff000000; + *dst++ = pix2 | 0xff000000; + } + + if (w & 1) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + *dst = pix1 | 0xff000000; + } +} + +FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC, + scaled_bilinear_scanline_sse2_x888_8888_SRC, + uint32_t, uint32_t, uint32_t, + COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC, + scaled_bilinear_scanline_sse2_x888_8888_SRC, + uint32_t, uint32_t, uint32_t, + PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC, + scaled_bilinear_scanline_sse2_x888_8888_SRC, + uint32_t, uint32_t, uint32_t, + NORMAL, FLAG_NONE) + +static force_inline void scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, const uint32_t * mask, const uint32_t * src_top, @@ -6186,6 +6242,13 @@ static const pixman_fast_path_t sse2_fast_paths[] = SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), + SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), + SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), + SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), + SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), + SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), @@ -6323,12 +6386,16 @@ sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) w -= 16; } - while (w) + while (w >= 2) { - *dst++ = *(src++) << 24; - w--; + *(uint64_t)dst++ = (uint64_t)src[0] << 24 | (uint64_t)src[1] << 56; + src += 2; + w -= 2 } + if (w) + *dst = *(src) << 24; + return iter->buffer; } @@ -6605,6 +6672,26 @@ sse2_fetch_bilinear (pixman_iter_t *iter, x_bottom = x_top = x += ux * width; mask += mask_inc * width; } + else if (format == PIXMAN_x8r8g8b8 && ux_top == ux && ux_bottom == ux && x < w) + { + int width; + + width = end - buffer; + if (width * ux + x > w) + width = (w - x + ux - 1) / ux; + + scaled_bilinear_scanline_sse2_x888_8888_SRC (buffer, NULL, + (uint32_t *)top_row, + (uint32_t *)bottom_row, + width, + BILINEAR_INTERPOLATION_RANGE - disty, disty, + x, ux, + 0, 0); + + buffer += width; + x_bottom = x_top = x += ux * width; + mask += mask_inc * width; + } else { while (buffer < end && x < w) @@ -6722,6 +6809,26 @@ sse2_fetch_bilinear (pixman_iter_t *iter, x += ux * width; mask += mask_inc * width; } + else if (format == PIXMAN_x8r8g8b8 && x < w) + { + int width; + + width = end - buffer; + if (width * ux > w - x) + width = (w - x + ux - 1) / ux; + + scaled_bilinear_scanline_sse2_x888_8888_SRC (buffer, NULL, + (uint32_t *)top_row, + (uint32_t *)bottom_row, + width, + BILINEAR_INTERPOLATION_RANGE - disty, disty, + x, ux, + 0, 0); + + buffer += width; + x += ux * width; + mask += mask_inc * width; + } else { while (buffer < end && x < w) -- 1.7.10.4 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman