I started porting my src_8888_0565 MMX function to SSE2, and in the process started thinking about using SSE3+. The useful instructions added post SSE2 that I see are SSE3: lddqu - for unaligned loads across cache lines SSSE3: palignr - for unaligned loads (but requires software pipelining...) pmaddubsw - maybe? SSE4.1: pextr*, pinsr* pcmpeqq, ptest packusdw - for 888 -> 565 packing
I first wrote a basic src_8888_0565 for SSE2 and discovered that the performance was worse than MMX (which we've been saying has no use in modern systems -- oops!). I figured the cool pmadd algorithm of MMX was the cause, but I wondered if 16-byte SSE chunks are too large sometimes. I added an 8-byte MMX loop before and after the main 16-byte SSE loop and got a nice improvement. Porting the pmadd algorithm to SSE4.1 gave another (very large) improvement. fast: src_8888_0565 = L1: 655.18 L2: 675.94 M:642.31 ( 23.44%) HT:403.00 VT:286.45 R:307.61 RT:150.59 (1675Kops/s) mmx: src_8888_0565 = L1:2050.45 L2:1988.97 M:1586.16 ( 57.34%) HT:529.12 VT:374.28 R:412.09 RT:177.35 (1913Kops/s) sse2: src_8888_0565 = L1:1518.61 L2:1493.10 M:1279.18 ( 46.24%) HT:433.65 VT:314.48 R:349.14 RT:151.84 (1685Kops/s) sse2mmx:src_8888_0565 = L1:1544.91 L2:1520.83 M:1307.79 ( 47.01%) HT:447.82 VT:326.81 R:379.60 RT:174.07 (1878Kops/s) sse4: src_8888_0565 = L1:4654.11 L2:4202.98 M:1885.01 ( 69.35%) HT:540.65 VT:421.04 R:427.73 RT:161.45 (1773Kops/s) sse4mmx:src_8888_0565 = L1:4786.27 L2:4255.13 M:1920.18 ( 69.93%) HT:581.42 VT:447.99 R:482.27 RT:193.15 (2049Kops/s) I'd like to isolate exactly what the performance improvement given by the only SSE4.1 instruction (i.e., _mm_packus_epi32) is before declaring SSE4.1 a fantastic improvement. If you can come up with a reasonable way to pack the two xmm registers together in pack_565_2packedx128_128, please tell me. Shuffle only works on hi/lo 8-bytes, so it'd be a pain. This got me wondering how to proceed. I'd rather not duplicate a bunch of code from pixman-mmx.c, and I'd rather not add #ifdef USE_SSE41 to pixman-sse2.c and make it a compile-time option (or recompile the whole file to get a few improvements from SSE4.1). It seems like we need a generic solution that would say for each compositing function - this is what you do for 1-byte; - this is what you do for 8-bytes if you have MMX; - this is what you do for 16-bytes if you have SSE2; - this is what you do for 16-bytes if you have SSE3; - this is what you do for 16-bytes if you have SSE4.1. and then construct the functions for generic/MMX/SSE2/SSE4 at build time. Does this seem like a reasonable approach? *How* to do it -- suggestions welcome. --- pixman/pixman-sse2.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 152 insertions(+), 0 deletions(-) diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index e217ca3..763c7b3 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -30,8 +30,12 @@ #include <config.h> #endif +#include <mmintrin.h> #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ #include <emmintrin.h> /* for SSE2 intrinsics */ +#if USE_SSE41 +#include <smmintrin.h> +#endif #include "pixman-private.h" #include "pixman-combine32.h" #include "pixman-inlines.h" @@ -53,6 +57,9 @@ static __m128i mask_blue; static __m128i mask_565_fix_rb; static __m128i mask_565_fix_g; +static __m128i mask_565_rb; +static __m128i mask_565_pack_multiplier; + static force_inline __m128i unpack_32_1x128 (uint32_t data) { @@ -120,7 +127,59 @@ pack_2x128_128 (__m128i lo, __m128i hi) return _mm_packus_epi16 (lo, hi); } +#if USE_X86_MMX +#define MC(x) ((__m64)mmx_ ## x) + +static force_inline __m64 +pack_4xpacked565 (__m64 a, __m64 b) +{ + static const uint64_t mmx_565_pack_multiplier = 0x2000000420000004ULL; + static const uint64_t mmx_packed_565_rb = 0x00f800f800f800f8ULL; + static const uint64_t mmx_packed_565_g = 0x0000fc000000fc00ULL; + + __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb)); + __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb)); + + __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier)); + __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier)); + + __m64 g0 = _mm_and_si64 (a, MC (packed_565_g)); + __m64 g1 = _mm_and_si64 (b, MC (packed_565_g)); + + t0 = _mm_or_si64 (t0, g0); + t1 = _mm_or_si64 (t1, g1); + + t0 = _mm_srli_si64 (t0, 5); + t1 = _mm_slli_si64 (t1, 11); + return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0)); +} +#endif + +#ifdef USE_SSE41 static force_inline __m128i +pack_565_2packedx128_128 (__m128i lo, __m128i hi) +{ + __m128i rb0 = _mm_and_si128 (lo, mask_565_rb); + __m128i rb1 = _mm_and_si128 (hi, mask_565_rb); + + __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier); + __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier); + + __m128i g0 = _mm_and_si128 (lo, mask_green); + __m128i g1 = _mm_and_si128 (hi, mask_green); + + t0 = _mm_or_si128 (t0, g0); + t1 = _mm_or_si128 (t1, g1); + + t0 = _mm_srli_epi32 (t0, 5); + t1 = _mm_srli_epi32 (t1, 5); + + /* XXX: maybe there's a way to do this relatively efficiently with SSE2? */ + return _mm_packus_epi32 (t0, t1); +} +#endif + +__m128i pack_565_2x128_128 (__m128i lo, __m128i hi) { __m128i data; @@ -2832,6 +2891,93 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, } static void +sse2_composite_src_x888_0565 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint16_t *dst_line, *dst; + uint32_t *src_line, *src, s; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 7) + { + s = *src++; + *dst = CONVERT_8888_TO_0565 (s); + dst++; + w--; + } + +#if USE_X86_MMX + while (w >= 4 && (unsigned long)dst & 15) + { + __m64 vsrc0 = *(__m64 *)(src + 0); + __m64 vsrc1 = *(__m64 *)(src + 2); + + *(__m64 *)dst = pack_4xpacked565 (vsrc0, vsrc1); + + w -= 4; + src += 4; + dst += 4; + } +#endif + + while (w >= 8) + { + __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0); + __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1); + +#if USE_SSE41 + save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1)); +#else + __m128i xmm_src0_lo, xmm_src0_hi, xmm_src1_lo, xmm_src1_hi; + unpack_128_2x128 (xmm_src0, &xmm_src0_lo, &xmm_src0_hi); + unpack_128_2x128 (xmm_src1, &xmm_src1_lo, &xmm_src1_hi); + + save_128_aligned ((__m128i*)dst, pack_565_4x128_128 (&xmm_src0_lo, &xmm_src0_hi, &xmm_src1_lo, &xmm_src1_hi)); +#endif + + w -= 8; + src += 8; + dst += 8; + } + +#if USE_X86_MMX + while (w >= 4) + { + __m64 vsrc0 = *(__m64 *)(src + 0); + __m64 vsrc1 = *(__m64 *)(src + 2); + + *(__m64 *)dst = pack_4xpacked565 (vsrc0, vsrc1); + + w -= 4; + src += 4; + dst += 4; + } +#endif + + while (w) + { + s = *src++; + *dst = CONVERT_8888_TO_0565 (s); + dst++; + w--; + } + } +} + +static void sse2_composite_src_x888_8888 (pixman_implementation_t *imp, pixman_composite_info_t *info) { @@ -5727,6 +5873,10 @@ static const pixman_fast_path_t sse2_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), @@ -6035,6 +6185,8 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback) mask_ffff = create_mask_16_128 (0xffff); mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); + mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8); + mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004); /* Set up function pointers */ imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; -- 1.7.3.4 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman