lowlevel-blt-bench, over_reverse_n_8888, 100 iterations: Before After Mean StdDev Mean StdDev Confidence Change L1 2372.6 2.50 4387.6 8.00 100.00% +84.9% L2 2490.3 5.29 4326.5 20.79 100.00% +73.7% M 2418.3 10.43 3718.0 38.55 100.00% +53.7% HT 1555.8 13.35 2112.9 23.85 100.00% +35.8% VT 1120.1 9.58 1403.7 15.43 100.00% +25.3% R 958.5 17.66 1176.9 20.87 100.00% +22.8% RT 407.3 6.79 450.1 7.22 100.00% +10.5%
At most 18 outliers rejected per test per set. cairo-perf-trace with trimmed traces, 30 iterations: Before After Mean StdDev Mean StdDev Confidence Change poppler 0.516 0.003 0.478 0.002 100.000% +8.1% Cairo perf reports the running time, but the change is computed for operations per second instead (inverse of running time). --- pixman/pixman-avx2.c | 94 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/pixman/pixman-avx2.c b/pixman/pixman-avx2.c index faef552..6a67515 100644 --- a/pixman/pixman-avx2.c +++ b/pixman/pixman-avx2.c @@ -28,6 +28,18 @@ negate_2x256 (__m256i data_lo, *neg_hi = _mm256_xor_si256 (data_hi, MASK_00FF_AVX2); } +static force_inline __m256i +unpack_32_1x256 (uint32_t data) +{ + return _mm256_unpacklo_epi8 (_mm256_broadcastd_epi32 (_mm_cvtsi32_si128 (data)), _mm256_setzero_si256 ()); +} + +static force_inline __m256i +expand_pixel_32_1x256 (uint32_t data) +{ + return _mm256_shuffle_epi32 (unpack_32_1x256 (data), _MM_SHUFFLE (1, 0, 1, 0)); +} + static force_inline __m256i pack_2x256_256 (__m256i lo, __m256i hi) { @@ -100,6 +112,13 @@ save_256_aligned (__m256i* dst, _mm256_store_si256 (dst, data); } +static force_inline void +save_256_unaligned (__m256i* dst, + __m256i data) +{ + _mm256_storeu_si256 (dst, data); +} + static force_inline int is_opaque_256 (__m256i x) { @@ -429,12 +448,87 @@ avx2_composite_over_8888_8888 (pixman_implementation_t *imp, src += src_stride; } } + +static void +avx2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src; + uint32_t *dst_line, *dst; + __m256i ymm_src; + __m256i ymm_dst, ymm_dst_lo, ymm_dst_hi; + __m256i ymm_dsta_hi, ymm_dsta_lo; + int dst_stride; + int32_t w; + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + ymm_src = expand_pixel_32_1x256 (src); + + while (height--) + { + dst = dst_line; + + dst_line += dst_stride; + w = width; + + while (w >= 8) + { + __m256i tmp_lo, tmp_hi; + + ymm_dst = load_256_unaligned ((__m256i*)dst); + + unpack_256_2x256 (ymm_dst, &ymm_dst_lo, &ymm_dst_hi); + expand_alpha_2x256 (ymm_dst_lo, ymm_dst_hi, &ymm_dsta_lo, &ymm_dsta_hi); + + tmp_lo = ymm_src; + tmp_hi = ymm_src; + + over_2x256 (&ymm_dst_lo, &ymm_dst_hi, + &ymm_dsta_lo, &ymm_dsta_hi, + &tmp_lo, &tmp_hi); + + save_256_unaligned ( + (__m256i*)dst, pack_2x256_256 (tmp_lo, tmp_hi)); + + w -= 8; + dst += 8; + } + + while (w) + { + __m128i vd; + + vd = unpack_32_1x128 (*dst); + + *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), + _mm256_castsi256_si128 (ymm_src))); + w--; + dst++; + } + + } + +} + static const pixman_fast_path_t avx2_fast_paths[] = { PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, avx2_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, avx2_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, avx2_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, avx2_composite_over_8888_8888), + + /* PIXMAN_OP_OVER_REVERSE */ + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, avx2_composite_over_reverse_n_8888), + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, avx2_composite_over_reverse_n_8888), + { PIXMAN_OP_NONE }, }; -- 2.19.2 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/pixman