Hi Soren, This attachment is the regenerated patch to remove cache prefetch code. Please review it. Thank you!
Here is some test result. They all show that without cache prefetch, the performance has increased more than 5%. CPU: Intel Atom [email protected] OS: MeeGo (32 bits) old: 0.19.5-with-cache-prefetch new: 0.19.5-without-cache-prefetch Speedups ======== image-rgba poppler-0 17125.68 (17279.58 0.92%) -> 14765.36 (15926.49 3.54%): 1.16x speedup image-rgba ocitysmap-0 9008.25 (9040.41 7.50%) -> 8277.94 (8343.09 5.44%): 1.09x speedup image-rgba xfce4-terminal-a1-0 18020.76 (18230.68 0.97%) -> 16703.77 (16712.42 1.22%): 1.08x speedup image-rgba gnome-terminal-vim-0 25081.38 (25133.38 0.24%) -> 23407.47 (23652.98 0.54%): 1.07x speedup image-rgba firefox-talos-gfx-0 57916.97 (57973.20 0.11%) -> 54556.64 (54624.55 0.39%): 1.06x speedup image-rgba firefox-planet-gnome-0 102377.47 (103496.63 0.70%) -> 96816.65 (97075.54 0.15%): 1.06x speedup image-rgba swfdec-giant-steps-0 12376.24 (12616.84 1.02%) -> 11705.30 (11825.20 1.06%): 1.06x speedup Test 1: old: 0.18.4-origin new: 0.18.4-disable-cache-prefetch Speedups ======== image-rgba poppler-0 17590.75 (17597.01 1.30%) -> 15414.85 (16345.44 2.89%): 1.14x speedup image-rgba xfce4-terminal-a1-0 18429.77 (18471.36 1.03%) -> 16648.99 (16725.68 1.18%): 1.11x speedup image-rgba gnome-terminal-vim-0 25272.76 (25366.13 0.18%) -> 22993.57 (23060.22 0.52%): 1.10x speedup image-rgba ocitysmap-0 9079.30 (9128.50 6.54%) -> 8410.53 (8452.60 8.02%): 1.08x speedup image-rgba swfdec-giant-steps-0 12436.80 (12473.41 0.39%) -> 11701.41 (11760.38 0.90%): 1.06x speedup image-rgba firefox-planet-gnome-0 103749.15 (103983.13 0.19%) -> 98310.02 (98522.71 0.35%): 1.06x speedup Test 2: old: 0.18.4-origin new: 0.18.4-disable-cache-prefetch Speedups ======== image-rgba gnome-terminal-vim-0 25068.39 (25095.11 0.06%) -> 23302.54 (23317.50 0.56%): 1.08x speedup image-rgba ocitysmap-0 8984.46 (8988.04 8.44%) -> 8416.64 (8489.40 6.91%): 1.07x speedup image-rgba firefox-talos-gfx-0 58330.34 (58361.84 0.23%) -> 54657.10 (55473.39 1.72%): 1.07x speedup image-rgba poppler-0 17265.62 (17437.80 0.84%) -> 16257.05 (16341.86 0.75%): 1.06x speedup image-rgba firefox-planet-gnome-0 104102.19 (104150.67 0.12%) -> 98324.84 (98402.02 0.32%): 1.06x speedup CPU: Intel Core(TM)2 Duo CPU [email protected] OS: Ubuntu 10.04 (64bits) old: 0.19.5-with-cache-prefetch new: 0.19.5-without-cache-prefetch Speedups ======== image-rgba ocitysmap-0 2671.46 (2691.82 8.55%) -> 2296.20 (2307.26 5.77%): 1.16x speedup image-rgba swfdec-giant-steps-0 1614.55 (1615.18 1.68%) -> 1532.84 (1538.52 0.72%): 1.05x speedup Regards, Liu, Xinyun > Liu Xinyun <[email protected]> writes: > > > OK. here is the work to clear all cache prefetch. Please review > > it. 3x > > This patch looks good to me. If you can resend it with a better commit > message, including the benchmark results, I'll apply it. > > > Thanks, > Soren
>From d2954034a0bf25e9819dd7c540c9a080cd029bc3 Mon Sep 17 00:00:00 2001 From: Liu Xinyun <[email protected]> Date: Sat, 25 Sep 2010 14:56:38 +0800 Subject: [PATCH] Remove cache prefetch code. The performance is decreased with cache prefetch, especially for ATOM. So remove these code. Following is the experiment. old: 0.19.5-with-cache-prefetch new: 0.19.5-without-cache-prefetch CPU: Intel Atom [email protected] OS: MeeGo (32 bits) Speedups ======== image-rgba poppler-0 17125.68 (17279.58 0.92%) -> 14765.36 (15926.49 3.54%): 1.16x speedup image-rgba ocitysmap-0 9008.25 (9040.41 7.50%) -> 8277.94 (8343.09 5.44%): 1.09x speedup image-rgba xfce4-terminal-a1-0 18020.76 (18230.68 0.97%) -> 16703.77 (16712.42 1.22%): 1.08x speedup image-rgba gnome-terminal-vim-0 25081.38 (25133.38 0.24%) -> 23407.47 (23652.98 0.54%): 1.07x speedup image-rgba firefox-talos-gfx-0 57916.97 (57973.20 0.11%) -> 54556.64 (54624.55 0.39%): 1.06x speedup image-rgba firefox-planet-gnome-0 102377.47 (103496.63 0.70%) -> 96816.65 (97075.54 0.15%): 1.06x speedup image-rgba swfdec-giant-steps-0 12376.24 (12616.84 1.02%) -> 11705.30 (11825.20 1.06%): 1.06x speedup CPU: Intel Core(TM)2 Duo CPU [email protected] OS: Ubuntu 10.04 (64bits) Speedups ======== image-rgba ocitysmap-0 2671.46 (2691.82 8.55%) -> 2296.20 (2307.26 5.77%): 1.16x speedup image-rgba swfdec-giant-steps-0 1614.55 (1615.18 1.68%) -> 1532.84 (1538.52 0.72%): 1.05x speedup Signed-off-by: Liu Xinyun <[email protected]> Signed-off-by: Chen Miaobo <[email protected]> --- pixman/pixman-sse2.c | 659 -------------------------------------------------- 1 files changed, 0 insertions(+), 659 deletions(-) diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index 33d71ee..112a8c2 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -357,34 +357,6 @@ in_over_2x128 (__m128i* src_lo, over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); } -static force_inline void -cache_prefetch (__m128i* addr) -{ - _mm_prefetch ((void const*)addr, _MM_HINT_T0); -} - -static force_inline void -cache_prefetch_next (__m128i* addr) -{ - _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */ -} - -/* prefetching NULL is very slow on some systems. don't do that. */ - -static force_inline void -maybe_prefetch (__m128i* addr) -{ - if (addr) - cache_prefetch (addr); -} - -static force_inline void -maybe_prefetch_next (__m128i* addr) -{ - if (addr) - cache_prefetch_next (addr); -} - /* load 4 pixels from a 16-byte boundary aligned address */ static force_inline __m128i load_128_aligned (__m128i* src) @@ -649,11 +621,6 @@ core_combine_over_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) { @@ -667,18 +634,8 @@ core_combine_over_u_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - /* I'm loading unaligned because I'm not sure about * the address alignment. */ @@ -740,11 +697,6 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) @@ -759,18 +711,8 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - /* I'm loading unaligned because I'm not sure * about the address alignment. */ @@ -842,11 +784,6 @@ core_combine_in_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -859,18 +796,8 @@ core_combine_in_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); @@ -916,11 +843,6 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -933,18 +855,8 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); @@ -985,11 +897,6 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, const uint32_t* pm, int w) { - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); @@ -1006,21 +913,11 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1067,11 +964,6 @@ core_combine_out_u_sse2 (uint32_t* pd, const uint32_t* pm, int w) { - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); @@ -1087,21 +979,11 @@ core_combine_out_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1167,11 +1049,6 @@ core_combine_atop_u_sse2 (uint32_t* pd, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1184,18 +1061,8 @@ core_combine_atop_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1264,11 +1131,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1281,18 +1143,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1365,11 +1217,6 @@ core_combine_xor_u_sse2 (uint32_t* dst, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1382,18 +1229,8 @@ core_combine_xor_u_sse2 (uint32_t* dst, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); xmm_dst = load_128_aligned ((__m128i*) pd); @@ -1450,11 +1287,6 @@ core_combine_add_u_sse2 (uint32_t* dst, const uint32_t* ps = src; const uint32_t* pm = mask; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); @@ -1468,20 +1300,10 @@ core_combine_add_u_sse2 (uint32_t* dst, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i s; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - s = combine4 ((__m128i*)ps, (__m128i*)pm); save_128_aligned ( @@ -1536,11 +1358,6 @@ core_combine_saturate_u_sse2 (uint32_t * pd, uint32_t pack_cmp; __m128i xmm_src, xmm_dst; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); @@ -1553,18 +1370,8 @@ core_combine_saturate_u_sse2 (uint32_t * pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - maybe_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - maybe_prefetch_next ((__m128i*)pm); - xmm_dst = load_128_aligned ((__m128i*)pd); xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); @@ -1637,11 +1444,6 @@ core_combine_src_ca_sse2 (uint32_t* pd, __m128i xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1651,18 +1453,8 @@ core_combine_src_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1718,11 +1510,6 @@ core_combine_over_ca_sse2 (uint32_t* pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1733,18 +1520,8 @@ core_combine_over_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1807,11 +1584,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1822,18 +1594,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1885,11 +1647,6 @@ core_combine_in_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1904,18 +1661,8 @@ core_combine_in_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1973,11 +1720,6 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1992,18 +1734,8 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2059,11 +1791,6 @@ core_combine_out_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2078,18 +1805,8 @@ core_combine_out_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2148,11 +1865,6 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2168,18 +1880,8 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2258,11 +1960,6 @@ core_combine_atop_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2273,18 +1970,8 @@ core_combine_atop_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2364,11 +2051,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2379,18 +2061,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2473,11 +2145,6 @@ core_combine_xor_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2488,18 +2155,8 @@ core_combine_xor_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2562,11 +2219,6 @@ core_combine_add_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2580,18 +2232,8 @@ core_combine_add_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); @@ -2971,9 +2613,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -2986,13 +2625,8 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, w--; } - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -3062,9 +2696,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -3079,14 +2710,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_565_128_4x128 (xmm_dst, @@ -3177,10 +2802,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, dst_line += dst_stride; mask_line += mask_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { m = *pm++; @@ -3200,16 +2821,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = @@ -3316,10 +2929,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, dst_line += dst_stride; mask_line += mask_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { m = *pm++; @@ -3340,16 +2949,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = @@ -3447,10 +3048,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { uint32_t s = *src++; @@ -3467,16 +3064,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)src); - xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -3556,25 +3145,16 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { *dst++ = *src++ | 0xff000000; w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - while (w >= 16) { __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - xmm_src1 = load_128_unaligned ((__m128i*)src + 0); xmm_src2 = load_128_unaligned ((__m128i*)src + 1); xmm_src3 = load_128_unaligned ((__m128i*)src + 2); @@ -3646,10 +3226,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { uint32_t s = (*src++) | 0xff000000; @@ -3666,16 +3242,8 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)src); - xmm_src = _mm_or_si128 ( load_128_unaligned ((__m128i*)src), mask_ff000000); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -3815,10 +3383,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, dst = dst_line; src = src_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; src_line += src_stride; w = width; @@ -3834,17 +3398,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - /* It's a 8 pixel loop */ while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - /* I'm loading unaligned because I'm not sure * about the address alignment. */ @@ -3954,10 +3510,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; @@ -3978,16 +3530,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) @@ -4099,7 +3643,6 @@ pixman_fill_sse2 (uint32_t *bits, return FALSE; } - cache_prefetch ((__m128i*)byte_line); xmm_def = create_mask_2x32_128 (data, data); while (height--) @@ -4109,8 +3652,6 @@ pixman_fill_sse2 (uint32_t *bits, byte_line += stride; w = byte_width; - cache_prefetch_next ((__m128i*)d); - while (w >= 1 && ((unsigned long)d & 1)) { *(uint8_t *)d = data; @@ -4133,12 +3674,8 @@ pixman_fill_sse2 (uint32_t *bits, d += 4; } - cache_prefetch_next ((__m128i*)d); - while (w >= 128) { - cache_prefetch (((__m128i*)d) + 12); - save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); @@ -4154,8 +3691,6 @@ pixman_fill_sse2 (uint32_t *bits, if (w >= 64) { - cache_prefetch (((__m128i*)d) + 8); - save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); @@ -4165,8 +3700,6 @@ pixman_fill_sse2 (uint32_t *bits, w -= 64; } - cache_prefetch_next ((__m128i*)d); - if (w >= 32) { save_128_aligned ((__m128i*)(d), xmm_def); @@ -4184,8 +3717,6 @@ pixman_fill_sse2 (uint32_t *bits, w -= 16; } - cache_prefetch_next ((__m128i*)d); - while (w >= 4) { *(uint32_t *)d = data; @@ -4265,10 +3796,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; @@ -4288,16 +3815,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) @@ -4410,10 +3929,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { m = *mask++; @@ -4434,16 +3949,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*) dst); unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); @@ -4570,10 +4077,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { s = *src++; @@ -4587,16 +4090,8 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - /* First round */ xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -4715,10 +4210,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { s = *src++; @@ -4731,16 +4222,8 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - xmm_src_hi = load_128_unaligned ((__m128i*)src); opaque = is_opaque (xmm_src_hi); @@ -4845,10 +4328,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, mask_line += mask_stride; dst_line += dst_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = *(uint32_t *) mask; @@ -4870,16 +4349,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, mask++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - /* First round */ xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5001,10 +4472,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; @@ -5018,16 +4485,8 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5121,9 +4580,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, dst_line += dst_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { d = (uint32_t) *dst; @@ -5135,14 +4591,8 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -5214,10 +4664,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { s = (uint32_t) *src++; @@ -5229,16 +4675,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5321,10 +4759,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; @@ -5338,16 +4772,8 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5440,9 +4866,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, dst_line += dst_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { *dst = (uint8_t)_mm_cvtsi64_si32 ( @@ -5454,14 +4877,8 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - save_128_aligned ( (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); @@ -5519,10 +4936,6 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp, dst = dst_line; src = src_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; src_line += src_stride; w = width; @@ -5644,9 +5057,6 @@ pixman_blt_sse2 (uint32_t *src_bits, return FALSE; } - cache_prefetch ((__m128i*)src_bytes); - cache_prefetch ((__m128i*)dst_bytes); - while (height--) { int w; @@ -5656,9 +5066,6 @@ pixman_blt_sse2 (uint32_t *src_bits, dst_bytes += dst_stride; w = byte_width; - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 2 && ((unsigned long)d & 3)) { *(uint16_t *)d = *(uint16_t *)s; @@ -5676,17 +5083,10 @@ pixman_blt_sse2 (uint32_t *src_bits, d += 4; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 64) { __m128i xmm0, xmm1, xmm2, xmm3; - /* 128 bytes ahead */ - cache_prefetch (((__m128i*)s) + 8); - cache_prefetch (((__m128i*)d) + 8); - xmm0 = load_128_unaligned ((__m128i*)(s)); xmm1 = load_128_unaligned ((__m128i*)(s + 16)); xmm2 = load_128_unaligned ((__m128i*)(s + 32)); @@ -5702,9 +5102,6 @@ pixman_blt_sse2 (uint32_t *src_bits, w -= 64; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 16) { save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); @@ -5714,9 +5111,6 @@ pixman_blt_sse2 (uint32_t *src_bits, s += 16; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 4) { *(uint32_t *)d = *(uint32_t *)s; @@ -5809,11 +5203,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)mask); - while (w && (unsigned long)dst & 15) { s = 0xff000000 | *src++; @@ -5833,18 +5222,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)mask); - m = *(uint32_t*) mask; xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000); @@ -5955,11 +5334,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w && (unsigned long)dst & 15) { uint32_t sa; @@ -5994,18 +5368,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i *)src); - cache_prefetch_next ((__m128i *)dst); - cache_prefetch_next ((__m128i *)mask); - m = *(uint32_t *) mask; if (m) @@ -6117,9 +5481,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -6135,15 +5496,10 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, dst++; } - cache_prefetch ((__m128i*)dst); - while (w >= 4) { __m128i tmp_lo, tmp_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)(dst + 4)); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -6224,11 +5580,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w && (unsigned long)dst & 15) { uint32_t sa; @@ -6263,18 +5614,8 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i *)src); - cache_prefetch ((__m128i *)dst); - cache_prefetch ((__m128i *)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i *)src); - cache_prefetch_next ((__m128i *)dst); - cache_prefetch_next ((__m128i *)mask); - xmm_mask = load_128_unaligned ((__m128i*)mask); if (!is_transparent (xmm_mask)) -- 1.7.0.4
_______________________________________________ Pixman mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/pixman
