Hi Soren,

This attachment is the regenerated patch to remove cache prefetch code. Please 
review it. Thank you!

Here is some test result. They all show that without cache prefetch, the 
performance has increased more than 5%.


CPU: Intel Atom [email protected]
OS: MeeGo (32 bits)

old: 0.19.5-with-cache-prefetch
new: 0.19.5-without-cache-prefetch
Speedups
========
image-rgba                    poppler-0    17125.68 (17279.58 0.92%) -> 
14765.36 (15926.49 3.54%):  1.16x speedup
image-rgba                  ocitysmap-0    9008.25 (9040.41 7.50%) -> 8277.94 
(8343.09 5.44%):  1.09x speedup
image-rgba          xfce4-terminal-a1-0    18020.76 (18230.68 0.97%) -> 
16703.77 (16712.42 1.22%):  1.08x speedup
image-rgba         gnome-terminal-vim-0    25081.38 (25133.38 0.24%) -> 
23407.47 (23652.98 0.54%):  1.07x speedup
image-rgba          firefox-talos-gfx-0    57916.97 (57973.20 0.11%) -> 
54556.64 (54624.55 0.39%):  1.06x speedup
image-rgba       firefox-planet-gnome-0    102377.47 (103496.63 0.70%) -> 
96816.65 (97075.54 0.15%):  1.06x speedup
image-rgba         swfdec-giant-steps-0    12376.24 (12616.84 1.02%) -> 
11705.30 (11825.20 1.06%):  1.06x speedup


Test 1:                                                                         
                          
old: 0.18.4-origin
new: 0.18.4-disable-cache-prefetch
Speedups
========
image-rgba                    poppler-0    17590.75 (17597.01 1.30%) -> 
15414.85 (16345.44 2.89%):  1.14x speedup
image-rgba          xfce4-terminal-a1-0    18429.77 (18471.36 1.03%) -> 
16648.99 (16725.68 1.18%):  1.11x speedup
image-rgba         gnome-terminal-vim-0    25272.76 (25366.13 0.18%) -> 
22993.57 (23060.22 0.52%):  1.10x speedup
image-rgba                  ocitysmap-0    9079.30 (9128.50 6.54%) -> 8410.53 
(8452.60 8.02%):  1.08x speedup
image-rgba         swfdec-giant-steps-0    12436.80 (12473.41 0.39%) -> 
11701.41 (11760.38 0.90%):  1.06x speedup
image-rgba       firefox-planet-gnome-0    103749.15 (103983.13 0.19%) -> 
98310.02 (98522.71 0.35%):  1.06x speedup

Test 2:
old: 0.18.4-origin
new: 0.18.4-disable-cache-prefetch
Speedups
========
image-rgba         gnome-terminal-vim-0    25068.39 (25095.11 0.06%) -> 
23302.54 (23317.50 0.56%):  1.08x speedup
image-rgba                  ocitysmap-0    8984.46 (8988.04 8.44%) -> 8416.64 
(8489.40 6.91%):  1.07x speedup
image-rgba          firefox-talos-gfx-0    58330.34 (58361.84 0.23%) -> 
54657.10 (55473.39 1.72%):  1.07x speedup
image-rgba                    poppler-0    17265.62 (17437.80 0.84%) -> 
16257.05 (16341.86 0.75%):  1.06x speedup
image-rgba       firefox-planet-gnome-0    104102.19 (104150.67 0.12%) -> 
98324.84 (98402.02 0.32%):  1.06x speedup

CPU: Intel Core(TM)2 Duo CPU [email protected]     
OS: Ubuntu 10.04 (64bits)

old: 0.19.5-with-cache-prefetch
new: 0.19.5-without-cache-prefetch
Speedups
========
image-rgba                  ocitysmap-0    2671.46 (2691.82 8.55%) -> 2296.20 
(2307.26 5.77%):  1.16x speedup
image-rgba         swfdec-giant-steps-0    1614.55 (1615.18 1.68%) -> 1532.84 
(1538.52 0.72%):  1.05x speedup


Regards,
Liu, Xinyun

> Liu Xinyun <[email protected]> writes:
> 
> > OK. here is the work to clear all cache prefetch. Please review
> > it. 3x
> 
> This patch looks good to me. If you can resend it with a better commit
> message, including the benchmark results, I'll apply it.
> 
> 
> Thanks,
> Soren
>From d2954034a0bf25e9819dd7c540c9a080cd029bc3 Mon Sep 17 00:00:00 2001
From: Liu Xinyun <[email protected]>
Date: Sat, 25 Sep 2010 14:56:38 +0800
Subject: [PATCH] Remove cache prefetch code.
 The performance is decreased with cache prefetch, especially for ATOM. So
 remove these code. Following is the experiment.

old: 0.19.5-with-cache-prefetch
new: 0.19.5-without-cache-prefetch

CPU: Intel Atom [email protected]
OS: MeeGo (32 bits)
Speedups
========
image-rgba                    poppler-0    17125.68 (17279.58 0.92%) -> 14765.36 (15926.49 3.54%):  1.16x speedup
image-rgba                  ocitysmap-0    9008.25 (9040.41 7.50%) -> 8277.94 (8343.09 5.44%):  1.09x speedup
image-rgba          xfce4-terminal-a1-0    18020.76 (18230.68 0.97%) -> 16703.77 (16712.42 1.22%):  1.08x speedup
image-rgba         gnome-terminal-vim-0    25081.38 (25133.38 0.24%) -> 23407.47 (23652.98 0.54%):  1.07x speedup
image-rgba          firefox-talos-gfx-0    57916.97 (57973.20 0.11%) -> 54556.64 (54624.55 0.39%):  1.06x speedup
image-rgba       firefox-planet-gnome-0    102377.47 (103496.63 0.70%) -> 96816.65 (97075.54 0.15%):  1.06x speedup
image-rgba         swfdec-giant-steps-0    12376.24 (12616.84 1.02%) -> 11705.30 (11825.20 1.06%):  1.06x speedup

CPU: Intel Core(TM)2 Duo CPU [email protected]
OS: Ubuntu 10.04 (64bits)
Speedups
========
image-rgba                  ocitysmap-0    2671.46 (2691.82 8.55%) -> 2296.20 (2307.26 5.77%):  1.16x speedup
image-rgba         swfdec-giant-steps-0    1614.55 (1615.18 1.68%) -> 1532.84 (1538.52 0.72%):  1.05x speedup

Signed-off-by: Liu Xinyun <[email protected]>
Signed-off-by: Chen Miaobo <[email protected]>
---
 pixman/pixman-sse2.c |  659 --------------------------------------------------
 1 files changed, 0 insertions(+), 659 deletions(-)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 33d71ee..112a8c2 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -357,34 +357,6 @@ in_over_2x128 (__m128i* src_lo,
     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 }
 
-static force_inline void
-cache_prefetch (__m128i* addr)
-{
-    _mm_prefetch ((void const*)addr, _MM_HINT_T0);
-}
-
-static force_inline void
-cache_prefetch_next (__m128i* addr)
-{
-    _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
-}
-
-/* prefetching NULL is very slow on some systems. don't do that. */
-
-static force_inline void
-maybe_prefetch (__m128i* addr)
-{
-    if (addr)
-	cache_prefetch (addr);
-}
-
-static force_inline void
-maybe_prefetch_next (__m128i* addr)
-{
-    if (addr)
-	cache_prefetch_next (addr);
-}
-
 /* load 4 pixels from a 16-byte boundary aligned address */
 static force_inline __m128i
 load_128_aligned (__m128i* src)
@@ -649,11 +621,6 @@ core_combine_over_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w && ((unsigned long)pd & 15))
     {
@@ -667,18 +634,8 @@ core_combine_over_u_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	/* I'm loading unaligned because I'm not sure about
 	 * the address alignment.
 	 */
@@ -740,11 +697,6 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w &&
            ((unsigned long)pd & 15))
@@ -759,18 +711,8 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	/* I'm loading unaligned because I'm not sure
 	 * about the address alignment.
 	 */
@@ -842,11 +784,6 @@ core_combine_in_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -859,18 +796,8 @@ core_combine_in_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 
@@ -916,11 +843,6 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -933,18 +855,8 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 
@@ -985,11 +897,6 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
                                  const uint32_t* pm,
                                  int             w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	uint32_t s = combine1 (ps, pm);
@@ -1006,21 +913,11 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i xmm_src_lo, xmm_src_hi;
 	__m128i xmm_dst_lo, xmm_dst_hi;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1067,11 +964,6 @@ core_combine_out_u_sse2 (uint32_t*       pd,
                          const uint32_t* pm,
                          int             w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	uint32_t s = combine1 (ps, pm);
@@ -1087,21 +979,11 @@ core_combine_out_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i xmm_src_lo, xmm_src_hi;
 	__m128i xmm_dst_lo, xmm_dst_hi;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1167,11 +1049,6 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1184,18 +1061,8 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1264,11 +1131,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1281,18 +1143,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1365,11 +1217,6 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1382,18 +1229,8 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
 	xmm_dst = load_128_aligned ((__m128i*) pd);
 
@@ -1450,11 +1287,6 @@ core_combine_add_u_sse2 (uint32_t*       dst,
     const uint32_t* ps = src;
     const uint32_t* pm = mask;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = combine1 (ps, pm);
@@ -1468,20 +1300,10 @@ core_combine_add_u_sse2 (uint32_t*       dst,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i s;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	s = combine4 ((__m128i*)ps, (__m128i*)pm);
 
 	save_128_aligned (
@@ -1536,11 +1358,6 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
     uint32_t pack_cmp;
     __m128i xmm_src, xmm_dst;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = combine1 (ps, pm);
@@ -1553,18 +1370,8 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst = load_128_aligned  ((__m128i*)pd);
 	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
 
@@ -1637,11 +1444,6 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
     __m128i xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1651,18 +1453,8 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
@@ -1718,11 +1510,6 @@ core_combine_over_ca_sse2 (uint32_t*       pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1733,18 +1520,8 @@ core_combine_over_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1807,11 +1584,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1822,18 +1594,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1885,11 +1647,6 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1904,18 +1661,8 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1973,11 +1720,6 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1992,18 +1734,8 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2059,11 +1791,6 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2078,18 +1805,8 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2148,11 +1865,6 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2168,18 +1880,8 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2258,11 +1960,6 @@ core_combine_atop_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2273,18 +1970,8 @@ core_combine_atop_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2364,11 +2051,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2379,18 +2061,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2473,11 +2145,6 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2488,18 +2155,8 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2562,11 +2219,6 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2580,18 +2232,8 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
@@ -2971,9 +2613,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -2986,13 +2625,8 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -3062,9 +2696,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -3079,14 +2710,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_565_128_4x128 (xmm_dst,
@@ -3177,10 +2802,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	mask_line += mask_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w && (unsigned long)pd & 15)
 	{
 	    m = *pm++;
@@ -3200,16 +2821,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)pd);
-	    cache_prefetch_next ((__m128i*)pm);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
 
 	    pack_cmp =
@@ -3316,10 +2929,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	mask_line += mask_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w && (unsigned long)pd & 15)
 	{
 	    m = *pm++;
@@ -3340,16 +2949,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)pd);
-	    cache_prefetch_next ((__m128i*)pm);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
 
 	    pack_cmp =
@@ -3447,10 +3048,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint32_t s = *src++;
@@ -3467,16 +3064,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -3556,25 +3145,16 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    *dst++ = *src++ | 0xff000000;
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 16)
 	{
 	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
 	    
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
 	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
 	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
@@ -3646,10 +3226,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint32_t s = (*src++) | 0xff000000;
@@ -3666,16 +3242,8 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src = _mm_or_si128 (
 		load_128_unaligned ((__m128i*)src), mask_ff000000);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3815,10 +3383,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
 	dst = dst_line;
 	src = src_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	src_line += src_stride;
 	w = width;
@@ -3834,17 +3398,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	/* It's a 8 pixel loop */
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* I'm loading unaligned because I'm not sure
 	     * about the address alignment.
 	     */
@@ -3954,10 +3510,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint8_t m = *mask++;
@@ -3978,16 +3530,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    m = *((uint32_t*)mask);
 
 	    if (srca == 0xff && m == 0xffffffff)
@@ -4099,7 +3643,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	return FALSE;
     }
 
-    cache_prefetch ((__m128i*)byte_line);
     xmm_def = create_mask_2x32_128 (data, data);
 
     while (height--)
@@ -4109,8 +3652,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	byte_line += stride;
 	w = byte_width;
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 1 && ((unsigned long)d & 1))
 	{
 	    *(uint8_t *)d = data;
@@ -4133,12 +3674,8 @@ pixman_fill_sse2 (uint32_t *bits,
 	    d += 4;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 128)
 	{
-	    cache_prefetch (((__m128i*)d) + 12);
-
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4154,8 +3691,6 @@ pixman_fill_sse2 (uint32_t *bits,
 
 	if (w >= 64)
 	{
-	    cache_prefetch (((__m128i*)d) + 8);
-
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4165,8 +3700,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	    w -= 64;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	if (w >= 32)
 	{
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
@@ -4184,8 +3717,6 @@ pixman_fill_sse2 (uint32_t *bits,
 	    w -= 16;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 4)
 	{
 	    *(uint32_t *)d = data;
@@ -4265,10 +3796,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint8_t m = *mask++;
@@ -4288,16 +3815,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    m = *((uint32_t*)mask);
 
 	    if (srca == 0xff && m == 0xffffffff)
@@ -4410,10 +3929,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    m = *mask++;
@@ -4434,16 +3949,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*) dst);
 	    unpack_565_128_4x128 (xmm_dst,
 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
@@ -4570,10 +4077,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    s = *src++;
@@ -4587,16 +4090,8 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* First round */
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned  ((__m128i*)dst);
@@ -4715,10 +4210,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    s = *src++;
@@ -4731,16 +4222,8 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
 
 	    opaque = is_opaque (xmm_src_hi);
@@ -4845,10 +4328,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	dst_line += dst_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = *(uint32_t *) mask;
@@ -4870,16 +4349,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 	    mask++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* First round */
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5001,10 +4472,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = (uint32_t) *mask++;
@@ -5018,16 +4485,8 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5121,9 +4580,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    d = (uint32_t) *dst;
@@ -5135,14 +4591,8 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -5214,10 +4664,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    s = (uint32_t) *src++;
@@ -5229,16 +4675,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5321,10 +4759,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = (uint32_t) *mask++;
@@ -5338,16 +4772,8 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5440,9 +4866,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 	dst_line += dst_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    *dst = (uint8_t)_mm_cvtsi64_si32 (
@@ -5454,14 +4877,8 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    save_128_aligned (
 		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
 
@@ -5519,10 +4936,6 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
 	dst = dst_line;
 	src = src_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	src_line += src_stride;
 	w = width;
@@ -5644,9 +5057,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	return FALSE;
     }
 
-    cache_prefetch ((__m128i*)src_bytes);
-    cache_prefetch ((__m128i*)dst_bytes);
-
     while (height--)
     {
 	int w;
@@ -5656,9 +5066,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	dst_bytes += dst_stride;
 	w = byte_width;
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 2 && ((unsigned long)d & 3))
 	{
 	    *(uint16_t *)d = *(uint16_t *)s;
@@ -5676,17 +5083,10 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    d += 4;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 64)
 	{
 	    __m128i xmm0, xmm1, xmm2, xmm3;
 
-	    /* 128 bytes ahead */
-	    cache_prefetch (((__m128i*)s) + 8);
-	    cache_prefetch (((__m128i*)d) + 8);
-
 	    xmm0 = load_128_unaligned ((__m128i*)(s));
 	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
 	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
@@ -5702,9 +5102,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    w -= 64;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 16)
 	{
 	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
@@ -5714,9 +5111,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
 	    s += 16;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 4)
 	{
 	    *(uint32_t *)d = *(uint32_t *)s;
@@ -5809,11 +5203,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w && (unsigned long)dst & 15)
         {
             s = 0xff000000 | *src++;
@@ -5833,18 +5222,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)src);
-            cache_prefetch_next ((__m128i*)dst);
-            cache_prefetch_next ((__m128i*)mask);
-
             m = *(uint32_t*) mask;
             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
 
@@ -5955,11 +5334,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w && (unsigned long)dst & 15)
         {
 	    uint32_t sa;
@@ -5994,18 +5368,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i *)src);
-            cache_prefetch_next ((__m128i *)dst);
-            cache_prefetch_next ((__m128i *)mask);
-
             m = *(uint32_t *) mask;
 
 	    if (m)
@@ -6117,9 +5481,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -6135,15 +5496,10 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
 	    __m128i tmp_lo, tmp_hi;
 
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)(dst + 4));
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -6224,11 +5580,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w && (unsigned long)dst & 15)
         {
 	    uint32_t sa;
@@ -6263,18 +5614,8 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i *)src);
-            cache_prefetch_next ((__m128i *)dst);
-            cache_prefetch_next ((__m128i *)mask);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 
 	    if (!is_transparent (xmm_mask))
-- 
1.7.0.4

_______________________________________________
Pixman mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pixman

Reply via email to