Author: post
Date: 2010-06-29 20:41:39 +0200 (Tue, 29 Jun 2010)
New Revision: 3455
Modified:
trunk/plugins/denoise/floatplanarimage-x86.cpp
Log:
Use streaming stores/prefetches to lighten cache pressure.
Modified: trunk/plugins/denoise/floatplanarimage-x86.cpp
===================================================================
--- trunk/plugins/denoise/floatplanarimage-x86.cpp 2010-06-29 18:29:39 UTC
(rev 3454)
+++ trunk/plugins/denoise/floatplanarimage-x86.cpp 2010-06-29 18:41:39 UTC
(rev 3455)
@@ -64,6 +64,7 @@
"unpack_next_pixel:\n"
"movaps (%0), %%xmm0\n" // Load xx,b1,g1,r1,xx,b0,g0,r0
"movaps 16(%0), %%xmm2\n" // Load xx,b3,g3,r3,xx,b2,g2,r2
+ "prefetchnta 64(%0)\n" // Prefetch next
"pxor %%xmm5,%%xmm5\n"
"movaps %%xmm0, %%xmm1\n"
"movaps %%xmm2, %%xmm3\n"
@@ -135,9 +136,9 @@
"addps %%xmm7, %%xmm6\n" // Add Cr
"addps %%xmm8, %%xmm6\n" // Add Cr (finished)
- "movaps %%xmm3, (%1)\n" // Store Y
- "movaps %%xmm0, (%2)\n" // Store Cb
- "movaps %%xmm6, (%3)\n" // Store Cr
+ "movntdq %%xmm3, (%1)\n" // Store Y
+ "movntdq %%xmm0, (%2)\n" // Store Cb
+ "movntdq %%xmm6, (%3)\n" // Store Cr
"add $32, %0\n"
"add $16, %1\n"
@@ -151,7 +152,7 @@
: "%rax", "%rbx", "%rcx"
);
}
- asm volatile ( "emms\n" );
+ asm volatile ( "emms\nsfence\n" );
}
#endif // defined (__x86_64__)
@@ -232,20 +233,20 @@
"movdqa %%xmm4, %%xmm0\n" // Copy r&g
"punpckldq %%xmm3, %%xmm4\n" // Interleave lower blue into reg&green
in xmm4 Now 00b1 g1r1 00b0 g0r0
"punpckhdq %%xmm3, %%xmm0\n" // Interleave higher blue into reg&green
in xmm0 Now 00b3 g3r3 00b2 g2r2
- "movdqa %%xmm4, (%0)\n" // Store low pixels
- "movdqa %%xmm0, 16(%0)\n" // Store high pixels
+ "movntdq %%xmm4, (%0)\n" // Store low pixels
+ "movntdq %%xmm0, 16(%0)\n" // Store high pixels
"add $32, %0\n"
"add $16, %1\n"
"add $16, %2\n"
"add $16, %3\n"
"dec %4\n"
"jnz loopback_YUV_SSE2_64\n"
- "emms\n"
: // no output registers
: "r" (out), "r" (Y), "r" (Cb), "r" (Cr), "r"(n)
: // %0 %1 %2 %3 %4
);
}
+ asm volatile ( "emms\nsfence\n" );
}
void FloatPlanarImage::packInterleavedYUV_SSE4( const ImgConvertJob* j)
@@ -313,20 +314,20 @@
"punpckldq %%xmm3,%%xmm4\n" // interleave r+g and blue low
"punpckhdq %%xmm3,%%xmm1\n" // interleave r+g and blue high
- "movdqa %%xmm4, (%0)\n" // Store low pixels
- "movdqa %%xmm1, 16(%0)\n" // Store high pixels
+ "movntdq %%xmm4, (%0)\n" // Store low pixels
+ "movntdq %%xmm1, 16(%0)\n" // Store high pixels
"add $32, %0\n"
"add $16, %1\n"
"add $16, %2\n"
"add $16, %3\n"
"dec %4\n"
"jnz loopback_YUV_SSE4_64\n"
- "emms\n"
: // no output registers
: "r" (out), "r" (Y), "r" (Cb), "r" (Cr), "r"(n)
: // %0 %1 %2 %3 %4
);
}
+ asm volatile ( "emms\nsfence\n" );
}
#else // 32 bits
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit