Author: post
Date: 2012-11-27 17:16:15 +0100 (Tue, 27 Nov 2012)
New Revision: 4316
Modified:
trunk/plugins/denoise/floatplanarimage-x86.cpp
trunk/plugins/denoise/floatplanarimage.cpp
trunk/plugins/denoise/floatplanarimage.h
Log:
Denoise, x64: Enable SSE4 code.
Modified: trunk/plugins/denoise/floatplanarimage-x86.cpp
===================================================================
--- trunk/plugins/denoise/floatplanarimage-x86.cpp 2012-11-27 12:50:59 UTC
(rev 4315)
+++ trunk/plugins/denoise/floatplanarimage-x86.cpp 2012-11-27 16:16:15 UTC
(rev 4316)
@@ -169,7 +169,146 @@
);
}
asm volatile ( "emms\nsfence\n" );
+}
+// Only 64 bits, and only if pixelsize is 4
+void FloatPlanarImage::unpackInterleavedYUV_SSE4( const ImgConvertJob* j )
+{
+ RS_IMAGE16* image = j->rs;
+ float* temp = p[0]->data;
+ temp[0] = redCorrection; temp[1] = 1.0f; temp[2] = blueCorrection; temp[3] =
0.0f;
+ for (int i = 0; i < 4; i++) {
+ temp[i+4] = (0.299); //r->Y
+ temp[i+8] = (0.587); //g->Y
+ temp[i+12] = (0.114); //b->Y
+
+ temp[i+16] = (-0.169); //r->Cb
+ temp[i+20] = (-0.331); //g->Cb
+ temp[i+24] = (0.499); //b->Cb
+
+ temp[i+28] = (0.499); //r->Cr
+ temp[i+32] = (-0.418); //g->Cr
+ temp[i+36] = (-0.0813); //b->Cr
+ temp[i+40] = 0.5f; // red/blue scale.
+ }
+
+ asm volatile
+ (
+ "movaps 0(%0), %%xmm15\n" // Red, green, bluecorrection
+ : // no output registers
+ : "r" (temp)
+ : // %0
+ );
+ for (int y = j->start_y; y < j->end_y; y++ ) {
+ const gushort* pix = GET_PIXEL(image,0,y);
+ gfloat *Y = p[0]->getAt(ox, y+oy);
+ gfloat *Cb = p[1]->getAt(ox, y+oy);
+ gfloat *Cr = p[2]->getAt(ox, y+oy);
+ gint w = (3+image->w) >>2;
+ asm volatile
+ (
+ "unpack_next_pixel_sse4:\n"
+ "movaps (%0), %%xmm0\n" // Load xx,b1,g1,r1,xx,b0,g0,r0
+ "movaps 16(%0), %%xmm2\n" // Load xx,b3,g3,r3,xx,b2,g2,r2
+ "prefetchnta 64(%0)\n" // Prefetch next
+ "pxor %%xmm5,%%xmm5\n"
+ "movaps %%xmm0, %%xmm1\n"
+ "movaps %%xmm2, %%xmm3\n"
+
+ "punpcklwd %%xmm5,%%xmm0\n" //00xx 00b0 00g0 00r0
+ "punpckhwd %%xmm5,%%xmm1\n" //00xx 00b1 00g1 00r1
+ "punpcklwd %%xmm5,%%xmm2\n" //00xx 00b2 00g2 00r2
+ "punpckhwd %%xmm5,%%xmm3\n" //00xx 00b3 00g3 00r3
+
+ "cvtdq2ps %%xmm0, %%xmm0\n" // doubleword to float
+ "cvtdq2ps %%xmm1, %%xmm1\n"
+ "cvtdq2ps %%xmm2, %%xmm2\n" // doubleword to float
+ "cvtdq2ps %%xmm3, %%xmm3\n"
+
+ "mulps %%xmm15, %%xmm0\n" // Multiply by
redcorrection/bluecorrection
+ "mulps %%xmm15, %%xmm1\n" // Multiply by
redcorrection/bluecorrection
+ "mulps %%xmm15, %%xmm2\n" // Multiply by
redcorrection/bluecorrection
+ "mulps %%xmm15, %%xmm3\n" // Multiply by
redcorrection/bluecorrection
+
+ "rsqrtps %%xmm0, %%xmm0\n" // 1 / sqrt()
+ "rsqrtps %%xmm1, %%xmm1\n"
+ "rsqrtps %%xmm2, %%xmm2\n"
+ "rsqrtps %%xmm3, %%xmm3\n"
+
+ "rcpps %%xmm0, %%xmm0\n" // sqrt
+ "rcpps %%xmm1, %%xmm1\n" // sqrt
+ "rcpps %%xmm2, %%xmm2\n" // sqrt
+ "rcpps %%xmm3, %%xmm3\n" // sqrt
+
+ "movaps %%xmm0, %%xmm5\n"
+ "movaps %%xmm2, %%xmm7\n"
+ "unpcklps %%xmm1, %%xmm0\n" //g1 g0 r1 r0
+ "unpcklps %%xmm3, %%xmm2\n" //g3 g2 r3 r2
+
+ "movaps %%xmm0, %%xmm4\n" //g1 g0 r1 r0
+ "movlhps %%xmm2, %%xmm0\n" //r3 r2 r1 r0
+ "movhlps %%xmm4, %%xmm2\n" //g3 g2 g1 g0
+
+ "unpckhps %%xmm1, %%xmm5\n" //xx xx b1 b0
+ "unpckhps %%xmm3, %%xmm7\n" //xx xx b3 b2
+ "movlhps %%xmm7, %%xmm5\n" //b3 b2 b1 b0
+
+ "movaps %%xmm2, %%xmm1\n" // Green in xmm1
+ "movaps %%xmm2, %%xmm4\n" // Green (copy) in xmm4
+ "movaps %%xmm5, %%xmm2\n" // Blue in xmm2
+ "movaps %%xmm0, %%xmm3\n" // Red (copy) in xmm3
+
+ "mulps 16(%5), %%xmm3\n" // R->Y
+ "mulps 32(%5), %%xmm4\n" // G->Y
+ "mulps 48(%5), %%xmm5\n" // B->Y
+
+ "movaps %%xmm0, %%xmm6\n" // Red (copy) in xmm6
+ "movaps %%xmm1, %%xmm7\n" // Green (copy) in xmm7
+ "movaps %%xmm2, %%xmm8\n" // Blue (copy) in xmm8
+
+ "mulps 64(%5), %%xmm0\n" // R->Cb
+ "mulps 80(%5), %%xmm1\n" // G->Cb
+ "mulps 96(%5), %%xmm2\n" // B->Cb
+
+ "addps %%xmm4, %%xmm3\n" // Add Y
+ "addps %%xmm1, %%xmm0\n" // Add Cb
+
+ "mulps 112(%5), %%xmm6\n" // R->Cr
+ "mulps 128(%5), %%xmm7\n" // G->Cr
+ "mulps 144(%5), %%xmm8\n" // B->Cr
+
+ "addps %%xmm5, %%xmm3\n" // Add Y (finished)
+ "addps %%xmm2, %%xmm0\n" // Add Cb (finished)
+ "addps %%xmm7, %%xmm6\n" // Add Cr
+ "addps %%xmm8, %%xmm6\n" // Add Cr (finished)
+
+ "movaps %%xmm0, %%xmm12\n" // Cb
+ "movaps %%xmm6, %%xmm13\n" // Cr
+ "mulps 160(%5), %%xmm12\n" // Cb * 0.5f
+ "mulps 160(%5), %%xmm13\n" // Cr * 0.5f
+
+ "blendvps %%xmm0, %%xmm12\n" // Cb(xmm0) < 0 ? Cb : Cb*0.5
+ "movaps %%xmm6, %%xmm0\n" // Cr
+ "blendvps %%xmm6, %%xmm13\n" // Cr(xmm0) < 0 ? Cr : Cr*0.5
+
+ "movntdq %%xmm3, (%1)\n" // Store Y
+ "movntdq %%xmm12, (%2)\n" // Store Cb
+ "movntdq %%xmm13, (%3)\n" // Store Cr
+
+ "add $32, %0\n"
+ "add $16, %1\n"
+ "add $16, %2\n"
+ "add $16, %3\n"
+ "dec %4\n"
+ "jnz unpack_next_pixel_sse4\n"
+ : // no output registers
+ : "r" (pix), "r" (Y), "r" (Cb), "r" (Cr), "r" (w), "r" (temp)
+ // %0 %1 %2 %3 %4 %5
+ : "%rax", "%rbx", "%rcx"
+ );
+ }
+ asm volatile ( "emms\nsfence\n" );
+
}
#endif // defined (__x86_64__)
@@ -279,7 +418,6 @@
asm volatile ( "emms\nsfence\n" );
}
-#if 0
void FloatPlanarImage::packInterleavedYUV_SSE4( const ImgConvertJob* j)
{
RS_IMAGE16* image = j->rs;
@@ -316,6 +454,16 @@
"loopback_YUV_SSE4_64:"
"movaps (%2), %%xmm1\n" // xmm1: Cb (4 pixels)
"movaps (%3), %%xmm2\n" // xmm2: Cr
+
+ "movaps %%xmm1, %%xmm3\n" // xmm3: Cb
+ "movaps %%xmm2, %%xmm4\n" // xmm4: Cr
+ "addps %%xmm1, %%xmm1\n" // Cb * 2f
+ "addps %%xmm2, %%xmm2\n" // Cr * 2f
+ "movaps %%xmm1, %%xmm0\n" // xmm0: sign for test
+ "blendvps %%xmm3, %%xmm1\n" // Cb < 0 ? Cb (dest): Cb * 2 (src)
+ "movaps %%xmm2, %%xmm0\n" // xmm0: sign for test
+ "blendvps %%xmm4, %%xmm2\n" // Cr < 0 ? Cr (dest) : Cb * 2 (src)
+
"movaps (%1), %%xmm0\n" // xmm0: Y
"movaps %%xmm1, %%xmm3\n" // xmm3: Cb
"movaps %%xmm2, %%xmm4\n" // xmm4: Cr
@@ -360,7 +508,6 @@
}
asm volatile ( "emms\nsfence\n" );
}
-#endif
#else // 32 bits
Modified: trunk/plugins/denoise/floatplanarimage.cpp
===================================================================
--- trunk/plugins/denoise/floatplanarimage.cpp 2012-11-27 12:50:59 UTC (rev
4315)
+++ trunk/plugins/denoise/floatplanarimage.cpp 2012-11-27 16:16:15 UTC (rev
4316)
@@ -170,7 +170,9 @@
blueCorrection = MAX(0.0f, blueCorrection);
#if defined (__x86_64__)
- if (image->pixelsize == 4)
+ if (image->pixelsize == 4 && (rs_detect_cpu_features() & RS_CPU_FLAG_SSE4_1))
+ return unpackInterleavedYUV_SSE4(j);
+ else if (image->pixelsize == 4)
return unpackInterleavedYUV_SSE2(j);
#endif
@@ -233,9 +235,8 @@
guint cpu = rs_detect_cpu_features();
#if defined (__x86_64__)
if ((image->pixelsize == 4) && (cpu & RS_CPU_FLAG_SSE4_1)) {
- // TODO: Test on SSE4 capable machine before enabling.
-// packInterleavedYUV_SSE4(j);
-// return;
+ packInterleavedYUV_SSE4(j);
+ return;
}
#endif
#if defined (__i386__) || defined (__x86_64__)
Modified: trunk/plugins/denoise/floatplanarimage.h
===================================================================
--- trunk/plugins/denoise/floatplanarimage.h 2012-11-27 12:50:59 UTC (rev
4315)
+++ trunk/plugins/denoise/floatplanarimage.h 2012-11-27 16:16:15 UTC (rev
4316)
@@ -52,8 +52,9 @@
void packInterleavedYUV_SSE2( const ImgConvertJob* j);
#endif
#if defined (__x86_64__)
+ void unpackInterleavedYUV_SSE4( const ImgConvertJob* j );
void unpackInterleavedYUV_SSE2( const ImgConvertJob* j );
-// void packInterleavedYUV_SSE4( const ImgConvertJob* j);
+ void packInterleavedYUV_SSE4( const ImgConvertJob* j);
#endif
void packInterleavedYUV( const ImgConvertJob* j);
JobQueue* getUnpackInterleavedYUVJobs(RS_IMAGE16* image);
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit