Author: post
Date: 2009-08-18 23:34:30 +0200 (Tue, 18 Aug 2009)
New Revision: 2623
Modified:
trunk/plugins/denoise/floatplanarimage-x86.cpp
trunk/plugins/denoise/floatplanarimage.cpp
trunk/plugins/denoise/floatplanarimage.h
Log:
Denoise: Rewrote 64 bit int rgb-> float yuv conversion. Much faster and not
using lookup table.
Modified: trunk/plugins/denoise/floatplanarimage-x86.cpp
===================================================================
--- trunk/plugins/denoise/floatplanarimage-x86.cpp 2009-08-17 21:16:07 UTC
(rev 2622)
+++ trunk/plugins/denoise/floatplanarimage-x86.cpp 2009-08-18 21:34:30 UTC
(rev 2623)
@@ -22,20 +22,28 @@
#if defined (__x86_64__)
// Only 64 bits, and only if pixelsize is 4
-// FIXME: Apply R/B correction prior to square root
-void FloatPlanarImage::unpackInterleavedYUV_SSE( const ImgConvertJob* j )
+void FloatPlanarImage::unpackInterleavedYUV_SSE2( const ImgConvertJob* j )
{
RS_IMAGE16* image = j->rs;
float* temp = p[0]->data;
- temp[0] = (0.299 * redCorrection); temp[4] = 0.587; temp[8] = (0.114 *
blueCorrection); temp[3] = 0.0f;
- temp[1] = (-0.169 * redCorrection); temp[5] = -0.331; temp[9] = (0.499 *
blueCorrection); temp[7] = 0.0f;
- temp[2] = (0.499 * redCorrection); temp[6] = -0.418; temp[10] =(-0.0813 *
blueCorrection); temp[11] = 0.0f;
+ temp[0] = redCorrection; temp[1] = 1.0f; temp[2] = blueCorrection; temp[3] =
0.0f;
+ for (int i = 0; i < 4; i++) {
+ temp[i+4] = (0.299); //r->Y
+ temp[i+8] = (0.587); //g->Y
+ temp[i+12] = (0.114); //b->Y
+ temp[i+16] = (-0.169); //r->Cb
+ temp[i+20] = (-0.331); //g->Cb
+ temp[i+24] = (0.499); //b->Cb
+
+ temp[i+28] = (0.499); //r->Cr
+ temp[i+32] = (-0.418); //g->Cr
+ temp[i+36] = (-0.0813); //b->Cr
+ }
+
asm volatile
(
- "movaps (%0), %%xmm5\n" // R values
- "movaps 16(%0), %%xmm6\n" // G values
- "movaps 32(%0), %%xmm7\n" // B values
+ "movaps 0(%0), %%xmm15\n" // Red, green, bluecorrection
: // no output registers
: "r" (temp)
: // %0
@@ -45,71 +53,96 @@
gfloat *Y = p[0]->getAt(ox, y+oy);
gfloat *Cb = p[1]->getAt(ox, y+oy);
gfloat *Cr = p[2]->getAt(ox, y+oy);
- gint w = (1+image->w) >>1;
+ gint w = (3+image->w) >>2;
asm volatile
(
"unpack_next_pixel:\n"
- "movq (%0), %%rax\n"
- "movq %%rax, %%rbx\n"
- "movq %%rax, %%rcx\n"
- "and $65535, %%rax\n"
- "shr $16, %%rbx\n"
- "shr $32, %%rcx\n"
- "and $65535, %%rbx\n"
- "and $65535, %%rcx\n"
- "movss (%4,%%rax,4), %%xmm0\n"
- "movss (%4,%%rbx,4), %%xmm1\n"
- "movss (%4,%%rcx,4), %%xmm2\n"
+ "movaps (%0), %%xmm0\n" // Load xx,b1,g1,r1,xx,b0,g0,r0
+ "movaps 16(%0), %%xmm2\n" // Load xx,b3,g3,r3,xx,b2,g2,r2
+ "pxor %%xmm5,%%xmm5\n"
+ "movaps %%xmm0, %%xmm1\n"
+ "movaps %%xmm2, %%xmm3\n"
- "movq 8(%0), %%rax\n"
- "movq %%rax, %%rbx\n"
- "movq %%rax, %%rcx\n"
- "and $65535, %%rax\n"
- "shr $16, %%rbx\n"
- "shr $32, %%rcx\n"
- "and $65535, %%rbx\n"
- "and $65535, %%rcx\n"
- "shufps $0, %%xmm0, %%xmm0\n" // Splat r (1)
- "movss (%4,%%rax,4), %%xmm10\n"
- "shufps $0, %%xmm1, %%xmm1\n" // Splat g (1)
- "movss (%4,%%rbx,4), %%xmm11\n"
- "movss (%4,%%rcx,4), %%xmm12\n"
+ "punpcklwd %%xmm5,%%xmm0\n" //00xx 00b0 00g0 00r0
+ "punpckhwd %%xmm5,%%xmm1\n" //00xx 00b1 00g1 00r1
+ "punpcklwd %%xmm5,%%xmm2\n" //00xx 00b2 00g2 00r2
+ "punpckhwd %%xmm5,%%xmm3\n" //00xx 00b3 00g3 00r3
- "shufps $0, %%xmm2, %%xmm2\n" // Splat b (1)
- "mulps %%xmm5, %%xmm0\n" // Multiply R (1)
- "shufps $0, %%xmm10, %%xmm10\n" // Splat r (2)
- "mulps %%xmm6, %%xmm1\n" // Multiply G (1)
- "shufps $0, %%xmm11, %%xmm11\n" // Splat g (2)
- "mulps %%xmm7, %%xmm2\n" // Multiply B (1)
- "shufps $0, %%xmm12, %%xmm12\n" // Splat b (2)
- "mulps %%xmm5, %%xmm10\n" // Multiply R (2)
- "addps %%xmm0, %%xmm1\n" // Add first (1)
- "mulps %%xmm6, %%xmm11\n" // Multiply G (2)
- "addps %%xmm1, %%xmm2\n" // Add second (1)
- "mulps %%xmm7, %%xmm12\n" // Multiply B (2)
+ "cvtdq2ps %%xmm0, %%xmm0\n" // doubleword to float
+ "cvtdq2ps %%xmm1, %%xmm1\n"
+ "cvtdq2ps %%xmm2, %%xmm2\n" // doubleword to float
+ "cvtdq2ps %%xmm3, %%xmm3\n"
- "addps %%xmm10, %%xmm11\n" // Add first (2)
- "addps %%xmm11, %%xmm12\n" // Add second (2)
+ "mulps %%xmm15, %%xmm0\n" // Multiply by
redcorrection/bluecorrection
+ "mulps %%xmm15, %%xmm1\n" // Multiply by
redcorrection/bluecorrection
+ "mulps %%xmm15, %%xmm2\n" // Multiply by
redcorrection/bluecorrection
+ "mulps %%xmm15, %%xmm3\n" // Multiply by
redcorrection/bluecorrection
- "movaps %%xmm2, %%xmm1\n" // Copy (1)
- "movaps %%xmm12, %%xmm11\n" // Copy (2)
+ "rsqrtps %%xmm0, %%xmm0\n" // 1 / sqrt()
+ "rsqrtps %%xmm1, %%xmm1\n"
+ "rsqrtps %%xmm2, %%xmm2\n"
+ "rsqrtps %%xmm3, %%xmm3\n"
- "unpcklps %%xmm12, %%xmm2\n" // Unpack (2) into (1) Y1,Y2, Cb1,
Cb2
- "unpckhps %%xmm11, %%xmm1\n" // Unpack (2) into (1) Cr1,Cr2, xx,
xx
+ "rcpps %%xmm0, %%xmm0\n" // sqrt
+ "rcpps %%xmm1, %%xmm1\n" // sqrt
+ "rcpps %%xmm2, %%xmm2\n" // sqrt
+ "rcpps %%xmm3, %%xmm3\n" // sqrt
- "movlps %%xmm2, (%1)\n" // Store Y
- "movlps %%xmm1, (%3)\n" // Store Cr
- "movhps %%xmm2, (%2)\n" // Store Cb
+ "movaps %%xmm0, %%xmm5\n"
+ "movaps %%xmm2, %%xmm7\n"
+ "unpcklps %%xmm1, %%xmm0\n" //g1 g0 r1 r0
+ "unpcklps %%xmm3, %%xmm2\n" //g3 g2 r3 r2
- "add $16, %0\n"
- "add $8, %1\n"
- "add $8, %2\n"
- "add $8, %3\n"
- "dec %5\n"
+ "movaps %%xmm0, %%xmm4\n" //g1 g0 r1 r0
+ "movlhps %%xmm2, %%xmm0\n" //r3 r2 r1 r0
+ "movhlps %%xmm4, %%xmm2\n" //g3 g2 g1 g0
+
+ "unpckhps %%xmm1, %%xmm5\n" //xx xx b1 b0
+ "unpckhps %%xmm3, %%xmm7\n" //xx xx b3 b2
+ "movlhps %%xmm7, %%xmm5\n" //b3 b2 b1 b0
+
+ "movaps %%xmm2, %%xmm1\n" // Green in xmm1
+ "movaps %%xmm2, %%xmm4\n" // Green (copy) in xmm4
+ "movaps %%xmm5, %%xmm2\n" // Blue in xmm2
+ "movaps %%xmm0, %%xmm3\n" // Red (copy) in xmm3
+
+ "mulps 16(%5), %%xmm3\n" // R->Y
+ "mulps 32(%5), %%xmm4\n" // G->Y
+ "mulps 48(%5), %%xmm5\n" // B->Y
+
+ "movaps %%xmm0, %%xmm6\n" // Red (copy) in xmm6
+ "movaps %%xmm1, %%xmm7\n" // Green (copy) in xmm7
+ "movaps %%xmm2, %%xmm8\n" // Blue (copy) in xmm8
+
+ "mulps 64(%5), %%xmm0\n" // R->Cb
+ "mulps 80(%5), %%xmm1\n" // G->Cb
+ "mulps 96(%5), %%xmm2\n" // B->Cb
+
+ "addps %%xmm4, %%xmm3\n" // Add Y
+ "addps %%xmm1, %%xmm0\n" // Add Cb
+
+ "mulps 112(%5), %%xmm6\n" // R->Cr
+ "mulps 128(%5), %%xmm7\n" // G->Cr
+ "mulps 144(%5), %%xmm8\n" // B->Cr
+
+ "addps %%xmm5, %%xmm3\n" // Add Y (finished)
+ "addps %%xmm2, %%xmm0\n" // Add Cb (finished)
+ "addps %%xmm7, %%xmm6\n" // Add Cr
+ "addps %%xmm8, %%xmm6\n" // Add Cr (finished)
+
+ "movaps %%xmm3, (%1)\n" // Store Y
+ "movaps %%xmm0, (%2)\n" // Store Cb
+ "movaps %%xmm6, (%3)\n" // Store Cr
+
+ "add $32, %0\n"
+ "add $16, %1\n"
+ "add $16, %2\n"
+ "add $16, %3\n"
+ "dec %4\n"
"jnz unpack_next_pixel\n"
: // no output registers
- : "r" (pix), "r" (Y), "r" (Cb), "r" (Cr), "r" (&shortToFloat[0]), "r"
(w)
- // %0 %1 %2 %3 %4
%5
+ : "r" (pix), "r" (Y), "r" (Cb), "r" (Cr), "r" (w), "r" (temp)
+ // %0 %1 %2 %3 %4 %5
: "%rax", "%rbx", "%rcx"
);
}
Modified: trunk/plugins/denoise/floatplanarimage.cpp
===================================================================
--- trunk/plugins/denoise/floatplanarimage.cpp 2009-08-17 21:16:07 UTC (rev
2622)
+++ trunk/plugins/denoise/floatplanarimage.cpp 2009-08-18 21:34:30 UTC (rev
2623)
@@ -161,8 +161,8 @@
RS_IMAGE16* image = j->rs;
#if defined (__x86_64__)
-// if (image->pixelsize == 4)
-// return unpackInterleavedYUV_SSE(j);
+ if (image->pixelsize == 4)
+ return unpackInterleavedYUV_SSE2(j);
#endif
// We cannot look up more than 65535*4
Modified: trunk/plugins/denoise/floatplanarimage.h
===================================================================
--- trunk/plugins/denoise/floatplanarimage.h 2009-08-17 21:16:07 UTC (rev
2622)
+++ trunk/plugins/denoise/floatplanarimage.h 2009-08-18 21:34:30 UTC (rev
2623)
@@ -48,7 +48,7 @@
void packInterleavedYUV_SSE2( const ImgConvertJob* j);
#endif
#if defined (__x86_64__)
- void unpackInterleavedYUV_SSE( const ImgConvertJob* j );
+ void unpackInterleavedYUV_SSE2( const ImgConvertJob* j );
void packInterleavedYUV_SSE4( const ImgConvertJob* j);
#endif
void packInterleavedYUV( const ImgConvertJob* j);
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit