Author: post
Date: 2012-11-27 17:16:15 +0100 (Tue, 27 Nov 2012)
New Revision: 4316

Modified:
   trunk/plugins/denoise/floatplanarimage-x86.cpp
   trunk/plugins/denoise/floatplanarimage.cpp
   trunk/plugins/denoise/floatplanarimage.h
Log:
Denoise, x64: Enable SSE4 code.

Modified: trunk/plugins/denoise/floatplanarimage-x86.cpp
===================================================================
--- trunk/plugins/denoise/floatplanarimage-x86.cpp      2012-11-27 12:50:59 UTC 
(rev 4315)
+++ trunk/plugins/denoise/floatplanarimage-x86.cpp      2012-11-27 16:16:15 UTC 
(rev 4316)
@@ -169,7 +169,146 @@
      );
   }
   asm volatile ( "emms\nsfence\n" );
+}
 
+// Only 64 bits, and only if pixelsize is 4
+void FloatPlanarImage::unpackInterleavedYUV_SSE4( const ImgConvertJob* j )
+{  
+  RS_IMAGE16* image = j->rs;
+  float* temp = p[0]->data;
+  temp[0] = redCorrection; temp[1] = 1.0f; temp[2] = blueCorrection; temp[3] = 
0.0f;
+  for (int i = 0; i < 4; i++) {
+    temp[i+4] = (0.299);   //r->Y
+    temp[i+8] = (0.587);   //g->Y
+    temp[i+12] = (0.114);   //b->Y
+
+    temp[i+16] = (-0.169);  //r->Cb
+    temp[i+20] = (-0.331);  //g->Cb
+    temp[i+24] = (0.499);   //b->Cb
+
+    temp[i+28] = (0.499);   //r->Cr
+    temp[i+32] = (-0.418);  //g->Cr
+    temp[i+36] = (-0.0813); //b->Cr
+    temp[i+40] = 0.5f;  // red/blue scale.
+  }
+
+  asm volatile
+  (
+    "movaps 0(%0), %%xmm15\n"     // Red, green, bluecorrection
+    : // no output registers
+    : "r" (temp)
+    : //  %0
+  );
+  for (int y = j->start_y; y < j->end_y; y++ ) {
+    const gushort* pix = GET_PIXEL(image,0,y);
+    gfloat *Y = p[0]->getAt(ox, y+oy);
+    gfloat *Cb = p[1]->getAt(ox, y+oy);
+    gfloat *Cr = p[2]->getAt(ox, y+oy);
+    gint w = (3+image->w) >>2;
+    asm volatile
+    (
+        "unpack_next_pixel_sse4:\n"
+        "movaps (%0), %%xmm0\n"         // Load xx,b1,g1,r1,xx,b0,g0,r0
+        "movaps 16(%0), %%xmm2\n"       // Load xx,b3,g3,r3,xx,b2,g2,r2
+        "prefetchnta 64(%0)\n"         // Prefetch next
+        "pxor %%xmm5,%%xmm5\n"
+        "movaps %%xmm0, %%xmm1\n"
+        "movaps %%xmm2, %%xmm3\n"
+
+        "punpcklwd %%xmm5,%%xmm0\n"     //00xx 00b0 00g0 00r0
+        "punpckhwd %%xmm5,%%xmm1\n"     //00xx 00b1 00g1 00r1
+        "punpcklwd %%xmm5,%%xmm2\n"     //00xx 00b2 00g2 00r2
+        "punpckhwd %%xmm5,%%xmm3\n"     //00xx 00b3 00g3 00r3
+
+        "cvtdq2ps %%xmm0, %%xmm0\n"     // doubleword to float
+        "cvtdq2ps %%xmm1, %%xmm1\n"
+        "cvtdq2ps %%xmm2, %%xmm2\n"     // doubleword to float
+        "cvtdq2ps %%xmm3, %%xmm3\n"
+
+        "mulps %%xmm15, %%xmm0\n"       // Multiply by 
redcorrection/bluecorrection
+        "mulps %%xmm15, %%xmm1\n"       // Multiply by 
redcorrection/bluecorrection
+        "mulps %%xmm15, %%xmm2\n"       // Multiply by 
redcorrection/bluecorrection
+        "mulps %%xmm15, %%xmm3\n"       // Multiply by 
redcorrection/bluecorrection
+
+        "rsqrtps %%xmm0, %%xmm0\n"      // 1 / sqrt()
+        "rsqrtps %%xmm1, %%xmm1\n"
+        "rsqrtps %%xmm2, %%xmm2\n"
+        "rsqrtps %%xmm3, %%xmm3\n"
+
+        "rcpps %%xmm0, %%xmm0\n"        // sqrt
+        "rcpps %%xmm1, %%xmm1\n"        // sqrt
+        "rcpps %%xmm2, %%xmm2\n"        // sqrt
+        "rcpps %%xmm3, %%xmm3\n"        // sqrt
+
+        "movaps %%xmm0, %%xmm5\n"
+        "movaps %%xmm2, %%xmm7\n"
+        "unpcklps %%xmm1, %%xmm0\n"     //g1 g0 r1 r0
+        "unpcklps %%xmm3, %%xmm2\n"     //g3 g2 r3 r2
+
+        "movaps %%xmm0, %%xmm4\n"       //g1 g0 r1 r0
+        "movlhps %%xmm2, %%xmm0\n"      //r3 r2 r1 r0
+        "movhlps %%xmm4, %%xmm2\n"      //g3 g2 g1 g0
+
+        "unpckhps %%xmm1, %%xmm5\n"     //xx xx b1 b0
+        "unpckhps %%xmm3, %%xmm7\n"     //xx xx b3 b2
+        "movlhps %%xmm7, %%xmm5\n"      //b3 b2 b1 b0
+
+        "movaps %%xmm2, %%xmm1\n"     // Green in xmm1
+        "movaps %%xmm2, %%xmm4\n"     // Green (copy) in xmm4
+        "movaps %%xmm5, %%xmm2\n"     // Blue in xmm2
+        "movaps %%xmm0, %%xmm3\n"     // Red (copy) in xmm3
+
+        "mulps 16(%5), %%xmm3\n"     // R->Y
+        "mulps 32(%5), %%xmm4\n"     // G->Y
+        "mulps 48(%5), %%xmm5\n"     // B->Y
+
+        "movaps %%xmm0, %%xmm6\n"     // Red (copy) in xmm6
+        "movaps %%xmm1, %%xmm7\n"     // Green (copy) in xmm7
+        "movaps %%xmm2, %%xmm8\n"     // Blue (copy) in xmm8
+
+        "mulps 64(%5), %%xmm0\n"     // R->Cb
+        "mulps 80(%5), %%xmm1\n"     // G->Cb
+        "mulps 96(%5), %%xmm2\n"     // B->Cb
+
+        "addps %%xmm4, %%xmm3\n"     // Add Y
+        "addps %%xmm1, %%xmm0\n"     // Add Cb
+
+        "mulps 112(%5), %%xmm6\n"     // R->Cr
+        "mulps 128(%5), %%xmm7\n"     // G->Cr
+        "mulps 144(%5), %%xmm8\n"     // B->Cr
+
+        "addps %%xmm5, %%xmm3\n"     // Add Y (finished)
+        "addps %%xmm2, %%xmm0\n"     // Add Cb (finished)
+        "addps %%xmm7, %%xmm6\n"     // Add Cr
+        "addps %%xmm8, %%xmm6\n"     // Add Cr (finished)
+
+        "movaps %%xmm0, %%xmm12\n"     // Cb
+        "movaps %%xmm6, %%xmm13\n"     // Cr
+        "mulps 160(%5), %%xmm12\n"     // Cb * 0.5f
+        "mulps 160(%5), %%xmm13\n"     // Cr * 0.5f
+
+        "blendvps %%xmm0, %%xmm12\n"     // Cb(xmm0) < 0 ? Cb : Cb*0.5
+        "movaps %%xmm6, %%xmm0\n"     // Cr
+        "blendvps %%xmm6, %%xmm13\n"     // Cr(xmm0) < 0 ? Cr : Cr*0.5
+
+        "movntdq %%xmm3, (%1)\n"      // Store Y
+        "movntdq %%xmm12, (%2)\n"      // Store Cb
+        "movntdq %%xmm13, (%3)\n"      // Store Cr
+
+        "add $32, %0\n"
+        "add $16, %1\n"
+        "add $16, %2\n"
+        "add $16, %3\n"
+        "dec %4\n"
+        "jnz unpack_next_pixel_sse4\n"
+        : // no output registers
+        : "r" (pix), "r" (Y), "r" (Cb),  "r" (Cr),  "r" (w), "r" (temp)
+         // %0         %1       %2         %3           %4    %5  
+        : "%rax", "%rbx", "%rcx"
+     );
+  }
+  asm volatile ( "emms\nsfence\n" );
+
 }
 #endif // defined (__x86_64__)
 
@@ -279,7 +418,6 @@
   asm volatile ( "emms\nsfence\n" );
 }
 
-#if 0
 void FloatPlanarImage::packInterleavedYUV_SSE4( const ImgConvertJob* j)
 {
   RS_IMAGE16* image = j->rs;
@@ -316,6 +454,16 @@
       "loopback_YUV_SSE4_64:"
       "movaps (%2), %%xmm1\n"         // xmm1: Cb (4 pixels)
       "movaps (%3), %%xmm2\n"         // xmm2: Cr
+
+      "movaps %%xmm1, %%xmm3\n"        // xmm3: Cb
+      "movaps %%xmm2, %%xmm4\n"        // xmm4: Cr
+      "addps %%xmm1, %%xmm1\n"         // Cb * 2f
+      "addps %%xmm2, %%xmm2\n"         // Cr * 2f
+      "movaps %%xmm1, %%xmm0\n"        // xmm0: sign for test
+      "blendvps %%xmm3, %%xmm1\n"     // Cb < 0 ? Cb (dest): Cb * 2 (src)
+      "movaps %%xmm2, %%xmm0\n"        // xmm0: sign for test
+      "blendvps %%xmm4, %%xmm2\n"     // Cr < 0 ? Cr (dest) : Cb * 2 (src)
+
       "movaps (%1), %%xmm0\n"         // xmm0: Y
       "movaps %%xmm1, %%xmm3\n"       // xmm3: Cb
       "movaps %%xmm2, %%xmm4\n"       // xmm4: Cr
@@ -360,7 +508,6 @@
   }
   asm volatile ( "emms\nsfence\n" );
 }
-#endif
 
 #else  // 32 bits
 

Modified: trunk/plugins/denoise/floatplanarimage.cpp
===================================================================
--- trunk/plugins/denoise/floatplanarimage.cpp  2012-11-27 12:50:59 UTC (rev 
4315)
+++ trunk/plugins/denoise/floatplanarimage.cpp  2012-11-27 16:16:15 UTC (rev 
4316)
@@ -170,7 +170,9 @@
   blueCorrection = MAX(0.0f, blueCorrection);
   
 #if defined (__x86_64__)
-  if (image->pixelsize == 4)
+  if (image->pixelsize == 4 && (rs_detect_cpu_features() & RS_CPU_FLAG_SSE4_1))
+    return unpackInterleavedYUV_SSE4(j);
+  else if (image->pixelsize == 4)
     return unpackInterleavedYUV_SSE2(j);
 #endif
 
@@ -233,9 +235,8 @@
   guint cpu = rs_detect_cpu_features();
 #if defined (__x86_64__)
   if ((image->pixelsize == 4) && (cpu & RS_CPU_FLAG_SSE4_1))  {
-    // TODO: Test on SSE4 capable machine before enabling.
-//    packInterleavedYUV_SSE4(j);
-//    return;
+    packInterleavedYUV_SSE4(j);
+    return;
   }
 #endif
 #if defined (__i386__) || defined (__x86_64__)

Modified: trunk/plugins/denoise/floatplanarimage.h
===================================================================
--- trunk/plugins/denoise/floatplanarimage.h    2012-11-27 12:50:59 UTC (rev 
4315)
+++ trunk/plugins/denoise/floatplanarimage.h    2012-11-27 16:16:15 UTC (rev 
4316)
@@ -52,8 +52,9 @@
   void packInterleavedYUV_SSE2( const ImgConvertJob* j);
 #endif
 #if defined (__x86_64__)
+  void unpackInterleavedYUV_SSE4( const ImgConvertJob* j );
   void unpackInterleavedYUV_SSE2( const ImgConvertJob* j );
-//  void packInterleavedYUV_SSE4( const ImgConvertJob* j);
+  void packInterleavedYUV_SSE4( const ImgConvertJob* j);
 #endif
   void packInterleavedYUV( const ImgConvertJob* j);
   JobQueue* getUnpackInterleavedYUVJobs(RS_IMAGE16* image);


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to