Author: post
Date: 2009-06-27 16:44:19 +0200 (Sat, 27 Jun 2009)
New Revision: 2514

Modified:
   trunk/plugins/denoise/floatplanarimage-x86.cpp
   trunk/plugins/denoise/floatplanarimage.cpp
Log:
Denoise: Enabled RGB to YUV assembler in x64 mode. Slightly faster on Conroe, 
should favour Nehalem CPUs.

Modified: trunk/plugins/denoise/floatplanarimage-x86.cpp
===================================================================
--- trunk/plugins/denoise/floatplanarimage-x86.cpp      2009-06-27 12:00:24 UTC 
(rev 2513)
+++ trunk/plugins/denoise/floatplanarimage-x86.cpp      2009-06-27 14:44:19 UTC 
(rev 2514)
@@ -17,18 +17,17 @@
 */
 #include "floatplanarimage.h"
 
-#if defined (__i386__) || defined (__x86_64__)
+#if defined (__x86_64__)
 
+// Only 64 bits, and only if pixelsize is 4
 void FloatPlanarImage::unpackInterleavedYUV_SSE( const ImgConvertJob* j )
-{
+{  
   RS_IMAGE16* image = j->rs;
   float* temp = p[0]->data;
-  temp[0] = (0.299f* WB_R_CORR); temp[4] = 0.587f; temp[8] = (0.114f * 
WB_B_CORR); temp[3] = 0.0f;
-  temp[1] = (-0.169f* WB_R_CORR); temp[5] = -0.331f; temp[9] = (0.499 * 
WB_B_CORR); temp[7] = 0.0f;
-  temp[2] = (0.499f* WB_R_CORR); temp[6] = -0.418f; temp[10] =(-0.0813f * 
WB_B_CORR); temp[11] = 0.0f;
+  temp[0] = (0.299* WB_R_CORR); temp[4] = 0.587; temp[8] = (0.114 * 
WB_B_CORR); temp[3] = 0.0f;
+  temp[1] = (-0.169* WB_R_CORR); temp[5] = -0.331; temp[9] = (0.499 * 
WB_B_CORR); temp[7] = 0.0f;
+  temp[2] = (0.499* WB_R_CORR); temp[6] = -0.418; temp[10] =(-0.0813 * 
WB_B_CORR); temp[11] = 0.0f;
 
-  float* xfer =  (float*)fftwf_malloc(3*sizeof(float));
-
   asm volatile
   (
     "movaps (%0), %%xmm5\n"     // R values
@@ -43,39 +42,75 @@
     gfloat *Y = p[0]->getAt(ox, y+oy);
     gfloat *Cb = p[1]->getAt(ox, y+oy);
     gfloat *Cr = p[2]->getAt(ox, y+oy);
+    gint w = (1+image->w) >>1;
+    asm volatile
+    (
+        "unpack_next_pixel:\n"
+        "movq (%0), %%rax\n"
+        "movq %%rax, %%rbx\n"
+        "movq %%rax, %%rcx\n"
+        "and $65535, %%rax\n"
+        "shr $16, %%rbx\n"
+        "shr $32, %%rcx\n"
+        "and $65535, %%rbx\n"
+        "and $65535, %%rcx\n"
+        "movss (%4,%%rax,4), %%xmm0\n"
+        "movss (%4,%%rbx,4), %%xmm1\n"
+        "movss (%4,%%rcx,4), %%xmm2\n"
 
-    for (int x=0; x<image->w; x++) {
-      xfer[0] = shortToFloat[(*pix)];     // r
-      xfer[1] = shortToFloat[(*(pix+1))]; // g
-      xfer[2] = shortToFloat[(*(pix+2))]; // b
-      asm volatile
-      (
-        "movss (%0), %%xmm0\n"        // Move r  into xmm0 (load 1 to avoid 
StoreLoadForward pentalty)
-        "movss 4(%0), %%xmm1\n"       // Move g into xmm1
-        "movss 8(%0), %%xmm2\n"       // Move r into xmm2
-        "shufps $0, %%xmm0, %%xmm0\n" // Splat r
-        "shufps $0, %%xmm1, %%xmm1\n" // Splat g
-        "shufps $0, %%xmm2, %%xmm2\n" // Splat b
-        "mulps %%xmm5, %%xmm0\n"      // Multiply R
-        "mulps %%xmm6, %%xmm1\n"      // Multiply G
-        "mulps %%xmm7, %%xmm2\n"      // Multiply B
-        "addps %%xmm0, %%xmm1\n"      // Add first
-        "addps %%xmm1, %%xmm2\n"      // Add second
-        "shufps $85, %%xmm2, %%xmm0\n" // Move Cb into xmm0 lower  (85 = 
01010101)
-        "movss %%xmm2, (%1)\n"        // Store Y
-        "movhlps %%xmm2, %%xmm1\n"      // Move Cr into xmm1 low
-        "movss %%xmm0, (%2)\n"        // Store Cb
-        "movss %%xmm1, (%3)\n"        // Store Cr
+        "movq 8(%0), %%rax\n"
+        "movq %%rax, %%rbx\n"
+        "movq %%rax, %%rcx\n"
+        "and $65535, %%rax\n"
+        "shr $16, %%rbx\n"
+        "shr $32, %%rcx\n"
+        "and $65535, %%rbx\n"
+        "and $65535, %%rcx\n"
+        "shufps $0, %%xmm0, %%xmm0\n" // Splat r (1)
+        "movss (%4,%%rax,4), %%xmm10\n"
+        "shufps $0, %%xmm1, %%xmm1\n" // Splat g (1)
+        "movss (%4,%%rbx,4), %%xmm11\n"
+        "movss (%4,%%rcx,4), %%xmm12\n"
+
+        "shufps $0, %%xmm2, %%xmm2\n" // Splat b (1)
+        "mulps %%xmm5, %%xmm0\n"      // Multiply R (1)
+        "shufps $0, %%xmm10, %%xmm10\n" // Splat r (2)
+        "mulps %%xmm6, %%xmm1\n"      // Multiply G (1)
+        "shufps $0, %%xmm11, %%xmm11\n" // Splat g (2)
+        "mulps %%xmm7, %%xmm2\n"      // Multiply B (1)
+        "shufps $0, %%xmm12, %%xmm12\n" // Splat b (2)
+        "mulps %%xmm5, %%xmm10\n"      // Multiply R (2)
+        "addps %%xmm0, %%xmm1\n"      // Add first (1)
+        "mulps %%xmm6, %%xmm11\n"      // Multiply G (2)
+        "addps %%xmm1, %%xmm2\n"      // Add second (1)
+        "mulps %%xmm7, %%xmm12\n"      // Multiply B (2)
+
+        "addps %%xmm10, %%xmm11\n"      // Add first (2)
+        "addps %%xmm11, %%xmm12\n"      // Add second (2)
+
+        "movaps %%xmm2, %%xmm1\n"     // Copy (1)
+        "movaps %%xmm12, %%xmm11\n"     // Copy (2)
+
+        "unpcklps %%xmm12, %%xmm2\n"     // Unpack (2) into (1)  Y1,Y2, Cb1, 
Cb2
+        "unpckhps %%xmm11, %%xmm1\n"     // Unpack (2) into (1)  Cr1,Cr2, xx, 
xx
+
+        "movlps %%xmm2, (%1)\n"           // Store Y
+        "movlps %%xmm1, (%3)\n"           // Store Cr
+        "movhps %%xmm2, (%2)\n"           // Store Cb
+
+        "add $16, %0\n"
+        "add $8, %1\n"
+        "add $8, %2\n"
+        "add $8, %3\n"
+        "dec %5\n"
+        "jnz unpack_next_pixel\n"
         : // no output registers
-        : "r" (&xfer[0]), "r" (Y), "r" (Cb),  "r" (Cr)
-        : //  %0         %1       %2         %3
+        : "r" (pix), "r" (Y), "r" (Cb),  "r" (Cr), "r" (&shortToFloat[0]), "r" 
(w)
+         // %0         %1       %2         %3           %4                    
%5     
+        : "%rax", "%rbx", "%rcx"
      );
-      Y++; Cb++; Cr++;
-      pix += image->pixelsize;
-    }
   }
   asm volatile ( "emms\n" );
-  fftwf_free(xfer);
 
 }
 

Modified: trunk/plugins/denoise/floatplanarimage.cpp
===================================================================
--- trunk/plugins/denoise/floatplanarimage.cpp  2009-06-27 12:00:24 UTC (rev 
2513)
+++ trunk/plugins/denoise/floatplanarimage.cpp  2009-06-27 14:44:19 UTC (rev 
2514)
@@ -153,11 +153,9 @@
 {
   RS_IMAGE16* image = j->rs;
   
-#if defined (__i386__) || defined (__x86_64__)
-  // FIXME: Seems slower - and yellow cast - check??
-/*  guint cpu = rs_detect_cpu_features();
-  if (cpu & RS_CPU_FLAG_SSE)
-    return unpackInterleavedYUV_SSE(j);*/
+#if defined (__x86_64__)
+  if (image->pixelsize == 4)
+    return unpackInterleavedYUV_SSE(j);
 #endif
 
   for (int y = j->start_y; y < j->end_y; y++ ) {


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to