Author: post
Date: 2009-08-20 17:15:15 +0200 (Thu, 20 Aug 2009)
New Revision: 2626

Modified:
   trunk/plugins/denoise/floatplanarimage-x86.cpp
Log:
Denoise: Refactored float yuv -> rgb shorts to avoid slow clamping, thanks to 
akupenguin and Dark Shikari for assistance.

Modified: trunk/plugins/denoise/floatplanarimage-x86.cpp
===================================================================
--- trunk/plugins/denoise/floatplanarimage-x86.cpp      2009-08-19 17:53:17 UTC 
(rev 2625)
+++ trunk/plugins/denoise/floatplanarimage-x86.cpp      2009-08-20 15:15:15 UTC 
(rev 2626)
@@ -164,7 +164,8 @@
     temp[i+12] = 1.772f;    // Cb to b
     temp[i+16] = (1.0f/redCorrection);   // Red correction
     temp[i+20] = (1.0f/blueCorrection);    // Blue correction
-    temp[i+24] = 65535.0f;    // Saturation
+    *((gint*)&temp[i+24]) = 32768;        // Subtract
+    *((guint*)&temp[i+28]) = 0x80008000;    // xor sign shift
   }
 
   asm volatile
@@ -175,8 +176,9 @@
     "movaps 48(%0), %%xmm13\n"   // Cb to b
     "movaps 64(%0), %%xmm14\n"   // Red Correction
     "movaps 80(%0), %%xmm15\n"   // Blue Correction
-    "movaps 96(%0), %%xmm9\n"   // Saturation point
+    "movaps 96(%0), %%xmm9\n"   // 0x00008000
     "pxor %%xmm8, %%xmm8\n"     // Zero
+    "movaps 112(%0), %%xmm7\n"   // word 0x8000
     : // no output registers
     : "r" (temp)
     : //  %0
@@ -204,31 +206,28 @@
       "addps %%xmm0, %%xmm3\n"        // Add Y to blue
       "addps %%xmm0, %%xmm4\n"        // Add Y to red - xmm 0 free
       "mulps %%xmm1, %%xmm1\n"        // Square green
-      "minps %%xmm9, %%xmm1\n"        // Saturate green
+      "cvtps2dq %%xmm1, %%xmm1\n"     // Convert green to dwords
       "mulps %%xmm3, %%xmm3\n"        // Square blue
       "mulps %%xmm4, %%xmm4\n"        // Square red
-      "cvtps2dq %%xmm1, %%xmm1\n"     // Convert green to dwords
-      "mulps %%xmm15, %%xmm3\n"        // Multiply blue correction - maybe not 
needed later
-      "mulps %%xmm14, %%xmm4\n"        // Multiply red correction - maybe not 
needed later
-      "minps %%xmm9, %%xmm3\n"        // Saturate blue
-      "minps %%xmm9, %%xmm4\n"        // Saturate red
+      "mulps %%xmm15, %%xmm3\n"       // Multiply blue correction - maybe not 
needed later
+      "mulps %%xmm14, %%xmm4\n"       // Multiply red correction - maybe not 
needed later
+      "psubd %%xmm9, %%xmm1\n"        // g = g - 32768  ( to avoid saturation)
       "cvtps2dq %%xmm3, %%xmm3\n"     // Convert blue to dwords
+      "packssdw %%xmm1,%%xmm1\n"      // g3g2 g1g0 g3g2 g1g0
       "cvtps2dq %%xmm4, %%xmm4\n"     // Convert red to dwords
-      "movdqa %%xmm1, %%xmm0\n"       // Copy green into xmm0
-      "movdqa %%xmm3, %%xmm2\n"       // Copy blue into xmm2
-      "movdqa %%xmm4, %%xmm5\n"       // Copy red into xmm5
-      "pcmpgtd %%xmm8, %%xmm1\n"      // if (xmm1 > 0) xmm1 = ones - green
-      "pcmpgtd %%xmm8, %%xmm3\n"      // same for blue
-      "pcmpgtd %%xmm8, %%xmm4\n"      // same for red
-      "pand %%xmm0, %%xmm1\n"         // Green in xmm1
-      "pand %%xmm5, %%xmm4\n"         // Red in xmm4
-      "pslld $16, %%xmm1\n"           // Shift up green
-      "pand %%xmm2, %%xmm3\n"         // Blue in xmm3
-      "por %%xmm1, %%xmm4\n"          // Interleave red & green
-      "movdqa %%xmm4, %%xmm0\n"       // Copy red &green into xmm0
+      "pxor %%xmm7, %%xmm1\n"         // Shift sign
+      "psubd %%xmm9, %%xmm3\n"        // b = b - 32768  ( to avoid saturation)
+      "psubd %%xmm9, %%xmm4\n"        // r = r - 32768  ( to avoid saturation)
+      "packssdw %%xmm3,%%xmm3\n"      // b3b2 b1b0 b3b2 b1b0
+      "packssdw %%xmm4,%%xmm4\n"      // g3g2 g1g0 r3r2 r1r0
+      "pxor %%xmm7, %%xmm3\n"         // Shift sign (b)
+      "pxor %%xmm7, %%xmm4\n"         // Shift sign (r)
+      "punpcklwd %%xmm1, %%xmm4\n"    // g3r3 g2r2 g1r1 g0r0
+      "punpcklwd %%xmm8, %%xmm3\n"    // 00b3 00b2 00b1 00b0
+      "movdqa %%xmm4, %%xmm0\n"       // Copy r&g
       "punpckldq %%xmm3, %%xmm4\n"    // Interleave lower blue into reg&green 
in xmm4 Now 00b1 g1r1 00b0 g0r0
       "punpckhdq %%xmm3, %%xmm0\n"    // Interleave higher blue into reg&green 
in xmm0 Now 00b3 g3r3 00b2 g2r2
-      "movdqa %%xmm4, (%0)\n"       // Store low pixels
+      "movdqa %%xmm4, (%0)\n"         // Store low pixels
       "movdqa %%xmm0, 16(%0)\n"       // Store high pixels
       "add $32, %0\n"
       "add $16, %1\n"
@@ -255,7 +254,6 @@
     temp[i+12] = 1.772f;    // Cb to b
     temp[i+16] = (1.0f/redCorrection);   // Red correction
     temp[i+20] = (1.0f/blueCorrection);    // Blue correction
-    temp[i+24] = 65535.0f;    // Saturation
   }
 
   asm volatile
@@ -339,13 +337,15 @@
     temp[i+12] = 1.772f;    // Cb to b
     temp[i+16] = (1.0f/redCorrection);   // Red correction
     temp[i+20] = (1.0f/blueCorrection);    // Blue correction
-    temp[i+24] = 65535.0f;    // Saturation
+    *((gint*)&temp[i+24]) = 32768;        // Subtract
+    *((guint*)&temp[i+28]) = 0x80008000;    // xor sign shift
   }
   int* itemp = (int*)(&temp[28]);
 
   asm volatile
   (
-    "movaps 96(%0), %%xmm7\n"   // Saturation point
+    "movaps 96(%0), %%xmm7\n"   // Subtract
+    "movaps 112(%0), %%xmm5\n"   // Xor sign
     "pxor %%xmm6, %%xmm6\n"     // Zero
     : // no output registers
     : "r" (temp)
@@ -374,30 +374,28 @@
       "addps %%xmm0, %%xmm3\n"        // Add Y to blue
       "addps %%xmm0, %%xmm4\n"        // Add Y to red - xmm 0 free
       "mulps %%xmm1, %%xmm1\n"        // Square green
-      "minps %%xmm7, %%xmm1\n"        // Saturate green
       "mulps %%xmm3, %%xmm3\n"        // Square blue
       "mulps %%xmm4, %%xmm4\n"        // Square red
       "cvtps2dq %%xmm1, %%xmm1\n"     // Convert green to dwords
       "mulps 80(%4), %%xmm3\n"        // Multiply blue correction - maybe not 
needed later
       "mulps 64(%4), %%xmm4\n"        // Multiply red correction - maybe not 
needed later
-      "minps %%xmm7, %%xmm3\n"        // Saturate blue
-      "minps %%xmm7, %%xmm4\n"        // Saturate red
+      "psubd %%xmm7, %%xmm1\n"        // g = g - 32768  ( to avoid saturation)
       "cvtps2dq %%xmm3, %%xmm3\n"     // Convert blue to dwords
+      "packssdw %%xmm1,%%xmm1\n"      // g3g2 g1g0 g3g2 g1g0
       "cvtps2dq %%xmm4, %%xmm4\n"     // Convert red to dwords
-      "movdqa %%xmm1, %%xmm0\n"       // Copy green into xmm0
-      "movdqa %%xmm3, %%xmm2\n"       // Copy blue into xmm2
-      "movdqa %%xmm4, %%xmm5\n"       // Copy red into xmm5
-      "pcmpgtd %%xmm6, %%xmm1\n"      // if (xmm1 > 0) xmm1 = ones - green
-      "pcmpgtd %%xmm6, %%xmm3\n"      // same for blue
-      "pcmpgtd %%xmm6, %%xmm4\n"      // same for red
-      "pand %%xmm0, %%xmm1\n"         // Green in xmm1
-      "pand %%xmm5, %%xmm4\n"         // Red in xmm4
-      "pslld $16, %%xmm1\n"           // Shift up green
-      "pand %%xmm2, %%xmm3\n"         // Blue in xmm3
-      "por %%xmm1, %%xmm4\n"          // Interleave red & green
-      "movdqa %%xmm4, %%xmm0\n"       // Copy red &green into xmm0
+      "pxor %%xmm5, %%xmm1\n"         // Shift sign
+      "psubd %%xmm7, %%xmm3\n"        // b = b - 32768  ( to avoid saturation)
+      "psubd %%xmm7, %%xmm4\n"        // r = r - 32768  ( to avoid saturation)
+      "packssdw %%xmm3,%%xmm3\n"      // b3b2 b1b0 b3b2 b1b0
+      "packssdw %%xmm4,%%xmm4\n"      // g3g2 g1g0 r3r2 r1r0
+      "pxor %%xmm5, %%xmm3\n"         // Shift sign (b)
+      "pxor %%xmm5, %%xmm4\n"         // Shift sign (r)
+      "punpcklwd %%xmm1, %%xmm4\n"    // g3r3 g2r2 g1r1 g0r0
+      "punpcklwd %%xmm8, %%xmm3\n"    // 00b3 00b2 00b1 00b0
+      "movdqa %%xmm4, %%xmm0\n"       // Copy r&g
       "punpckldq %%xmm3, %%xmm4\n"    // Interleave lower blue into reg&green 
in xmm4 Now 00b1 g1r1 00b0 g0r0
       "punpckhdq %%xmm3, %%xmm0\n"    // Interleave higher blue into reg&green 
in xmm0 Now 00b3 g3r3 00b2 g2r2
+
       "movdqa %%xmm4, (%0)\n"       // Store low pixels
       "movdqa %%xmm0, 16(%0)\n"       // Store high pixels
       "add $32, %0\n"


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to