Author: post
Date: 2012-11-07 19:14:26 +0100 (Wed, 07 Nov 2012)
New Revision: 4308

Modified:
   trunk/plugins/denoise/complexfilter-x86.cpp
Log:
Utilize the amazing fact that sqrt(x) = x * (1 / sqrt(x)). Faster processing on 
older machines.

Modified: trunk/plugins/denoise/complexfilter-x86.cpp
===================================================================
--- trunk/plugins/denoise/complexfilter-x86.cpp 2012-11-07 17:59:24 UTC (rev 
4307)
+++ trunk/plugins/denoise/complexfilter-x86.cpp 2012-11-07 18:14:26 UTC (rev 
4308)
@@ -103,12 +103,12 @@
     "rcpps %%xmm13, %%xmm13\n"        // 1 / (psd + 
sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
     "movaps 16(%4), %%xmm4\n"       // Load wsharpen
     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + 
sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
-    "mulps %%xmm13, %%xmm15\n"        // psd*sigmaSquaredSharpenMax/((psd + 
sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
+    "mulps %%xmm13, %%xmm15\n"        // psd*sigmaSquaredSharpenMax/((psd + 
sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm13 free
+    "rsqrtps %%xmm7, %%xmm5\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
+    "rsqrtps %%xmm15, %%xmm13\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
+    "mulps %%xmm5, %%xmm7\n"        // sqrt(x)
+    "mulps %%xmm13, %%xmm15\n"        //
     "movaps 64(%1), %%xmm5\n"       // Load "1.0"
-    "rsqrtps %%xmm7, %%xmm7\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
-    "rsqrtps %%xmm15, %%xmm15\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
-    "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
-    "rcpps %%xmm15, %%xmm15\n"        // sqrt (..)
     "mulps %%xmm6, %%xmm7\n"        // multiply wsharpen
     "mulps %%xmm4, %%xmm15\n"        // multiply wsharpen
     "addps %%xmm5, %%xmm7\n"        // + 1.0 xmm7 = sfact
@@ -178,11 +178,11 @@
     "addps %%xmm13, %%xmm5\n"        //xmm5 = psd + sigmaSquaredSharpenMin 
//xmm6 free
 
     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax) xmm4 free
-    "movaps (%4), %%xmm6\n"         // load wsharpen[0->4]
     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd 
+ sigmaSquaredSharpenMax) (stall)
     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + 
sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
-    "rsqrtps %%xmm7, %%xmm7\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
-    "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
+    "rsqrtps %%xmm7, %%xmm6\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
+    "mulps %%xmm6, %%xmm7\n"        // sqrt
+    "movaps (%4), %%xmm6\n"         // load wsharpen[0->4]
     "mulps %%xmm6, %%xmm7\n"        // multiply wsharpen
     "addps %%xmm15, %%xmm7\n"        // + 1.0 xmm7 = sfact
     "movaps %%xmm7, %%xmm5\n"
@@ -256,12 +256,12 @@
     "addps %%xmm6, %%xmm5\n"        //xmm5 = psd + sigmaSquaredSharpenMin 
//xmm6 free
     
     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax) xmm4 free
-    "movaps (%4), %%xmm6\n"         // load wsharpen[0->4]
     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd 
+ sigmaSquaredSharpenMax) (stall)
     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + 
sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
+    "rsqrtps %%xmm7, %%xmm6\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
+    "mulps %%xmm6, %%xmm7\n"        // sqrt(x)
+    "movaps (%4), %%xmm6\n"         // load wsharpen[0->4]
     "movaps 64(%1), %%xmm5\n"       // Load "1.0"
-    "rsqrtps %%xmm7, %%xmm7\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
-    "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
     "mulps %%xmm6, %%xmm7\n"        // multiply wsharpen 
     "addps %%xmm5, %%xmm7\n"        // + 1.0 xmm7 = sfact
     "movaps %%xmm7, %%xmm5\n"
@@ -340,9 +340,9 @@
     "movaps (%4), %%xmm6\n"         // load wsharpen[0->4]
     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd 
+ sigmaSquaredSharpenMax) (stall)
     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + 
sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
+    "rsqrtps %%xmm7, %%xmm5\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
+    "mulps %%xmm5, %%xmm7\n"        // sqrt(x)
     "movaps 64(%1), %%xmm5\n"       // Load "1.0"
-    "rsqrtps %%xmm7, %%xmm7\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
-    "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
     "mulps %%xmm6, %%xmm7\n"        // multiply wsharpen 
     "addps %%xmm5, %%xmm7\n"        // + 1.0 xmm7 = sfact
     "movaps %%xmm7, %%xmm5\n"
@@ -432,8 +432,8 @@
     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax) xmm4 free
     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd 
+ sigmaSquaredSharpenMax) (stall)
     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + 
sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
-    "rsqrtps %%xmm7, %%xmm7\n"       // 1 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
-    "rcpps %%xmm7, %%xmm7\n"        // sqrt(...)
+    "rsqrtps %%xmm7, %%xmm5\n"       // 1 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
+    "mulps %%xmm5, %%xmm7\n"        // sqrt(x)
     "mulps (%4), %%xmm7\n"        // multiply wsharpen
     "addps %%xmm9, %%xmm7\n"        // + 1.0 xmm7 = sfact
     "mulps %%xmm6, %%xmm7\n"        // *= Wienerfactor
@@ -522,9 +522,9 @@
     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax) xmm4 free
     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd 
+ sigmaSquaredSharpenMax) (stall)
     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + 
sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
+    "rsqrtps %%xmm7, %%xmm5\n"      // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
+    "mulps %%xmm5, %%xmm7\n"        // sqrt(x)
     "movaps 64(%1), %%xmm5\n"       // Load "1.0"
-    "rsqrtps %%xmm7, %%xmm7\n"      // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
-    "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
     "mulps (%4), %%xmm7\n"        // multiply wsharpen 
     "addps %%xmm5, %%xmm7\n"        // + 1.0 xmm7 = sfact
     "mulps %%xmm6, %%xmm7\n"        // *= Wienerfactor
@@ -617,9 +617,9 @@
     "mulps %%xmm4, %%xmm5\n"        // (psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax) xmm4 free
     "rcpps %%xmm5, %%xmm5\n"        // 1 / (psd + sigmaSquaredSharpenMin)*(psd 
+ sigmaSquaredSharpenMax) (stall)
     "mulps %%xmm5, %%xmm7\n"        // psd*sigmaSquaredSharpenMax/((psd + 
sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
+    "rsqrtps %%xmm7, %%xmm5\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
+    "mulps %%xmm5, %%xmm7\n"        // sqrt(x)
     "movaps 64(%1), %%xmm5\n"       // Load "1.0"
-    "rsqrtps %%xmm7, %%xmm7\n"       // 1.0 / sqrt( 
psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + 
sigmaSquaredSharpenMax))
-    "rcpps %%xmm7, %%xmm7\n"        // sqrt (..)
     "mulps (%4), %%xmm7\n"        // multiply wsharpen 
     "addps %%xmm5, %%xmm7\n"        // + 1.0 xmm7 = sfact
     "mulps %%xmm6, %%xmm7\n"        // *= Wienerfactor
@@ -878,6 +878,7 @@
 }
 
 #else // 32 bits
+
 void ComplexWienerFilterDeGrid::processNoSharpen_SSE3( ComplexBlock* block ) 
 {
   fftwf_complex* outcur = block->complex;


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to