Author: post
Date: 2009-12-31 00:14:23 +0100 (Thu, 31 Dec 2009)
New Revision: 2930

Added:
   branches/rawstudio-ng-color/plugins/lensfun/lensfun-sse2.c
Modified:
   branches/rawstudio-ng-color/plugins/lensfun/Makefile.am
   branches/rawstudio-ng-color/plugins/lensfun/lensfun.c
Log:
Lensfun: Put SSE2 into separate file, so we don't risk SSE2 in lensfun.c

Modified: branches/rawstudio-ng-color/plugins/lensfun/Makefile.am
===================================================================
--- branches/rawstudio-ng-color/plugins/lensfun/Makefile.am     2009-12-30 
22:39:52 UTC (rev 2929)
+++ branches/rawstudio-ng-color/plugins/lensfun/Makefile.am     2009-12-30 
23:14:23 UTC (rev 2930)
@@ -1,15 +1,9 @@
 plugindir = $(libdir)/rawstudio/plugins
 
-if CAN_COMPILE_SSE2
-SSE_FLAG=-msse2
-else
-SSE_FLAG=
-endif
-
 AM_CFLAGS =\
        -Wall\
        -O4\
-       $(SSE_FLAG)
+       -funroll-loops
 
 AM_CXXFLAGS = $(AM_CFLAGS)
 
@@ -23,6 +17,16 @@
 
 libdir = $(datadir)/rawstudio/plugins/
 
-lensfun_la_LIBADD = @PACKAGE_LIBS@
+lensfun_la_LIBADD = @PACKAGE_LIBS@ lensfun-sse2.lo
 lensfun_la_LDFLAGS = -module -avoid-version
-lensfun_la_SOURCES = lensfun.c
+lensfun.lo: lensfun.c
+       $(LTCOMPILE) -DEXIT_CODE=0 -c lensfun.c
+
+lensfun-sse2.lo: lensfun-sse2.c
+if CAN_COMPILE_SSE2
+SSE_FLAG=-msse2
+else
+SSE_FLAG=
+endif
+       $(LTCOMPILE) $(SSE_FLAG) -DEXIT_CODE=1 -c lensfun-sse2.c
+

Added: branches/rawstudio-ng-color/plugins/lensfun/lensfun-sse2.c
===================================================================
--- branches/rawstudio-ng-color/plugins/lensfun/lensfun-sse2.c                  
        (rev 0)
+++ branches/rawstudio-ng-color/plugins/lensfun/lensfun-sse2.c  2009-12-30 
23:14:23 UTC (rev 2930)
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2006-2009 Anders Brander <[email protected]> and
+ * Anders Kvist <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, 
USA.
+ */
+
+/* Plugin tmpl version 4 */
+
+#include <rawstudio.h>
+#include <lensfun.h>
+
+#if defined (__SSE2__)
+
+#include <emmintrin.h>
+
+static gfloat twofiftytwo_ps[4] __attribute__ ((aligned (16))) = {256.0f, 
256.0f, 256.0f, 0.0f};
+
+gboolean is_sse2_compiled()
+{
+       return TRUE;
+}
+
+void
+rs_image16_bilinear_full_sse2(RS_IMAGE16 *in, gushort *out, gfloat *pos)
+{
+       const gint m_w = (in->w-1);
+       const gint m_h = (in->h-1);
+
+       __m128 p0, p1;
+       if ((uintptr_t)pos & 15)
+       {
+               p0 = _mm_loadu_ps(pos);         // y1x1 y0x0
+               p1 = _mm_loadu_ps(pos+4);       // ---- y2x2
+       } else 
+       {
+               p0 = _mm_load_ps(pos);          // y1x1 y0x0
+               p1 = _mm_load_ps(pos+4);        // ---- y2x2
+       }
+               
+       __m128 xf = _mm_shuffle_ps(p1, p0, _MM_SHUFFLE(0,2,2,0));
+       __m128 yf = _mm_shuffle_ps(p1, p0, _MM_SHUFFLE(1,3,1,1));
+                       
+       __m128 fl256 = _mm_load_ps(twofiftytwo_ps);
+       xf = _mm_mul_ps(xf, fl256);
+       yf = _mm_mul_ps(yf, fl256);
+       __m128i x = _mm_cvttps_epi32(xf);
+       __m128i y = _mm_cvttps_epi32(yf);
+
+       __m128i _m_w = _mm_slli_epi32(_mm_set1_epi32(m_w), 8);
+       __m128i _m_h = _mm_slli_epi32(_mm_set1_epi32(m_h), 8);
+       
+       __m128i x_gt, y_gt;
+       
+       /* If positions from lensfun is properly clamped this should not be 
needed */
+       /* Enable, if crashes begin occuring here here */
+#if 0
+       x_gt = _mm_cmpgt_epi32(x, _m_w);
+       y_gt = _mm_cmpgt_epi32(y, _m_h);
+       
+       x = _mm_or_si128(_mm_andnot_si128(x_gt, x), _mm_and_si128(_m_w, x_gt));
+       y = _mm_or_si128(_mm_andnot_si128(y_gt, y), _mm_and_si128(_m_h, y_gt));
+
+       __m128i zero = _mm_setzero_si128();
+       __m128i x_lt = _mm_cmplt_epi32(x, zero);
+       __m128i y_lt = _mm_cmplt_epi32(y, zero);
+       x = _mm_andnot_si128(x_lt, x);
+       y = _mm_andnot_si128(y_lt, y);
+#endif
+       __m128i one = _mm_set1_epi32(1);
+       __m128i nx = _mm_add_epi32(one, _mm_srai_epi32(x, 8));
+       __m128i ny = _mm_add_epi32(one, _mm_srai_epi32(y, 8));
+
+       _m_w = _mm_srai_epi32(_m_w, 8);
+       _m_h = _mm_srai_epi32(_m_h, 8);
+
+       x_gt = _mm_cmpgt_epi32(nx, _m_w);
+       y_gt = _mm_cmpgt_epi32(ny, _m_h);
+       
+       nx = _mm_or_si128(_mm_andnot_si128(x_gt, nx), _mm_and_si128(_m_w, 
x_gt));
+       ny = _mm_or_si128(_mm_andnot_si128(y_gt, ny), _mm_and_si128(_m_h, 
y_gt));
+
+       int xfer[16] __attribute__ ((aligned (16)));
+
+       _mm_store_si128((__m128i*)xfer, _mm_srai_epi32(x, 8));
+       _mm_store_si128((__m128i*)&xfer[4], _mm_srai_epi32(y, 8));
+       _mm_store_si128((__m128i*)&xfer[8], nx);
+       _mm_store_si128((__m128i*)&xfer[12], ny);
+       
+       gushort* pixels[12];
+       
+       /* Loop unrolled, allows agressive instruction reordering */
+       /* Red, then G & B */
+       pixels[0] = GET_PIXEL(in, xfer[0], xfer[4]);    // a
+       pixels[1] = GET_PIXEL(in, xfer[8], xfer[4]);    // b
+       pixels[2] = GET_PIXEL(in, xfer[0], xfer[12]);   // c
+       pixels[3] = GET_PIXEL(in, xfer[8], xfer[12]);   // d
+               
+       pixels[4] = GET_PIXEL(in, xfer[1], xfer[1+4]) + 1;              // a
+       pixels[4+1] = GET_PIXEL(in, xfer[1+8], xfer[1+4]) + 1;  // b
+       pixels[4+2] = GET_PIXEL(in, xfer[1], xfer[1+12]) + 1;   // c
+       pixels[4+3] = GET_PIXEL(in, xfer[1+8], xfer[1+12]) + 1; // d
+
+       pixels[2*4] = GET_PIXEL(in, xfer[2], xfer[2+4]) + 2;            // a
+       pixels[2*4+1] = GET_PIXEL(in, xfer[2+8], xfer[2+4]) + 2;        // b
+       pixels[2*4+2] = GET_PIXEL(in, xfer[2], xfer[2+12]) + 2;         // c
+       pixels[2*4+3] = GET_PIXEL(in, xfer[2+8], xfer[2+12]) + 2;       // d
+
+       /* Calculate distances */
+       __m128i twofiftyfive = _mm_set1_epi32(255);
+       __m128i diffx = _mm_and_si128(x, twofiftyfive); 
+       __m128i diffy = _mm_and_si128(y, twofiftyfive); 
+       __m128i inv_diffx = _mm_andnot_si128(diffx, twofiftyfive);
+       __m128i inv_diffy = _mm_andnot_si128(diffy, twofiftyfive);
+
+       /* Calculate weights */
+       __m128i aw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, inv_diffy),1);
+       __m128i bw = _mm_srai_epi32(_mm_mullo_epi16(diffx, inv_diffy),1);
+       __m128i cw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, diffy),1);
+       __m128i dw = _mm_srai_epi32(_mm_mullo_epi16(diffx, diffy),1);
+
+       _mm_store_si128((__m128i*)xfer, aw);
+       _mm_store_si128((__m128i*)&xfer[4], bw);
+       _mm_store_si128((__m128i*)&xfer[8], cw);
+       _mm_store_si128((__m128i*)&xfer[12], dw);
+       
+       gushort** p = pixels;
+       /* Loop unrolled */
+       out[0]  = (gushort) ((xfer[0] * *p[0] + xfer[4] * *p[1] + xfer[8] * 
*p[2] + xfer[12] * *p[3]  + 16384) >> 15 );
+       p+=4;
+       out[1]  = (gushort) ((xfer[1] * *p[0] + xfer[1+4] * *p[1] + xfer[1+8] * 
*p[2] + xfer[1+12] * *p[3]  + 16384) >> 15 );
+       p+=4;
+       out[2]  = (gushort) ((xfer[2] * *p[0] + xfer[2+4] * *p[1] + xfer[2+8] * 
*p[2] + xfer[2+12] * *p[3]  + 16384) >> 15 );
+}
+#else
+gboolean is_sse2_compiled()
+{
+       return FALSE;
+}
+
+void
+rs_image16_bilinear_full_sse2(RS_IMAGE16 *in, gushort *out, gfloat *pos)
+{
+}
+#endif // defined (__SSE2__)

Modified: branches/rawstudio-ng-color/plugins/lensfun/lensfun.c
===================================================================
--- branches/rawstudio-ng-color/plugins/lensfun/lensfun.c       2009-12-30 
22:39:52 UTC (rev 2929)
+++ branches/rawstudio-ng-color/plugins/lensfun/lensfun.c       2009-12-30 
23:14:23 UTC (rev 2930)
@@ -87,9 +87,8 @@
 static RSFilterResponse *get_image(RSFilter *filter, const RSFilterRequest 
*request);
 static void inline rs_image16_nearest_full(RS_IMAGE16 *in, gushort *out, 
gfloat *pos);
 static void inline rs_image16_bilinear_full(RS_IMAGE16 *in, gushort *out, 
gfloat *pos);
-#if defined (__SSE2__)
-static void inline rs_image16_bilinear_full_sse2(RS_IMAGE16 *in, gushort *out, 
gfloat *pos);
-#endif
+extern gboolean is_sse2_compiled();
+extern void rs_image16_bilinear_full_sse2(RS_IMAGE16 *in, gushort *out, gfloat 
*pos);
 static RSFilterClass *rs_lensfun_parent_class = NULL;
 
 G_MODULE_EXPORT void
@@ -343,7 +342,8 @@
                }
        }
 
-       gboolean sse2_available = !!(rs_detect_cpu_features() & 
RS_CPU_FLAG_SSE2);
+       gboolean sse2_available = !!(rs_detect_cpu_features() & 
RS_CPU_FLAG_SSE2) && is_sse2_compiled();
+
        if (t->stage == 3) 
        {
                /* Do TCA and distortion */
@@ -356,7 +356,7 @@
                        lf_modifier_apply_subpixel_geometry_distortion(t->mod, 
t->roi->x, (gfloat) y, t->roi->width, 1, pos);
                        target = GET_PIXEL(t->output, t->roi->x, y);
                        gfloat* l_pos = pos;
-#if defined (__SSE2__)
+
                        if (sse2_available)
                        {
                                for(x = 0; x < t->roi->width ; x++)
@@ -366,13 +366,14 @@
                                        l_pos += 6;
                                }
                        } else 
-#endif
-                       for(x = 0; x < t->roi->width ; x++)
                        {
-                               rs_image16_bilinear_full_sse2(t->input, target, 
l_pos);
-                               rs_image16_bilinear_full(t->input, target, 
l_pos);
-                               target += pixelsize;
-                               l_pos += 6;
+                               for(x = 0; x < t->roi->width ; x++)
+                               {
+                                       rs_image16_bilinear_full_sse2(t->input, 
target, l_pos);
+                                       rs_image16_bilinear_full(t->input, 
target, l_pos);
+                                       target += pixelsize;
+                                       l_pos += 6;
+                               }
                        }
                }
                g_free(pos);
@@ -711,121 +712,3 @@
                out[i]  = (gushort) ((a[i]*aw  + b[i]*bw  + c[i]*cw  + d[i]*dw 
+ 16384) >> 15 );
        }
 }
-
-
-#if defined (__SSE2__)
-static gfloat twofiftytwo_ps[4] __attribute__ ((aligned (16))) = {256.0f, 
256.0f, 256.0f, 0.0f};
-               
-static void inline
-rs_image16_bilinear_full_sse2(RS_IMAGE16 *in, gushort *out, gfloat *pos)
-{
-       const gint m_w = (in->w-1);
-       const gint m_h = (in->h-1);
-
-       __m128 p0, p1;
-       if ((uintptr_t)pos & 15)
-       {
-               p0 = _mm_loadu_ps(pos);         // y1x1 y0x0
-               p1 = _mm_loadu_ps(pos+4);       // ---- y2x2
-       } else 
-       {
-               p0 = _mm_load_ps(pos);          // y1x1 y0x0
-               p1 = _mm_load_ps(pos+4);        // ---- y2x2
-       }
-               
-       __m128 xf = _mm_shuffle_ps(p1, p0, _MM_SHUFFLE(0,2,2,0));
-       __m128 yf = _mm_shuffle_ps(p1, p0, _MM_SHUFFLE(1,3,1,1));
-                       
-       __m128 fl256 = _mm_load_ps(twofiftytwo_ps);
-       xf = _mm_mul_ps(xf, fl256);
-       yf = _mm_mul_ps(yf, fl256);
-       __m128i x = _mm_cvttps_epi32(xf);
-       __m128i y = _mm_cvttps_epi32(yf);
-
-       __m128i _m_w = _mm_slli_epi32(_mm_set1_epi32(m_w), 8);
-       __m128i _m_h = _mm_slli_epi32(_mm_set1_epi32(m_h), 8);
-       
-       __m128i x_gt, y_gt;
-       
-       /* If positions from lensfun is properly clamped this should not be 
needed */
-       /* Enable, if crashes begin occuring here here */
-#if 0
-       x_gt = _mm_cmpgt_epi32(x, _m_w);
-       y_gt = _mm_cmpgt_epi32(y, _m_h);
-       
-       x = _mm_or_si128(_mm_andnot_si128(x_gt, x), _mm_and_si128(_m_w, x_gt));
-       y = _mm_or_si128(_mm_andnot_si128(y_gt, y), _mm_and_si128(_m_h, y_gt));
-
-       __m128i zero = _mm_setzero_si128();
-       __m128i x_lt = _mm_cmplt_epi32(x, zero);
-       __m128i y_lt = _mm_cmplt_epi32(y, zero);
-       x = _mm_andnot_si128(x_lt, x);
-       y = _mm_andnot_si128(y_lt, y);
-#endif
-       __m128i one = _mm_set1_epi32(1);
-       __m128i nx = _mm_add_epi32(one, _mm_srai_epi32(x, 8));
-       __m128i ny = _mm_add_epi32(one, _mm_srai_epi32(y, 8));
-
-       _m_w = _mm_srai_epi32(_m_w, 8);
-       _m_h = _mm_srai_epi32(_m_h, 8);
-
-       x_gt = _mm_cmpgt_epi32(nx, _m_w);
-       y_gt = _mm_cmpgt_epi32(ny, _m_h);
-       
-       nx = _mm_or_si128(_mm_andnot_si128(x_gt, nx), _mm_and_si128(_m_w, 
x_gt));
-       ny = _mm_or_si128(_mm_andnot_si128(y_gt, ny), _mm_and_si128(_m_h, 
y_gt));
-
-       int xfer[16] __attribute__ ((aligned (16)));
-
-       _mm_store_si128((__m128i*)xfer, _mm_srai_epi32(x, 8));
-       _mm_store_si128((__m128i*)&xfer[4], _mm_srai_epi32(y, 8));
-       _mm_store_si128((__m128i*)&xfer[8], nx);
-       _mm_store_si128((__m128i*)&xfer[12], ny);
-       
-       gushort* pixels[12];
-       
-       /* Loop unrolled, allows agressive instruction reordering */
-       /* Red, then G & B */
-       pixels[0] = GET_PIXEL(in, xfer[0], xfer[4]);    // a
-       pixels[1] = GET_PIXEL(in, xfer[8], xfer[4]);    // b
-       pixels[2] = GET_PIXEL(in, xfer[0], xfer[12]);   // c
-       pixels[3] = GET_PIXEL(in, xfer[8], xfer[12]);   // d
-               
-       pixels[4] = GET_PIXEL(in, xfer[1], xfer[1+4]) + 1;              // a
-       pixels[4+1] = GET_PIXEL(in, xfer[1+8], xfer[1+4]) + 1;  // b
-       pixels[4+2] = GET_PIXEL(in, xfer[1], xfer[1+12]) + 1;   // c
-       pixels[4+3] = GET_PIXEL(in, xfer[1+8], xfer[1+12]) + 1; // d
-
-       pixels[2*4] = GET_PIXEL(in, xfer[2], xfer[2+4]) + 2;            // a
-       pixels[2*4+1] = GET_PIXEL(in, xfer[2+8], xfer[2+4]) + 2;        // b
-       pixels[2*4+2] = GET_PIXEL(in, xfer[2], xfer[2+12]) + 2;         // c
-       pixels[2*4+3] = GET_PIXEL(in, xfer[2+8], xfer[2+12]) + 2;       // d
-
-       /* Calculate distances */
-       __m128i twofiftyfive = _mm_set1_epi32(255);
-       __m128i diffx = _mm_and_si128(x, twofiftyfive); 
-       __m128i diffy = _mm_and_si128(y, twofiftyfive); 
-       __m128i inv_diffx = _mm_andnot_si128(diffx, twofiftyfive);
-       __m128i inv_diffy = _mm_andnot_si128(diffy, twofiftyfive);
-
-       /* Calculate weights */
-       __m128i aw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, inv_diffy),1);
-       __m128i bw = _mm_srai_epi32(_mm_mullo_epi16(diffx, inv_diffy),1);
-       __m128i cw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, diffy),1);
-       __m128i dw = _mm_srai_epi32(_mm_mullo_epi16(diffx, diffy),1);
-
-       _mm_store_si128((__m128i*)xfer, aw);
-       _mm_store_si128((__m128i*)&xfer[4], bw);
-       _mm_store_si128((__m128i*)&xfer[8], cw);
-       _mm_store_si128((__m128i*)&xfer[12], dw);
-       
-       gushort** p = pixels;
-       /* Loop unrolled */
-       out[0]  = (gushort) ((xfer[0] * *p[0] + xfer[4] * *p[1] + xfer[8] * 
*p[2] + xfer[12] * *p[3]  + 16384) >> 15 );
-       p+=4;
-       out[1]  = (gushort) ((xfer[1] * *p[0] + xfer[1+4] * *p[1] + xfer[1+8] * 
*p[2] + xfer[1+12] * *p[3]  + 16384) >> 15 );
-       p+=4;
-       out[2]  = (gushort) ((xfer[2] * *p[0] + xfer[2+4] * *p[1] + xfer[2+8] * 
*p[2] + xfer[2+12] * *p[3]  + 16384) >> 15 );
-}
-
-#endif // defined (__SSE2__)


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to