Author: post
Date: 2010-01-23 14:57:11 +0100 (Sat, 23 Jan 2010)
New Revision: 3057

Added:
   
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c
Modified:
   branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am
   
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c
Log:
Colorspace transform. Add multithreading, ROI and SSE2 code to all internal 16 
bit -> 8 bit conversions. Note that the GdkPixbuffer now has an alpha channel, 
which might all to other code changes needed.

Modified: branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am
===================================================================
--- branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am        
2010-01-23 12:55:16 UTC (rev 3056)
+++ branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am        
2010-01-23 13:57:11 UTC (rev 3057)
@@ -17,6 +17,19 @@
 
 libdir = $(datadir)/rawstudio/plugins/
 
-colorspace_transform_la_LIBADD = @PACKAGE_LIBS@
+colorspace_transform_la_LIBADD = @PACKAGE_LIBS@ colorspace_transform_sse2.lo 
rs-cmm.lo
 colorspace_transform_la_LDFLAGS = -module -avoid-version
-colorspace_transform_la_SOURCES = colorspace_transform.c rs-cmm.c rs-cmm.h
+
+colorspace_transform.lo: colorspace_transform.c
+       $(LTCOMPILE) -DEXIT_CODE=0 -c colorspace_transform.c
+
+rs-cmm.lo: rs-cmm.c rs-cmm.h
+       $(LTCOMPILE) -DEXIT_CODE=2 -c rs-cmm.c
+
+colorspace_transform_sse2.lo: colorspace_transform_sse2.c
+if CAN_COMPILE_SSE2
+SSE_FLAG=-msse2
+else
+SSE_FLAG=
+endif
+       $(LTCOMPILE) $(SSE_FLAG) -DEXIT_CODE=1 -c colorspace_transform_sse2.c

Modified: 
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c
===================================================================
--- 
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c 
    2010-01-23 12:55:16 UTC (rev 3056)
+++ 
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c 
    2010-01-23 13:57:11 UTC (rev 3057)
@@ -31,6 +31,23 @@
 typedef struct _RSColorspaceTransform RSColorspaceTransform;
 typedef struct _RSColorspaceTransformClass RSColorspaceTransformClass;
 
+typedef struct {
+       RSColorspaceTransform *cst;
+       GThread *threadid;
+       gint start_x;
+       gint start_y;
+       gint end_x;
+       gint end_y;
+       RS_IMAGE16 *input;
+       void *output;
+       RSColorSpace *input_space;
+       RSColorSpace *output_space;
+       RS_MATRIX3 *matrix;
+       gboolean gamma_correct;
+       guchar* table8;
+       gfloat output_gamma;
+} ThreadInfo;
+
 struct _RSColorspaceTransform {
        RSFilter parent;
 
@@ -51,10 +68,15 @@
 static RSFilterResponse *get_image(RSFilter *filter, const RSFilterRequest 
*request);
 static RSFilterResponse *get_image8(RSFilter *filter, const RSFilterRequest 
*request);
 static gboolean convert_colorspace16(RSColorspaceTransform 
*colorspace_transform, RS_IMAGE16 *input_image, RS_IMAGE16 *output_image, 
RSColorSpace *input_space, RSColorSpace *output_space);
-static void convert_colorspace8(RSColorspaceTransform *colorspace_transform, 
RS_IMAGE16 *input_image, GdkPixbuf *output_image, RSColorSpace *input_space, 
RSColorSpace *output_space);
+static void convert_colorspace8(RSColorspaceTransform *colorspace_transform, 
RS_IMAGE16 *input_image, GdkPixbuf *output_image, RSColorSpace *input_space, 
RSColorSpace *output_space, GdkRectangle *roi);
 
 static RSFilterClass *rs_colorspace_transform_parent_class = NULL;
 
+/* SSE2 optimized functions */
+extern void transform8_srgb_sse2(ThreadInfo* t);
+extern void transform8_otherrgb_sse2(ThreadInfo* t);
+extern gboolean cst_has_sse2();
+
 G_MODULE_EXPORT void
 rs_plugin_load(RSPlugin *plugin)
 {
@@ -106,8 +128,6 @@
                if (convert_colorspace16(colorspace_transform, input, output, 
input_space, output_space))
                {
                        /* Image was converted */
-                       printf("\033[33m16 input_space: %s\033[0m\n", 
(input_space) ? G_OBJECT_TYPE_NAME(input_space) : "none");
-                       printf("\033[33m16 output_space: %s\n\033[0m", 
(output_space) ? G_OBJECT_TYPE_NAME(output_space) : "none");
                        response = rs_filter_response_clone(previous_response);
                        g_object_unref(previous_response);
                        rs_filter_response_set_image(response, output);
@@ -137,53 +157,60 @@
        RSFilterResponse *response;
        RS_IMAGE16 *input;
        GdkPixbuf *output = NULL;
+       GdkRectangle *roi;
 
        previous_response = rs_filter_get_image(filter->previous, request);
        input = rs_filter_response_get_image(previous_response);
        if (!RS_IS_IMAGE16(input))
                return previous_response;
 
+       roi = rs_filter_request_get_roi(request);
        RSColorSpace *input_space = 
rs_filter_param_get_object_with_type(RS_FILTER_PARAM(previous_response), 
"colorspace", RS_TYPE_COLOR_SPACE);
        RSColorSpace *output_space = 
rs_filter_param_get_object_with_type(RS_FILTER_PARAM(request), "colorspace", 
RS_TYPE_COLOR_SPACE);
 
+#if 0
        printf("\033[33m8 input_space: %s\033[0m\n", (input_space) ? 
G_OBJECT_TYPE_NAME(input_space) : "none");
        printf("\033[33m8 output_space: %s\n\033[0m", (output_space) ? 
G_OBJECT_TYPE_NAME(output_space) : "none");
+#endif
 
        response = rs_filter_response_clone(previous_response);
        g_object_unref(previous_response);
-       output = gdk_pixbuf_new(GDK_COLORSPACE_RGB, FALSE, 8, input->w, 
input->h);
+       output = gdk_pixbuf_new(GDK_COLORSPACE_RGB, TRUE, 8, input->w, 
input->h);
 
-       convert_colorspace8(colorspace_transform, input, output, input_space, 
output_space);
+       /* Process output */
+       convert_colorspace8(colorspace_transform, input, output, input_space, 
output_space, roi);
 
        rs_filter_response_set_image8(response, output);
        g_object_unref(output);
-
-       /* Process output */
-
        g_object_unref(input);
        return response;
 }
 
 static void
-transform8_c(RS_IMAGE16 *input, GdkPixbuf *output, RS_MATRIX3 *matrix, guchar 
*table8)
+transform8_c(ThreadInfo* t)
 {
        gint row;
        gint r,g,b;
        gint width;
        RS_MATRIX3Int mati;
-
+       RS_IMAGE16 *input = t->input;
+       GdkPixbuf *output = (GdkPixbuf *)t->output;
+       guchar *table8 = t->table8;
+       
+       g_assert(RS_IS_IMAGE16(input));
+       g_assert(GDK_IS_PIXBUF(output));
        gint o_channels = gdk_pixbuf_get_n_channels(output);
 
-       matrix3_to_matrix3int(matrix, &mati);
+       matrix3_to_matrix3int(t->matrix, &mati);
 
-       for(row=0 ; row<input->h ; row++)
+       for(row=t->start_y ; row<t->end_y ; row++)
        {
-               gushort *i = GET_PIXEL(input, 0, row);
-               guchar *o = GET_PIXBUF_PIXEL(output, 0, row);
+               gushort *i = GET_PIXEL(input, t->start_x, row);
+               guchar *o = GET_PIXBUF_PIXEL(output, t->start_x, row);
 
-               width = input->w;
+               width = t->end_x - t->start_x;
 
-               while(width--)
+               while(width-- > 0)
                {
                        r =
                                ( i[R] * mati.coeff[0][0]
@@ -208,12 +235,15 @@
                        o[R] = table8[r];
                        o[G] = table8[g];
                        o[B] = table8[b];
+                       o[3] = 255;
+
                        i += input->pixelsize;
                        o += o_channels;
                }
        }
 }
 
+
 static void
 transform16_c(gushort* __restrict input, gushort* __restrict output, gint 
num_pixels, const gint pixelsize, RS_MATRIX3 *matrix)
 {
@@ -301,8 +331,65 @@
        return TRUE;
 }
 
+gpointer
+start_single_cs8_transform_thread(gpointer _thread_info)
+{
+       ThreadInfo* t = _thread_info;
+       RS_IMAGE16 *input_image = t->input; 
+       GdkPixbuf *output = (GdkPixbuf*) t->output;
+       RSColorSpace *input_space = t->input_space;
+       RSColorSpace *output_space = t->output_space;
+
+       g_assert(RS_IS_IMAGE16(input_image));
+       g_assert(GDK_IS_PIXBUF(output));
+       g_assert(RS_IS_COLOR_SPACE(input_space));
+       g_assert(RS_IS_COLOR_SPACE(output_space));
+
+       gboolean sse2_available = (!!(rs_detect_cpu_features() & 
RS_CPU_FLAG_SSE2)) && cst_has_sse2();
+
+       if (sse2_available && rs_color_space_new_singleton("RSSrgb") == 
output_space)
+       {
+               transform8_srgb_sse2(t);
+               return (NULL);
+       }
+       if (sse2_available && rs_color_space_new_singleton("RSAdobeRGB") == 
output_space)
+       {
+               t->output_gamma = 1.0 / 2.19921875;
+               transform8_otherrgb_sse2(t);
+               return (NULL);
+       }
+       if (sse2_available && rs_color_space_new_singleton("RSProphoto") == 
output_space)
+       {
+               t->output_gamma = 1.0 / 1.8;
+               transform8_otherrgb_sse2(t);
+               return (NULL);
+       }
+       
+       /* Fall back to C-functions */
+       /* Calculate our gamma table */
+       const RS1dFunction *input_gamma = 
rs_color_space_get_gamma_function(input_space);
+       const RS1dFunction *output_gamma = 
rs_color_space_get_gamma_function(output_space);
+       guchar table8[65536];
+       gint i;
+       for(i=0;i<65536;i++)
+       {
+               gdouble nd = ((gdouble) i) * (1.0/65535.0);
+
+               nd = rs_1d_function_evaluate_inverse(input_gamma, nd);
+               nd = rs_1d_function_evaluate(output_gamma, nd);
+
+               /* 8 bit output */
+               gint res = (gint) (nd*255.0 + 0.5f);
+               _CLAMP255(res);
+               table8[i] = res;
+       }
+       t->table8 = table8;
+       transform8_c(t);
+       return (NULL);
+}
+
 static void
-convert_colorspace8(RSColorspaceTransform *colorspace_transform, RS_IMAGE16 
*input_image, GdkPixbuf *output_image, RSColorSpace *input_space, RSColorSpace 
*output_space)
+convert_colorspace8(RSColorspaceTransform *colorspace_transform, RS_IMAGE16 
*input_image, GdkPixbuf *output_image, RSColorSpace *input_space, RSColorSpace 
*output_space, GdkRectangle *_roi)
 {
        g_assert(RS_IS_IMAGE16(input_image));
        g_assert(GDK_IS_PIXBUF(output_image));
@@ -313,6 +400,16 @@
        g_assert(input_image->w == gdk_pixbuf_get_width(output_image));
        g_assert(input_image->h == gdk_pixbuf_get_height(output_image));
 
+       GdkRectangle *roi = _roi;
+       if (!roi) 
+       {
+               roi = g_new(GdkRectangle, 1);
+               roi->x = 0;
+               roi->y = 0;
+               roi->width = input_image->w;
+               roi->height = input_image->h;
+       }
+
        /* If a CMS is needed, do the transformation using LCMS */
        if (RS_COLOR_SPACE_REQUIRES_CMS(input_space) || 
RS_COLOR_SPACE_REQUIRES_CMS(output_space))
        {
@@ -332,34 +429,45 @@
        {
                const RS_MATRIX3 a = 
rs_color_space_get_matrix_from_pcs(input_space);
                const RS_MATRIX3 b = 
rs_color_space_get_matrix_to_pcs(output_space);
+               
                RS_MATRIX3 mat;
                matrix3_multiply(&b, &a, &mat);
 
-               /* Calculate our gamma table */
-               /* FIXME: Move this to someplace where we can cache it! */
-               guchar table8[65536];
                gint i;
+               guint y_offset, y_per_thread, threaded_h;
+               const guint threads = rs_get_number_of_processor_cores();
+               ThreadInfo *t = g_new(ThreadInfo, threads);
 
-               const RS1dFunction *input_gamma = 
rs_color_space_get_gamma_function(input_space);
-               const RS1dFunction *output_gamma = 
rs_color_space_get_gamma_function(output_space);
+               threaded_h = roi->height;
+               y_per_thread = (threaded_h + threads-1)/threads;
+               y_offset = roi->y;
 
-               for(i=0;i<65536;i++)
+               for (i = 0; i < threads; i++)
                {
-                       gdouble nd = ((gdouble) i) * (1.0/65535.0);
+                       t[i].input = input_image;
+                       t[i].output = output_image;
+                       t[i].start_y = y_offset;
+                       t[i].start_x = roi->x;
+                       t[i].end_x = roi->x + roi->width;
+                       t[i].cst = colorspace_transform;
+                       t[i].input_space = input_space;
+                       t[i].output_space = output_space;
+                       y_offset += y_per_thread;
+                       y_offset = MIN(input_image->h, y_offset);
+                       t[i].end_y = y_offset;
+                       t[i].matrix = &mat;
+                       t[i].table8 = NULL;
 
-                       nd = rs_1d_function_evaluate_inverse(input_gamma, nd);
-                       nd = rs_1d_function_evaluate(output_gamma, nd);
-
-                       /* 8 bit output */
-                       gint res = (gint) (nd*255.0 + 0.5f);
-                       _CLAMP255(res);
-                       table8[i] = res;
+                       t[i].threadid = 
g_thread_create(start_single_cs8_transform_thread, &t[i], TRUE, NULL);
                }
 
-               transform8_c(
-                       input_image,
-                       output_image,
-                       &mat,
-                       table8);
+               /* Wait for threads to finish */
+               for(i = 0; i < threads; i++)
+                       g_thread_join(t[i].threadid);
+
+               g_free(t);
        }
+       /* If we created the ROI here, free it */
+       if (!_roi) 
+               g_free(roi);
 }

Added: 
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c
===================================================================
--- 
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c
                                (rev 0)
+++ 
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c
        2010-01-23 13:57:11 UTC (rev 3057)
@@ -0,0 +1,534 @@
+/*
+ * Copyright (C) 2006-2009 Anders Brander <[email protected]> and 
+ * Anders Kvist <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, 
USA.
+ */
+
+/* Plugin tmpl version 5 */
+
+#include <rawstudio.h>
+#include <lcms.h>
+#include "rs-cmm.h"
+
+typedef struct _RSColorspaceTransform RSColorspaceTransform;
+typedef struct _RSColorspaceTransformClass RSColorspaceTransformClass;
+
+typedef struct {
+       RSColorspaceTransform *cst;
+       GThread *threadid;
+       gint start_x;
+       gint start_y;
+       gint end_x;
+       gint end_y;
+       RS_IMAGE16 *input;
+       void *output;
+       RSColorSpace *input_space;
+       RSColorSpace *output_space;
+       RS_MATRIX3 *matrix;
+       gboolean gamma_correct;
+       guchar* table8;
+       gfloat output_gamma;
+} ThreadInfo;
+
+#if defined(__SSE2__)
+
+#include <emmintrin.h>
+
+
+/* SSE2 Polynomial pow function from Mesa3d (MIT License) */
+
+#define EXP_POLY_DEGREE 3
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), 
_mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), 
_mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), 
x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, 
c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, 
c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+static inline __m128 
+exp2f4(__m128 x)
+{
+       __m128i ipart;
+       __m128 fpart, expipart, expfpart;
+
+       x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+       x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+       /* ipart = int(x - 0.5) */
+       ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+       /* fpart = x - ipart */
+       fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+       /* expipart = (float) (1 << ipart) */
+       expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, 
_mm_set1_epi32(127)), 23));
+
+       /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+       expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 
5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+       expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 
5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+       expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 
7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+       expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+       return _mm_mul_ps(expipart, expfpart);
+}
+
+
+#define LOG_POLY_DEGREE 5
+
+static inline __m128 
+log2f4(__m128 x)
+{
+       __m128i exp = _mm_set1_epi32(0x7F800000);
+       __m128i mant = _mm_set1_epi32(0x007FFFFF);
+       __m128 one = _mm_set1_ps( 1.0f);
+       __m128i i = _mm_castps_si128(x);
+       __m128 e = 
_mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, exp), 23), 
_mm_set1_epi32(127)));
+       __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mant)), one);
+       __m128 p;
+
+       /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */
+#if LOG_POLY_DEGREE == 6
+       p = POLY5( m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  
3.1821337e-1f, -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+       p = POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 
1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+       p = POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 
0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+       p = POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 
0.204446009836232697516f);
+#else
+#error
+#endif
+
+       /* This effectively increases the polynomial degree by one, but ensures 
that log2(1) == 0*/
+       p = _mm_mul_ps(p, _mm_sub_ps(m, one));
+
+       return _mm_add_ps(p, e);
+}
+
+static inline __m128
+_mm_fastpow_ps(__m128 x, __m128 y)
+{
+       return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+/* END: SSE2 Polynomial pow function from Mesa3d (MIT License) */
+
+
+static inline __m128
+sse_matrix3_mul(float* mul, __m128 a, __m128 b, __m128 c)
+{
+       __m128 v = _mm_load_ps(mul);
+       __m128 acc = _mm_mul_ps(a, v);
+
+       v = _mm_load_ps(mul+4);
+       acc = _mm_add_ps(acc, _mm_mul_ps(b, v));
+
+       v = _mm_load_ps(mul+8);
+       acc = _mm_add_ps(acc, _mm_mul_ps(c, v));
+
+       return acc;
+}
+
+
+static gfloat _junction_ps[4] __attribute__ ((aligned (16))) = {0.0031308, 
0.0031308, 0.0031308, 0.0031308};
+static gfloat _normalize[4] __attribute__ ((aligned (16))) = {1.0f/65535.0f, 
1.0f/65535.0f, 1.0f/65535.0f, 1.0f/65535.0f};
+static gfloat _8bit[4] __attribute__ ((aligned (16))) = {255.5f, 255.5f, 
255.5f, 255.5f};
+static gfloat _ones_ps[4] __attribute__ ((aligned (16))) = {1.0f, 1.0f, 1.0f, 
1.0f};
+static gfloat _srb_mul_under[4] __attribute__ ((aligned (16))) = {12.92f, 
12.92f, 12.92f, 12.92f};
+static gfloat _srb_mul_over[4] __attribute__ ((aligned (16))) = {1.055f, 
1.055f, 1.055f, 1.055f};
+static gfloat _srb_sub_over[4] __attribute__ ((aligned (16))) = {0.055f, 
0.055f, 0.055f, 0.055f};
+static gfloat _srb_pow_over[4] __attribute__ ((aligned (16))) = {1.0/2.4, 
1.0/2.4, 1.0/2.4, 1.0/2.4};
+static guint _alpha_mask[4] __attribute__ ((aligned (16))) = 
{0xff000000,0xff000000,0xff000000,0xff000000};
+
+void
+transform8_srgb_sse2(ThreadInfo* t)
+{
+       RS_IMAGE16 *input = t->input;
+       GdkPixbuf *output = t->output;
+       RS_MATRIX3 *matrix = t->matrix;
+       gint x,y;
+       gint width;
+
+       float mat_ps[4*4*3] __attribute__ ((aligned (16)));
+       for (x = 0; x < 4; x++ ) {
+               mat_ps[x] = matrix->coeff[0][0];
+               mat_ps[x+4] = matrix->coeff[0][1];
+               mat_ps[x+8] = matrix->coeff[0][2];
+               mat_ps[12+x] = matrix->coeff[1][0];
+               mat_ps[12+x+4] = matrix->coeff[1][1];
+               mat_ps[12+x+8] = matrix->coeff[1][2];
+               mat_ps[24+x] = matrix->coeff[2][0];
+               mat_ps[24+x+4] = matrix->coeff[2][1];
+               mat_ps[24+x+8] = matrix->coeff[2][2];
+       }
+       
+       int start_x = t->start_x;
+       /* Always have aligned input and output adress */
+       if (start_x & 3)
+               start_x = ((start_x) / 4) * 4;
+       
+       int complete_w = t->end_x - start_x;
+       /* If width is not multiple of 4, check if we can extend it a bit */
+       if (complete_w & 3)
+       {
+               if ((t->end_x+4) < input->w)
+                       complete_w = ((complete_w+3) / 4 *4);
+       }
+       
+       for(y=t->start_y ; y<t->end_y ; y++)
+       {
+               gushort *i = GET_PIXEL(input, start_x, y);
+               guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
+               gboolean aligned_write = !((guintptr)(o)&0xf);
+
+               width = complete_w >> 2;
+
+               while(width--)
+               {
+                       /* Load and convert to float */
+                       __m128i zero = _mm_setzero_si128();
+                       __m128i in = _mm_load_si128((__m128i*)i); // Load two 
pixels
+                       __m128i in2 = _mm_load_si128((__m128i*)i+1); // Load 
two pixels
+                       _mm_prefetch(i + 64, _MM_HINT_NTA);
+                       __m128i p1 =_mm_unpacklo_epi16(in, zero);
+                       __m128i p2 =_mm_unpackhi_epi16(in, zero);
+                       __m128i p3 =_mm_unpacklo_epi16(in2, zero);
+                       __m128i p4 =_mm_unpackhi_epi16(in2, zero);
+                       __m128 p1f  = _mm_cvtepi32_ps(p1);
+                       __m128 p2f  = _mm_cvtepi32_ps(p2);
+                       __m128 p3f  = _mm_cvtepi32_ps(p3);
+                       __m128 p4f  = _mm_cvtepi32_ps(p4);
+                       
+                       /* Convert to planar */
+                       __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
+                       __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
+                       __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
+                       __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
+                       __m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
+                       __m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
+                       __m128 b = _mm_movelh_ps(b1b0, b3b2);
+
+                       /* Apply matrix to convert to sRGB */
+                       __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+                       __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+                       __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+                       /* Normalize to 0->1 and clamp */
+                       __m128 normalize = _mm_load_ps(_normalize);
+                       __m128 max_val = _mm_load_ps(_ones_ps);
+                       __m128 min_val = _mm_setzero_ps();
+                       r = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, r2)));
+                       g = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, g2)));
+                       b = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, b2)));
+
+                       /* Apply Gamma */
+                       /* Calculate values to be used if larger than junction 
point */
+                       __m128 mul_over = _mm_load_ps(_srb_mul_over);
+                       __m128 sub_over = _mm_load_ps(_srb_sub_over);
+                       __m128 pow_over = _mm_load_ps(_srb_pow_over);
+                       __m128 r_gam = _mm_sub_ps(_mm_mul_ps( mul_over, 
_mm_fastpow_ps(r, pow_over)), sub_over);
+                       __m128 g_gam = _mm_sub_ps(_mm_mul_ps( mul_over, 
_mm_fastpow_ps(g, pow_over)), sub_over);
+                       __m128 b_gam = _mm_sub_ps(_mm_mul_ps( mul_over, 
_mm_fastpow_ps(b, pow_over)), sub_over);
+
+                       /* Create mask for values smaller than junction point */
+                       __m128 junction = _mm_load_ps(_junction_ps);
+                       __m128 mask_r = _mm_cmplt_ps(r, junction);
+                       __m128 mask_g = _mm_cmplt_ps(g, junction);
+                       __m128 mask_b = _mm_cmplt_ps(b, junction);
+
+                       /* Calculate value to be used if under junction */
+                       __m128 mul_under = _mm_load_ps(_srb_mul_under);
+                       __m128 r_mul = _mm_and_ps(mask_r, _mm_mul_ps(mul_under, 
r));
+                       __m128 g_mul = _mm_and_ps(mask_g, _mm_mul_ps(mul_under, 
g));
+                       __m128 b_mul = _mm_and_ps(mask_b, _mm_mul_ps(mul_under, 
b));
+
+                       /* Select the value to be used based on the junction 
mask and scale to 8 bit */
+                       __m128 upscale = _mm_load_ps(_8bit);
+                       r = _mm_mul_ps(upscale, _mm_or_ps(r_mul, 
_mm_andnot_ps(mask_r, r_gam)));
+                       g = _mm_mul_ps(upscale, _mm_or_ps(g_mul, 
_mm_andnot_ps(mask_g, g_gam)));
+                       b = _mm_mul_ps(upscale, _mm_or_ps(b_mul, 
_mm_andnot_ps(mask_b, b_gam)));
+                       
+                       /* Convert to 8 bit unsigned  and interleave*/
+                       __m128i r_i = _mm_cvtps_epi32(r);
+                       __m128i g_i = _mm_cvtps_epi32(g);
+                       __m128i b_i = _mm_cvtps_epi32(b);
+                       
+                       r_i = _mm_packs_epi32(r_i, r_i);
+                       g_i = _mm_packs_epi32(g_i, g_i);
+                       b_i = _mm_packs_epi32(b_i, b_i);
+
+                       /* Set alpha value to 255 and store */
+                       __m128i alpha_mask = 
_mm_load_si128((__m128i*)_alpha_mask);
+                       __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
+                       __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
+                       p1 = _mm_unpacklo_epi32(rg_i, bb_i);
+                       p2 = _mm_unpackhi_epi32(rg_i, bb_i);
+       
+                       p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));
+
+                       if (aligned_write)
+                               _mm_store_si128((__m128i*)o, p1);
+                       else
+                               _mm_storeu_si128((__m128i*)o, p1);
+
+                       i += 16;
+                       o += 16;
+               }
+               /* Process remaining pixels */
+               width = complete_w & 3;
+               while(width--)
+               {
+                       __m128i zero = _mm_setzero_si128();
+                       __m128i in = _mm_loadl_epi64((__m128i*)i); // Load two 
pixels
+                       __m128i p1 =_mm_unpacklo_epi16(in, zero);
+                       __m128 p1f  = _mm_cvtepi32_ps(p1);
+
+                       /* Splat r,g,b */
+                       __m128 r =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(4,4,4,4));
+                       __m128 g =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(3,3,3,3));
+                       __m128 b =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(2,2,2,2));
+
+                       __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+                       __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+                       __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+                       r = _mm_unpacklo_ps(r2, g2);    // GG RR GG RR
+                       r = _mm_movelh_ps(r, b2);               // BB BB GG RR
+
+                       __m128 normalize = _mm_load_ps(_normalize);
+                       __m128 max_val = _mm_load_ps(_ones_ps);
+                       __m128 min_val = _mm_setzero_ps();
+                       r = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, r2)));
+                       __m128 mul_over = _mm_load_ps(_srb_mul_over);
+                       __m128 sub_over = _mm_load_ps(_srb_sub_over);
+                       __m128 pow_over = _mm_load_ps(_srb_pow_over);
+                       __m128 r_gam = _mm_sub_ps(_mm_mul_ps( mul_over, 
_mm_fastpow_ps(r, pow_over)), sub_over);
+                       __m128 junction = _mm_load_ps(_junction_ps);
+                       __m128 mask_r = _mm_cmplt_ps(r, junction);
+                       __m128 mul_under = _mm_load_ps(_srb_mul_under);
+                       __m128 r_mul = _mm_and_ps(mask_r, _mm_mul_ps(mul_under, 
r));
+                       __m128 upscale = _mm_load_ps(_8bit);
+                       r = _mm_mul_ps(upscale, _mm_or_ps(r_mul, 
_mm_andnot_ps(mask_r, r_gam)));
+                       
+                       /* Convert to 8 bit unsigned */
+                       zero = _mm_setzero_si128();
+                       __m128i r_i = _mm_cvtps_epi32(r);
+                       /* To 16 bit signed */
+                       r_i = _mm_packs_epi32(r_i, zero);
+                       /* To 8 bit unsigned - set alpha channel*/
+                       __m128i alpha_mask = 
_mm_load_si128((__m128i*)_alpha_mask);
+                       r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, 
zero));
+                       *(int*)o = _mm_cvtsi128_si32(r_i);
+                       i+=4;
+                       o+=4;
+               }
+       }
+}
+
+
+void
+transform8_otherrgb_sse2(ThreadInfo* t)
+{
+       RS_IMAGE16 *input = t->input;
+       GdkPixbuf *output = t->output;
+       RS_MATRIX3 *matrix = t->matrix;
+       gint x,y;
+       gint width;
+
+       float mat_ps[4*4*3] __attribute__ ((aligned (16)));
+       for (x = 0; x < 4; x++ ) {
+               mat_ps[x] = matrix->coeff[0][0];
+               mat_ps[x+4] = matrix->coeff[0][1];
+               mat_ps[x+8] = matrix->coeff[0][2];
+               mat_ps[12+x] = matrix->coeff[1][0];
+               mat_ps[12+x+4] = matrix->coeff[1][1];
+               mat_ps[12+x+8] = matrix->coeff[1][2];
+               mat_ps[24+x] = matrix->coeff[2][0];
+               mat_ps[24+x+4] = matrix->coeff[2][1];
+               mat_ps[24+x+8] = matrix->coeff[2][2];
+       }
+       
+       int start_x = t->start_x;
+       /* Always have aligned input and output adress */
+       if (start_x & 3)
+               start_x = ((start_x) / 4) * 4;
+       
+       int complete_w = t->end_x - start_x;
+       /* If width is not multiple of 4, check if we can extend it a bit */
+       if (complete_w & 3)
+       {
+               if ((t->end_x+4) < input->w)
+                       complete_w = ((complete_w+3) / 4 *4);
+       }
+       __m128 gamma = _mm_set1_ps(t->output_gamma);
+
+       for(y=t->start_y ; y<t->end_y ; y++)
+       {
+               gushort *i = GET_PIXEL(input, start_x, y);
+               guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
+               gboolean aligned_write = !((guintptr)(o)&0xf);
+
+               width = complete_w >> 2;
+
+               while(width--)
+               {
+                       /* Load and convert to float */
+                       __m128i zero = _mm_setzero_si128();
+                       __m128i in = _mm_load_si128((__m128i*)i); // Load two 
pixels
+                       __m128i in2 = _mm_load_si128((__m128i*)i+1); // Load 
two pixels
+                       _mm_prefetch(i + 64, _MM_HINT_NTA);
+                       __m128i p1 =_mm_unpacklo_epi16(in, zero);
+                       __m128i p2 =_mm_unpackhi_epi16(in, zero);
+                       __m128i p3 =_mm_unpacklo_epi16(in2, zero);
+                       __m128i p4 =_mm_unpackhi_epi16(in2, zero);
+                       __m128 p1f  = _mm_cvtepi32_ps(p1);
+                       __m128 p2f  = _mm_cvtepi32_ps(p2);
+                       __m128 p3f  = _mm_cvtepi32_ps(p3);
+                       __m128 p4f  = _mm_cvtepi32_ps(p4);
+                       
+                       /* Convert to planar */
+                       __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
+                       __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
+                       __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
+                       __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
+                       __m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
+                       __m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
+                       __m128 b = _mm_movelh_ps(b1b0, b3b2);
+
+                       /* Apply matrix to convert to sRGB */
+                       __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+                       __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+                       __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+                       /* Normalize to 0->1 and clamp */
+                       __m128 normalize = _mm_load_ps(_normalize);
+                       __m128 max_val = _mm_load_ps(_ones_ps);
+                       __m128 min_val = _mm_setzero_ps();
+                       r = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, r2)));
+                       g = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, g2)));
+                       b = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, b2)));
+
+                       /* Apply Gamma */
+                       __m128 upscale = _mm_load_ps(_8bit);
+                       r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
+                       g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma));
+                       b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma));
+
+                       /* Convert to 8 bit unsigned  and interleave*/
+                       __m128i r_i = _mm_cvtps_epi32(r);
+                       __m128i g_i = _mm_cvtps_epi32(g);
+                       __m128i b_i = _mm_cvtps_epi32(b);
+                       
+                       r_i = _mm_packs_epi32(r_i, r_i);
+                       g_i = _mm_packs_epi32(g_i, g_i);
+                       b_i = _mm_packs_epi32(b_i, b_i);
+
+                       /* Set alpha value to 255 and store */
+                       __m128i alpha_mask = 
_mm_load_si128((__m128i*)_alpha_mask);
+                       __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
+                       __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
+                       p1 = _mm_unpacklo_epi32(rg_i, bb_i);
+                       p2 = _mm_unpackhi_epi32(rg_i, bb_i);
+       
+                       p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));
+
+                       if (aligned_write)
+                               _mm_store_si128((__m128i*)o, p1);
+                       else
+                               _mm_storeu_si128((__m128i*)o, p1);
+
+                       i += 16;
+                       o += 16;
+               }
+               /* Process remaining pixels */
+               width = complete_w & 3;
+               while(width--)
+               {
+                       __m128i zero = _mm_setzero_si128();
+                       __m128i in = _mm_loadl_epi64((__m128i*)i); // Load two 
pixels
+                       __m128i p1 =_mm_unpacklo_epi16(in, zero);
+                       __m128 p1f  = _mm_cvtepi32_ps(p1);
+
+                       /* Splat r,g,b */
+                       __m128 r =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(4,4,4,4));
+                       __m128 g =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(3,3,3,3));
+                       __m128 b =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(2,2,2,2));
+
+                       __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+                       __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+                       __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+                       r = _mm_unpacklo_ps(r2, g2);    // GG RR GG RR
+                       r = _mm_movelh_ps(r, b2);               // BB BB GG RR
+
+                       __m128 normalize = _mm_load_ps(_normalize);
+                       __m128 max_val = _mm_load_ps(_ones_ps);
+                       __m128 min_val = _mm_setzero_ps();
+                       r = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, r2)));
+                       __m128 upscale = _mm_load_ps(_8bit);
+                       r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
+                       
+                       /* Convert to 8 bit unsigned */
+                       zero = _mm_setzero_si128();
+                       __m128i r_i = _mm_cvtps_epi32(r);
+                       /* To 16 bit signed */
+                       r_i = _mm_packs_epi32(r_i, zero);
+                       /* To 8 bit unsigned - set alpha channel*/
+                       __m128i alpha_mask = 
_mm_load_si128((__m128i*)_alpha_mask);
+                       r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, 
zero));
+                       *(int*)o = _mm_cvtsi128_si32(r_i);
+                       i+=4;
+                       o+=4;
+               }
+       }
+}
+
+gboolean cst_has_sse2() 
+{
+       return TRUE;
+}
+
+#else // !defined __SSE2__
+
+/* Provide empty functions if not SSE2 compiled to avoid linker errors */
+
+void transform8_sse2(ThreadInfo* t)
+{
+       /* We should never even get here */
+       g_assert(FALSE);
+}
+
+void
+transform8_otherrgb_sse2(ThreadInfo* t)
+{
+       /* We should never even get here */
+       g_assert(FALSE);
+}
+
+gboolean cst_has_sse2() 
+{
+       return FALSE;
+}
+
+#endif
\ No newline at end of file


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to