Author: post
Date: 2010-01-23 14:57:11 +0100 (Sat, 23 Jan 2010)
New Revision: 3057
Added:
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c
Modified:
branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c
Log:
Colorspace transform. Add multithreading, ROI and SSE2 code to all internal 16
bit -> 8 bit conversions. Note that the GdkPixbuffer now has an alpha channel,
which might all to other code changes needed.
Modified: branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am
===================================================================
--- branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am
2010-01-23 12:55:16 UTC (rev 3056)
+++ branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am
2010-01-23 13:57:11 UTC (rev 3057)
@@ -17,6 +17,19 @@
libdir = $(datadir)/rawstudio/plugins/
-colorspace_transform_la_LIBADD = @PACKAGE_LIBS@
+colorspace_transform_la_LIBADD = @PACKAGE_LIBS@ colorspace_transform_sse2.lo
rs-cmm.lo
colorspace_transform_la_LDFLAGS = -module -avoid-version
-colorspace_transform_la_SOURCES = colorspace_transform.c rs-cmm.c rs-cmm.h
+
+colorspace_transform.lo: colorspace_transform.c
+ $(LTCOMPILE) -DEXIT_CODE=0 -c colorspace_transform.c
+
+rs-cmm.lo: rs-cmm.c rs-cmm.h
+ $(LTCOMPILE) -DEXIT_CODE=2 -c rs-cmm.c
+
+colorspace_transform_sse2.lo: colorspace_transform_sse2.c
+if CAN_COMPILE_SSE2
+SSE_FLAG=-msse2
+else
+SSE_FLAG=
+endif
+ $(LTCOMPILE) $(SSE_FLAG) -DEXIT_CODE=1 -c colorspace_transform_sse2.c
Modified:
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c
===================================================================
---
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c
2010-01-23 12:55:16 UTC (rev 3056)
+++
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c
2010-01-23 13:57:11 UTC (rev 3057)
@@ -31,6 +31,23 @@
typedef struct _RSColorspaceTransform RSColorspaceTransform;
typedef struct _RSColorspaceTransformClass RSColorspaceTransformClass;
+typedef struct {
+ RSColorspaceTransform *cst;
+ GThread *threadid;
+ gint start_x;
+ gint start_y;
+ gint end_x;
+ gint end_y;
+ RS_IMAGE16 *input;
+ void *output;
+ RSColorSpace *input_space;
+ RSColorSpace *output_space;
+ RS_MATRIX3 *matrix;
+ gboolean gamma_correct;
+ guchar* table8;
+ gfloat output_gamma;
+} ThreadInfo;
+
struct _RSColorspaceTransform {
RSFilter parent;
@@ -51,10 +68,15 @@
static RSFilterResponse *get_image(RSFilter *filter, const RSFilterRequest
*request);
static RSFilterResponse *get_image8(RSFilter *filter, const RSFilterRequest
*request);
static gboolean convert_colorspace16(RSColorspaceTransform
*colorspace_transform, RS_IMAGE16 *input_image, RS_IMAGE16 *output_image,
RSColorSpace *input_space, RSColorSpace *output_space);
-static void convert_colorspace8(RSColorspaceTransform *colorspace_transform,
RS_IMAGE16 *input_image, GdkPixbuf *output_image, RSColorSpace *input_space,
RSColorSpace *output_space);
+static void convert_colorspace8(RSColorspaceTransform *colorspace_transform,
RS_IMAGE16 *input_image, GdkPixbuf *output_image, RSColorSpace *input_space,
RSColorSpace *output_space, GdkRectangle *roi);
static RSFilterClass *rs_colorspace_transform_parent_class = NULL;
+/* SSE2 optimized functions */
+extern void transform8_srgb_sse2(ThreadInfo* t);
+extern void transform8_otherrgb_sse2(ThreadInfo* t);
+extern gboolean cst_has_sse2();
+
G_MODULE_EXPORT void
rs_plugin_load(RSPlugin *plugin)
{
@@ -106,8 +128,6 @@
if (convert_colorspace16(colorspace_transform, input, output,
input_space, output_space))
{
/* Image was converted */
- printf("\033[33m16 input_space: %s\033[0m\n",
(input_space) ? G_OBJECT_TYPE_NAME(input_space) : "none");
- printf("\033[33m16 output_space: %s\n\033[0m",
(output_space) ? G_OBJECT_TYPE_NAME(output_space) : "none");
response = rs_filter_response_clone(previous_response);
g_object_unref(previous_response);
rs_filter_response_set_image(response, output);
@@ -137,53 +157,60 @@
RSFilterResponse *response;
RS_IMAGE16 *input;
GdkPixbuf *output = NULL;
+ GdkRectangle *roi;
previous_response = rs_filter_get_image(filter->previous, request);
input = rs_filter_response_get_image(previous_response);
if (!RS_IS_IMAGE16(input))
return previous_response;
+ roi = rs_filter_request_get_roi(request);
RSColorSpace *input_space =
rs_filter_param_get_object_with_type(RS_FILTER_PARAM(previous_response),
"colorspace", RS_TYPE_COLOR_SPACE);
RSColorSpace *output_space =
rs_filter_param_get_object_with_type(RS_FILTER_PARAM(request), "colorspace",
RS_TYPE_COLOR_SPACE);
+#if 0
printf("\033[33m8 input_space: %s\033[0m\n", (input_space) ?
G_OBJECT_TYPE_NAME(input_space) : "none");
printf("\033[33m8 output_space: %s\n\033[0m", (output_space) ?
G_OBJECT_TYPE_NAME(output_space) : "none");
+#endif
response = rs_filter_response_clone(previous_response);
g_object_unref(previous_response);
- output = gdk_pixbuf_new(GDK_COLORSPACE_RGB, FALSE, 8, input->w,
input->h);
+ output = gdk_pixbuf_new(GDK_COLORSPACE_RGB, TRUE, 8, input->w,
input->h);
- convert_colorspace8(colorspace_transform, input, output, input_space,
output_space);
+ /* Process output */
+ convert_colorspace8(colorspace_transform, input, output, input_space,
output_space, roi);
rs_filter_response_set_image8(response, output);
g_object_unref(output);
-
- /* Process output */
-
g_object_unref(input);
return response;
}
static void
-transform8_c(RS_IMAGE16 *input, GdkPixbuf *output, RS_MATRIX3 *matrix, guchar
*table8)
+transform8_c(ThreadInfo* t)
{
gint row;
gint r,g,b;
gint width;
RS_MATRIX3Int mati;
-
+ RS_IMAGE16 *input = t->input;
+ GdkPixbuf *output = (GdkPixbuf *)t->output;
+ guchar *table8 = t->table8;
+
+ g_assert(RS_IS_IMAGE16(input));
+ g_assert(GDK_IS_PIXBUF(output));
gint o_channels = gdk_pixbuf_get_n_channels(output);
- matrix3_to_matrix3int(matrix, &mati);
+ matrix3_to_matrix3int(t->matrix, &mati);
- for(row=0 ; row<input->h ; row++)
+ for(row=t->start_y ; row<t->end_y ; row++)
{
- gushort *i = GET_PIXEL(input, 0, row);
- guchar *o = GET_PIXBUF_PIXEL(output, 0, row);
+ gushort *i = GET_PIXEL(input, t->start_x, row);
+ guchar *o = GET_PIXBUF_PIXEL(output, t->start_x, row);
- width = input->w;
+ width = t->end_x - t->start_x;
- while(width--)
+ while(width-- > 0)
{
r =
( i[R] * mati.coeff[0][0]
@@ -208,12 +235,15 @@
o[R] = table8[r];
o[G] = table8[g];
o[B] = table8[b];
+ o[3] = 255;
+
i += input->pixelsize;
o += o_channels;
}
}
}
+
static void
transform16_c(gushort* __restrict input, gushort* __restrict output, gint
num_pixels, const gint pixelsize, RS_MATRIX3 *matrix)
{
@@ -301,8 +331,65 @@
return TRUE;
}
+gpointer
+start_single_cs8_transform_thread(gpointer _thread_info)
+{
+ ThreadInfo* t = _thread_info;
+ RS_IMAGE16 *input_image = t->input;
+ GdkPixbuf *output = (GdkPixbuf*) t->output;
+ RSColorSpace *input_space = t->input_space;
+ RSColorSpace *output_space = t->output_space;
+
+ g_assert(RS_IS_IMAGE16(input_image));
+ g_assert(GDK_IS_PIXBUF(output));
+ g_assert(RS_IS_COLOR_SPACE(input_space));
+ g_assert(RS_IS_COLOR_SPACE(output_space));
+
+ gboolean sse2_available = (!!(rs_detect_cpu_features() &
RS_CPU_FLAG_SSE2)) && cst_has_sse2();
+
+ if (sse2_available && rs_color_space_new_singleton("RSSrgb") ==
output_space)
+ {
+ transform8_srgb_sse2(t);
+ return (NULL);
+ }
+ if (sse2_available && rs_color_space_new_singleton("RSAdobeRGB") ==
output_space)
+ {
+ t->output_gamma = 1.0 / 2.19921875;
+ transform8_otherrgb_sse2(t);
+ return (NULL);
+ }
+ if (sse2_available && rs_color_space_new_singleton("RSProphoto") ==
output_space)
+ {
+ t->output_gamma = 1.0 / 1.8;
+ transform8_otherrgb_sse2(t);
+ return (NULL);
+ }
+
+ /* Fall back to C-functions */
+ /* Calculate our gamma table */
+ const RS1dFunction *input_gamma =
rs_color_space_get_gamma_function(input_space);
+ const RS1dFunction *output_gamma =
rs_color_space_get_gamma_function(output_space);
+ guchar table8[65536];
+ gint i;
+ for(i=0;i<65536;i++)
+ {
+ gdouble nd = ((gdouble) i) * (1.0/65535.0);
+
+ nd = rs_1d_function_evaluate_inverse(input_gamma, nd);
+ nd = rs_1d_function_evaluate(output_gamma, nd);
+
+ /* 8 bit output */
+ gint res = (gint) (nd*255.0 + 0.5f);
+ _CLAMP255(res);
+ table8[i] = res;
+ }
+ t->table8 = table8;
+ transform8_c(t);
+ return (NULL);
+}
+
static void
-convert_colorspace8(RSColorspaceTransform *colorspace_transform, RS_IMAGE16
*input_image, GdkPixbuf *output_image, RSColorSpace *input_space, RSColorSpace
*output_space)
+convert_colorspace8(RSColorspaceTransform *colorspace_transform, RS_IMAGE16
*input_image, GdkPixbuf *output_image, RSColorSpace *input_space, RSColorSpace
*output_space, GdkRectangle *_roi)
{
g_assert(RS_IS_IMAGE16(input_image));
g_assert(GDK_IS_PIXBUF(output_image));
@@ -313,6 +400,16 @@
g_assert(input_image->w == gdk_pixbuf_get_width(output_image));
g_assert(input_image->h == gdk_pixbuf_get_height(output_image));
+ GdkRectangle *roi = _roi;
+ if (!roi)
+ {
+ roi = g_new(GdkRectangle, 1);
+ roi->x = 0;
+ roi->y = 0;
+ roi->width = input_image->w;
+ roi->height = input_image->h;
+ }
+
/* If a CMS is needed, do the transformation using LCMS */
if (RS_COLOR_SPACE_REQUIRES_CMS(input_space) ||
RS_COLOR_SPACE_REQUIRES_CMS(output_space))
{
@@ -332,34 +429,45 @@
{
const RS_MATRIX3 a =
rs_color_space_get_matrix_from_pcs(input_space);
const RS_MATRIX3 b =
rs_color_space_get_matrix_to_pcs(output_space);
+
RS_MATRIX3 mat;
matrix3_multiply(&b, &a, &mat);
- /* Calculate our gamma table */
- /* FIXME: Move this to someplace where we can cache it! */
- guchar table8[65536];
gint i;
+ guint y_offset, y_per_thread, threaded_h;
+ const guint threads = rs_get_number_of_processor_cores();
+ ThreadInfo *t = g_new(ThreadInfo, threads);
- const RS1dFunction *input_gamma =
rs_color_space_get_gamma_function(input_space);
- const RS1dFunction *output_gamma =
rs_color_space_get_gamma_function(output_space);
+ threaded_h = roi->height;
+ y_per_thread = (threaded_h + threads-1)/threads;
+ y_offset = roi->y;
- for(i=0;i<65536;i++)
+ for (i = 0; i < threads; i++)
{
- gdouble nd = ((gdouble) i) * (1.0/65535.0);
+ t[i].input = input_image;
+ t[i].output = output_image;
+ t[i].start_y = y_offset;
+ t[i].start_x = roi->x;
+ t[i].end_x = roi->x + roi->width;
+ t[i].cst = colorspace_transform;
+ t[i].input_space = input_space;
+ t[i].output_space = output_space;
+ y_offset += y_per_thread;
+ y_offset = MIN(input_image->h, y_offset);
+ t[i].end_y = y_offset;
+ t[i].matrix = &mat;
+ t[i].table8 = NULL;
- nd = rs_1d_function_evaluate_inverse(input_gamma, nd);
- nd = rs_1d_function_evaluate(output_gamma, nd);
-
- /* 8 bit output */
- gint res = (gint) (nd*255.0 + 0.5f);
- _CLAMP255(res);
- table8[i] = res;
+ t[i].threadid =
g_thread_create(start_single_cs8_transform_thread, &t[i], TRUE, NULL);
}
- transform8_c(
- input_image,
- output_image,
- &mat,
- table8);
+ /* Wait for threads to finish */
+ for(i = 0; i < threads; i++)
+ g_thread_join(t[i].threadid);
+
+ g_free(t);
}
+ /* If we created the ROI here, free it */
+ if (!_roi)
+ g_free(roi);
}
Added:
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c
===================================================================
---
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c
(rev 0)
+++
branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c
2010-01-23 13:57:11 UTC (rev 3057)
@@ -0,0 +1,534 @@
+/*
+ * Copyright (C) 2006-2009 Anders Brander <[email protected]> and
+ * Anders Kvist <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
+ */
+
+/* Plugin tmpl version 5 */
+
+#include <rawstudio.h>
+#include <lcms.h>
+#include "rs-cmm.h"
+
+typedef struct _RSColorspaceTransform RSColorspaceTransform;
+typedef struct _RSColorspaceTransformClass RSColorspaceTransformClass;
+
+typedef struct {
+ RSColorspaceTransform *cst;
+ GThread *threadid;
+ gint start_x;
+ gint start_y;
+ gint end_x;
+ gint end_y;
+ RS_IMAGE16 *input;
+ void *output;
+ RSColorSpace *input_space;
+ RSColorSpace *output_space;
+ RS_MATRIX3 *matrix;
+ gboolean gamma_correct;
+ guchar* table8;
+ gfloat output_gamma;
+} ThreadInfo;
+
+#if defined(__SSE2__)
+
+#include <emmintrin.h>
+
+
+/* SSE2 Polynomial pow function from Mesa3d (MIT License) */
+
+#define EXP_POLY_DEGREE 3
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x),
_mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x),
_mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3),
x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2,
c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1,
c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+static inline __m128
+exp2f4(__m128 x)
+{
+ __m128i ipart;
+ __m128 fpart, expipart, expfpart;
+
+ x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+ x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+ /* ipart = int(x - 0.5) */
+ ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+ /* fpart = x - ipart */
+ fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+ /* expipart = (float) (1 << ipart) */
+ expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart,
_mm_set1_epi32(127)), 23));
+
+ /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+ expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f,
5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+ expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f,
5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+ expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f,
7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+ expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+ return _mm_mul_ps(expipart, expfpart);
+}
+
+
+#define LOG_POLY_DEGREE 5
+
+static inline __m128
+log2f4(__m128 x)
+{
+ __m128i exp = _mm_set1_epi32(0x7F800000);
+ __m128i mant = _mm_set1_epi32(0x007FFFFF);
+ __m128 one = _mm_set1_ps( 1.0f);
+ __m128i i = _mm_castps_si128(x);
+ __m128 e =
_mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, exp), 23),
_mm_set1_epi32(127)));
+ __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mant)), one);
+ __m128 p;
+
+ /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */
+#if LOG_POLY_DEGREE == 6
+ p = POLY5( m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,
3.1821337e-1f, -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+ p = POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f,
1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+ p = POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f,
0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+ p = POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f,
0.204446009836232697516f);
+#else
+#error
+#endif
+
+ /* This effectively increases the polynomial degree by one, but ensures
that log2(1) == 0*/
+ p = _mm_mul_ps(p, _mm_sub_ps(m, one));
+
+ return _mm_add_ps(p, e);
+}
+
+static inline __m128
+_mm_fastpow_ps(__m128 x, __m128 y)
+{
+ return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+/* END: SSE2 Polynomial pow function from Mesa3d (MIT License) */
+
+
+static inline __m128
+sse_matrix3_mul(float* mul, __m128 a, __m128 b, __m128 c)
+{
+ __m128 v = _mm_load_ps(mul);
+ __m128 acc = _mm_mul_ps(a, v);
+
+ v = _mm_load_ps(mul+4);
+ acc = _mm_add_ps(acc, _mm_mul_ps(b, v));
+
+ v = _mm_load_ps(mul+8);
+ acc = _mm_add_ps(acc, _mm_mul_ps(c, v));
+
+ return acc;
+}
+
+
+static gfloat _junction_ps[4] __attribute__ ((aligned (16))) = {0.0031308,
0.0031308, 0.0031308, 0.0031308};
+static gfloat _normalize[4] __attribute__ ((aligned (16))) = {1.0f/65535.0f,
1.0f/65535.0f, 1.0f/65535.0f, 1.0f/65535.0f};
+static gfloat _8bit[4] __attribute__ ((aligned (16))) = {255.5f, 255.5f,
255.5f, 255.5f};
+static gfloat _ones_ps[4] __attribute__ ((aligned (16))) = {1.0f, 1.0f, 1.0f,
1.0f};
+static gfloat _srb_mul_under[4] __attribute__ ((aligned (16))) = {12.92f,
12.92f, 12.92f, 12.92f};
+static gfloat _srb_mul_over[4] __attribute__ ((aligned (16))) = {1.055f,
1.055f, 1.055f, 1.055f};
+static gfloat _srb_sub_over[4] __attribute__ ((aligned (16))) = {0.055f,
0.055f, 0.055f, 0.055f};
+static gfloat _srb_pow_over[4] __attribute__ ((aligned (16))) = {1.0/2.4,
1.0/2.4, 1.0/2.4, 1.0/2.4};
+static guint _alpha_mask[4] __attribute__ ((aligned (16))) =
{0xff000000,0xff000000,0xff000000,0xff000000};
+
+void
+transform8_srgb_sse2(ThreadInfo* t)
+{
+ RS_IMAGE16 *input = t->input;
+ GdkPixbuf *output = t->output;
+ RS_MATRIX3 *matrix = t->matrix;
+ gint x,y;
+ gint width;
+
+ float mat_ps[4*4*3] __attribute__ ((aligned (16)));
+ for (x = 0; x < 4; x++ ) {
+ mat_ps[x] = matrix->coeff[0][0];
+ mat_ps[x+4] = matrix->coeff[0][1];
+ mat_ps[x+8] = matrix->coeff[0][2];
+ mat_ps[12+x] = matrix->coeff[1][0];
+ mat_ps[12+x+4] = matrix->coeff[1][1];
+ mat_ps[12+x+8] = matrix->coeff[1][2];
+ mat_ps[24+x] = matrix->coeff[2][0];
+ mat_ps[24+x+4] = matrix->coeff[2][1];
+ mat_ps[24+x+8] = matrix->coeff[2][2];
+ }
+
+ int start_x = t->start_x;
+ /* Always have aligned input and output adress */
+ if (start_x & 3)
+ start_x = ((start_x) / 4) * 4;
+
+ int complete_w = t->end_x - start_x;
+ /* If width is not multiple of 4, check if we can extend it a bit */
+ if (complete_w & 3)
+ {
+ if ((t->end_x+4) < input->w)
+ complete_w = ((complete_w+3) / 4 *4);
+ }
+
+ for(y=t->start_y ; y<t->end_y ; y++)
+ {
+ gushort *i = GET_PIXEL(input, start_x, y);
+ guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
+ gboolean aligned_write = !((guintptr)(o)&0xf);
+
+ width = complete_w >> 2;
+
+ while(width--)
+ {
+ /* Load and convert to float */
+ __m128i zero = _mm_setzero_si128();
+ __m128i in = _mm_load_si128((__m128i*)i); // Load two
pixels
+ __m128i in2 = _mm_load_si128((__m128i*)i+1); // Load
two pixels
+ _mm_prefetch(i + 64, _MM_HINT_NTA);
+ __m128i p1 =_mm_unpacklo_epi16(in, zero);
+ __m128i p2 =_mm_unpackhi_epi16(in, zero);
+ __m128i p3 =_mm_unpacklo_epi16(in2, zero);
+ __m128i p4 =_mm_unpackhi_epi16(in2, zero);
+ __m128 p1f = _mm_cvtepi32_ps(p1);
+ __m128 p2f = _mm_cvtepi32_ps(p2);
+ __m128 p3f = _mm_cvtepi32_ps(p3);
+ __m128 p4f = _mm_cvtepi32_ps(p4);
+
+ /* Convert to planar */
+ __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
+ __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
+ __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
+ __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
+ __m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
+ __m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
+ __m128 b = _mm_movelh_ps(b1b0, b3b2);
+
+ /* Apply matrix to convert to sRGB */
+ __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+ __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+ __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+ /* Normalize to 0->1 and clamp */
+ __m128 normalize = _mm_load_ps(_normalize);
+ __m128 max_val = _mm_load_ps(_ones_ps);
+ __m128 min_val = _mm_setzero_ps();
+ r = _mm_min_ps(max_val, _mm_max_ps(min_val,
_mm_mul_ps(normalize, r2)));
+ g = _mm_min_ps(max_val, _mm_max_ps(min_val,
_mm_mul_ps(normalize, g2)));
+ b = _mm_min_ps(max_val, _mm_max_ps(min_val,
_mm_mul_ps(normalize, b2)));
+
+ /* Apply Gamma */
+ /* Calculate values to be used if larger than junction
point */
+ __m128 mul_over = _mm_load_ps(_srb_mul_over);
+ __m128 sub_over = _mm_load_ps(_srb_sub_over);
+ __m128 pow_over = _mm_load_ps(_srb_pow_over);
+ __m128 r_gam = _mm_sub_ps(_mm_mul_ps( mul_over,
_mm_fastpow_ps(r, pow_over)), sub_over);
+ __m128 g_gam = _mm_sub_ps(_mm_mul_ps( mul_over,
_mm_fastpow_ps(g, pow_over)), sub_over);
+ __m128 b_gam = _mm_sub_ps(_mm_mul_ps( mul_over,
_mm_fastpow_ps(b, pow_over)), sub_over);
+
+ /* Create mask for values smaller than junction point */
+ __m128 junction = _mm_load_ps(_junction_ps);
+ __m128 mask_r = _mm_cmplt_ps(r, junction);
+ __m128 mask_g = _mm_cmplt_ps(g, junction);
+ __m128 mask_b = _mm_cmplt_ps(b, junction);
+
+ /* Calculate value to be used if under junction */
+ __m128 mul_under = _mm_load_ps(_srb_mul_under);
+ __m128 r_mul = _mm_and_ps(mask_r, _mm_mul_ps(mul_under,
r));
+ __m128 g_mul = _mm_and_ps(mask_g, _mm_mul_ps(mul_under,
g));
+ __m128 b_mul = _mm_and_ps(mask_b, _mm_mul_ps(mul_under,
b));
+
+ /* Select the value to be used based on the junction
mask and scale to 8 bit */
+ __m128 upscale = _mm_load_ps(_8bit);
+ r = _mm_mul_ps(upscale, _mm_or_ps(r_mul,
_mm_andnot_ps(mask_r, r_gam)));
+ g = _mm_mul_ps(upscale, _mm_or_ps(g_mul,
_mm_andnot_ps(mask_g, g_gam)));
+ b = _mm_mul_ps(upscale, _mm_or_ps(b_mul,
_mm_andnot_ps(mask_b, b_gam)));
+
+ /* Convert to 8 bit unsigned and interleave*/
+ __m128i r_i = _mm_cvtps_epi32(r);
+ __m128i g_i = _mm_cvtps_epi32(g);
+ __m128i b_i = _mm_cvtps_epi32(b);
+
+ r_i = _mm_packs_epi32(r_i, r_i);
+ g_i = _mm_packs_epi32(g_i, g_i);
+ b_i = _mm_packs_epi32(b_i, b_i);
+
+ /* Set alpha value to 255 and store */
+ __m128i alpha_mask =
_mm_load_si128((__m128i*)_alpha_mask);
+ __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
+ __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
+ p1 = _mm_unpacklo_epi32(rg_i, bb_i);
+ p2 = _mm_unpackhi_epi32(rg_i, bb_i);
+
+ p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));
+
+ if (aligned_write)
+ _mm_store_si128((__m128i*)o, p1);
+ else
+ _mm_storeu_si128((__m128i*)o, p1);
+
+ i += 16;
+ o += 16;
+ }
+ /* Process remaining pixels */
+ width = complete_w & 3;
+ while(width--)
+ {
+ __m128i zero = _mm_setzero_si128();
+ __m128i in = _mm_loadl_epi64((__m128i*)i); // Load two
pixels
+ __m128i p1 =_mm_unpacklo_epi16(in, zero);
+ __m128 p1f = _mm_cvtepi32_ps(p1);
+
+ /* Splat r,g,b */
+ __m128 r = _mm_shuffle_ps(p1f, p1f,
_MM_SHUFFLE(4,4,4,4));
+ __m128 g = _mm_shuffle_ps(p1f, p1f,
_MM_SHUFFLE(3,3,3,3));
+ __m128 b = _mm_shuffle_ps(p1f, p1f,
_MM_SHUFFLE(2,2,2,2));
+
+ __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+ __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+ __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+ r = _mm_unpacklo_ps(r2, g2); // GG RR GG RR
+ r = _mm_movelh_ps(r, b2); // BB BB GG RR
+
+ __m128 normalize = _mm_load_ps(_normalize);
+ __m128 max_val = _mm_load_ps(_ones_ps);
+ __m128 min_val = _mm_setzero_ps();
+ r = _mm_min_ps(max_val, _mm_max_ps(min_val,
_mm_mul_ps(normalize, r2)));
+ __m128 mul_over = _mm_load_ps(_srb_mul_over);
+ __m128 sub_over = _mm_load_ps(_srb_sub_over);
+ __m128 pow_over = _mm_load_ps(_srb_pow_over);
+ __m128 r_gam = _mm_sub_ps(_mm_mul_ps( mul_over,
_mm_fastpow_ps(r, pow_over)), sub_over);
+ __m128 junction = _mm_load_ps(_junction_ps);
+ __m128 mask_r = _mm_cmplt_ps(r, junction);
+ __m128 mul_under = _mm_load_ps(_srb_mul_under);
+ __m128 r_mul = _mm_and_ps(mask_r, _mm_mul_ps(mul_under,
r));
+ __m128 upscale = _mm_load_ps(_8bit);
+ r = _mm_mul_ps(upscale, _mm_or_ps(r_mul,
_mm_andnot_ps(mask_r, r_gam)));
+
+ /* Convert to 8 bit unsigned */
+ zero = _mm_setzero_si128();
+ __m128i r_i = _mm_cvtps_epi32(r);
+ /* To 16 bit signed */
+ r_i = _mm_packs_epi32(r_i, zero);
+ /* To 8 bit unsigned - set alpha channel*/
+ __m128i alpha_mask =
_mm_load_si128((__m128i*)_alpha_mask);
+ r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i,
zero));
+ *(int*)o = _mm_cvtsi128_si32(r_i);
+ i+=4;
+ o+=4;
+ }
+ }
+}
+
+
+void
+transform8_otherrgb_sse2(ThreadInfo* t)
+{
+ RS_IMAGE16 *input = t->input;
+ GdkPixbuf *output = t->output;
+ RS_MATRIX3 *matrix = t->matrix;
+ gint x,y;
+ gint width;
+
+ float mat_ps[4*4*3] __attribute__ ((aligned (16)));
+ for (x = 0; x < 4; x++ ) {
+ mat_ps[x] = matrix->coeff[0][0];
+ mat_ps[x+4] = matrix->coeff[0][1];
+ mat_ps[x+8] = matrix->coeff[0][2];
+ mat_ps[12+x] = matrix->coeff[1][0];
+ mat_ps[12+x+4] = matrix->coeff[1][1];
+ mat_ps[12+x+8] = matrix->coeff[1][2];
+ mat_ps[24+x] = matrix->coeff[2][0];
+ mat_ps[24+x+4] = matrix->coeff[2][1];
+ mat_ps[24+x+8] = matrix->coeff[2][2];
+ }
+
+ int start_x = t->start_x;
+ /* Always have aligned input and output adress */
+ if (start_x & 3)
+ start_x = ((start_x) / 4) * 4;
+
+ int complete_w = t->end_x - start_x;
+ /* If width is not multiple of 4, check if we can extend it a bit */
+ if (complete_w & 3)
+ {
+ if ((t->end_x+4) < input->w)
+ complete_w = ((complete_w+3) / 4 *4);
+ }
+ __m128 gamma = _mm_set1_ps(t->output_gamma);
+
+ for(y=t->start_y ; y<t->end_y ; y++)
+ {
+ gushort *i = GET_PIXEL(input, start_x, y);
+ guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
+ gboolean aligned_write = !((guintptr)(o)&0xf);
+
+ width = complete_w >> 2;
+
+ while(width--)
+ {
+ /* Load and convert to float */
+ __m128i zero = _mm_setzero_si128();
+ __m128i in = _mm_load_si128((__m128i*)i); // Load two
pixels
+ __m128i in2 = _mm_load_si128((__m128i*)i+1); // Load
two pixels
+ _mm_prefetch(i + 64, _MM_HINT_NTA);
+ __m128i p1 =_mm_unpacklo_epi16(in, zero);
+ __m128i p2 =_mm_unpackhi_epi16(in, zero);
+ __m128i p3 =_mm_unpacklo_epi16(in2, zero);
+ __m128i p4 =_mm_unpackhi_epi16(in2, zero);
+ __m128 p1f = _mm_cvtepi32_ps(p1);
+ __m128 p2f = _mm_cvtepi32_ps(p2);
+ __m128 p3f = _mm_cvtepi32_ps(p3);
+ __m128 p4f = _mm_cvtepi32_ps(p4);
+
+ /* Convert to planar */
+ __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
+ __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
+ __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
+ __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
+ __m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
+ __m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
+ __m128 b = _mm_movelh_ps(b1b0, b3b2);
+
+ /* Apply matrix to convert to sRGB */
+ __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+ __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+ __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+ /* Normalize to 0->1 and clamp */
+ __m128 normalize = _mm_load_ps(_normalize);
+ __m128 max_val = _mm_load_ps(_ones_ps);
+ __m128 min_val = _mm_setzero_ps();
+ r = _mm_min_ps(max_val, _mm_max_ps(min_val,
_mm_mul_ps(normalize, r2)));
+ g = _mm_min_ps(max_val, _mm_max_ps(min_val,
_mm_mul_ps(normalize, g2)));
+ b = _mm_min_ps(max_val, _mm_max_ps(min_val,
_mm_mul_ps(normalize, b2)));
+
+ /* Apply Gamma */
+ __m128 upscale = _mm_load_ps(_8bit);
+ r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
+ g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma));
+ b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma));
+
+ /* Convert to 8 bit unsigned and interleave*/
+ __m128i r_i = _mm_cvtps_epi32(r);
+ __m128i g_i = _mm_cvtps_epi32(g);
+ __m128i b_i = _mm_cvtps_epi32(b);
+
+ r_i = _mm_packs_epi32(r_i, r_i);
+ g_i = _mm_packs_epi32(g_i, g_i);
+ b_i = _mm_packs_epi32(b_i, b_i);
+
+ /* Set alpha value to 255 and store */
+ __m128i alpha_mask =
_mm_load_si128((__m128i*)_alpha_mask);
+ __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
+ __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
+ p1 = _mm_unpacklo_epi32(rg_i, bb_i);
+ p2 = _mm_unpackhi_epi32(rg_i, bb_i);
+
+ p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));
+
+ if (aligned_write)
+ _mm_store_si128((__m128i*)o, p1);
+ else
+ _mm_storeu_si128((__m128i*)o, p1);
+
+ i += 16;
+ o += 16;
+ }
+ /* Process remaining pixels */
+ width = complete_w & 3;
+ while(width--)
+ {
+ __m128i zero = _mm_setzero_si128();
+ __m128i in = _mm_loadl_epi64((__m128i*)i); // Load two
pixels
+ __m128i p1 =_mm_unpacklo_epi16(in, zero);
+ __m128 p1f = _mm_cvtepi32_ps(p1);
+
+ /* Splat r,g,b */
+ __m128 r = _mm_shuffle_ps(p1f, p1f,
_MM_SHUFFLE(4,4,4,4));
+ __m128 g = _mm_shuffle_ps(p1f, p1f,
_MM_SHUFFLE(3,3,3,3));
+ __m128 b = _mm_shuffle_ps(p1f, p1f,
_MM_SHUFFLE(2,2,2,2));
+
+ __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+ __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+ __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+ r = _mm_unpacklo_ps(r2, g2); // GG RR GG RR
+ r = _mm_movelh_ps(r, b2); // BB BB GG RR
+
+ __m128 normalize = _mm_load_ps(_normalize);
+ __m128 max_val = _mm_load_ps(_ones_ps);
+ __m128 min_val = _mm_setzero_ps();
+ r = _mm_min_ps(max_val, _mm_max_ps(min_val,
_mm_mul_ps(normalize, r2)));
+ __m128 upscale = _mm_load_ps(_8bit);
+ r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
+
+ /* Convert to 8 bit unsigned */
+ zero = _mm_setzero_si128();
+ __m128i r_i = _mm_cvtps_epi32(r);
+ /* To 16 bit signed */
+ r_i = _mm_packs_epi32(r_i, zero);
+ /* To 8 bit unsigned - set alpha channel*/
+ __m128i alpha_mask =
_mm_load_si128((__m128i*)_alpha_mask);
+ r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i,
zero));
+ *(int*)o = _mm_cvtsi128_si32(r_i);
+ i+=4;
+ o+=4;
+ }
+ }
+}
+
+gboolean cst_has_sse2()
+{
+ return TRUE;
+}
+
+#else // !defined __SSE2__
+
+/* Provide empty functions if not SSE2 compiled to avoid linker errors */
+
+void transform8_sse2(ThreadInfo* t)
+{
+ /* We should never even get here */
+ g_assert(FALSE);
+}
+
+void
+transform8_otherrgb_sse2(ThreadInfo* t)
+{
+ /* We should never even get here */
+ g_assert(FALSE);
+}
+
+gboolean cst_has_sse2()
+{
+ return FALSE;
+}
+
+#endif
\ No newline at end of file
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit