Author: post
Date: 2011-04-13 00:32:17 +0200 (Wed, 13 Apr 2011)
New Revision: 3984

Added:
   trunk/plugins/colorspace-transform/colorspace_transform_avx.c
Modified:
   trunk/plugins/colorspace-transform/Makefile.am
   trunk/plugins/colorspace-transform/colorspace_transform.c
   trunk/plugins/colorspace-transform/colorspace_transform.h
Log:
Add AVX compiled version of SSE2 code.

Modified: trunk/plugins/colorspace-transform/Makefile.am
===================================================================
--- trunk/plugins/colorspace-transform/Makefile.am      2011-04-12 22:14:10 UTC 
(rev 3983)
+++ trunk/plugins/colorspace-transform/Makefile.am      2011-04-12 22:32:17 UTC 
(rev 3984)
@@ -17,11 +17,11 @@
 
 libdir = $(datadir)/rawstudio/plugins/
 
-colorspace_transform_la_LIBADD = @PACKAGE_LIBS@ colorspace_transform_sse2.lo 
rs-cmm.lo colorspace_transform-c.lo
+colorspace_transform_la_LIBADD = @PACKAGE_LIBS@ colorspace_transform_avx.lo 
colorspace_transform_sse2.lo rs-cmm.lo colorspace_transform-c.lo
 colorspace_transform_la_LDFLAGS = -module -avoid-version
 colorspace_transform_la_SOURCES = 
 
-EXTRA_DIST = colorspace_transform.c rs-cmm.c rs-cmm.h 
colorspace_transform_sse2.c colorspace_transform.h
+EXTRA_DIST = colorspace_transform.c rs-cmm.c rs-cmm.h 
colorspace_transform_avx.lo colorspace_transform_sse2.c colorspace_transform.h
 
 colorspace_transform-c.lo: colorspace_transform.c colorspace_transform.h
        $(LTCOMPILE) -o colorspace_transform-c.o -c 
$(top_srcdir)/plugins/colorspace-transform/colorspace_transform.c
@@ -36,3 +36,11 @@
 SSE_FLAG=
 endif
        $(LTCOMPILE) $(SSE_FLAG) -c 
$(top_srcdir)/plugins/colorspace-transform/colorspace_transform_sse2.c
+
+colorspace_transform_avx.lo: colorspace_transform_avx.c colorspace_transform.h
+if CAN_COMPILE_AVX
+AVX_FLAG=-mavx
+else
+AVX_FLAG=
+endif
+       $(LTCOMPILE) $(AVX_FLAG) -c 
$(top_srcdir)/plugins/colorspace-transform/colorspace_transform_avx.c

Modified: trunk/plugins/colorspace-transform/colorspace_transform.c
===================================================================
--- trunk/plugins/colorspace-transform/colorspace_transform.c   2011-04-12 
22:14:10 UTC (rev 3983)
+++ trunk/plugins/colorspace-transform/colorspace_transform.c   2011-04-12 
22:32:17 UTC (rev 3984)
@@ -56,6 +56,11 @@
 extern void transform8_otherrgb_sse2(ThreadInfo* t);
 extern gboolean cst_has_sse2(void);
 
+/* AVX optimized functions */
+extern void transform8_srgb_avx(ThreadInfo* t);
+extern void transform8_otherrgb_avx(ThreadInfo* t);
+extern gboolean cst_has_avx(void);
+
 G_MODULE_EXPORT void
 rs_plugin_load(RSPlugin *plugin)
 {
@@ -358,8 +363,27 @@
        g_assert(RS_IS_COLOR_SPACE(input_space));
        g_assert(RS_IS_COLOR_SPACE(output_space));
 
+       gboolean avx_available = (!!(rs_detect_cpu_features() & 
RS_CPU_FLAG_AVX)) && cst_has_avx();
        gboolean sse2_available = (!!(rs_detect_cpu_features() & 
RS_CPU_FLAG_SSE2)) && cst_has_sse2();
 
+       if (avx_available && rs_color_space_new_singleton("RSSrgb") == 
output_space)
+       {
+               transform8_srgb_avx(t);
+               return (NULL);
+       }
+       if (avx_available && rs_color_space_new_singleton("RSAdobeRGB") == 
output_space)
+       {
+               t->output_gamma = 1.0 / 2.19921875;
+               transform8_otherrgb_avx(t);
+               return (NULL);
+       }
+       if (avx_available && rs_color_space_new_singleton("RSProphoto") == 
output_space)
+       {
+               t->output_gamma = 1.0 / 1.8;
+               transform8_otherrgb_avx(t);
+               return (NULL);
+       }
+
        if (sse2_available && rs_color_space_new_singleton("RSSrgb") == 
output_space)
        {
                transform8_srgb_sse2(t);

Modified: trunk/plugins/colorspace-transform/colorspace_transform.h
===================================================================
--- trunk/plugins/colorspace-transform/colorspace_transform.h   2011-04-12 
22:14:10 UTC (rev 3983)
+++ trunk/plugins/colorspace-transform/colorspace_transform.h   2011-04-12 
22:32:17 UTC (rev 3984)
@@ -57,3 +57,8 @@
 void transform8_srgb_sse2(ThreadInfo* t);
 void transform8_otherrgb_sse2(ThreadInfo* t);
 gboolean cst_has_sse2(void);
+
+/* AVX optimized functions */
+void transform8_srgb_avx(ThreadInfo* t);
+void transform8_otherrgb_avx(ThreadInfo* t);
+gboolean cst_has_avx(void);

Added: trunk/plugins/colorspace-transform/colorspace_transform_avx.c
===================================================================
--- trunk/plugins/colorspace-transform/colorspace_transform_avx.c               
                (rev 0)
+++ trunk/plugins/colorspace-transform/colorspace_transform_avx.c       
2011-04-12 22:32:17 UTC (rev 3984)
@@ -0,0 +1,570 @@
+/*
+ * * Copyright (C) 2006-2011 Anders Brander <[email protected]>, 
+ * * Anders Kvist <[email protected]> and Klaus Post <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, 
USA.
+ */
+
+/* Plugin tmpl version 5 */
+
+#include <rawstudio.h>
+#include <lcms.h>
+#include "rs-cmm.h"
+#include "colorspace_transform.h"
+
+#if defined(__AVX__)
+
+#include <emmintrin.h>
+
+
+/* AVX Polynomial pow function from Mesa3d (MIT License) */
+
+#define EXP_POLY_DEGREE 2
+
+#define POLY0(x, c0) _mm_load_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), 
_mm_load_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), 
_mm_load_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), 
x), _mm_load_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, 
c3, c4), x), _mm_load_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, 
c2, c3, c4, c5), x), _mm_load_ps(c0))
+
+static const gfloat exp_p5_0[4] __attribute__ ((aligned (16))) = 
{9.9999994e-1f, 9.9999994e-1f, 9.9999994e-1f, 9.9999994e-1f};
+static const gfloat exp_p5_1[4] __attribute__ ((aligned (16))) = 
{6.9315308e-1f, 6.9315308e-1f, 6.9315308e-1f, 6.9315308e-1f};
+static const gfloat exp_p5_2[4] __attribute__ ((aligned (16))) = 
{2.4015361e-1f,  2.4015361e-1f,  2.4015361e-1f,  2.4015361e-1f};
+static const gfloat exp_p5_3[4] __attribute__ ((aligned (16))) = 
{5.5826318e-2f, 5.5826318e-2f, 5.5826318e-2f, 5.5826318e-2f};
+static const gfloat exp_p5_4[4] __attribute__ ((aligned (16))) = 
{8.9893397e-3f, 8.9893397e-3f, 8.9893397e-3f, 8.9893397e-3f};
+static const gfloat exp_p5_5[4] __attribute__ ((aligned (16))) = 
{1.8775767e-3f, 1.8775767e-3f, 1.8775767e-3f, 1.8775767e-3f};
+
+static const gfloat exp_p4_0[4] __attribute__ ((aligned (16))) = {1.0000026f, 
1.0000026f, 1.0000026f, 1.0000026f};
+static const gfloat exp_p4_1[4] __attribute__ ((aligned (16))) = 
{6.9300383e-1f,  6.9300383e-1f,  6.9300383e-1f,  6.9300383e-1f};
+static const gfloat exp_p4_2[4] __attribute__ ((aligned (16))) = 
{2.4144275e-1f, 2.4144275e-1f, 2.4144275e-1f, 2.4144275e-1f};
+static const gfloat exp_p4_3[4] __attribute__ ((aligned (16))) = 
{5.2011464e-2f, 5.2011464e-2f, 5.2011464e-2f, 5.2011464e-2f};
+static const gfloat exp_p4_4[4] __attribute__ ((aligned (16))) = 
{1.3534167e-2f, 1.3534167e-2f, 1.3534167e-2f, 1.3534167e-2f};
+
+static const gfloat exp_p3_0[4] __attribute__ ((aligned (16))) = 
{9.9992520e-1f, 9.9992520e-1f, 9.9992520e-1f, 9.9992520e-1f};
+static const gfloat exp_p3_1[4] __attribute__ ((aligned (16))) = 
{6.9583356e-1f, 6.9583356e-1f, 6.9583356e-1f, 6.9583356e-1f};
+static const gfloat exp_p3_2[4] __attribute__ ((aligned (16))) = 
{2.2606716e-1f, 2.2606716e-1f, 2.2606716e-1f, 2.2606716e-1f};
+static const gfloat exp_p3_3[4] __attribute__ ((aligned (16))) = 
{7.8024521e-2f, 7.8024521e-2f, 7.8024521e-2f, 7.8024521e-2f};
+
+static const gfloat exp_p2_0[4] __attribute__ ((aligned (16))) = {1.0017247f, 
1.0017247f, 1.0017247f, 1.0017247f};
+static const gfloat exp_p2_1[4] __attribute__ ((aligned (16))) = 
{6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
+static const gfloat exp_p2_2[4] __attribute__ ((aligned (16))) = 
{3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
+
+static const gfloat _ones_ps[4] __attribute__ ((aligned (16))) = {1.0f, 1.0f, 
1.0f, 1.0f};
+static const gfloat _one29_ps[4] __attribute__ ((aligned (16))) = {129.00000f, 
129.00000f, 129.00000f, 129.00000f};
+static const gfloat _minusone27_ps[4] __attribute__ ((aligned (16))) = 
{-126.99999f, -126.99999f, -126.99999f, -126.99999f};
+static const gfloat _half_ps[4] __attribute__ ((aligned (16))) = {0.5f, 0.5f, 
0.5f, 0.5f};
+static const guint _one27[4] __attribute__ ((aligned (16))) = 
{127,127,127,127};
+       
+static inline __m128 
+exp2f4(__m128 x)
+{
+       __m128i ipart;
+       __m128 fpart, expipart, expfpart;
+
+       x = _mm_min_ps(x, _mm_load_ps(_one29_ps));
+       x = _mm_max_ps(x, _mm_load_ps(_minusone27_ps));
+
+       /* ipart = int(x - 0.5) */
+       ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_load_ps(_half_ps)));
+
+       /* fpart = x - ipart */
+       fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+       /* expipart = (float) (1 << ipart) */
+       expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, 
_mm_load_si128((__m128i*)_one27)), 23));
+
+       /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+       expfpart = POLY5(fpart, exp_p5_0, exp_p5_1, exp_p5_2, exp_p5_3, 
exp_p5_4, exp_p5_5);
+#elif EXP_POLY_DEGREE == 4
+       expfpart = POLY4(fpart, exp_p4_0, exp_p4_1, exp_p4_2, exp_p4_3, 
exp_p4_4);
+#elif EXP_POLY_DEGREE == 3
+       expfpart = POLY3(fpart, exp_p3_0, exp_p3_1, exp_p3_2, exp_p3_3);
+#elif EXP_POLY_DEGREE == 2
+       expfpart = POLY2(fpart, exp_p2_0, exp_p2_1, exp_p2_2);
+#else
+#error
+#endif
+
+       return _mm_mul_ps(expipart, expfpart);
+}
+
+
+#define LOG_POLY_DEGREE 4
+
+static const gfloat log_p5_0[4] __attribute__ ((aligned (16))) = {3.1157899f, 
3.1157899f, 3.1157899f, 3.1157899f};
+static const gfloat log_p5_1[4] __attribute__ ((aligned (16))) = {-3.3241990f, 
-3.3241990f, -3.3241990f, -3.3241990f};
+static const gfloat log_p5_2[4] __attribute__ ((aligned (16))) = {2.5988452f, 
2.5988452f, 2.5988452f, 2.5988452f};
+static const gfloat log_p5_3[4] __attribute__ ((aligned (16))) = {-1.2315303f, 
-1.2315303f, -1.2315303f, -1.2315303f};
+static const gfloat log_p5_4[4] __attribute__ ((aligned (16))) = 
{3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
+static const gfloat log_p5_5[4] __attribute__ ((aligned (16))) = 
{-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
+
+static const gfloat log_p4_0[4] __attribute__ ((aligned (16))) = 
{2.8882704548164776201f, 2.8882704548164776201f, 2.8882704548164776201f, 
2.8882704548164776201f};
+static const gfloat log_p4_1[4] __attribute__ ((aligned (16))) = 
{-2.52074962577807006663f, -2.52074962577807006663f, -2.52074962577807006663f, 
-2.52074962577807006663f};
+static const gfloat log_p4_2[4] __attribute__ ((aligned (16))) = 
{1.48116647521213171641f, 1.48116647521213171641f, 1.48116647521213171641f, 
1.48116647521213171641f};
+static const gfloat log_p4_3[4] __attribute__ ((aligned (16))) = 
{-0.465725644288844778798f, 
-0.465725644288844778798f,-0.465725644288844778798f, -0.465725644288844778798f};
+static const gfloat log_p4_4[4] __attribute__ ((aligned (16))) = 
{0.0596515482674574969533f, 0.0596515482674574969533f, 
0.0596515482674574969533f, 0.0596515482674574969533f};
+
+static const gfloat log_p3_0[4] __attribute__ ((aligned (16))) = 
{2.61761038894603480148f, 2.61761038894603480148f, 2.61761038894603480148f, 
2.61761038894603480148f};
+static const gfloat log_p3_1[4] __attribute__ ((aligned (16))) = 
{-1.75647175389045657003f, -1.75647175389045657003f, -1.75647175389045657003f, 
-1.75647175389045657003f};
+static const gfloat log_p3_2[4] __attribute__ ((aligned (16))) = 
{0.688243882994381274313f, 0.688243882994381274313f, 0.688243882994381274313f, 
0.688243882994381274313f};
+static const gfloat log_p3_3[4] __attribute__ ((aligned (16))) = 
{-0.107254423828329604454f, -0.107254423828329604454f, 
-0.107254423828329604454f, -0.107254423828329604454f};
+
+static const gfloat log_p2_0[4] __attribute__ ((aligned (16))) = 
{2.28330284476918490682f, 2.28330284476918490682f, 2.28330284476918490682f, 
2.28330284476918490682f};
+static const gfloat log_p2_1[4] __attribute__ ((aligned (16))) = 
{-1.04913055217340124191f, -1.04913055217340124191f, -1.04913055217340124191f, 
-1.04913055217340124191f};
+static const gfloat log_p2_2[4] __attribute__ ((aligned (16))) = 
{0.204446009836232697516f, 0.204446009836232697516f, 0.204446009836232697516f, 
0.204446009836232697516f};
+
+static const guint _exp_mask[4] __attribute__ ((aligned (16))) = 
{0x7F800000,0x7F800000,0x7F800000,0x7F800000};
+static const guint _mantissa_mask[4] __attribute__ ((aligned (16))) = 
{0x007FFFFF,0x007FFFFF,0x007FFFFF,0x007FFFFF};
+
+static inline __m128 
+log2f4(__m128 x)
+{
+       __m128i exp = _mm_load_si128((__m128i*)_exp_mask);
+       __m128i mant = _mm_load_si128((__m128i*)_mantissa_mask);
+       __m128 one = _mm_load_ps(_ones_ps);
+       __m128i i = _mm_castps_si128(x);
+       __m128 e = 
_mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, exp), 23), 
_mm_load_si128((__m128i*)_one27)));
+       __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mant)), one);
+       __m128 p;
+
+       /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */
+#if LOG_POLY_DEGREE == 6
+       p = POLY5( m, log_p5_0, log_p5_1, log_p5_2, log_p5_3, log_p5_4, 
log_p5_5);
+#elif LOG_POLY_DEGREE == 5
+       p = POLY4(m, log_p4_0, log_p4_1, log_p4_2, log_p4_3, log_p4_4);
+#elif LOG_POLY_DEGREE == 4
+       p = POLY3(m, log_p3_0, log_p3_1, log_p3_2, log_p3_3);
+#elif LOG_POLY_DEGREE == 3
+       p = POLY2(m, log_p2_0, log_p2_1, log_p2_2);
+#else
+#error
+#endif
+
+       /* This effectively increases the polynomial degree by one, but ensures 
that log2(1) == 0*/
+       p = _mm_mul_ps(p, _mm_sub_ps(m, one));
+
+       return _mm_add_ps(p, e);
+}
+
+static inline __m128
+_mm_fastpow_ps(__m128 x, __m128 y)
+{
+       return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+/* END: AVX Polynomial pow function from Mesa3d (MIT License) */
+
+
+static inline __m128
+sse_matrix3_mul(float* mul, __m128 a, __m128 b, __m128 c)
+{
+       __m128 v = _mm_load_ps(mul);
+       __m128 acc = _mm_mul_ps(a, v);
+
+       v = _mm_load_ps(mul+4);
+       acc = _mm_add_ps(acc, _mm_mul_ps(b, v));
+
+       v = _mm_load_ps(mul+8);
+       acc = _mm_add_ps(acc, _mm_mul_ps(c, v));
+
+       return acc;
+}
+
+
+static const gfloat _junction_ps[4] __attribute__ ((aligned (16))) = 
{0.0031308, 0.0031308, 0.0031308, 0.0031308};
+static const gfloat _normalize[4] __attribute__ ((aligned (16))) = 
{1.0f/65535.0f, 1.0f/65535.0f, 1.0f/65535.0f, 1.0f/65535.0f};
+static const gfloat _8bit[4] __attribute__ ((aligned (16))) = {255.5f, 255.5f, 
255.5f, 255.5f};
+static const gfloat _srb_mul_under[4] __attribute__ ((aligned (16))) = 
{12.92f, 12.92f, 12.92f, 12.92f};
+static const gfloat _srb_mul_over[4] __attribute__ ((aligned (16))) = {1.055f, 
1.055f, 1.055f, 1.055f};
+static const gfloat _srb_sub_over[4] __attribute__ ((aligned (16))) = {0.055f, 
0.055f, 0.055f, 0.055f};
+static const gfloat _srb_pow_over[4] __attribute__ ((aligned (16))) = 
{1.0/2.4, 1.0/2.4, 1.0/2.4, 1.0/2.4};
+static const guint _alpha_mask[4] __attribute__ ((aligned (16))) = 
{0xff000000,0xff000000,0xff000000,0xff000000};
+
+void
+transform8_srgb_avx(ThreadInfo* t)
+{
+       RS_IMAGE16 *input = t->input;
+       GdkPixbuf *output = t->output;
+       RS_MATRIX3 *matrix = t->matrix;
+       gint x,y;
+       gint width;
+
+       float mat_ps[4*4*3] __attribute__ ((aligned (16)));
+       for (x = 0; x < 4; x++ ) {
+               mat_ps[x] = matrix->coeff[0][0];
+               mat_ps[x+4] = matrix->coeff[0][1];
+               mat_ps[x+8] = matrix->coeff[0][2];
+               mat_ps[12+x] = matrix->coeff[1][0];
+               mat_ps[12+x+4] = matrix->coeff[1][1];
+               mat_ps[12+x+8] = matrix->coeff[1][2];
+               mat_ps[24+x] = matrix->coeff[2][0];
+               mat_ps[24+x+4] = matrix->coeff[2][1];
+               mat_ps[24+x+8] = matrix->coeff[2][2];
+       }
+       
+       int start_x = t->start_x;
+       /* Always have aligned input and output adress */
+       if (start_x & 3)
+               start_x = ((start_x) / 4) * 4;
+       
+       int complete_w = t->end_x - start_x;
+       /* If width is not multiple of 4, check if we can extend it a bit */
+       if (complete_w & 3)
+       {
+               if ((t->end_x+4) < input->w)
+                       complete_w = (((complete_w + 3) / 4) * 4);
+       }
+       
+       for(y=t->start_y ; y<t->end_y ; y++)
+       {
+               gushort *i = GET_PIXEL(input, start_x, y);
+               guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
+               gboolean aligned_write = !((guintptr)(o)&0xf);
+
+               width = complete_w >> 2;
+
+               while(width--)
+               {
+                       /* Load and convert to float */
+                       __m128i zero = _mm_setzero_si128();
+                       __m128i in = _mm_load_si128((__m128i*)i); // Load two 
pixels
+                       __m128i in2 = _mm_load_si128((__m128i*)i+1); // Load 
two pixels
+                       _mm_prefetch(i + 64, _MM_HINT_NTA);
+                       __m128i p1 =_mm_unpacklo_epi16(in, zero);
+                       __m128i p2 =_mm_unpackhi_epi16(in, zero);
+                       __m128i p3 =_mm_unpacklo_epi16(in2, zero);
+                       __m128i p4 =_mm_unpackhi_epi16(in2, zero);
+                       __m128 p1f  = _mm_cvtepi32_ps(p1);
+                       __m128 p2f  = _mm_cvtepi32_ps(p2);
+                       __m128 p3f  = _mm_cvtepi32_ps(p3);
+                       __m128 p4f  = _mm_cvtepi32_ps(p4);
+                       
+                       /* Convert to planar */
+                       __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
+                       __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
+                       __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
+                       __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
+                       __m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
+                       __m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
+                       __m128 b = _mm_movelh_ps(b1b0, b3b2);
+
+                       /* Apply matrix to convert to sRGB */
+                       __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+                       __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+                       __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+                       /* Normalize to 0->1 and clamp */
+                       __m128 normalize = _mm_load_ps(_normalize);
+                       __m128 max_val = _mm_load_ps(_ones_ps);
+                       __m128 min_val = _mm_setzero_ps();
+                       r = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, r2)));
+                       g = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, g2)));
+                       b = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, b2)));
+
+                       /* Apply Gamma */
+                       /* Calculate values to be used if larger than junction 
point */
+                       __m128 mul_over = _mm_load_ps(_srb_mul_over);
+                       __m128 sub_over = _mm_load_ps(_srb_sub_over);
+                       __m128 pow_over = _mm_load_ps(_srb_pow_over);
+                       __m128 r_gam = _mm_sub_ps(_mm_mul_ps( mul_over, 
_mm_fastpow_ps(r, pow_over)), sub_over);
+                       __m128 g_gam = _mm_sub_ps(_mm_mul_ps( mul_over, 
_mm_fastpow_ps(g, pow_over)), sub_over);
+                       __m128 b_gam = _mm_sub_ps(_mm_mul_ps( mul_over, 
_mm_fastpow_ps(b, pow_over)), sub_over);
+
+                       /* Create mask for values smaller than junction point */
+                       __m128 junction = _mm_load_ps(_junction_ps);
+                       __m128 mask_r = _mm_cmplt_ps(r, junction);
+                       __m128 mask_g = _mm_cmplt_ps(g, junction);
+                       __m128 mask_b = _mm_cmplt_ps(b, junction);
+
+                       /* Calculate value to be used if under junction */
+                       __m128 mul_under = _mm_load_ps(_srb_mul_under);
+                       __m128 r_mul = _mm_and_ps(mask_r, _mm_mul_ps(mul_under, 
r));
+                       __m128 g_mul = _mm_and_ps(mask_g, _mm_mul_ps(mul_under, 
g));
+                       __m128 b_mul = _mm_and_ps(mask_b, _mm_mul_ps(mul_under, 
b));
+
+                       /* Select the value to be used based on the junction 
mask and scale to 8 bit */
+                       __m128 upscale = _mm_load_ps(_8bit);
+                       r = _mm_mul_ps(upscale, _mm_or_ps(r_mul, 
_mm_andnot_ps(mask_r, r_gam)));
+                       g = _mm_mul_ps(upscale, _mm_or_ps(g_mul, 
_mm_andnot_ps(mask_g, g_gam)));
+                       b = _mm_mul_ps(upscale, _mm_or_ps(b_mul, 
_mm_andnot_ps(mask_b, b_gam)));
+                       
+                       /* Convert to 8 bit unsigned  and interleave*/
+                       __m128i r_i = _mm_cvtps_epi32(r);
+                       __m128i g_i = _mm_cvtps_epi32(g);
+                       __m128i b_i = _mm_cvtps_epi32(b);
+                       
+                       r_i = _mm_packs_epi32(r_i, r_i);
+                       g_i = _mm_packs_epi32(g_i, g_i);
+                       b_i = _mm_packs_epi32(b_i, b_i);
+
+                       /* Set alpha value to 255 and store */
+                       __m128i alpha_mask = 
_mm_load_si128((__m128i*)_alpha_mask);
+                       __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
+                       __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
+                       p1 = _mm_unpacklo_epi32(rg_i, bb_i);
+                       p2 = _mm_unpackhi_epi32(rg_i, bb_i);
+       
+                       p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));
+
+                       if (aligned_write)
+                               _mm_store_si128((__m128i*)o, p1);
+                       else
+                               _mm_storeu_si128((__m128i*)o, p1);
+
+                       i += 16;
+                       o += 16;
+               }
+
+               /* Process remaining pixels */
+               width = complete_w & 3;
+
+               while(width--)
+               {
+                       __m128i zero = _mm_setzero_si128();
+                       __m128i in = _mm_loadl_epi64((__m128i*)i); // Load one 
pixel
+                       __m128i p1 =_mm_unpacklo_epi16(in, zero);
+                       __m128 p1f  = _mm_cvtepi32_ps(p1);
+
+                       /* Splat r,g,b */
+                       __m128 r =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(0,0,0,0));
+                       __m128 g =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(1,1,1,1));
+                       __m128 b =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(2,2,2,2));
+
+                       __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+                       __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+                       __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+                       r = _mm_unpacklo_ps(r2, g2);    // RR GG RR GG
+                       r = _mm_movelh_ps(r, b2);               // RR GG BB BB
+
+                       __m128 normalize = _mm_load_ps(_normalize);
+                       __m128 max_val = _mm_load_ps(_ones_ps);
+                       __m128 min_val = _mm_setzero_ps();
+                       r = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, r)));
+                       __m128 mul_over = _mm_load_ps(_srb_mul_over);
+                       __m128 sub_over = _mm_load_ps(_srb_sub_over);
+                       __m128 pow_over = _mm_load_ps(_srb_pow_over);
+                       __m128 r_gam = _mm_sub_ps(_mm_mul_ps( mul_over, 
_mm_fastpow_ps(r, pow_over)), sub_over);
+                       __m128 junction = _mm_load_ps(_junction_ps);
+                       __m128 mask_r = _mm_cmplt_ps(r, junction);
+                       __m128 mul_under = _mm_load_ps(_srb_mul_under);
+                       __m128 r_mul = _mm_and_ps(mask_r, _mm_mul_ps(mul_under, 
r));
+                       __m128 upscale = _mm_load_ps(_8bit);
+                       r = _mm_mul_ps(upscale, _mm_or_ps(r_mul, 
_mm_andnot_ps(mask_r, r_gam)));
+                       
+                       /* Convert to 8 bit unsigned */
+                       zero = _mm_setzero_si128();
+                       __m128i r_i = _mm_cvtps_epi32(r);
+                       /* To 16 bit signed */
+                       r_i = _mm_packs_epi32(r_i, zero);
+                       /* To 8 bit unsigned - set alpha channel*/
+                       __m128i alpha_mask = 
_mm_load_si128((__m128i*)_alpha_mask);
+                       r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, 
zero));
+                       *(int*)o = _mm_cvtsi128_si32(r_i);
+                       i+=4;
+                       o+=4;
+               }
+       }
+}
+
+
+void
+transform8_otherrgb_avx(ThreadInfo* t)
+{
+       RS_IMAGE16 *input = t->input;
+       GdkPixbuf *output = t->output;
+       RS_MATRIX3 *matrix = t->matrix;
+       gint x,y;
+       gint width;
+
+       float mat_ps[4*4*3] __attribute__ ((aligned (16)));
+       for (x = 0; x < 4; x++ ) {
+               mat_ps[x] = matrix->coeff[0][0];
+               mat_ps[x+4] = matrix->coeff[0][1];
+               mat_ps[x+8] = matrix->coeff[0][2];
+               mat_ps[12+x] = matrix->coeff[1][0];
+               mat_ps[12+x+4] = matrix->coeff[1][1];
+               mat_ps[12+x+8] = matrix->coeff[1][2];
+               mat_ps[24+x] = matrix->coeff[2][0];
+               mat_ps[24+x+4] = matrix->coeff[2][1];
+               mat_ps[24+x+8] = matrix->coeff[2][2];
+       }
+       
+       int start_x = t->start_x;
+       /* Always have aligned input and output adress */
+       if (start_x & 3)
+               start_x = ((start_x) / 4) * 4;
+       
+       int complete_w = t->end_x - start_x;
+       /* If width is not multiple of 4, check if we can extend it a bit */
+       if (complete_w & 3)
+       {
+               if ((t->end_x+4) < input->w)
+                       complete_w = ((complete_w+3) / 4 * 4);
+       }
+       __m128 gamma = _mm_set1_ps(t->output_gamma);
+
+       for(y=t->start_y ; y<t->end_y ; y++)
+       {
+               gushort *i = GET_PIXEL(input, start_x, y);
+               guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
+               gboolean aligned_write = !((guintptr)(o)&0xf);
+
+               width = complete_w >> 2;
+
+               while(width--)
+               {
+                       /* Load and convert to float */
+                       __m128i zero = _mm_setzero_si128();
+                       __m128i in = _mm_load_si128((__m128i*)i); // Load two 
pixels
+                       __m128i in2 = _mm_load_si128((__m128i*)i+1); // Load 
two pixels
+                       _mm_prefetch(i + 64, _MM_HINT_NTA);
+                       __m128i p1 =_mm_unpacklo_epi16(in, zero);
+                       __m128i p2 =_mm_unpackhi_epi16(in, zero);
+                       __m128i p3 =_mm_unpacklo_epi16(in2, zero);
+                       __m128i p4 =_mm_unpackhi_epi16(in2, zero);
+                       __m128 p1f  = _mm_cvtepi32_ps(p1);
+                       __m128 p2f  = _mm_cvtepi32_ps(p2);
+                       __m128 p3f  = _mm_cvtepi32_ps(p3);
+                       __m128 p4f  = _mm_cvtepi32_ps(p4);
+                       
+                       /* Convert to planar */
+                       __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
+                       __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
+                       __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
+                       __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
+                       __m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
+                       __m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
+                       __m128 b = _mm_movelh_ps(b1b0, b3b2);
+
+                       /* Apply matrix to convert to sRGB */
+                       __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+                       __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+                       __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+                       /* Normalize to 0->1 and clamp */
+                       __m128 normalize = _mm_load_ps(_normalize);
+                       __m128 max_val = _mm_load_ps(_ones_ps);
+                       __m128 min_val = _mm_setzero_ps();
+                       r = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, r2)));
+                       g = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, g2)));
+                       b = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, b2)));
+
+                       /* Apply Gamma */
+                       __m128 upscale = _mm_load_ps(_8bit);
+                       r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
+                       g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma));
+                       b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma));
+
+                       /* Convert to 8 bit unsigned  and interleave*/
+                       __m128i r_i = _mm_cvtps_epi32(r);
+                       __m128i g_i = _mm_cvtps_epi32(g);
+                       __m128i b_i = _mm_cvtps_epi32(b);
+                       
+                       r_i = _mm_packs_epi32(r_i, r_i);
+                       g_i = _mm_packs_epi32(g_i, g_i);
+                       b_i = _mm_packs_epi32(b_i, b_i);
+
+                       /* Set alpha value to 255 and store */
+                       __m128i alpha_mask = 
_mm_load_si128((__m128i*)_alpha_mask);
+                       __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
+                       __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
+                       p1 = _mm_unpacklo_epi32(rg_i, bb_i);
+                       p2 = _mm_unpackhi_epi32(rg_i, bb_i);
+       
+                       p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));
+
+                       if (aligned_write)
+                               _mm_store_si128((__m128i*)o, p1);
+                       else
+                               _mm_storeu_si128((__m128i*)o, p1);
+
+                       i += 16;
+                       o += 16;
+               }
+               /* Process remaining pixels */
+               width = complete_w & 3;
+               while(width--)
+               {
+                       __m128i zero = _mm_setzero_si128();
+                       __m128i in = _mm_loadl_epi64((__m128i*)i); // Load two 
pixels
+                       __m128i p1 =_mm_unpacklo_epi16(in, zero);
+                       __m128 p1f  = _mm_cvtepi32_ps(p1);
+
+                       /* Splat r,g,b */
+                       __m128 r =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(0,0,0,0));
+                       __m128 g =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(1,1,1,1));
+                       __m128 b =  _mm_shuffle_ps(p1f, p1f, 
_MM_SHUFFLE(2,2,2,2));
+
+                       __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+                       __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+                       __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+                       r = _mm_unpacklo_ps(r2, g2);    // GG RR GG RR
+                       r = _mm_movelh_ps(r, b2);               // BB BB GG RR
+
+                       __m128 normalize = _mm_load_ps(_normalize);
+                       __m128 max_val = _mm_load_ps(_ones_ps);
+                       __m128 min_val = _mm_setzero_ps();
+                       r = _mm_min_ps(max_val, _mm_max_ps(min_val, 
_mm_mul_ps(normalize, r)));
+                       __m128 upscale = _mm_load_ps(_8bit);
+                       r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
+                       
+                       /* Convert to 8 bit unsigned */
+                       zero = _mm_setzero_si128();
+                       __m128i r_i = _mm_cvtps_epi32(r);
+                       /* To 16 bit signed */
+                       r_i = _mm_packs_epi32(r_i, zero);
+                       /* To 8 bit unsigned - set alpha channel*/
+                       __m128i alpha_mask = 
_mm_load_si128((__m128i*)_alpha_mask);
+                       r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, 
zero));
+                       *(int*)o = _mm_cvtsi128_si32(r_i);
+                       i+=4;
+                       o+=4;
+               }
+       }
+}
+
+gboolean cst_has_avx(void) 
+{
+       return TRUE;
+}
+
+#else // !defined __AVX__
+
+/* Provide empty functions if not AVX compiled to avoid linker errors */
+
+void
+transform8_srgb_avx(ThreadInfo* t)
+{
+       /* We should never even get here */
+       g_assert(FALSE);
+}
+
+void
+transform8_otherrgb_avx(ThreadInfo* t)
+{
+       /* We should never even get here */
+       g_assert(FALSE);
+}
+
+gboolean cst_has_avx() 
+{
+       return FALSE;
+}
+
+#endif
\ No newline at end of file


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to