Author: post
Date: 2011-04-13 00:14:10 +0200 (Wed, 13 Apr 2011)
New Revision: 3983

Added:
   trunk/plugins/dcp/dcp-avx.c
Modified:
   trunk/plugins/dcp/Makefile.am
   trunk/plugins/dcp/dcp.c
   trunk/plugins/dcp/dcp.h
Log:
Add AVX compiled version of the SSE2 code (SSE4 version should be added when it 
has been tested).

Modified: trunk/plugins/dcp/Makefile.am
===================================================================
--- trunk/plugins/dcp/Makefile.am       2011-04-12 21:19:00 UTC (rev 3982)
+++ trunk/plugins/dcp/Makefile.am       2011-04-12 22:14:10 UTC (rev 3983)
@@ -14,10 +14,10 @@
 
 libdir = $(datadir)/rawstudio/plugins/
 
-dcp_la_LIBADD = @PACKAGE_LIBS@ adobe-camera-raw-tone.lo dcp-sse2.lo 
dcp-sse4.lo dcp-c.lo
+dcp_la_LIBADD = @PACKAGE_LIBS@ adobe-camera-raw-tone.lo dcp-sse2.lo 
dcp-sse4.lo dcp-avx.lo dcp-c.lo
 dcp_la_LDFLAGS = -module -avoid-version
 dcp_la_SOURCES = 
-EXTRA_DIST = dcp.c dcp.h dcp-sse2.c dcp-sse4.c adobe-camera-raw-tone.c 
adobe-camera-raw-tone.h
+EXTRA_DIST = dcp.c dcp.h dcp-sse2.c dcp-sse4.c dcp-avx.c 
adobe-camera-raw-tone.c adobe-camera-raw-tone.h
 
 adobe-camera-raw-tone.lo: adobe-camera-raw-tone.c adobe-camera-raw-tone.h
        $(LTCOMPILE) -c $(top_srcdir)/plugins/dcp/adobe-camera-raw-tone.c
@@ -37,8 +37,17 @@
 SSE2_FLAG=
 endif
 
+if CAN_COMPILE_AVX
+AVX_FLAG=-mavx
+else
+AVX_FLAG=
+endif
+
 dcp-sse2.lo: dcp-sse2.c dcp.h
        $(LTCOMPILE) $(SSE2_FLAG) -c $(top_srcdir)/plugins/dcp/dcp-sse2.c
 
 dcp-sse4.lo: dcp-sse4.c dcp.h
        $(LTCOMPILE) $(SSE4_FLAG) -c $(top_srcdir)/plugins/dcp/dcp-sse4.c
+
+dcp-avx.lo: dcp-avx.c dcp.h
+       $(LTCOMPILE) $(AVX_FLAG) -c $(top_srcdir)/plugins/dcp/dcp-avx.c

Added: trunk/plugins/dcp/dcp-avx.c
===================================================================
--- trunk/plugins/dcp/dcp-avx.c                         (rev 0)
+++ trunk/plugins/dcp/dcp-avx.c 2011-04-12 22:14:10 UTC (rev 3983)
@@ -0,0 +1,904 @@
+/*
+ * * Copyright (C) 2006-2011 Anders Brander <[email protected]>,
+ * * Anders Kvist <[email protected]> and Klaus Post <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, 
USA.
+ */
+ 
+#include "dcp.h"
+
+#ifdef __AVX__
+
+#include <emmintrin.h>
+#include <math.h> /* powf() */
+
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+/* We ignore this pragma, because we are casting a pointer from float to int 
to pass a float using */
+/* _mm_insert_epi32, since no-one was kind enough to include "insertps xmm, 
mem32, imm8" */
+/* as a valid intrinsic. So we use the integer equivalent instead */
+
+/* Regarding table lookups: */
+/* We are using double sized tables to avoid cache-splits, */
+/* when looking up curve and rgb_tone */
+
+static gfloat _ones_ps[4] __attribute__ ((aligned (16))) = {1.0f, 1.0f, 1.0f, 
1.0f};
+static gfloat _two_ps[4] __attribute__ ((aligned (16))) = {2.0f, 2.0f, 2.0f, 
2.0f};
+static gfloat _six_ps[4] __attribute__ ((aligned (16))) = {6.0f-1e-15, 
6.0f-1e-15, 6.0f-1e-15, 6.0f-1e-15};
+static gfloat _very_small_ps[4] __attribute__ ((aligned (16))) = {1e-15, 
1e-15, 1e-15, 1e-15};
+static const gfloat _two_to_23_ps[4] __attribute__ ((aligned (16))) = { 
0x1.0p23f, 0x1.0p23f, 0x1.0p23f, 0x1.0p23f };
+static guint _ps_mask_sign[4] __attribute__ ((aligned (16))) = 
{0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
+
+#define DW(A) _mm_castps_si128(A)
+#define PS(A) _mm_castsi128_ps(A)
+
+/* Floor for positive numbers */
+static inline __m128 _mm_floor_positive_ps( __m128 v )
+{
+       __m128 two_to_23_ps = _mm_load_ps(_two_to_23_ps);
+       return _mm_sub_ps( _mm_add_ps( v, two_to_23_ps ), two_to_23_ps );
+}
+
+static inline void
+RGBtoHSV_AVX(__m128 *c0, __m128 *c1, __m128 *c2)
+{
+
+       __m128i zero_i = _mm_setzero_si128();
+       __m128 small_ps = _mm_load_ps(_very_small_ps);
+       __m128 ones_ps = _mm_load_ps(_ones_ps);
+       __m128i ps_mask_sign = _mm_load_si128((__m128i*)_ps_mask_sign);
+       
+       // Any number > 1
+       __m128 add_v = _mm_load_ps(_two_ps);
+
+       __m128 r = *c0;
+       __m128 g = *c1;
+       __m128 b = *c2;
+       
+       /* Clamp */
+       r = _mm_min_ps(_mm_max_ps(r, small_ps),ones_ps);
+       g =  _mm_min_ps(_mm_max_ps(g, small_ps),ones_ps);
+       b =  _mm_min_ps(_mm_max_ps(b, small_ps),ones_ps);
+
+       __m128 v = _mm_max_ps(b,_mm_max_ps(r,g));
+       __m128 m = _mm_min_ps(b,_mm_min_ps(r,g));
+       __m128 gap = _mm_sub_ps(v,m);
+       __m128 v_mask = PS(_mm_cmpeq_epi32(_mm_and_si128(DW(gap), 
ps_mask_sign), zero_i));
+       v = _mm_add_ps(v, _mm_and_ps(add_v, v_mask));
+
+       /* Set gap to one where sat = 0, this will avoid divisions by zero, 
these values will not be used */
+       ones_ps = _mm_and_ps(ones_ps, v_mask);
+       gap = _mm_or_ps(gap, ones_ps);
+       /*  gap_inv = 1.0 / gap */
+       __m128 gap_inv = _mm_rcp_ps(gap);
+
+       /* if r == v */
+       /* h = (g - b) / gap; */
+       __m128i mask = _mm_cmpeq_epi32(DW(r), DW(v));
+       __m128 val = _mm_mul_ps(gap_inv, _mm_sub_ps(g, b));
+
+       /* fill h */
+       v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
+       __m128i h = _mm_and_si128(DW(val), mask);
+
+       /* if g == v */
+       /* h = 2.0f + (b - r) / gap; */
+       __m128 two_ps = _mm_load_ps(_two_ps);
+       mask = _mm_cmpeq_epi32(DW(g), DW(v));
+       val = _mm_sub_ps(b, r);
+       val = _mm_mul_ps(val, gap_inv);
+       val = _mm_add_ps(val, two_ps);
+
+       v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
+       h = _mm_or_si128(h, _mm_and_si128(DW(val), mask));
+
+       /* If (b == v) */
+       /* h = 4.0f + (r - g) / gap; */
+       __m128 four_ps = _mm_add_ps(two_ps, two_ps);
+       mask = _mm_cmpeq_epi32(DW(b), DW(v));
+       val = _mm_add_ps(four_ps, _mm_mul_ps(gap_inv, _mm_sub_ps(r, g)));
+
+       h = _mm_or_si128(h, _mm_and_si128(DW(val), mask));
+       v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
+
+       __m128 s;
+       /* Fill s, if gap > 0 */
+       v = _mm_sub_ps(v, add_v);
+       val = _mm_mul_ps(gap,_mm_rcp_ps(v));
+       s = _mm_andnot_ps(v_mask, val );
+
+       /* Check if h < 0 */
+       zero_i = _mm_setzero_si128();
+       __m128i six_ps_i = _mm_load_si128((__m128i*)_six_ps);
+       /* We can use integer comparision, since we are checking if h < 0, 
since the sign bit is same in integer */
+       mask = _mm_cmplt_epi32(h, zero_i);
+       __m128 h2 = _mm_add_ps(PS(h), PS(_mm_and_si128(mask, six_ps_i)));
+
+       *c0 = h2;
+       *c1 = s;
+       *c2 = v;
+}
+
+
+static inline void
+HSVtoRGB_SSE(__m128 *c0, __m128 *c1, __m128 *c2)
+{
+       __m128 h = *c0;
+       __m128 s = *c1;
+       __m128 v = *c2;
+       __m128 r, g, b;
+       
+       /* Convert get the fraction of h
+       * h_fraction = h - floor(h) */
+       __m128 ones_ps = _mm_load_ps(_ones_ps);
+       __m128 h_fraction = _mm_sub_ps(h,_mm_floor_positive_ps(h));
+
+       /* p = v * (1.0f - s)  */
+       __m128 p = _mm_mul_ps(v,  _mm_sub_ps(ones_ps, s));
+       /* q = (v * (1.0f - s * f)) */
+       __m128 q = _mm_mul_ps(v, _mm_sub_ps(ones_ps, _mm_mul_ps(s, 
h_fraction)));
+       /* t = (v * (1.0f - s * (1.0f - f))) */
+       __m128 t = _mm_mul_ps(v, _mm_sub_ps(ones_ps, _mm_mul_ps(s, 
_mm_sub_ps(ones_ps, h_fraction))));
+
+       /* h < 1  (case 0)*/
+       /* case 0: *r = v; *g = t; *b = p; break; */
+       __m128 h_threshold = _mm_add_ps(ones_ps, ones_ps);
+       __m128 out_mask = _mm_cmplt_ps(h, ones_ps);
+       r = _mm_and_ps(v, out_mask);
+       g = _mm_and_ps(t, out_mask);
+       b = _mm_and_ps(p, out_mask);
+
+       /* h < 2 (case 1) */
+       /* case 1: *r = q; *g = v; *b = p; break; */
+       __m128 m = _mm_cmplt_ps(h, h_threshold);
+       h_threshold = _mm_add_ps(h_threshold, ones_ps);
+       m = _mm_andnot_ps(out_mask, m);
+       r = _mm_or_ps(r, _mm_and_ps(q, m));
+       g = _mm_or_ps(g, _mm_and_ps(v, m));
+       b = _mm_or_ps(b, _mm_and_ps(p, m));
+       out_mask = _mm_or_ps(out_mask, m);
+
+       /* h < 3 (case 2)*/
+       /* case 2: *r = p; *g = v; *b = t; break; */
+       m = _mm_cmplt_ps(h, h_threshold);
+       h_threshold = _mm_add_ps(h_threshold, ones_ps);
+       m = _mm_andnot_ps(out_mask, m);
+       r = _mm_or_ps(r, _mm_and_ps(p, m));
+       g = _mm_or_ps(g, _mm_and_ps(v, m));
+       b = _mm_or_ps(b, _mm_and_ps(t, m));
+       out_mask = _mm_or_ps(out_mask, m);
+
+       /* h < 4 (case 3)*/
+       /* case 3: *r = p; *g = q; *b = v; break; */
+       m = _mm_cmplt_ps(h, h_threshold);
+       h_threshold = _mm_add_ps(h_threshold, ones_ps);
+       m = _mm_andnot_ps(out_mask, m);
+       r = _mm_or_ps(r, _mm_and_ps(p, m));
+       g = _mm_or_ps(g, _mm_and_ps(q, m));
+       b = _mm_or_ps(b, _mm_and_ps(v, m));
+       out_mask = _mm_or_ps(out_mask, m);
+
+       /* h < 5 (case 4)*/
+       /* case 4: *r = t; *g = p; *b = v; break; */
+       m = _mm_cmplt_ps(h, h_threshold);
+       m = _mm_andnot_ps(out_mask, m);
+       r = _mm_or_ps(r, _mm_and_ps(t, m));
+       g = _mm_or_ps(g, _mm_and_ps(p, m));
+       b = _mm_or_ps(b, _mm_and_ps(v, m));
+       out_mask = _mm_or_ps(out_mask, m);
+
+
+       /* Remainder (case 5) */
+       /* case 5: *r = v; *g = p; *b = q; break; */
+       r = _mm_or_ps(r, _mm_andnot_ps(out_mask,v));
+       g = _mm_or_ps(g, _mm_andnot_ps(out_mask,p));
+       b = _mm_or_ps(b, _mm_andnot_ps(out_mask,q));
+       
+       *c0 = r;
+       *c1 = g;
+       *c2 = b;
+}
+
+
+static gfloat _mul_hue_ps[4] __attribute__ ((aligned (16))) = {6.0f / 360.0f, 
6.0f / 360.0f, 6.0f / 360.0f, 6.0f / 360.0f};
+static gint _ones_epi32[4] __attribute__ ((aligned (16))) = {1,1,1,1};
+
+static void
+huesat_map_AVX(RSHuesatMap *map, const PrecalcHSM* precalc, __m128 *_h, __m128 
*_s, __m128 *_v)
+{
+       __m128 zero_ps = _mm_setzero_ps();
+       __m128 ones_ps = _mm_load_ps(_ones_ps);
+       
+       __m128 h = *_h;
+       __m128 s = *_s;
+       __m128 v = *_v;
+       
+       /* Clamp - H must be pre-clamped*/
+       s =  _mm_min_ps(_mm_max_ps(s, zero_ps),ones_ps);
+       v =  _mm_min_ps(_mm_max_ps(v, zero_ps),ones_ps);
+       
+       gint xfer_0[4] __attribute__ ((aligned (16)));
+       gint xfer_1[4] __attribute__ ((aligned (16)));
+
+       const RS_VECTOR3 *tableBase = map->deltas;
+
+       __m128 hueShift;
+       __m128 satScale;
+       __m128 valScale;
+
+       if (map->val_divisions < 2)
+       {
+               __m128 hScaled = _mm_mul_ps(h, _mm_load_ps(precalc->hScale));
+               __m128 sScaled = _mm_mul_ps(s,  _mm_load_ps(precalc->sScale));
+
+               __m128i maxHueIndex0 = 
_mm_load_si128((__m128i*)precalc->maxHueIndex0);
+               __m128i maxSatIndex0 = 
_mm_load_si128((__m128i*)precalc->maxSatIndex0);
+               __m128i hIndex0 = _mm_cvttps_epi32( hScaled );
+               __m128i sIndex0 = _mm_cvttps_epi32( sScaled );
+
+               sIndex0 = _mm_min_epi16(sIndex0, maxSatIndex0);
+               __m128i ones_epi32 = _mm_load_si128((__m128i*)_ones_epi32);
+               __m128i hIndex1 = _mm_add_epi32(hIndex0, ones_epi32);
+
+               /* if (hIndex0 >= maxHueIndex0) */
+               __m128i hIndexMask = _mm_cmpgt_epi32( hIndex0, 
_mm_sub_epi32(maxHueIndex0, ones_epi32));
+               hIndex0 = _mm_andnot_si128(hIndexMask, hIndex0);
+               /* hIndex1 = 0; */
+               hIndex1 = _mm_andnot_si128(hIndexMask, hIndex1);
+               /* hIndex0 = maxHueIndex0 */
+               hIndex0 = _mm_or_si128(hIndex0, _mm_and_si128(hIndexMask, 
maxHueIndex0));
+
+               __m128 hFract1 = _mm_sub_ps( hScaled, _mm_cvtepi32_ps(hIndex0));
+               __m128 sFract1 = _mm_sub_ps( sScaled, _mm_cvtepi32_ps(sIndex0));
+               __m128 ones_ps = _mm_load_ps(_ones_ps);
+
+               __m128 hFract0 = _mm_sub_ps(ones_ps, hFract1);
+               __m128 sFract0 = _mm_sub_ps(ones_ps, sFract1);
+               __m128i hueStep = _mm_load_si128((__m128i*)precalc->hueStep);
+               __m128i table_offsets = _mm_add_epi32(sIndex0, 
_mm_mullo_epi16(hIndex0, hueStep));
+               __m128i next_offsets = _mm_add_epi32(sIndex0, 
_mm_mullo_epi16(hIndex1, hueStep));
+
+               _mm_store_si128((__m128i*)xfer_0, table_offsets);
+               _mm_store_si128((__m128i*)xfer_1, next_offsets);
+
+               const RS_VECTOR3 *entry00[4] = { tableBase + xfer_0[0], 
tableBase + xfer_0[1], tableBase + xfer_0[2], tableBase + xfer_0[3]};
+               const RS_VECTOR3 *entry01[4] = { tableBase + xfer_1[0], 
tableBase + xfer_1[1], tableBase + xfer_1[2], tableBase + xfer_1[3]};
+
+               __m128 hs0 = _mm_set_ps(entry00[3]->fHueShift, 
entry00[2]->fHueShift, entry00[1]->fHueShift, entry00[0]->fHueShift);
+               __m128 hs1 = _mm_set_ps(entry01[3]->fHueShift, 
entry01[2]->fHueShift, entry01[1]->fHueShift, entry01[0]->fHueShift);
+               __m128 hueShift0 = _mm_add_ps(_mm_mul_ps(hs0, hFract0), 
_mm_mul_ps(hs1, hFract1));
+               hueShift0 = _mm_mul_ps(hueShift0, sFract0);
+               hs0 = _mm_set_ps(entry00[3][1].fHueShift, 
entry00[2][1].fHueShift, entry00[1][1].fHueShift, entry00[0][1].fHueShift);
+               hs1 = _mm_set_ps(entry01[3][1].fHueShift, 
entry01[2][1].fHueShift, entry01[1][1].fHueShift, entry01[0][1].fHueShift);
+               __m128 hueShift1 = _mm_add_ps(_mm_mul_ps(hs0, hFract0), 
_mm_mul_ps(hs1, hFract1));
+               hueShift = _mm_add_ps(hueShift0, _mm_mul_ps(hueShift1, 
sFract1));
+               
+               __m128 ss0 = _mm_set_ps(entry00[3]->fSatScale, 
entry00[2]->fSatScale, entry00[1]->fSatScale, entry00[0]->fSatScale);
+               __m128 ss1 = _mm_set_ps(entry01[3]->fSatScale, 
entry01[2]->fSatScale, entry01[1]->fSatScale, entry01[0]->fSatScale);
+               __m128 satScale0 = _mm_add_ps(_mm_mul_ps(ss0, hFract0), 
_mm_mul_ps(ss1, hFract1));
+               satScale0 = _mm_mul_ps(satScale0, sFract0);
+               ss0 = _mm_set_ps(entry00[3][1].fSatScale, 
entry00[2][1].fSatScale, entry00[1][1].fSatScale, entry00[0][1].fSatScale);
+               ss1 = _mm_set_ps(entry01[3][1].fSatScale, 
entry01[2][1].fSatScale, entry01[1][1].fSatScale, entry01[0][1].fSatScale);
+               __m128 satScale1 = _mm_add_ps(_mm_mul_ps(ss0, hFract0), 
_mm_mul_ps(ss1, hFract1));
+               satScale = _mm_add_ps(satScale0, _mm_mul_ps(satScale1, 
sFract1));
+
+               __m128 vs0 = _mm_set_ps(entry00[3]->fValScale, 
entry00[2]->fValScale, entry00[1]->fValScale, entry00[0]->fValScale);
+               __m128 vs1 = _mm_set_ps(entry01[3]->fValScale, 
entry01[2]->fValScale, entry01[1]->fValScale, entry01[0]->fValScale);
+               __m128 valScale0 = _mm_add_ps(_mm_mul_ps(vs0, hFract0), 
_mm_mul_ps(vs1, hFract1));
+               valScale0 = _mm_mul_ps(valScale0, sFract0);
+               vs0 = _mm_set_ps(entry00[3][1].fValScale, 
entry00[2][1].fValScale, entry00[1][1].fValScale, entry00[0][1].fValScale);
+               vs1 = _mm_set_ps(entry01[3][1].fValScale, 
entry01[2][1].fValScale, entry01[1][1].fValScale, entry01[0][1].fValScale);
+               __m128 valScale1 = _mm_add_ps(_mm_mul_ps(vs0, hFract0), 
_mm_mul_ps(vs1, hFract1));
+               valScale = _mm_add_ps(valScale0, _mm_mul_ps(valScale1, 
sFract1));
+
+       }
+       else
+       {
+               __m128 hScaled = _mm_mul_ps(h, _mm_load_ps(precalc->hScale));
+               __m128 sScaled = _mm_mul_ps(s,  _mm_load_ps(precalc->sScale));
+               __m128 vScaled = _mm_mul_ps(v,  _mm_load_ps(precalc->vScale));
+
+               __m128i maxHueIndex0 = 
_mm_load_si128((__m128i*)precalc->maxHueIndex0);
+               __m128i maxSatIndex0 = 
_mm_load_si128((__m128i*)precalc->maxSatIndex0);
+               __m128i maxValIndex0 = 
_mm_load_si128((__m128i*)precalc->maxValIndex0);
+               
+               __m128i hIndex0 = _mm_cvttps_epi32(hScaled);
+               __m128i sIndex0 = _mm_cvttps_epi32(sScaled);
+               __m128i vIndex0 = _mm_cvttps_epi32(vScaled);
+
+               // Requires that maxSatIndex0 and sIndex0 can be contained 
within a 16 bit signed word.
+               sIndex0 = _mm_min_epi16(sIndex0, maxSatIndex0);
+               vIndex0 = _mm_min_epi16(vIndex0, maxValIndex0);
+               __m128i ones_epi32 = _mm_load_si128((__m128i*)_ones_epi32);
+               __m128i hIndex1 = _mm_add_epi32(hIndex0, ones_epi32);
+
+               /* if (hIndex0 > (maxHueIndex0 - 1)) */
+               __m128i hIndexMask = _mm_cmpgt_epi32( hIndex0, 
_mm_sub_epi32(maxHueIndex0, ones_epi32));
+               /* Make room in hIndex0 */
+               hIndex0 = _mm_andnot_si128(hIndexMask, hIndex0);
+               /* hIndex1 = 0; */
+               hIndex1 = _mm_andnot_si128(hIndexMask, hIndex1);
+               /* hIndex0 = maxHueIndex0, where hIndex0 >= (maxHueIndex0) */
+               hIndex0 = _mm_or_si128(hIndex0, _mm_and_si128(hIndexMask, 
maxHueIndex0));
+
+               __m128 hFract1 = _mm_sub_ps( hScaled, _mm_cvtepi32_ps(hIndex0));
+               __m128 sFract1 = _mm_sub_ps( sScaled, _mm_cvtepi32_ps(sIndex0));
+               __m128 vFract1 = _mm_sub_ps( vScaled, _mm_cvtepi32_ps(vIndex0));
+               __m128 ones_ps = _mm_load_ps(_ones_ps);
+
+               __m128 hFract0 = _mm_sub_ps(ones_ps, hFract1);
+               __m128 sFract0 = _mm_sub_ps(ones_ps, sFract1);
+               __m128 vFract0 = _mm_sub_ps(ones_ps, vFract1);
+
+               __m128i hueStep = _mm_load_si128((__m128i*)precalc->hueStep);
+               __m128i valStep = _mm_load_si128((__m128i*)precalc->valStep);
+
+               // This requires that hueStep and valStep can be contained in a 
16 bit signed integer.
+               __m128i table_offsets = _mm_add_epi32(sIndex0, 
_mm_mullo_epi16(vIndex0, valStep));
+               __m128i next_offsets = _mm_mullo_epi16(hIndex1, hueStep);
+               next_offsets = _mm_add_epi32(next_offsets, table_offsets);
+               table_offsets = _mm_add_epi32(table_offsets, 
_mm_mullo_epi16(hIndex0, hueStep));
+
+               _mm_store_si128((__m128i*)xfer_0, table_offsets);
+               _mm_store_si128((__m128i*)xfer_1, next_offsets);
+               gint _valStep = precalc->valStep[0];
+               
+               const RS_VECTOR3 *entry00[4] = { tableBase + xfer_0[0], 
tableBase + xfer_0[1], tableBase + xfer_0[2], tableBase + xfer_0[3]};
+               const RS_VECTOR3 *entry01[4] = { tableBase + xfer_1[0], 
tableBase + xfer_1[1], tableBase + xfer_1[2], tableBase + xfer_1[3]};
+               const RS_VECTOR3 *entry10[4] = { entry00[0] + _valStep, 
entry00[1] + _valStep, entry00[2] + _valStep, entry00[3] + _valStep};
+               const RS_VECTOR3 *entry11[4] = { entry01[0] + _valStep, 
entry01[1] + _valStep, entry01[2] + _valStep, entry01[3] + _valStep};
+
+               /* Hue first element */
+               __m128 hs00 = _mm_set_ps(entry00[3]->fHueShift, 
entry00[2]->fHueShift, entry00[1]->fHueShift, entry00[0]->fHueShift);
+               __m128 hs01 = _mm_set_ps(entry01[3]->fHueShift, 
entry01[2]->fHueShift, entry01[1]->fHueShift, entry01[0]->fHueShift);
+               __m128 hs10 = _mm_set_ps(entry10[3]->fHueShift, 
entry10[2]->fHueShift, entry10[1]->fHueShift, entry10[0]->fHueShift);
+               __m128 hs11 = _mm_set_ps(entry11[3]->fHueShift, 
entry11[2]->fHueShift, entry11[1]->fHueShift, entry11[0]->fHueShift);
+               __m128 hueShift0 = _mm_mul_ps(vFract0, 
_mm_add_ps(_mm_mul_ps(hs00, hFract0), _mm_mul_ps(hs01, hFract1)));
+               __m128 hueShift1 = _mm_mul_ps(vFract1, 
_mm_add_ps(_mm_mul_ps(hs10, hFract0), _mm_mul_ps(hs11, hFract1)));
+               hueShift = _mm_mul_ps(sFract0, _mm_add_ps(hueShift0, 
hueShift1));
+
+               /* Hue second element */
+               hs00 = _mm_set_ps(entry00[3][1].fHueShift, 
entry00[2][1].fHueShift, entry00[1][1].fHueShift, entry00[0][1].fHueShift);
+               hs01 = _mm_set_ps(entry01[3][1].fHueShift, 
entry01[2][1].fHueShift, entry01[1][1].fHueShift, entry01[0][1].fHueShift);
+               hs10 = _mm_set_ps(entry10[3][1].fHueShift, 
entry10[2][1].fHueShift, entry10[1][1].fHueShift, entry10[0][1].fHueShift);
+               hs11 = _mm_set_ps(entry11[3][1].fHueShift, 
entry11[2][1].fHueShift, entry11[1][1].fHueShift, entry11[0][1].fHueShift);
+               hueShift0 = _mm_mul_ps(vFract0, _mm_add_ps(_mm_mul_ps(hs00, 
hFract0), _mm_mul_ps(hs01, hFract1)));
+               hueShift1 = _mm_mul_ps(vFract1, _mm_add_ps(_mm_mul_ps(hs10, 
hFract0), _mm_mul_ps(hs11, hFract1)));
+               hueShift = _mm_add_ps(hueShift, _mm_mul_ps(sFract1, 
_mm_add_ps(hueShift0, hueShift1)));
+
+               /* Sat first element */
+               __m128 ss00 = _mm_set_ps(entry00[3]->fSatScale, 
entry00[2]->fSatScale, entry00[1]->fSatScale, entry00[0]->fSatScale);
+               __m128 ss01 = _mm_set_ps(entry01[3]->fSatScale, 
entry01[2]->fSatScale, entry01[1]->fSatScale, entry01[0]->fSatScale);
+               __m128 ss10 = _mm_set_ps(entry10[3]->fSatScale, 
entry10[2]->fSatScale, entry10[1]->fSatScale, entry10[0]->fSatScale);
+               __m128 ss11 = _mm_set_ps(entry11[3]->fSatScale, 
entry11[2]->fSatScale, entry11[1]->fSatScale, entry11[0]->fSatScale);
+               __m128 satScale0 = _mm_mul_ps(vFract0, 
_mm_add_ps(_mm_mul_ps(ss00, hFract0), _mm_mul_ps(ss01, hFract1)));
+               __m128 satScale1 = _mm_mul_ps(vFract1, 
_mm_add_ps(_mm_mul_ps(ss10, hFract0), _mm_mul_ps(ss11, hFract1)));
+               satScale = _mm_mul_ps(sFract0, _mm_add_ps(satScale0, 
satScale1));
+
+               /* Sat second element */
+               ss00 = _mm_set_ps(entry00[3][1].fSatScale, 
entry00[2][1].fSatScale, entry00[1][1].fSatScale, entry00[0][1].fSatScale);
+               ss01 = _mm_set_ps(entry01[3][1].fSatScale, 
entry01[2][1].fSatScale, entry01[1][1].fSatScale, entry01[0][1].fSatScale);
+               ss10 = _mm_set_ps(entry10[3][1].fSatScale, 
entry10[2][1].fSatScale, entry10[1][1].fSatScale, entry10[0][1].fSatScale);
+               ss11 = _mm_set_ps(entry11[3][1].fSatScale, 
entry11[2][1].fSatScale, entry11[1][1].fSatScale, entry11[0][1].fSatScale);
+               satScale0 = _mm_mul_ps(vFract0, _mm_add_ps(_mm_mul_ps(ss00, 
hFract0), _mm_mul_ps(ss01, hFract1)));
+               satScale1 = _mm_mul_ps(vFract1, _mm_add_ps(_mm_mul_ps(ss10, 
hFract0), _mm_mul_ps(ss11, hFract1)));
+               satScale = _mm_add_ps(satScale, _mm_mul_ps(sFract1, 
_mm_add_ps(satScale0, satScale1)));
+
+               /* Val first element */
+               __m128 vs00 = _mm_set_ps(entry00[3]->fValScale, 
entry00[2]->fValScale, entry00[1]->fValScale, entry00[0]->fValScale);
+               __m128 vs01 = _mm_set_ps(entry01[3]->fValScale, 
entry01[2]->fValScale, entry01[1]->fValScale, entry01[0]->fValScale);
+               __m128 vs10 = _mm_set_ps(entry10[3]->fValScale, 
entry10[2]->fValScale, entry10[1]->fValScale, entry10[0]->fValScale);
+               __m128 vs11 = _mm_set_ps(entry11[3]->fValScale, 
entry11[2]->fValScale, entry11[1]->fValScale, entry11[0]->fValScale);
+               __m128 valScale0 = _mm_mul_ps(vFract0, 
_mm_add_ps(_mm_mul_ps(vs00, hFract0), _mm_mul_ps(vs01, hFract1)));
+               __m128 valScale1 = _mm_mul_ps(vFract1, 
_mm_add_ps(_mm_mul_ps(vs10, hFract0), _mm_mul_ps(vs11, hFract1)));
+               valScale = _mm_mul_ps(sFract0, _mm_add_ps(valScale0, 
valScale1));
+
+               /* Val second element */
+               vs00 = _mm_set_ps(entry00[3][1].fValScale, 
entry00[2][1].fValScale, entry00[1][1].fValScale, entry00[0][1].fValScale);
+               vs01 = _mm_set_ps(entry01[3][1].fValScale, 
entry01[2][1].fValScale, entry01[1][1].fValScale, entry01[0][1].fValScale);
+               vs10 = _mm_set_ps(entry10[3][1].fValScale, 
entry10[2][1].fValScale, entry10[1][1].fValScale, entry10[0][1].fValScale);
+               vs11 = _mm_set_ps(entry11[3][1].fValScale, 
entry11[2][1].fValScale, entry11[1][1].fValScale, entry11[0][1].fValScale);
+               valScale0 = _mm_mul_ps(vFract0, _mm_add_ps(_mm_mul_ps(vs00, 
hFract0), _mm_mul_ps(vs01, hFract1)));
+               valScale1 = _mm_mul_ps(vFract1, _mm_add_ps(_mm_mul_ps(vs10, 
hFract0), _mm_mul_ps(vs11, hFract1)));
+               valScale = _mm_add_ps(valScale, _mm_mul_ps(sFract1, 
_mm_add_ps(valScale0, valScale1)));
+       }
+
+       __m128 mul_hue = _mm_load_ps(_mul_hue_ps);
+       ones_ps = _mm_load_ps(_ones_ps);
+       hueShift = _mm_mul_ps(hueShift, mul_hue);
+       s = _mm_min_ps(ones_ps, _mm_mul_ps(s, satScale));
+       v = _mm_min_ps(ones_ps, _mm_mul_ps(v, valScale));
+       h = _mm_add_ps(h, hueShift);
+       *_h = h;
+       *_s = s;
+       *_v = v;
+}
+
+static gfloat _16_bit_ps[4] __attribute__ ((aligned (16))) = {65535.0, 
65535.0, 65535.0, 65535.0};
+static gfloat _thousand_24_ps[4] __attribute__ ((aligned (16))) = 
{1023.99999f, 1023.99999f, 1023.99999f, 1023.99999f};
+
+static inline __m128 
+curve_interpolate_lookup(__m128 value, const gfloat * const tone_lut)
+{
+       int xfer[8] __attribute__ ((aligned (16)));
+       /* Convert v to lookup values and interpolate */
+       __m128 mul = _mm_mul_ps(value, _mm_load_ps(_thousand_24_ps));
+       __m128i lookup = _mm_cvtps_epi32(mul);
+       _mm_store_si128((__m128i*)&xfer[0], lookup);
+
+       /* Calculate fractions */
+       __m128 frac = _mm_sub_ps(mul, _mm_floor_positive_ps(mul));
+       __m128 inv_frac = _mm_sub_ps(_mm_load_ps(_ones_ps), frac);
+
+       /* Load two adjacent curve values and interpolate between them */
+       __m128 p0p1 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[xfer[0]*2]));
+       __m128 p2p3 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[xfer[2]*2]));
+       p0p1 = _mm_loadh_pi(p0p1, (__m64*)&tone_lut[xfer[1]*2]);
+       p2p3 = _mm_loadh_pi(p2p3, (__m64*)&tone_lut[xfer[3]*2]);
+
+       /* Pack all lower values in v0, high in v1 and interpolate */
+       __m128 v0 = _mm_shuffle_ps(p0p1, p2p3, _MM_SHUFFLE(2,0,2,0));
+       __m128 v1 = _mm_shuffle_ps(p0p1, p2p3, _MM_SHUFFLE(3,1,3,1));
+       return _mm_add_ps(_mm_mul_ps(inv_frac, v0), _mm_mul_ps(frac, v1));
+}
+
+static inline void 
+rgb_tone_sse2(__m128* _r, __m128* _g, __m128* _b, const gfloat * const 
tone_lut)
+{
+       __m128 r = *_r;
+       __m128 g = *_g;
+       __m128 b = *_b;
+       __m128 small_ps = _mm_load_ps(_very_small_ps);
+       __m128 ones_ps = _mm_load_ps(_ones_ps);
+       
+       /* Clamp  to avoid lookups out of table */
+       r = _mm_min_ps(_mm_max_ps(r, small_ps),ones_ps);
+       g =  _mm_min_ps(_mm_max_ps(g, small_ps),ones_ps);
+       b =  _mm_min_ps(_mm_max_ps(b, small_ps),ones_ps);
+       
+       /* Find largest and smallest values */
+       __m128 lg = _mm_max_ps(b, _mm_max_ps(r, g));
+       __m128 sm = _mm_min_ps(b, _mm_min_ps(r, g));
+
+       /* Lookup */
+       __m128 LG = curve_interpolate_lookup(lg, tone_lut);
+       __m128 SM = curve_interpolate_lookup(sm, tone_lut);
+
+       /* Create masks for largest, smallest and medium values */
+       /* This is done in integer AVX, since they have double the throughput */
+       __m128i ones = _mm_cmpeq_epi32(DW(r), DW(r));
+       __m128i is_r_lg = _mm_cmpeq_epi32(DW(r), DW(lg));
+       __m128i is_g_lg = _mm_cmpeq_epi32(DW(g), DW(lg));
+       __m128i is_b_lg = _mm_cmpeq_epi32(DW(b), DW(lg));
+       
+       __m128i is_r_sm = _mm_andnot_si128(is_r_lg, _mm_cmpeq_epi32(DW(r), 
DW(sm)));
+       __m128i is_g_sm = _mm_andnot_si128(is_g_lg, _mm_cmpeq_epi32(DW(g), 
DW(sm)));
+       __m128i is_b_sm = _mm_andnot_si128(is_b_lg, _mm_cmpeq_epi32(DW(b), 
DW(sm)));
+       
+       __m128i is_r_md = _mm_xor_si128(ones, _mm_or_si128(is_r_lg, is_r_sm));
+       __m128i is_g_md = _mm_xor_si128(ones, _mm_or_si128(is_g_lg, is_g_sm));
+       __m128i is_b_md = _mm_xor_si128(ones, _mm_or_si128(is_b_lg, is_b_sm));
+
+       /* Find all medium values based on masks */
+       __m128 md = PS(_mm_or_si128(_mm_or_si128(
+                                  _mm_and_si128(DW(r), is_r_md), 
+                                                                
_mm_and_si128(DW(g), is_g_md)),
+                                                                               
           _mm_and_si128(DW(b), is_b_md)));
+
+       /* Calculate tone corrected medium value */
+       __m128 p = _mm_rcp_ps(_mm_sub_ps(lg, sm));
+       __m128 q = _mm_sub_ps(md, sm);
+       __m128 o = _mm_sub_ps(LG, SM);
+       __m128 MD = _mm_add_ps(SM, _mm_mul_ps(o, _mm_mul_ps(p, q)));
+
+       /* Inserted here again, to lighten register presssure */
+       is_r_lg = _mm_cmpeq_epi32(DW(r), DW(lg));
+       is_g_lg = _mm_cmpeq_epi32(DW(g), DW(lg));
+       is_b_lg = _mm_cmpeq_epi32(DW(b), DW(lg));
+
+       /* Combine corrected values to output RGB */
+       r = PS(_mm_or_si128( _mm_or_si128(
+                  _mm_and_si128(DW(LG), is_r_lg),
+                                                _mm_and_si128(DW(SM), 
is_r_sm)), 
+                                                                          
_mm_and_si128(DW(MD), is_r_md)));
+       
+       g = PS(_mm_or_si128( _mm_or_si128(
+                  _mm_and_si128(DW(LG), is_g_lg),
+                                                _mm_and_si128(DW(SM), 
is_g_sm)), 
+                                                                          
_mm_and_si128(DW(MD), is_g_md)));
+       
+       b = PS(_mm_or_si128( _mm_or_si128(
+                  _mm_and_si128(DW(LG), is_b_lg),
+                                                _mm_and_si128(DW(SM), 
is_b_sm)), 
+                                                                          
_mm_and_si128(DW(MD), is_b_md)));
+
+       *_r = r;
+       *_g = g;
+       *_b = b;
+}
+
+#undef DW
+#undef PS
+
+static inline __m128
+sse_matrix3_mul(float* mul, __m128 a, __m128 b, __m128 c)
+{
+
+       __m128 v = _mm_load_ps(mul);
+       __m128 acc = _mm_mul_ps(a, v);
+
+       v = _mm_load_ps(mul+4);
+       acc = _mm_add_ps(acc, _mm_mul_ps(b, v));
+
+       v = _mm_load_ps(mul+8);
+       acc = _mm_add_ps(acc, _mm_mul_ps(c, v));
+
+       return acc;
+}
+
+static gfloat _rgb_div_ps[4] __attribute__ ((aligned (16))) = {1.0/65535.0, 
1.0/65535.0, 1.0/65535.0, 1.0/65535.0};
+static gint _15_bit_epi32[4] __attribute__ ((aligned (16))) = { 32768, 32768, 
32768, 32768};
+static guint _16_bit_sign[4] __attribute__ ((aligned (16))) = 
{0x80008000,0x80008000,0x80008000,0x80008000};
+static gfloat _twofiftysix_ps[4] __attribute__ ((aligned (16))) = 
{255.9999f,255.9999f,255.9999f,255.9999f};
+
+#define SETFLOAT4(N, A, B, C, D) float N[4] __attribute__ ((aligned (16))); \
+N[0] = D; N[1] = C; N[2] = B; N[3] = A;
+
+#define SETFLOAT4_SAME(N, A) float N[4] __attribute__ ((aligned (16))); \
+N[0] = A; N[1] = A; N[2] = A; N[3] = A;
+
+gboolean
+render_AVX(ThreadInfo* t)
+{
+       RS_IMAGE16 *image = t->tmp;
+       RSDcp *dcp = t->dcp;
+       gint x, y;
+       __m128 h, s, v;
+       __m128i p1,p2;
+       __m128 p1f, p2f, p3f, p4f;
+       __m128 r, g, b, r2, g2, b2;
+       __m128i zero;
+       int _mm_rounding = _MM_GET_ROUNDING_MODE();
+       _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+       __m128 hue_add = _mm_set_ps(dcp->hue, dcp->hue, dcp->hue, dcp->hue);
+       __m128 sat;
+       if (dcp->saturation > 1.0)
+               sat = _mm_set_ps(dcp->saturation-1.0f, dcp->saturation-1.0f, 
dcp->saturation-1.0f, dcp->saturation-1.0f);
+       else 
+               sat = _mm_set_ps(dcp->saturation, dcp->saturation, 
dcp->saturation, dcp->saturation);
+       gboolean do_contrast = (dcp->contrast > 1.001f);
+       gboolean do_highrec = (dcp->contrast < 0.999f);
+       float exposure_simple = MAX(1.0, powf(2.0f, dcp->exposure));
+       float __recover_radius = 0.5 * exposure_simple;
+       SETFLOAT4_SAME(_inv_recover_radius, 1.0f / __recover_radius);
+       SETFLOAT4_SAME(_recover_radius, 1.0 - __recover_radius);
+
+       int xfer[4] __attribute__ ((aligned (16)));
+       SETFLOAT4_SAME(_min_cam, 1.0);
+       SETFLOAT4_SAME(_black_minus_radius, dcp->exposure_black - 
dcp->exposure_radius);
+       SETFLOAT4_SAME(_black_plus_radius, dcp->exposure_black + 
dcp->exposure_radius);
+       SETFLOAT4_SAME(_exposure_black, dcp->exposure_black);
+       SETFLOAT4_SAME(_exposure_slope, dcp->exposure_slope);
+       SETFLOAT4_SAME(_exposure_qscale, dcp->exposure_qscale);
+       SETFLOAT4_SAME(_contrast, dcp->contrast);
+       SETFLOAT4_SAME(_inv_contrast, 1.0f - dcp->contrast);
+       SETFLOAT4_SAME(_cm_r, dcp->channelmixer_red);
+       SETFLOAT4_SAME(_cm_g, dcp->channelmixer_green);
+       SETFLOAT4_SAME(_cm_b, dcp->channelmixer_blue);
+       SETFLOAT4_SAME(_contr_base, 0.5f);
+
+       if (dcp->use_profile)
+       {
+               _min_cam[0] = dcp->camera_white.x;
+               _min_cam[1] = dcp->camera_white.y;
+               _min_cam[2] = dcp->camera_white.z;
+               _min_cam[3] = 0.0;
+       }
+
+       float cam_prof[4*4*3] __attribute__ ((aligned (16)));
+       for (x = 0; x < 4; x++ ) {
+               cam_prof[x] = dcp->camera_to_prophoto.coeff[0][0] * 
dcp->channelmixer_red;
+               cam_prof[x+4] = dcp->camera_to_prophoto.coeff[0][1] * 
dcp->channelmixer_red;
+               cam_prof[x+8] = dcp->camera_to_prophoto.coeff[0][2] * 
dcp->channelmixer_red;
+               cam_prof[12+x] = dcp->camera_to_prophoto.coeff[1][0] * 
dcp->channelmixer_green;
+               cam_prof[12+x+4] = dcp->camera_to_prophoto.coeff[1][1] * 
dcp->channelmixer_green;
+               cam_prof[12+x+8] = dcp->camera_to_prophoto.coeff[1][2] * 
dcp->channelmixer_green;
+               cam_prof[24+x] = dcp->camera_to_prophoto.coeff[2][0] * 
dcp->channelmixer_blue;
+               cam_prof[24+x+4] = dcp->camera_to_prophoto.coeff[2][1] * 
dcp->channelmixer_blue;
+               cam_prof[24+x+8] = dcp->camera_to_prophoto.coeff[2][2]* 
dcp->channelmixer_blue;
+       }
+       
+       gint end_x = image->w - (image->w & 3);
+
+       for(y = t->start_y ; y < t->end_y; y++)
+       {
+               __m128i* pixel = (__m128i*)GET_PIXEL(image, 0, y);
+
+               /* Prefetch next line */
+               _mm_prefetch((char*)(pixel)+image->rowstride*2, _MM_HINT_NTA);
+
+               for(x=0; x < end_x; x+=4)
+               {
+
+                       zero = _mm_setzero_si128();
+
+                       /* Convert to float */
+                       p1 = _mm_load_si128(pixel);
+                       p2 = _mm_load_si128(pixel + 1);
+                       _mm_prefetch((char*)(pixel+4), _MM_HINT_NTA);
+
+                       /* Unpack to R G B x */
+                       p2f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(p1, zero));
+                       p4f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(p2, zero));
+                       p1f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(p1, zero));
+                       p3f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(p2, zero));
+
+                       /* Normalize to 0 to 1 range */
+                       __m128 rgb_div = _mm_load_ps(_rgb_div_ps);
+                       p1f = _mm_mul_ps(p1f, rgb_div);
+                       p2f = _mm_mul_ps(p2f, rgb_div);
+                       p3f = _mm_mul_ps(p3f, rgb_div);
+                       p4f = _mm_mul_ps(p4f, rgb_div);
+
+                       if (dcp->use_profile)
+                       {
+                               /* Restric to camera white */
+                               __m128 min_cam = _mm_load_ps(_min_cam);
+                               p1f = _mm_min_ps(p1f, min_cam);
+                               p2f = _mm_min_ps(p2f, min_cam);
+                               p3f = _mm_min_ps(p3f, min_cam);
+                               p4f = _mm_min_ps(p4f, min_cam);
+                       }
+
+                       /* Convert to planar */
+                       __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
+                       __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
+                       __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
+                       __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
+                       r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
+                       g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
+                       b = _mm_movelh_ps(b1b0, b3b2);
+
+                       /* Convert to Prophoto */
+                       r2 = sse_matrix3_mul(cam_prof, r, g, b);
+                       g2 = sse_matrix3_mul(&cam_prof[12], r, g, b);
+                       b2 = sse_matrix3_mul(&cam_prof[24], r, g, b);
+
+                       RGBtoHSV_AVX(&r2, &g2, &b2);
+                       h = r2; s = g2; v = b2;
+
+                       if (dcp->huesatmap)
+                       {
+                               huesat_map_AVX(dcp->huesatmap, 
dcp->huesatmap_precalc, &h, &s, &v);
+                       }
+
+                       /* Saturation */
+                       __m128 max_val = _mm_load_ps(_ones_ps);
+                       __m128 min_val = _mm_load_ps(_very_small_ps);
+                       if (dcp->saturation > 1.0)
+                       {
+                               __m128 two_ps = _mm_load_ps(_two_ps);
+                               __m128 ones_ps = _mm_load_ps(_ones_ps);
+                               /*  out = (sat) * (x*2-x^2.0) + ((1.0-sat)*x) */
+                               __m128 s_curved = _mm_mul_ps(sat, 
_mm_sub_ps(_mm_mul_ps(s, two_ps), _mm_mul_ps(s,s)));
+                               s = _mm_min_ps(max_val, _mm_add_ps(s_curved, 
_mm_mul_ps(s, _mm_sub_ps(ones_ps, sat))));
+                       } 
+                       else
+                       {
+                               s = _mm_max_ps(min_val, _mm_min_ps(max_val, 
_mm_mul_ps(s, sat)));
+                       }
+
+                       /* Hue */
+                       __m128 six_ps = _mm_load_ps(_six_ps);
+                       __m128 zero_ps = _mm_setzero_ps();
+                       h = _mm_add_ps(h, hue_add);
+
+                       /* Check if hue >= 6 or < 0*/
+                       __m128 h_mask_gt = _mm_cmpge_ps(h, six_ps);
+                       __m128 h_mask_lt = _mm_cmplt_ps(h, zero_ps);
+                       __m128 six_masked_gt = _mm_and_ps(six_ps, h_mask_gt);
+                       __m128 six_masked_lt = _mm_and_ps(six_ps, h_mask_lt);
+                       h = _mm_sub_ps(h, six_masked_gt);
+                       h = _mm_add_ps(h, six_masked_lt);
+                       __m128 v_stored = v;
+
+                       HSVtoRGB_SSE(&h, &s, &v);
+                       r = h; g = s; b = v;
+                       
+                       /* Exposure */
+                       /* y = x - (dcp->exposure_black - 
dcp->exposure_radius);        */
+                       /* x = dcp->exposure_qscale * y * y;                    
                        */
+                       __m128 black_minus_radius = 
_mm_load_ps(_black_minus_radius);
+                       __m128 y_r = _mm_sub_ps(r, black_minus_radius);
+                       __m128 y_g = _mm_sub_ps(g, black_minus_radius);
+                       __m128 y_b = _mm_sub_ps(b, black_minus_radius);
+
+                       __m128 exposure_qscale = _mm_load_ps(_exposure_qscale);
+                       y_r = _mm_mul_ps(exposure_qscale,_mm_mul_ps(y_r, y_r));
+                       y_g = _mm_mul_ps(exposure_qscale,_mm_mul_ps(y_g, y_g));
+                       y_b = _mm_mul_ps(exposure_qscale,_mm_mul_ps(y_b, y_b));
+
+                       /* if (x >= dcp->exposure_black + dcp->exposure_radius) 
                */
+                       /*              x =  (x - dcp->exposure_black) * 
dcp->exposure_slope;   */                      
+                       __m128 exposure_slope = _mm_load_ps(_exposure_slope);
+                       __m128 exposure_black = _mm_load_ps(_exposure_black);
+                       __m128 y2_r = _mm_mul_ps(exposure_slope, _mm_sub_ps(r, 
exposure_black));
+                       __m128 y2_g = _mm_mul_ps(exposure_slope, _mm_sub_ps(g, 
exposure_black));
+                       __m128 y2_b = _mm_mul_ps(exposure_slope, _mm_sub_ps(b, 
exposure_black));
+                                               
+                       __m128 black_plus_radius = 
_mm_load_ps(_black_plus_radius);
+                       __m128 r_mask = _mm_cmpgt_ps(r, black_plus_radius);
+                       __m128 g_mask = _mm_cmpgt_ps(g, black_plus_radius);
+                       __m128 b_mask = _mm_cmpgt_ps(b, black_plus_radius);
+                       y_r = _mm_andnot_ps(r_mask, y_r);
+                       y_g = _mm_andnot_ps(g_mask, y_g);
+                       y_b = _mm_andnot_ps(b_mask, y_b);
+                       y_r = _mm_or_ps(y_r, _mm_and_ps(r_mask, y2_r));
+                       y_g = _mm_or_ps(y_g, _mm_and_ps(g_mask, y2_g));
+                       y_b = _mm_or_ps(y_b, _mm_and_ps(b_mask, y2_b));
+
+                       /* if (x <= dcp->exposure_black - dcp->exposure_radius) 
x = 0; */
+                       black_minus_radius = _mm_load_ps(_black_minus_radius);
+                       r_mask = _mm_cmple_ps(r, black_minus_radius);
+                       g_mask = _mm_cmple_ps(g, black_minus_radius);
+                       b_mask = _mm_cmple_ps(b, black_minus_radius);
+                       r = _mm_andnot_ps(r_mask, y_r);
+                       g = _mm_andnot_ps(g_mask, y_g);
+                       b = _mm_andnot_ps(b_mask, y_b);
+
+                       /* Contrast in gamma 2.0 */
+                       if (do_contrast)
+                       {
+                               __m128 contr_base = _mm_load_ps(_contr_base);
+                               __m128 contrast = _mm_load_ps(_contrast);
+                               min_val = _mm_load_ps(_very_small_ps);
+                               r = _mm_max_ps(r, min_val);
+                               g = _mm_max_ps(g, min_val);
+                               b = _mm_max_ps(b, min_val);
+                               r = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_rcp_ps(_mm_rsqrt_ps(r)), contr_base)), contr_base);
+                               g = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_rcp_ps(_mm_rsqrt_ps(g)), contr_base)), contr_base);
+                               b = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_rcp_ps(_mm_rsqrt_ps(b)), contr_base)), contr_base);
+                               r = _mm_max_ps(r, min_val);
+                               g = _mm_max_ps(g, min_val);
+                               b = _mm_max_ps(b, min_val);
+                               r = _mm_mul_ps(r,r);
+                               g = _mm_mul_ps(g,g);
+                               b = _mm_mul_ps(b,b);
+                       }
+                       else if (do_highrec)
+                       {
+                               max_val = _mm_load_ps(_ones_ps);
+                               __m128 inv_contrast = 
_mm_load_ps(_inv_contrast);
+                               __m128 recover_radius = 
_mm_load_ps(_recover_radius);
+                               __m128 inv_recover_radius = 
_mm_load_ps(_inv_recover_radius);
+
+                               /* Distance from 1.0 - radius */
+                               __m128 dist = _mm_sub_ps(v_stored, 
recover_radius);
+                               /* Scale so distance is normalized, clamp */
+                               __m128 dist_scaled = _mm_min_ps(max_val, 
_mm_mul_ps(dist, inv_recover_radius));
+
+                               __m128 mul_val = _mm_sub_ps(max_val, 
_mm_mul_ps(dist_scaled, inv_contrast));
+
+                               r = _mm_mul_ps(r, mul_val);
+                               g = _mm_mul_ps(g, mul_val);
+                               b = _mm_mul_ps(b, mul_val);
+                       }
+
+                       /* Convert to HSV */
+                       RGBtoHSV_AVX(&r, &g, &b);
+                       h = r; s = g; v = b;
+
+                       if (!dcp->curve_is_flat)                        
+                       {
+                               /* Convert v to lookup values and interpolate */
+                               __m128 v_mul = _mm_mul_ps(v, 
_mm_load_ps(_twofiftysix_ps));
+                               __m128i lookup = _mm_cvtps_epi32(v_mul);
+                               _mm_store_si128((__m128i*)&xfer[0], lookup);
+
+                               /* Calculate fractions */
+                               __m128 frac = _mm_sub_ps(v_mul, 
_mm_floor_positive_ps(v_mul));
+                               __m128 inv_frac = 
_mm_sub_ps(_mm_load_ps(_ones_ps), frac);
+                               
+                               /* Load two adjacent curve values and 
interpolate between them */
+                               __m128 p0p1 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[0]*2]));
+                               __m128 p2p3 = 
_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[2]*2]));
+                               p0p1 = _mm_loadh_pi(p0p1, 
(__m64*)&dcp->curve_samples[xfer[1]*2]);
+                               p2p3 = _mm_loadh_pi(p2p3, 
(__m64*)&dcp->curve_samples[xfer[3]*2]);
+                               
+                               /* Pack all lower values in v0, high in v1 and 
interpolate */
+                               __m128 v0 = _mm_shuffle_ps(p0p1, p2p3, 
_MM_SHUFFLE(2,0,2,0));
+                               __m128 v1 = _mm_shuffle_ps(p0p1, p2p3, 
_MM_SHUFFLE(3,1,3,1));
+                               v = _mm_add_ps(_mm_mul_ps(inv_frac, v0), 
_mm_mul_ps(frac, v1));
+                       }
+
+                       /* Apply looktable */
+                       if (dcp->looktable) {
+                               huesat_map_AVX(dcp->looktable, 
dcp->looktable_precalc, &h, &s, &v);
+                       }
+                       
+                       /* Ensure that hue is within range */
+                       zero_ps = _mm_setzero_ps();
+                       h_mask_gt = _mm_cmpge_ps(h, six_ps);
+                       h_mask_lt = _mm_cmplt_ps(h, zero_ps);
+                       six_masked_gt = _mm_and_ps(six_ps, h_mask_gt);
+                       six_masked_lt = _mm_and_ps(six_ps, h_mask_lt);
+                       h = _mm_sub_ps(h, six_masked_gt);
+                       h = _mm_add_ps(h, six_masked_lt);
+                       
+                       /* s always slightly > 0 when converting to RGB */
+                       s = _mm_max_ps(s, min_val);
+
+                       HSVtoRGB_SSE(&h, &s, &v);
+                       r = h; g = s; b = v;
+
+                       /* Apply Tone Curve  in RGB space*/
+                       if (dcp->tone_curve_lut) 
+                       {
+                               rgb_tone_sse2( &r, &g, &b, dcp->tone_curve_lut);
+                       }
+
+                       /* Convert to 16 bit */
+                       __m128 rgb_mul = _mm_load_ps(_16_bit_ps);
+                       r = _mm_mul_ps(r, rgb_mul);
+                       g = _mm_mul_ps(g, rgb_mul);
+                       b = _mm_mul_ps(b, rgb_mul);
+                       
+                       __m128i r_i = _mm_cvtps_epi32(r);
+                       __m128i g_i = _mm_cvtps_epi32(g);
+                       __m128i b_i = _mm_cvtps_epi32(b);
+                       __m128i sub_32 = 
_mm_load_si128((__m128i*)_15_bit_epi32);
+                       __m128i signxor = 
_mm_load_si128((__m128i*)_16_bit_sign);
+
+                       /* Subtract 32768 to avoid saturation */
+                       r_i = _mm_sub_epi32(r_i, sub_32);
+                       g_i = _mm_sub_epi32(g_i, sub_32);
+                       b_i = _mm_sub_epi32(b_i, sub_32);
+
+                       /* 32 bit signed -> 16 bit signed conversion, all in 
lower 64 bit */
+                       r_i = _mm_packs_epi32(r_i, r_i);
+                       g_i = _mm_packs_epi32(g_i, g_i);
+                       b_i = _mm_packs_epi32(b_i, b_i);
+
+                       /* Interleave*/
+                       __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
+                       __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
+                       p1 = _mm_unpacklo_epi32(rg_i, bb_i);
+                       p2 = _mm_unpackhi_epi32(rg_i, bb_i);
+
+                       /* Convert sign back */
+                       p1 = _mm_xor_si128(p1, signxor);
+                       p2 = _mm_xor_si128(p2, signxor);
+
+                       /* Store processed pixel */
+                       _mm_store_si128(pixel, p1);
+                       _mm_store_si128(pixel + 1, p2);
+                       pixel += 2;
+               }
+       }
+       _MM_SET_ROUNDING_MODE(_mm_rounding);
+       return TRUE;
+}
+
+#undef SETFLOAT4
+#undef SETFLOAT4_SAME
+
+#else // if not __AVX__
+
+gboolean
+render_AVX(ThreadInfo* t)
+{
+       return FALSE;
+}
+
+#endif

Modified: trunk/plugins/dcp/dcp.c
===================================================================
--- trunk/plugins/dcp/dcp.c     2011-04-12 21:19:00 UTC (rev 3982)
+++ trunk/plugins/dcp/dcp.c     2011-04-12 22:14:10 UTC (rev 3983)
@@ -459,8 +459,18 @@
        pre_cache_tables(t->dcp);
        if (tmp->pixelsize == 4  && (rs_detect_cpu_features() & 
RS_CPU_FLAG_SSE2) && !t->dcp->read_out_curve)
        {
-               if (render_SSE2(t))
+               if ((rs_detect_cpu_features() & RS_CPU_FLAG_AVX) && 
render_AVX(t))
                {
+                       /* AVX routine renders 4 pixels in parallel, but any 
remaining must be */
+                       /* calculated using C routines */
+                       if (tmp->w & 3)
+                       {
+                               t->start_x = tmp->w - (tmp->w & 3);
+                               render(t);
+                       }
+               } 
+               else if (render_SSE2(t))
+               {
                        /* SSE2 routine renders 4 pixels in parallel, but any 
remaining must be */
                        /* calculated using C routines */
                        if (tmp->w & 3)

Modified: trunk/plugins/dcp/dcp.h
===================================================================
--- trunk/plugins/dcp/dcp.h     2011-04-12 21:19:00 UTC (rev 3982)
+++ trunk/plugins/dcp/dcp.h     2011-04-12 22:14:10 UTC (rev 3983)
@@ -127,6 +127,7 @@
 
 gboolean render_SSE2(ThreadInfo* t);
 gboolean render_SSE4(ThreadInfo* t);
+gboolean render_AVX(ThreadInfo* t);
 void calc_hsm_constants(const RSHuesatMap *map, PrecalcHSM* table); 
 
 #endif /* DCP_H */


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to