Author: post
Date: 2009-12-28 02:09:59 +0100 (Mon, 28 Dec 2009)
New Revision: 2868

Modified:
   branches/rawstudio-ng-color/plugins/dcp/dcp-sse.c
Log:
Added Disabled and Untested SSE4.1 code for future experiments.

Modified: branches/rawstudio-ng-color/plugins/dcp/dcp-sse.c
===================================================================
--- branches/rawstudio-ng-color/plugins/dcp/dcp-sse.c   2009-12-28 01:07:58 UTC 
(rev 2867)
+++ branches/rawstudio-ng-color/plugins/dcp/dcp-sse.c   2009-12-28 01:09:59 UTC 
(rev 2868)
@@ -20,7 +20,12 @@
 #include "dcp.h"
 
 #if defined (__i386__) || defined (__x86_64__)
+
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#else
 #include <emmintrin.h>
+#endif
 
 static gfloat _zero_ps[4] __attribute__ ((aligned (16))) = {0.0f, 0.0f, 0.0f, 
0.0f};
 static gfloat _ones_ps[4] __attribute__ ((aligned (16))) = {1.0f, 1.0f, 1.0f, 
1.0f};
@@ -523,7 +528,7 @@
 #undef PS
 
 static inline __m128
-               sse_matrix3_mul(float* mul, __m128 a, __m128 b, __m128 c)
+sse_matrix3_mul(float* mul, __m128 a, __m128 b, __m128 c)
 {
 
        __m128 v = _mm_load_ps(mul);
@@ -799,8 +804,689 @@
 #undef SETFLOAT4
 #undef SETFLOAT4_SAME
 
+
+#ifdef __SSE4_1__
+
+static inline void
+RGBtoHSV_SSE4(__m128 *c0, __m128 *c1, __m128 *c2)
+{
+
+       __m128 zero_ps = _mm_load_ps(_zero_ps);
+       __m128 small_ps = _mm_load_ps(_very_small_ps);
+       __m128 ones_ps = _mm_load_ps(_ones_ps);
+       
+       // Any number > 1
+       __m128 add_v = _mm_load_ps(_two_ps);
+
+       __m128 r = *c0;
+       __m128 g = *c1;
+       __m128 b = *c2;
+       
+       /* Clamp */
+       r = _mm_min_ps(_mm_max_ps(r, small_ps),ones_ps);
+       g =  _mm_min_ps(_mm_max_ps(g, small_ps),ones_ps);
+       b =  _mm_min_ps(_mm_max_ps(b, small_ps),ones_ps);
+
+       __m128 h, v;
+       v = _mm_max_ps(b,_mm_max_ps(r,g));
+
+       __m128 m = _mm_min_ps(b,_mm_min_ps(r,g));
+       __m128 gap = _mm_sub_ps(v,m);
+       __m128 v_mask = _mm_cmpeq_ps(gap, zero_ps);
+       v = _mm_add_ps(v, _mm_and_ps(add_v, v_mask));
+
+       h = _mm_xor_ps(r,r);
+
+       /* Set gap to one where sat = 0, this will avoid divisions by zero, 
these values will not be used */
+       ones_ps = _mm_and_ps(ones_ps, v_mask);
+       gap = _mm_or_ps(gap, ones_ps);
+       /*  gap_inv = 1.0 / gap */
+       __m128 gap_inv = _mm_rcp_ps(gap);
+
+       /* if r == v */
+       /* h = (g - b) / gap; */
+       __m128 mask = _mm_cmpeq_ps(r, v);
+       __m128 val = _mm_mul_ps(gap_inv, _mm_sub_ps(g, b));
+
+       /* fill h */
+       v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
+       h = _mm_blendv_ps(h, val, mask);
+
+       /* if g == v */
+       /* h = 2.0f + (b - r) / gap; */
+       __m128 two_ps = _mm_load_ps(_two_ps);
+       mask = _mm_cmpeq_ps(g, v);
+       val = _mm_sub_ps(b, r);
+       val = _mm_mul_ps(val, gap_inv);
+       val = _mm_add_ps(val, two_ps);
+
+       v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
+       h = _mm_blendv_ps(h, val, mask);
+
+       /* If (b == v) */
+       /* h = 4.0f + (r - g) / gap; */
+       __m128 four_ps = _mm_add_ps(two_ps, two_ps);
+       mask = _mm_cmpeq_ps(b, v);
+       val = _mm_add_ps(four_ps, _mm_mul_ps(gap_inv, _mm_sub_ps(r, g)));
+
+       v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
+       h = _mm_blendv_ps(h, val, mask);
+
+       __m128 s;
+       /* Fill s, if gap > 0 */
+       v = _mm_sub_ps(v, add_v);
+       val = _mm_mul_ps(gap,_mm_rcp_ps(v));
+       s = _mm_andnot_ps(v_mask, val );
+
+       /* Check if h < 0 */
+       __m128 six_ps = _mm_load_ps(_six_ps);
+       mask = _mm_cmplt_ps(h, zero_ps);
+       h = _mm_add_ps(h, _mm_and_ps(mask, six_ps));
+
+       *c0 = h;
+       *c1 = s;
+       *c2 = v;
+}
+
+
+static inline void
+HSVtoRGB_SSE4(__m128 *c0, __m128 *c1, __m128 *c2)
+{
+       __m128 h = *c0;
+       __m128 s = *c1;
+       __m128 v = *c2;
+       __m128 r, g, b;
+       
+       /* Convert get the fraction of h
+       * h_fraction = h - (float)(int)h */
+       __m128 ones_ps = _mm_load_ps(_ones_ps);
+       __m128 h_fraction = _mm_sub_ps(h, _mm_floor_ps(h));
+
+       /* p = v * (1.0f - s)  */
+       __m128 p = _mm_mul_ps(v,  _mm_sub_ps(ones_ps, s));
+       /* q = (v * (1.0f - s * f)) */
+       __m128 q = _mm_mul_ps(v, _mm_sub_ps(ones_ps, _mm_mul_ps(s, 
h_fraction)));
+       /* t = (v * (1.0f - s * (1.0f - f))) */
+       __m128 t = _mm_mul_ps(v, _mm_sub_ps(ones_ps, _mm_mul_ps(s, 
_mm_sub_ps(ones_ps, h_fraction))));
+
+       /* h < 1  (case 0)*/
+       /* case 0: *r = v; *g = t; *b = p; break; */
+       __m128 h_threshold = _mm_add_ps(ones_ps, ones_ps);
+       __m128 out_mask = _mm_cmplt_ps(h, ones_ps);
+       r = _mm_and_ps(v, out_mask);
+       g = _mm_and_ps(t, out_mask);
+       b = _mm_and_ps(p, out_mask);
+
+       /* h < 2 (case 1) */
+       /* case 1: *r = q; *g = v; *b = p; break; */
+       __m128 m = _mm_cmplt_ps(h, h_threshold);
+       h_threshold = _mm_add_ps(h_threshold, ones_ps);
+       m = _mm_andnot_ps(out_mask, m);
+       r = _mm_blendv_ps(r, q, m);
+       g = _mm_blendv_ps(g, v, m);
+       b = _mm_blendv_ps(b, p, m);
+       out_mask = _mm_or_ps(out_mask, m);
+
+       /* h < 3 (case 2)*/
+       /* case 2: *r = p; *g = v; *b = t; break; */
+       m = _mm_cmplt_ps(h, h_threshold);
+       h_threshold = _mm_add_ps(h_threshold, ones_ps);
+       m = _mm_andnot_ps(out_mask, m);
+       r = _mm_blendv_ps(r, p, m);
+       g = _mm_blendv_ps(g, v, m);
+       b = _mm_blendv_ps(b, t, m);
+       out_mask = _mm_or_ps(out_mask, m);
+
+       /* h < 4 (case 3)*/
+       /* case 3: *r = p; *g = q; *b = v; break; */
+       m = _mm_cmplt_ps(h, h_threshold);
+       h_threshold = _mm_add_ps(h_threshold, ones_ps);
+       m = _mm_andnot_ps(out_mask, m);
+       r = _mm_blendv_ps(r, p, m);
+       g = _mm_blendv_ps(g, q, m);
+       b = _mm_blendv_ps(b, v, m);
+       out_mask = _mm_or_ps(out_mask, m);
+
+       /* h < 5 (case 4)*/
+       /* case 4: *r = t; *g = p; *b = v; break; */
+       m = _mm_cmplt_ps(h, h_threshold);
+       m = _mm_andnot_ps(out_mask, m);
+       r = _mm_or_ps(r, _mm_and_ps(t, m));
+       g = _mm_or_ps(g, _mm_and_ps(p, m));
+       b = _mm_or_ps(b, _mm_and_ps(v, m));
+       out_mask = _mm_or_ps(out_mask, m);
+
+
+       /* Remainder (case 5) */
+       /* case 5: *r = v; *g = p; *b = q; break; */
+       __m128 all_ones = _mm_cmpeq_ps(h,h);
+       m = _mm_xor_ps(out_mask, all_ones);
+       r = _mm_blendv_ps(r, v, m);
+       g = _mm_blendv_ps(g, p, m);
+       b = _mm_blendv_ps(b, q, m);
+       
+       *c0 = r;
+       *c1 = g;
+       *c2 = b;
+}
+
+#define DW(A) _mm_castps_si128(A)
+#define PS(A) _mm_castsi128_ps(A)
+
+static void
+huesat_map_SSE4(RSHuesatMap *map, const PrecalcHSM* precalc, __m128 *_h, 
__m128 *_s, __m128 *_v)
+{
+       g_assert(RS_IS_HUESAT_MAP(map));
+
+       __m128 h = *_h;
+       __m128 s = *_s;
+       __m128 v = *_v;
+       gint i;
+
+       const RS_VECTOR3 *tableBase = map->deltas;
+
+       __m128 hueShift;
+       __m128 satScale;
+       __m128 valScale;
+
+       if (map->val_divisions < 2)
+       {
+               __m128 hScaled = _mm_mul_ps(h, _mm_load_ps(precalc->hScale));
+               __m128 sScaled = _mm_mul_ps(s,  _mm_load_ps(precalc->sScale));
+
+               __m128i maxHueIndex0 = 
_mm_load_si128((__m128i*)precalc->maxHueIndex0);
+               __m128i maxSatIndex0 = 
_mm_load_si128((__m128i*)precalc->maxSatIndex0);
+               __m128i hIndex0 = _mm_cvttps_epi32( hScaled );
+               __m128i sIndex0 = _mm_cvttps_epi32( sScaled );
+
+               sIndex0 = _mm_min_epi16(sIndex0, maxSatIndex0);
+               __m128i ones_epi32 = _mm_load_si128((__m128i*)_ones_epi32);
+               __m128i hIndex1 = _mm_add_epi32(hIndex0, ones_epi32);
+
+               /* if (hIndex0 >= maxHueIndex0) */
+               __m128i hIndexMask = _mm_cmpgt_epi32( hIndex0, 
_mm_sub_epi32(maxHueIndex0, ones_epi32));
+               hIndex0 = _mm_andnot_si128(hIndexMask, hIndex0);
+               /* hIndex1 = 0; */
+               hIndex1 = _mm_andnot_si128(hIndexMask, hIndex1);
+               /* hIndex0 = maxHueIndex0 */
+               hIndex0 = _mm_or_si128(hIndex0, _mm_and_si128(hIndexMask, 
maxHueIndex0));
+
+               __m128 hFract1 = _mm_sub_ps( hScaled, _mm_cvtepi32_ps(hIndex0));
+               __m128 sFract1 = _mm_sub_ps( sScaled, _mm_cvtepi32_ps(sIndex0));
+               __m128 ones_ps = _mm_load_ps(_ones_ps);
+
+               __m128 hFract0 = _mm_sub_ps(ones_ps, hFract1);
+               __m128 sFract0 = _mm_sub_ps(ones_ps, sFract1);
+               __m128i hueStep = _mm_load_si128((__m128i*)precalc->hueStep);
+               __m128i table_offsets = _mm_add_epi32(sIndex0, 
_mm_mullo_epi16(hIndex0, hueStep));
+               __m128i next_offsets = _mm_add_epi32(sIndex0, 
_mm_mullo_epi16(hIndex1, hueStep));
+
+               const RS_VECTOR3 *entry00[4] = { tableBase + 
_mm_extract_epi32(table_offsets,0), tableBase + 
_mm_extract_epi32(table_offsets,1),
+                       tableBase + _mm_extract_epi32(table_offsets,2), 
tableBase + _mm_extract_epi32(table_offsets,3) };
+
+               const RS_VECTOR3 *entry01[4] = { tableBase + 
_mm_extract_epi32(next_offsets,0), tableBase + 
_mm_extract_epi32(next_offsets,1),
+                       tableBase + _mm_extract_epi32(next_offsets,2), 
tableBase + _mm_extract_epi32(next_offsets,3)};
+
+#define LOOK_SINGLE(A,B,C,D) A = _mm_insert_epi32( A, *(gint32*)&C[D]->B, D)
+       
+/*
+//#define LOOKUP_FOUR(A, B, C) A = _mm_cvtsi32_si128(*(gint32*)&C[0]->B);\
+               LOOK_SINGLE(A, B, C, 1);\
+               LOOK_SINGLE(A, B, C, 2);\
+               LOOK_SINGLE(A, B, C, 3);
+*/
+
+#define LOOKUP_FOUR(A, B, C) LOOK_SINGLE(A, B, C, 0);\
+                       LOOK_SINGLE(A, B, C, 1);\
+                       LOOK_SINGLE(A, B, C, 2);\
+                       LOOK_SINGLE(A, B, C, 3);
+
+               /* Initialize to something (will be overwritten) */
+               __m128i h00 = next_offsets;
+               __m128i h01 = next_offsets;
+               
+               LOOKUP_FOUR(h00, fHueShift, entry00);
+               LOOKUP_FOUR(h01, fHueShift, entry01);
+               __m128 hueShift0 = _mm_add_ps(_mm_mul_ps(PS(h00), hFract0), 
_mm_mul_ps(PS(h01), hFract1));
+               hueShift0 = _mm_mul_ps(hueShift0, sFract0);
+
+               __m128i s00 = h00;
+               __m128i s01 = h00;
+               LOOKUP_FOUR(s00, fSatScale, entry00);
+               LOOKUP_FOUR(s01, fSatScale, entry01);
+               __m128 satScale0 = _mm_add_ps(_mm_mul_ps(PS(s00), hFract0), 
_mm_mul_ps(PS(s01), hFract1));
+               satScale0 = _mm_mul_ps(satScale0, sFract0);
+
+               __m128i v00 = h00;
+               __m128i v01 = h00;
+               LOOKUP_FOUR(v00, fValScale, entry00);
+               LOOKUP_FOUR(v01, fValScale, entry01);
+               __m128 valScale0 = _mm_add_ps(_mm_mul_ps(PS(v00), hFract0), 
_mm_mul_ps(PS(v01), hFract1));
+               valScale0 = _mm_mul_ps(valScale0, sFract0);
+
+               for (i = 0; i < 4; i++) {
+                       entry00[i]++;
+                       entry01[i]++;
+               }
+
+               h00 = v00;
+               h01 = v00;
+
+               LOOKUP_FOUR(h00, fHueShift, entry00);
+               LOOKUP_FOUR(h01, fHueShift, entry01);
+               __m128 hueShift1 = _mm_add_ps(_mm_mul_ps(PS(h00), hFract0), 
_mm_mul_ps(PS(h01), hFract1));
+               hueShift = _mm_add_ps(hueShift0, _mm_mul_ps(hueShift1, 
sFract1));
+
+               s00 = v00;
+               s01 = v00;
+               LOOKUP_FOUR(s00, fSatScale, entry00);
+               LOOKUP_FOUR(s01, fSatScale, entry01);
+               __m128 satScale1 = _mm_add_ps(_mm_mul_ps(PS(s00), hFract0), 
_mm_mul_ps(PS(s01), hFract1));
+               satScale = _mm_add_ps(satScale0, _mm_mul_ps(satScale1, 
sFract1));
+
+               v00 = s00;
+               v01 = s00;
+               LOOKUP_FOUR(v00, fValScale, entry00);
+               LOOKUP_FOUR(v01, fValScale, entry01);
+               __m128 valScale1 = _mm_add_ps(_mm_mul_ps(PS(v00), hFract0), 
_mm_mul_ps(PS(v01), hFract1));
+               valScale = _mm_add_ps(valScale0, _mm_mul_ps(valScale1, 
sFract1));
+
+       }
+       else
+       {
+               __m128 hScaled = _mm_mul_ps(h, _mm_load_ps(precalc->hScale));
+               __m128 sScaled = _mm_mul_ps(s,  _mm_load_ps(precalc->sScale));
+               __m128 vScaled = _mm_mul_ps(v,  _mm_load_ps(precalc->vScale));
+
+               __m128i maxHueIndex0 = 
_mm_load_si128((__m128i*)precalc->maxHueIndex0);
+               __m128i maxSatIndex0 = 
_mm_load_si128((__m128i*)precalc->maxSatIndex0);
+               __m128i maxValIndex0 = 
_mm_load_si128((__m128i*)precalc->maxValIndex0);
+               
+               __m128i hIndex0 = _mm_cvttps_epi32(hScaled);
+               __m128i sIndex0 = _mm_cvttps_epi32(sScaled);
+               __m128i vIndex0 = _mm_cvttps_epi32(vScaled);
+
+               // Requires that maxSatIndex0 and sIndex0 can be contained 
within a 16 bit signed word.
+               sIndex0 = _mm_min_epi16(sIndex0, maxSatIndex0);
+               vIndex0 = _mm_min_epi16(vIndex0, maxValIndex0);
+               __m128i ones_epi32 = _mm_load_si128((__m128i*)_ones_epi32);
+               __m128i hIndex1 = _mm_add_epi32(hIndex0, ones_epi32);
+
+               /* if (hIndex0 > (maxHueIndex0 - 1)) */
+               __m128i hIndexMask = _mm_cmpgt_epi32( hIndex0, 
_mm_sub_epi32(maxHueIndex0, ones_epi32));
+               /* Make room in hIndex0 */
+               hIndex0 = _mm_andnot_si128(hIndexMask, hIndex0);
+               /* hIndex1 = 0; */
+               hIndex1 = _mm_andnot_si128(hIndexMask, hIndex1);
+               /* hIndex0 = maxHueIndex0, where hIndex0 >= (maxHueIndex0) */
+               hIndex0 = _mm_or_si128(hIndex0, _mm_and_si128(hIndexMask, 
maxHueIndex0));
+
+               __m128 hFract1 = _mm_sub_ps( hScaled, _mm_cvtepi32_ps(hIndex0));
+               __m128 sFract1 = _mm_sub_ps( sScaled, _mm_cvtepi32_ps(sIndex0));
+               __m128 vFract1 = _mm_sub_ps( vScaled, _mm_cvtepi32_ps(vIndex0));
+               __m128 ones_ps = _mm_load_ps(_ones_ps);
+
+               __m128 hFract0 = _mm_sub_ps(ones_ps, hFract1);
+               __m128 sFract0 = _mm_sub_ps(ones_ps, sFract1);
+               __m128 vFract0 = _mm_sub_ps(ones_ps, vFract1);
+
+               __m128i hueStep = _mm_load_si128((__m128i*)precalc->hueStep);
+               __m128i valStep = _mm_load_si128((__m128i*)precalc->valStep);
+
+               // This requires that hueStep and valStep can be contained in a 
16 bit signed integer.
+               __m128i table_offsets = _mm_add_epi32(sIndex0, 
_mm_mullo_epi16(vIndex0, valStep));
+               __m128i next_offsets = _mm_mullo_epi16(hIndex1, hueStep);
+               next_offsets = _mm_add_epi32(next_offsets, table_offsets);
+               table_offsets = _mm_add_epi32(table_offsets, 
_mm_mullo_epi16(hIndex0, hueStep));
+
+               gint _valStep = precalc->valStep[0];
+
+               const RS_VECTOR3 *entry00[4] = { tableBase + 
_mm_extract_epi32(table_offsets,0), tableBase + 
_mm_extract_epi32(table_offsets,1),
+                       tableBase + _mm_extract_epi32(table_offsets,2), 
tableBase + _mm_extract_epi32(table_offsets,3) };
+                       
+               const RS_VECTOR3 *entry10[4] = { entry00[0] + _valStep, 
entry00[1] + _valStep, entry00[2] + _valStep, entry00[3] + _valStep};
+
+               const RS_VECTOR3 *entry01[4] = { tableBase + 
_mm_extract_epi32(next_offsets,0), tableBase + 
_mm_extract_epi32(next_offsets,1),
+                       tableBase + _mm_extract_epi32(next_offsets,2), 
tableBase + _mm_extract_epi32(next_offsets,3)};
+
+               const RS_VECTOR3 *entry11[4] = { entry01[0] + _valStep, 
entry01[1] + _valStep, entry01[2] + _valStep, entry01[3] + _valStep};
+               
+               /* Initialize to something (will be overwritten) */
+               __m128i temp_00 = next_offsets;
+               __m128i temp_01 = next_offsets;
+               __m128i temp_10 = next_offsets;
+               __m128i temp_11 = next_offsets;
+
+               LOOKUP_FOUR(temp_00, fHueShift, entry00);
+               LOOKUP_FOUR(temp_01, fHueShift, entry01);
+               LOOKUP_FOUR(temp_10, fHueShift, entry10);
+               LOOKUP_FOUR(temp_11, fHueShift, entry11);
+
+               __m128 hueShift0 = _mm_mul_ps(vFract0, 
_mm_add_ps(_mm_mul_ps(PS(temp_00), hFract0), _mm_mul_ps(PS(temp_01), hFract1)));
+               __m128 hueShift1 = _mm_mul_ps(vFract1, 
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
+               hueShift = _mm_mul_ps(sFract0, _mm_add_ps(hueShift0, 
hueShift1));
+
+               LOOKUP_FOUR(temp_00, fSatScale, entry00);
+               LOOKUP_FOUR(temp_01, fSatScale, entry01);
+               LOOKUP_FOUR(temp_10, fSatScale, entry10);
+               LOOKUP_FOUR(temp_11, fSatScale, entry11);
+               __m128 satScale0 = _mm_mul_ps(vFract0, 
_mm_add_ps(_mm_mul_ps(PS(temp_00), hFract0), _mm_mul_ps(PS(temp_01), hFract1)));
+               __m128 satScale1 = _mm_mul_ps(vFract1, 
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
+               satScale = _mm_mul_ps(sFract0, _mm_add_ps(satScale0, 
satScale1));
+
+               LOOKUP_FOUR(temp_00, fValScale, entry00);
+               LOOKUP_FOUR(temp_01, fValScale, entry01);
+               LOOKUP_FOUR(temp_10, fValScale, entry10);
+               LOOKUP_FOUR(temp_11, fValScale, entry11);
+               __m128 valScale0 = _mm_mul_ps(vFract0, 
_mm_add_ps(_mm_mul_ps(PS(temp_00), hFract0), _mm_mul_ps(PS(temp_01), hFract1)));
+               __m128 valScale1 = _mm_mul_ps(vFract1, 
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
+               valScale = _mm_mul_ps(sFract0, _mm_add_ps(valScale0, 
valScale1));
+
+               for (i = 0; i < 4; i++) {
+                       entry00[i]++;
+                       entry01[i]++;
+                       entry10[i]++;
+                       entry11[i]++;
+               }
+
+               LOOKUP_FOUR(temp_00, fHueShift, entry00);
+               LOOKUP_FOUR(temp_01, fHueShift, entry01);
+               LOOKUP_FOUR(temp_10, fHueShift, entry10);
+               LOOKUP_FOUR(temp_11, fHueShift, entry11);
+               hueShift0 = _mm_mul_ps(vFract0, 
_mm_add_ps(_mm_mul_ps(PS(temp_00), hFract0), _mm_mul_ps(PS(temp_01), hFract1)));
+               hueShift1 = _mm_mul_ps(vFract1, 
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
+               hueShift = _mm_add_ps(hueShift, _mm_mul_ps(sFract1, 
_mm_add_ps(hueShift0, hueShift1)));
+
+               LOOKUP_FOUR(temp_00, fSatScale, entry00);
+               LOOKUP_FOUR(temp_01, fSatScale, entry01);
+               LOOKUP_FOUR(temp_10, fSatScale, entry10);
+               LOOKUP_FOUR(temp_11, fSatScale, entry11);
+               satScale0 = _mm_mul_ps(vFract0, 
_mm_add_ps(_mm_mul_ps(PS(temp_00), hFract0), _mm_mul_ps(PS(temp_01), hFract1)));
+               satScale1 = _mm_mul_ps(vFract1, 
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
+               satScale = _mm_add_ps(satScale, _mm_mul_ps(sFract1, 
_mm_add_ps(satScale0, satScale1)));
+
+               LOOKUP_FOUR(temp_00, fValScale, entry00);
+               LOOKUP_FOUR(temp_01, fValScale, entry01);
+               LOOKUP_FOUR(temp_10, fValScale, entry10);
+               LOOKUP_FOUR(temp_11, fValScale, entry11);
+               valScale0 = _mm_mul_ps(vFract0, 
_mm_add_ps(_mm_mul_ps(PS(temp_00), hFract0), _mm_mul_ps(PS(temp_01), hFract1)));
+               valScale1 = _mm_mul_ps(vFract1, 
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
+               valScale = _mm_add_ps(valScale, _mm_mul_ps(sFract1, 
_mm_add_ps(valScale0, valScale1)));
+       }
+
+       __m128 mul_hue = _mm_load_ps(_mul_hue_ps);
+       __m128 ones_ps = _mm_load_ps(_ones_ps);
+       hueShift = _mm_mul_ps(hueShift, mul_hue);
+       s = _mm_min_ps(ones_ps, _mm_mul_ps(s, satScale));
+       v = _mm_min_ps(ones_ps, _mm_mul_ps(v, valScale));
+       h = _mm_add_ps(h, hueShift);
+       *_h = h;
+       *_s = s;
+       *_v = v;
+}
+
+
+
+#define SETFLOAT4(N, A, B, C, D) float N[4] __attribute__ ((aligned (16))); \
+N[0] = D; N[1] = C; N[2] = B; N[3] = A;
+
+#define SETFLOAT4_SAME(N, A) float N[4] __attribute__ ((aligned (16))); \
+N[0] = A; N[1] = A; N[2] = A; N[3] = A;
+
+
+
+gboolean
+render_SSE4(ThreadInfo* t)
+{
+       RS_IMAGE16 *image = t->tmp;
+       RSDcp *dcp = t->dcp;
+       gint x, y;
+       __m128 h, s, v;
+       __m128i p1,p2;
+       __m128 p1f, p2f, p3f, p4f;
+       __m128 r, g, b, r2, g2, b2;
+       __m128i zero = _mm_load_si128((__m128i*)_15_bit_epi32);
+
+       __m128 hue_add = _mm_set_ps(dcp->hue, dcp->hue, dcp->hue, dcp->hue);
+       __m128 sat = _mm_set_ps(dcp->saturation, dcp->saturation, 
dcp->saturation, dcp->saturation);
+
+       SETFLOAT4(_min_cam, 0.0f, dcp->camera_white.z, dcp->camera_white.y, 
dcp->camera_white.x);
+       SETFLOAT4_SAME(_black_minus_radius, dcp->exposure_black - 
dcp->exposure_radius);
+       SETFLOAT4_SAME(_black_plus_radius, dcp->exposure_black + 
dcp->exposure_radius);
+       SETFLOAT4_SAME(_exposure_black, dcp->exposure_black);
+       SETFLOAT4_SAME(_exposure_slope, dcp->exposure_slope);
+       SETFLOAT4_SAME(_exposure_qscale, dcp->exposure_qscale);
+       SETFLOAT4_SAME(_contrast, dcp->contrast);
+       
+       float cam_prof[4*4*3] __attribute__ ((aligned (16)));
+       for (x = 0; x < 4; x++ ) {
+               cam_prof[x] = dcp->camera_to_prophoto.coeff[0][0];
+               cam_prof[x+4] = dcp->camera_to_prophoto.coeff[0][1];
+               cam_prof[x+8] = dcp->camera_to_prophoto.coeff[0][2];
+               cam_prof[12+x] = dcp->camera_to_prophoto.coeff[1][0];
+               cam_prof[12+x+4] = dcp->camera_to_prophoto.coeff[1][1];
+               cam_prof[12+x+8] = dcp->camera_to_prophoto.coeff[1][2];
+               cam_prof[24+x] = dcp->camera_to_prophoto.coeff[2][0];
+               cam_prof[24+x+4] = dcp->camera_to_prophoto.coeff[2][1];
+               cam_prof[24+x+8] = dcp->camera_to_prophoto.coeff[2][2];
+       }
+
+
+       gint end_x = image->w - (image->w & 3);
+
+       for(y = t->start_y ; y < t->end_y; y++)
+       {
+               for(x=0; x < end_x; x+=4)
+               {
+                       __m128i* pixel = (__m128i*)GET_PIXEL(image, x, y);
+
+                       zero = _mm_xor_si128(zero,zero);
+
+                       /* Convert to float */
+                       p1 = _mm_load_si128(pixel);
+                       p2 = _mm_load_si128(pixel + 1);
+
+                       /* Unpack to R G B x */
+                       p2f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(p1, zero));
+                       p4f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(p2, zero));
+                       p1f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(p1, zero));
+                       p3f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(p2, zero));
+
+                       /* Normalize to 0 to 1 range */
+                       __m128 rgb_div = _mm_load_ps(_rgb_div_ps);
+                       p1f = _mm_mul_ps(p1f, rgb_div);
+                       p2f = _mm_mul_ps(p2f, rgb_div);
+                       p3f = _mm_mul_ps(p3f, rgb_div);
+                       p4f = _mm_mul_ps(p4f, rgb_div);
+
+                       /* Restric to camera white */
+                       __m128 min_cam = _mm_load_ps(_min_cam);
+                       p1f = _mm_min_ps(p1f, min_cam);
+                       p2f = _mm_min_ps(p2f, min_cam);
+                       p3f = _mm_min_ps(p3f, min_cam);
+                       p4f = _mm_min_ps(p4f, min_cam);
+
+                       /* Convert to planar */
+                       __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
+                       __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
+                       __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
+                       __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
+                       r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
+                       g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
+                       b = _mm_movelh_ps(b1b0, b3b2);
+
+                       /* Convert to Prophoto */
+                       r2 = sse_matrix3_mul(cam_prof, r, g, b);
+                       g2 = sse_matrix3_mul(&cam_prof[12], r, g, b);
+                       b2 = sse_matrix3_mul(&cam_prof[24], r, g, b);
+
+                       RGBtoHSV_SSE4(&r, &g, &b);
+                       h = r; s = g; v = b;
+
+                       if (dcp->huesatmap)
+                       {
+                               huesat_map_SSE4(dcp->huesatmap, 
&dcp->huesatmap_precalc, &h, &s, &v);
+                       }
+
+                       /* Saturation */
+                       __m128 max_val = _mm_load_ps(_ones_ps);
+                       __m128 min_val = _mm_load_ps(_very_small_ps);
+                       s = _mm_max_ps(min_val, _mm_min_ps(max_val, 
_mm_mul_ps(s, sat)));
+
+                       /* Hue */
+                       __m128 six_ps = _mm_load_ps(_six_ps);
+                       __m128 zero_ps = _mm_load_ps(_zero_ps);
+                       h = _mm_add_ps(h, hue_add);
+
+                       /* Check if hue > 6 or < 0*/
+                       __m128 h_mask_gt = _mm_cmpgt_ps(h, six_ps);
+                       __m128 h_mask_lt = _mm_cmplt_ps(h, zero_ps);
+                       __m128 six_masked_gt = _mm_and_ps(six_ps, h_mask_gt);
+                       __m128 six_masked_lt = _mm_and_ps(six_ps, h_mask_lt);
+                       h = _mm_sub_ps(h, six_masked_gt);
+                       h = _mm_add_ps(h, six_masked_lt);
+
+                       HSVtoRGB_SSE4(&h, &s, &v);
+                       r = h; g = s; b = v;
+                       
+                       /* Exposure */
+                       __m128 black_minus_radius = 
_mm_load_ps(_black_minus_radius);
+                       __m128 y_r = _mm_sub_ps(r, black_minus_radius);
+                       __m128 y_g = _mm_sub_ps(g, black_minus_radius);
+                       __m128 y_b = _mm_sub_ps(b, black_minus_radius);
+
+                       __m128 exposure_qscale = _mm_load_ps(_exposure_qscale);
+                       y_r = _mm_mul_ps(exposure_qscale,_mm_mul_ps(y_r, y_r));
+                       y_g = _mm_mul_ps(exposure_qscale,_mm_mul_ps(y_g, y_g));
+                       y_b = _mm_mul_ps(exposure_qscale,_mm_mul_ps(y_b, y_b));
+
+                       __m128 exposure_slope = _mm_load_ps(_exposure_slope);
+                       __m128 exposure_black = _mm_load_ps(_exposure_black);
+                       __m128 y2_r = _mm_mul_ps(exposure_slope, _mm_sub_ps(r, 
exposure_black));
+                       __m128 y2_g = _mm_mul_ps(exposure_slope, _mm_sub_ps(g, 
exposure_black));
+                       __m128 y2_b = _mm_mul_ps(exposure_slope, _mm_sub_ps(b, 
exposure_black));
+
+                       __m128 black_plus_radius = 
_mm_load_ps(_black_plus_radius);
+                       __m128 r_mask = _mm_cmpgt_ps(r, black_plus_radius);
+                       __m128 g_mask = _mm_cmpgt_ps(g, black_plus_radius);
+                       __m128 b_mask = _mm_cmpgt_ps(b, black_plus_radius);
+                       y_r = _mm_andnot_ps(r_mask, y_r);
+                       y_g = _mm_andnot_ps(g_mask, y_g);
+                       y_b = _mm_andnot_ps(b_mask, y_b);
+                       y_r = _mm_or_ps(y_r, _mm_and_ps(r_mask, y2_r));
+                       y_g = _mm_or_ps(y_g, _mm_and_ps(g_mask, y2_g));
+                       y_b = _mm_or_ps(y_b, _mm_and_ps(b_mask, y2_b));
+                       black_minus_radius = _mm_load_ps(_black_minus_radius);
+                       r_mask = _mm_cmple_ps(r, black_minus_radius);
+                       g_mask = _mm_cmple_ps(g, black_minus_radius);
+                       b_mask = _mm_cmple_ps(b, black_minus_radius);
+                       r = _mm_andnot_ps(r_mask, y_r);
+                       g = _mm_andnot_ps(g_mask, y_g);
+                       b = _mm_andnot_ps(b_mask, y_b);
+
+                       /* Contrast in gamma 2.0 */
+                       __m128 half_ps = _mm_load_ps(_half_ps);
+                       __m128 contrast = _mm_load_ps(_contrast);
+                       min_val = _mm_load_ps(_very_small_ps);
+                       r = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_sqrt_ps(r), half_ps)), half_ps);
+                       g = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_sqrt_ps(g), half_ps)), half_ps);
+                       b = _mm_add_ps(_mm_mul_ps(contrast, 
_mm_sub_ps(_mm_sqrt_ps(b), half_ps)), half_ps);
+                       r = _mm_max_ps(r, min_val);
+                       g = _mm_max_ps(g, min_val);
+                       b = _mm_max_ps(b, min_val);
+                       r = _mm_mul_ps(r,r);
+                       g = _mm_mul_ps(g,g);
+                       b = _mm_mul_ps(b,b);
+
+                       /* Convert to HSV */
+                       RGBtoHSV_SSE4(&r, &g, &b);
+                       h = r; s = g; v = b;
+
+                       /* Convert v to lookup values */
+                       /* TODO: Use 8 bit fraction as interpolation, for 
interpolating
+                       * a more precise lookup using linear interpolation. 
Maybe use less than
+                       * 16 bits for lookup for speed, 10 bits with 
interpolation should be enough */
+                       __m128 v_mul = _mm_load_ps(_16_bit_ps);
+                       v = _mm_mul_ps(v, v_mul);
+                       __m128i lookup = _mm_cvtps_epi32(v);
+
+                       __m128i v_curved = lookup;
+                       v_curved = _mm_insert_epi32(v_curved, 
((gint32*)dcp->curve_samples)[_mm_extract_epi32(lookup,0)], 0);
+                       v_curved = _mm_insert_epi32(v_curved, 
((gint32*)dcp->curve_samples)[_mm_extract_epi32(lookup,1)], 1);
+                       v_curved = _mm_insert_epi32(v_curved, 
((gint32*)dcp->curve_samples)[_mm_extract_epi32(lookup,2)], 2);
+                       v_curved = _mm_insert_epi32(v_curved, 
((gint32*)dcp->curve_samples)[_mm_extract_epi32(lookup,3)], 3);
+
+                       v = PS(v_curved);
+
+                       /* Apply looktable */
+                       if (dcp->looktable) {
+                               huesat_map_SSE4(dcp->looktable, 
&dcp->looktable_precalc, &h, &s, &v);
+                       }
+                       
+                       /* Ensure that hue is within range */   
+                       h_mask_gt = _mm_cmpgt_ps(h, six_ps);
+                       h_mask_lt = _mm_cmplt_ps(h, zero_ps);
+                       six_masked_gt = _mm_and_ps(six_ps, h_mask_gt);
+                       six_masked_lt = _mm_and_ps(six_ps, h_mask_lt);
+                       h = _mm_sub_ps(h, six_masked_gt);
+                       h = _mm_add_ps(h, six_masked_lt);
+                       
+                       /* s always slightly > 0 when converting to RGB */
+                       s = _mm_max_ps(s, min_val);
+
+                       HSVtoRGB_SSE4(&h, &s, &v);
+                       r = h; g = s; b = v;
+
+                       /* Apply Tone Curve  in RGB space*/
+                       if (dcp->tone_curve_lut) 
+                       {
+                               rgb_tone_sse2( &r, &g, &b, dcp->tone_curve_lut);
+                       }
+
+                       /* Convert to 16 bit */
+                       __m128 rgb_mul = _mm_load_ps(_16_bit_ps);
+                       r = _mm_mul_ps(r, rgb_mul);
+                       g = _mm_mul_ps(g, rgb_mul);
+                       b = _mm_mul_ps(b, rgb_mul);
+                       
+                       __m128i r_i = _mm_cvtps_epi32(r);
+                       __m128i g_i = _mm_cvtps_epi32(g);
+                       __m128i b_i = _mm_cvtps_epi32(b);
+
+                       /* Convert to 16 bit unsigned values */
+                       r_i = _mm_packus_epi32(r_i, r_i);
+                       b_i = _mm_packus_epi32(b_i, b_i);
+                       g_i = _mm_packus_epi32(g_i, g_i);
+
+                       /* Interleave*/
+                       __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
+                       __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
+                       p1 = _mm_unpacklo_epi32(rg_i, bb_i);
+                       p2 = _mm_unpackhi_epi32(rg_i, bb_i);
+
+                       /* Store processed pixel */
+                       _mm_store_si128(pixel, p1);
+                       _mm_store_si128(pixel + 1, p2);
+               }
+       }
+       return TRUE;
+}
+#undef DW
+#undef PS
+#undef SETFLOAT4
+#undef SETFLOAT4_SAME
+
+#else  // not defined __SSE4_1__
+
+gboolean render_SSE4(ThreadInfo* t)
+{
+       return FALSE;
+}
+
+#endif // defined __SSE4_1__
+
 #else // if not x86 or x86-64
-static void render_SSE2(ThreadInfo* t)
+gboolean render_SSE2(ThreadInfo* t)
 {
        return FALSE;
 }
@@ -808,4 +1494,8 @@
 {
        return;
 }
+gboolean render_SSE4(ThreadInfo* t)
+{
+       return FALSE;
+}
 #endif


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to