Author: post
Date: 2009-10-12 20:23:24 +0200 (Mon, 12 Oct 2009)
New Revision: 2703

Modified:
   trunk/plugins/dcp/dcp.c
Log:
DCP: Load constants from static variable to avoid generation of load+shuffle 
from intrinsics. Approx 10% faster overall.

Modified: trunk/plugins/dcp/dcp.c
===================================================================
--- trunk/plugins/dcp/dcp.c     2009-10-12 17:17:39 UTC (rev 2702)
+++ trunk/plugins/dcp/dcp.c     2009-10-12 18:23:24 UTC (rev 2703)
@@ -455,14 +455,19 @@
 
 #if defined (__SSE2__)
 
+static gfloat _zero_ps[4] __attribute__ ((aligned (16))) = {0.0f, 0.0f, 0.0f, 
0.0f};
+static gfloat _ones_ps[4] __attribute__ ((aligned (16))) = {1.0f, 1.0f, 1.0f, 
1.0f};
+static gfloat _two_ps[4] __attribute__ ((aligned (16))) = {2.0f, 2.0f, 2.0f, 
2.0f};
+static gfloat _six_ps[4] __attribute__ ((aligned (16))) = {6.0f-1e-15, 
6.0f-1e-15, 6.0f-1e-15, 6.0f-1e-15};
+
 inline void
 RGBtoHSV_SSE(__m128 *c0, __m128 *c1, __m128 *c2)
 {
 
-       __m128 zero_ps = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
-       __m128 ones_ps = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
+       __m128 zero_ps = _mm_load_ps(_zero_ps);
+       __m128 ones_ps = _mm_load_ps(_ones_ps);
        // Any number > 1
-       __m128 add_v = _mm_set_ps(10.0f, 10.0f, 10.0f, 10.0f);
+       __m128 add_v = _mm_load_ps(_two_ps);
 
        __m128 r = *c0;
        __m128 g = *c1;
@@ -495,7 +500,7 @@
 
        /* if g == v */
        /* h = 2.0f + (b - r) / gap; */
-       __m128 two_ps = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
+       __m128 two_ps = _mm_load_ps(_two_ps);
        mask = _mm_cmpeq_ps(g, v);
        val = _mm_sub_ps(b, r);
        val = _mm_mul_ps(val, gap_inv);
@@ -520,7 +525,7 @@
        s = _mm_andnot_ps(v_mask, val );
 
        /* Check if h < 0 */
-       __m128 six_ps = _mm_set_ps(6.0f, 6.0f, 6.0f, 6.0f);
+       __m128 six_ps = _mm_load_ps(_six_ps);
        mask = _mm_cmplt_ps(h, zero_ps);
        h = _mm_add_ps(h, _mm_and_ps(mask, six_ps));
 
@@ -736,6 +741,9 @@
 
 /* SSE2 implementation, matches the reference implementation pretty closely */
 
+static gfloat _mul_hue_ps[4] __attribute__ ((aligned (16))) = {6.0f / 360.0f, 
6.0f / 360.0f, 6.0f / 360.0f, 6.0f / 360.0f};
+static gint _ones_epi32[4] __attribute__ ((aligned (16))) = {1,1,1,1};
+
 static void
 huesat_map_SSE2(RSHuesatMap *map, __m128 *_h, __m128 *_s, __m128 *_v)
 {
@@ -777,7 +785,7 @@
                __m128i sIndex0 = _mm_cvttps_epi32( sScaled );
 
                sIndex0 = _mm_min_epi16(sIndex0, maxSatIndex0);
-               __m128i ones_epi32 = _mm_set_epi32(1,1,1,1);
+               __m128i ones_epi32 = _mm_load_si128((__m128i*)_ones_epi32);
                __m128i hIndex1 = _mm_add_epi32(hIndex0, ones_epi32);
 
                /* if (hIndex0 >= maxHueIndex0) */
@@ -790,7 +798,7 @@
 
                __m128 hFract1 = _mm_sub_ps( hScaled, _mm_cvtepi32_ps(hIndex0));
                __m128 sFract1 = _mm_sub_ps( sScaled, _mm_cvtepi32_ps(sIndex0));
-               __m128 ones_ps = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
+               __m128 ones_ps = _mm_load_ps(_ones_ps);
 
                __m128 hFract0 = _mm_sub_ps(ones_ps, hFract1);
                __m128 sFract0 = _mm_sub_ps(ones_ps, sFract1);
@@ -872,7 +880,7 @@
                __m128 hFract1 = _mm_sub_ps( hScaled, _mm_cvtepi32_ps(hIndex0));
                __m128 sFract1 = _mm_sub_ps( sScaled, _mm_cvtepi32_ps(sIndex0));
                __m128 vFract1 = _mm_sub_ps( vScaled, _mm_cvtepi32_ps(vIndex0));
-               __m128 ones_ps = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
+               __m128 ones_ps = _mm_load_ps(_ones_ps);
 
                __m128 hFract0 = _mm_sub_ps(ones_ps, hFract1);
                __m128 sFract0 = _mm_sub_ps(ones_ps, sFract1);
@@ -952,8 +960,8 @@
                valScale = _mm_add_ps(valScale, _mm_mul_ps(sFract1, 
_mm_add_ps(valScale0, valScale1)));
        }
 
-       __m128 mul_hue = _mm_set_ps(6.0f / 360.0f, 6.0f / 360.0f, 6.0f / 
360.0f, 6.0f / 360.0f);
-       __m128 ones_ps = _mm_set_ps(1.0, 1.0, 1.0, 1.0);
+       __m128 mul_hue = _mm_load_ps(_mul_hue_ps);
+       __m128 ones_ps = _mm_load_ps(_ones_ps);
        hueShift = _mm_mul_ps(hueShift, mul_hue);
        s = _mm_min_ps(ones_ps, _mm_mul_ps(s, satScale));
        v = _mm_min_ps(ones_ps, _mm_mul_ps(v, valScale));
@@ -1081,18 +1089,24 @@
 sse_matrix3_mul(float* mul, __m128 a, __m128 b, __m128 c)
 {
 
-       __m128 v = _mm_set_ps(mul[0], mul[0], mul[0], mul[0]);
+       __m128 v = _mm_load_ps(mul);
        __m128 acc = _mm_mul_ps(a, v);
 
-       v = _mm_set_ps(mul[1], mul[1], mul[1], mul[1]);
+       v = _mm_load_ps(mul+4);
        acc = _mm_add_ps(acc, _mm_mul_ps(b, v));
 
-       v = _mm_set_ps(mul[2], mul[2], mul[2], mul[2]);
+       v = _mm_load_ps(mul+8);
        acc = _mm_add_ps(acc, _mm_mul_ps(c, v));
 
        return acc;
 }
 
+static gfloat _rgb_div_ps[4] __attribute__ ((aligned (16))) = {1.0/65535.0, 
1.0/65535.0, 1.0/65535.0, 1.0/65535.0};
+static gfloat _very_small_ps[4] __attribute__ ((aligned (16))) = {1e-15, 
1e-15, 1e-15, 1e-15};
+static gfloat _16_bit_ps[4] __attribute__ ((aligned (16))) = {65535.0, 
65535.0, 65535.0, 65535.0};
+static gint _15_bit_epi32[4] __attribute__ ((aligned (16))) = { 32768, 32768, 
32768, 32768};
+static guint _16_bit_sign[4] __attribute__ ((aligned (16))) = 
{0x80008000,0x80008000,0x80008000,0x80008000};
+
 static void
 render_SSE2(ThreadInfo* t)
 {
@@ -1103,16 +1117,27 @@
        __m128i p1,p2;
        __m128 p1f, p2f, p3f, p4f;
        __m128 r, g, b, r2, g2, b2;
-       __m128i zero;
+       __m128i zero = _mm_load_si128((__m128i*)_15_bit_epi32);
 
        int xfer[4] __attribute__ ((aligned (16)));
 
        const gfloat exposure_comp = pow(2.0, dcp->exposure);
        const gfloat saturation = dcp->saturation;
        const gfloat hue = dcp->hue;
-       gfloat r_coeffs[3] = {dcp->camera_to_prophoto.coeff[0][0], 
dcp->camera_to_prophoto.coeff[0][1], dcp->camera_to_prophoto.coeff[0][2]};
-       gfloat g_coeffs[3] = {dcp->camera_to_prophoto.coeff[1][0], 
dcp->camera_to_prophoto.coeff[1][1], dcp->camera_to_prophoto.coeff[1][2]};
-       gfloat b_coeffs[3] = {dcp->camera_to_prophoto.coeff[2][0], 
dcp->camera_to_prophoto.coeff[2][1], dcp->camera_to_prophoto.coeff[2][2]};
+       
+       float cam_prof[4*4*3] __attribute__ ((aligned (16)));
+       for (x = 0; x < 4; x++ ) {
+               cam_prof[x] = dcp->camera_to_prophoto.coeff[0][0];
+               cam_prof[x+4] = dcp->camera_to_prophoto.coeff[0][1];
+               cam_prof[x+8] = dcp->camera_to_prophoto.coeff[0][2];
+               cam_prof[12+x] = dcp->camera_to_prophoto.coeff[1][0];
+               cam_prof[12+x+4] = dcp->camera_to_prophoto.coeff[1][1];
+               cam_prof[12+x+8] = dcp->camera_to_prophoto.coeff[1][2];
+               cam_prof[24+x] = dcp->camera_to_prophoto.coeff[2][0];
+               cam_prof[24+x+4] = dcp->camera_to_prophoto.coeff[2][1];
+               cam_prof[24+x+8] = dcp->camera_to_prophoto.coeff[2][2];
+       }
+       
        gint end_x = image->w - (image->w & 3);
 
        for(y = t->start_y ; y < t->end_y; y++)
@@ -1134,7 +1159,7 @@
                        p3f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(p2, zero));
 
                        /* Normalize to 0 to 1 range */
-                       __m128 rgb_div = _mm_set_ps(1.0/65535.0, 1.0/65535.0, 
1.0/65535.0, 1.0/65535.0);
+                       __m128 rgb_div = _mm_load_ps(_rgb_div_ps);
                        p1f = _mm_mul_ps(p1f, rgb_div);
                        p2f = _mm_mul_ps(p2f, rgb_div);
                        p3f = _mm_mul_ps(p3f, rgb_div);
@@ -1157,13 +1182,13 @@
                        b = _mm_movelh_ps(b1b0, b3b2);
 
                        /* Convert to Prophoto */
-                       r2 = sse_matrix3_mul(r_coeffs, r, g, b);
-                       g2 = sse_matrix3_mul(g_coeffs, r, g, b);
-                       b2 = sse_matrix3_mul(b_coeffs, r, g, b);
+                       r2 = sse_matrix3_mul(cam_prof, r, g, b);
+                       g2 = sse_matrix3_mul(&cam_prof[12], r, g, b);
+                       b2 = sse_matrix3_mul(&cam_prof[24], r, g, b);
 
                        /* Set min/max before HSV conversion */
-                       __m128 min_val = _mm_set_ps(1e-15, 1e-15, 1e-15, 1e-15);
-                       __m128 max_val = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
+                       __m128 min_val = _mm_load_ps(_very_small_ps);
+                       __m128 max_val = _mm_load_ps(_ones_ps);
                        r = _mm_max_ps(_mm_min_ps(r2, max_val), min_val);
                        g = _mm_max_ps(_mm_min_ps(g2, max_val), min_val);
                        b = _mm_max_ps(_mm_min_ps(b2, max_val), min_val);
@@ -1187,8 +1212,8 @@
 
                        /* Hue */
                        __m128 hue_add = _mm_set_ps(hue, hue, hue, hue);
-                       __m128 six_ps = _mm_set_ps(6.0f-1e-15, 6.0f-1e-15, 
6.0f-1e-15, 6.0f-1e-15);
-                       __m128 zero_ps = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
+                       __m128 six_ps = _mm_load_ps(_six_ps);
+                       __m128 zero_ps = _mm_load_ps(_zero_ps);
                        h = _mm_add_ps(h, hue_add);
 
                        /* Check if hue > 6 or < 0*/
@@ -1204,7 +1229,7 @@
                        /* TODO: Use 8 bit fraction as interpolation, for 
interpolating
                         * a more precise lookup using linear interpolation. 
Maybe use less than
                         * 16 bits for lookup for speed, 10 bits with 
interpolation should be enough */
-                       __m128 v_mul = _mm_set_ps(65535.0, 65535.0, 65535.0, 
65535.0);
+                       __m128 v_mul = _mm_load_ps(_16_bit_ps);
                        v = _mm_mul_ps(v, v_mul);
                        __m128i lookup = _mm_cvtps_epi32(v);
                        gfloat* v_p = (gfloat*)&v;
@@ -1237,7 +1262,7 @@
 
                        /* Convert get the fraction of h
                         * h_fraction = h - (float)(int)h */
-                       __m128 ones_ps = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
+                       __m128 ones_ps = _mm_load_ps(_ones_ps);
                        __m128 h_fraction = 
_mm_sub_ps(h,_mm_cvtepi32_ps(_mm_cvttps_epi32(h)));
 
                        /* p = v * (1.0f - s)  */
@@ -1304,7 +1329,7 @@
                        b = _mm_or_ps(b, _mm_and_ps(q, m));
 
 
-                       __m128 rgb_mul = _mm_set_ps(65535.0, 65535.0, 65535.0, 
65535.0);
+                       __m128 rgb_mul = _mm_load_ps(_16_bit_ps);
                        r = _mm_mul_ps(r, rgb_mul);
                        g = _mm_mul_ps(g, rgb_mul);
                        b = _mm_mul_ps(b, rgb_mul);
@@ -1313,8 +1338,8 @@
                        __m128i g_i = _mm_cvtps_epi32(g);
                        __m128i b_i = _mm_cvtps_epi32(b);
 
-                       __m128i sub_32 = _mm_set_epi32(32768, 32768, 32768, 
32768);
-                       __m128i signxor = _mm_set_epi32(0x80008000, 0x80008000, 
0x80008000, 0x80008000);
+                       __m128i sub_32 = 
_mm_load_si128((__m128i*)_15_bit_epi32);
+                       __m128i signxor = 
_mm_load_si128((__m128i*)_16_bit_sign);
 
                        /* Subtract 32768 to avoid saturation */
                        r_i = _mm_sub_epi32(r_i, sub_32);


_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit

Reply via email to