Author: post
Date: 2010-03-14 19:23:23 +0100 (Sun, 14 Mar 2010)
New Revision: 3255
Modified:
trunk/plugins/dcp/dcp-sse2.c
Log:
Use Integer compares in RGB to HSV conversion, they have smaller latency and
gives the same result.
Modified: trunk/plugins/dcp/dcp-sse2.c
===================================================================
--- trunk/plugins/dcp/dcp-sse2.c 2010-03-11 22:49:58 UTC (rev 3254)
+++ trunk/plugins/dcp/dcp-sse2.c 2010-03-14 18:23:23 UTC (rev 3255)
@@ -33,7 +33,11 @@
static gfloat _six_ps[4] __attribute__ ((aligned (16))) = {6.0f-1e-15,
6.0f-1e-15, 6.0f-1e-15, 6.0f-1e-15};
static gfloat _very_small_ps[4] __attribute__ ((aligned (16))) = {1e-15,
1e-15, 1e-15, 1e-15};
static const gfloat _two_to_23_ps[4] __attribute__ ((aligned (16))) = {
0x1.0p23f, 0x1.0p23f, 0x1.0p23f, 0x1.0p23f };
+static guint _ps_mask_sign[4] __attribute__ ((aligned (16))) =
{0x4fffffff,0x4fffffff,0x4fffffff,0x4fffffff};
+#define DW(A) _mm_castps_si128(A)
+#define PS(A) _mm_castsi128_ps(A)
+
/* Floor for positive numbers */
static inline __m128 _mm_floor_positive_ps( __m128 v )
{
@@ -42,12 +46,13 @@
}
static inline void
-RGBtoHSV_SSE(__m128 *c0, __m128 *c1, __m128 *c2)
+RGBtoHSV_SSE2(__m128 *c0, __m128 *c1, __m128 *c2)
{
- __m128 zero_ps = _mm_setzero_ps();
+ __m128i zero_i = _mm_setzero_si128();
__m128 small_ps = _mm_load_ps(_very_small_ps);
__m128 ones_ps = _mm_load_ps(_ones_ps);
+ __m128i ps_mask_sign = _mm_load_si128((__m128i*)_ps_mask_sign);
// Any number > 1
__m128 add_v = _mm_load_ps(_two_ps);
@@ -61,16 +66,12 @@
g = _mm_min_ps(_mm_max_ps(g, small_ps),ones_ps);
b = _mm_min_ps(_mm_max_ps(b, small_ps),ones_ps);
- __m128 h, v;
- v = _mm_max_ps(b,_mm_max_ps(r,g));
-
+ __m128 v = _mm_max_ps(b,_mm_max_ps(r,g));
__m128 m = _mm_min_ps(b,_mm_min_ps(r,g));
__m128 gap = _mm_sub_ps(v,m);
- __m128 v_mask = _mm_cmpeq_ps(gap, zero_ps);
+ __m128 v_mask = PS(_mm_cmpeq_epi32(_mm_and_si128(DW(gap),
ps_mask_sign), zero_i));
v = _mm_add_ps(v, _mm_and_ps(add_v, v_mask));
- h = _mm_setzero_ps();
-
/* Set gap to one where sat = 0, this will avoid divisions by zero,
these values will not be used */
ones_ps = _mm_and_ps(ones_ps, v_mask);
gap = _mm_or_ps(gap, ones_ps);
@@ -79,32 +80,32 @@
/* if r == v */
/* h = (g - b) / gap; */
- __m128 mask = _mm_cmpeq_ps(r, v);
+ __m128i mask = _mm_cmpeq_epi32(DW(r), DW(v));
__m128 val = _mm_mul_ps(gap_inv, _mm_sub_ps(g, b));
/* fill h */
- v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
- h = _mm_or_ps(h, _mm_and_ps(val, mask));
+ v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
+ __m128i h = _mm_and_si128(DW(val), mask);
/* if g == v */
/* h = 2.0f + (b - r) / gap; */
__m128 two_ps = _mm_load_ps(_two_ps);
- mask = _mm_cmpeq_ps(g, v);
+ mask = _mm_cmpeq_epi32(DW(g), DW(v));
val = _mm_sub_ps(b, r);
val = _mm_mul_ps(val, gap_inv);
val = _mm_add_ps(val, two_ps);
- v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
- h = _mm_or_ps(h, _mm_and_ps(val, mask));
+ v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
+ h = _mm_or_si128(h, _mm_and_si128(DW(val), mask));
/* If (b == v) */
/* h = 4.0f + (r - g) / gap; */
__m128 four_ps = _mm_add_ps(two_ps, two_ps);
- mask = _mm_cmpeq_ps(b, v);
+ mask = _mm_cmpeq_epi32(DW(b), DW(v));
val = _mm_add_ps(four_ps, _mm_mul_ps(gap_inv, _mm_sub_ps(r, g)));
- v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
- h = _mm_or_ps(h, _mm_and_ps(val, mask));
+ h = _mm_or_si128(h, _mm_and_si128(DW(val), mask));
+ v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
__m128 s;
/* Fill s, if gap > 0 */
@@ -113,12 +114,13 @@
s = _mm_andnot_ps(v_mask, val );
/* Check if h < 0 */
- zero_ps = _mm_setzero_ps();
- __m128 six_ps = _mm_load_ps(_six_ps);
- mask = _mm_cmplt_ps(h, zero_ps);
- h = _mm_add_ps(h, _mm_and_ps(mask, six_ps));
+ zero_i = _mm_setzero_si128();
+ __m128i six_ps_i = _mm_load_si128((__m128i*)_six_ps);
+ /* We can use integer comparision, since we are checking if h < 0*/
+ mask = _mm_cmplt_epi32(h, zero_i);
+ __m128 h2 = _mm_add_ps(PS(h), PS(_mm_and_si128(mask, six_ps_i)));
- *c0 = h;
+ *c0 = h2;
*c1 = s;
*c2 = v;
}
@@ -438,8 +440,6 @@
*_s = s;
*_v = v;
}
-#define DW(A) _mm_castps_si128(A)
-#define PS(A) _mm_castsi128_ps(A)
static gfloat _16_bit_ps[4] __attribute__ ((aligned (16))) = {65535.0,
65535.0, 65535.0, 65535.0};
static gfloat _thousand_24_ps[4] __attribute__ ((aligned (16))) =
{1023.99999f, 1023.99999f, 1023.99999f, 1023.99999f};
@@ -702,7 +702,7 @@
b2 = _mm_mul_ps(_mm_load_ps(_cm_b), b);
}
- RGBtoHSV_SSE(&r2, &g2, &b2);
+ RGBtoHSV_SSE2(&r2, &g2, &b2);
h = r2; s = g2; v = b2;
if (dcp->huesatmap)
@@ -793,7 +793,7 @@
}
/* Convert to HSV */
- RGBtoHSV_SSE(&r, &g, &b);
+ RGBtoHSV_SSE2(&r, &g, &b);
h = r; s = g; v = b;
if (!dcp->curve_is_flat)
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit