Author: post
Date: 2012-02-16 19:07:44 +0100 (Thu, 16 Feb 2012)
New Revision: 4134
Modified:
trunk/plugins/dcp/dcp-avx.c
trunk/plugins/dcp/dcp.c
Log:
Use SSE4 functions for AVX, re-enable, but use SSE2 tonemapping function, since
GCC 4.6.1 seem to miscompile AVX for this function.
Modified: trunk/plugins/dcp/dcp-avx.c
===================================================================
--- trunk/plugins/dcp/dcp-avx.c 2012-02-13 17:08:23 UTC (rev 4133)
+++ trunk/plugins/dcp/dcp-avx.c 2012-02-16 18:07:44 UTC (rev 4134)
@@ -21,7 +21,7 @@
#ifdef __AVX__
-#include <emmintrin.h>
+#include <smmintrin.h>
#include <math.h> /* powf() */
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
@@ -38,26 +38,17 @@
static gfloat _six_ps[4] __attribute__ ((aligned (16))) = {6.0f-1e-15,
6.0f-1e-15, 6.0f-1e-15, 6.0f-1e-15};
static gfloat _very_small_ps[4] __attribute__ ((aligned (16))) = {1e-15,
1e-15, 1e-15, 1e-15};
static const gfloat _two_to_23_ps[4] __attribute__ ((aligned (16))) = {
0x1.0p23f, 0x1.0p23f, 0x1.0p23f, 0x1.0p23f };
-static guint _ps_mask_sign[4] __attribute__ ((aligned (16))) =
{0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
#define DW(A) _mm_castps_si128(A)
#define PS(A) _mm_castsi128_ps(A)
-/* Floor for positive numbers */
-static inline __m128 _mm_floor_positive_ps( __m128 v )
-{
- __m128 two_to_23_ps = _mm_load_ps(_two_to_23_ps);
- return _mm_sub_ps( _mm_add_ps( v, two_to_23_ps ), two_to_23_ps );
-}
static inline void
-RGBtoHSV_AVX(__m128 *c0, __m128 *c1, __m128 *c2)
+RGBtoHSV_SSE4(__m128 *c0, __m128 *c1, __m128 *c2)
{
-
- __m128i zero_i = _mm_setzero_si128();
+ __m128 zero_ps = _mm_setzero_ps();
__m128 small_ps = _mm_load_ps(_very_small_ps);
__m128 ones_ps = _mm_load_ps(_ones_ps);
- __m128i ps_mask_sign = _mm_load_si128((__m128i*)_ps_mask_sign);
// Any number > 1
__m128 add_v = _mm_load_ps(_two_ps);
@@ -71,12 +62,16 @@
g = _mm_min_ps(_mm_max_ps(g, small_ps),ones_ps);
b = _mm_min_ps(_mm_max_ps(b, small_ps),ones_ps);
- __m128 v = _mm_max_ps(b,_mm_max_ps(r,g));
+ __m128 h, v;
+ v = _mm_max_ps(b,_mm_max_ps(r,g));
+
__m128 m = _mm_min_ps(b,_mm_min_ps(r,g));
__m128 gap = _mm_sub_ps(v,m);
- __m128 v_mask = PS(_mm_cmpeq_epi32(_mm_and_si128(DW(gap),
ps_mask_sign), zero_i));
+ __m128 v_mask = _mm_cmpeq_ps(gap, zero_ps);
v = _mm_add_ps(v, _mm_and_ps(add_v, v_mask));
+ h = _mm_setzero_ps();
+
/* Set gap to one where sat = 0, this will avoid divisions by zero,
these values will not be used */
ones_ps = _mm_and_ps(ones_ps, v_mask);
gap = _mm_or_ps(gap, ones_ps);
@@ -85,32 +80,32 @@
/* if r == v */
/* h = (g - b) / gap; */
- __m128i mask = _mm_cmpeq_epi32(DW(r), DW(v));
+ __m128 mask = _mm_cmpeq_ps(r, v);
__m128 val = _mm_mul_ps(gap_inv, _mm_sub_ps(g, b));
/* fill h */
- v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
- __m128i h = _mm_and_si128(DW(val), mask);
+ v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
+ h = _mm_blendv_ps(h, val, mask);
/* if g == v */
/* h = 2.0f + (b - r) / gap; */
__m128 two_ps = _mm_load_ps(_two_ps);
- mask = _mm_cmpeq_epi32(DW(g), DW(v));
+ mask = _mm_cmpeq_ps(g, v);
val = _mm_sub_ps(b, r);
val = _mm_mul_ps(val, gap_inv);
val = _mm_add_ps(val, two_ps);
- v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
- h = _mm_or_si128(h, _mm_and_si128(DW(val), mask));
+ v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
+ h = _mm_blendv_ps(h, val, mask);
/* If (b == v) */
/* h = 4.0f + (r - g) / gap; */
__m128 four_ps = _mm_add_ps(two_ps, two_ps);
- mask = _mm_cmpeq_epi32(DW(b), DW(v));
+ mask = _mm_cmpeq_ps(b, v);
val = _mm_add_ps(four_ps, _mm_mul_ps(gap_inv, _mm_sub_ps(r, g)));
- h = _mm_or_si128(h, _mm_and_si128(DW(val), mask));
- v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
+ v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
+ h = _mm_blendv_ps(h, val, mask);
__m128 s;
/* Fill s, if gap > 0 */
@@ -119,20 +114,19 @@
s = _mm_andnot_ps(v_mask, val );
/* Check if h < 0 */
- zero_i = _mm_setzero_si128();
- __m128i six_ps_i = _mm_load_si128((__m128i*)_six_ps);
- /* We can use integer comparision, since we are checking if h < 0,
since the sign bit is same in integer */
- mask = _mm_cmplt_epi32(h, zero_i);
- __m128 h2 = _mm_add_ps(PS(h), PS(_mm_and_si128(mask, six_ps_i)));
+ zero_ps = _mm_setzero_ps();
+ __m128 six_ps = _mm_load_ps(_six_ps);
+ mask = _mm_cmplt_ps(h, zero_ps);
+ h = _mm_add_ps(h, _mm_and_ps(mask, six_ps));
- *c0 = h2;
+ *c0 = h;
*c1 = s;
*c2 = v;
}
static inline void
-HSVtoRGB_SSE(__m128 *c0, __m128 *c1, __m128 *c2)
+HSVtoRGB_SSE4(__m128 *c0, __m128 *c1, __m128 *c2)
{
__m128 h = *c0;
__m128 s = *c1;
@@ -140,9 +134,9 @@
__m128 r, g, b;
/* Convert get the fraction of h
- * h_fraction = h - floor(h) */
+ * h_fraction = h - (float)(int)h */
__m128 ones_ps = _mm_load_ps(_ones_ps);
- __m128 h_fraction = _mm_sub_ps(h,_mm_floor_positive_ps(h));
+ __m128 h_fraction = _mm_sub_ps(h, _mm_floor_ps(h));
/* p = v * (1.0f - s) */
__m128 p = _mm_mul_ps(v, _mm_sub_ps(ones_ps, s));
@@ -164,9 +158,9 @@
__m128 m = _mm_cmplt_ps(h, h_threshold);
h_threshold = _mm_add_ps(h_threshold, ones_ps);
m = _mm_andnot_ps(out_mask, m);
- r = _mm_or_ps(r, _mm_and_ps(q, m));
- g = _mm_or_ps(g, _mm_and_ps(v, m));
- b = _mm_or_ps(b, _mm_and_ps(p, m));
+ r = _mm_blendv_ps(r, q, m);
+ g = _mm_blendv_ps(g, v, m);
+ b = _mm_blendv_ps(b, p, m);
out_mask = _mm_or_ps(out_mask, m);
/* h < 3 (case 2)*/
@@ -174,9 +168,9 @@
m = _mm_cmplt_ps(h, h_threshold);
h_threshold = _mm_add_ps(h_threshold, ones_ps);
m = _mm_andnot_ps(out_mask, m);
- r = _mm_or_ps(r, _mm_and_ps(p, m));
- g = _mm_or_ps(g, _mm_and_ps(v, m));
- b = _mm_or_ps(b, _mm_and_ps(t, m));
+ r = _mm_blendv_ps(r, p, m);
+ g = _mm_blendv_ps(g, v, m);
+ b = _mm_blendv_ps(b, t, m);
out_mask = _mm_or_ps(out_mask, m);
/* h < 4 (case 3)*/
@@ -184,9 +178,9 @@
m = _mm_cmplt_ps(h, h_threshold);
h_threshold = _mm_add_ps(h_threshold, ones_ps);
m = _mm_andnot_ps(out_mask, m);
- r = _mm_or_ps(r, _mm_and_ps(p, m));
- g = _mm_or_ps(g, _mm_and_ps(q, m));
- b = _mm_or_ps(b, _mm_and_ps(v, m));
+ r = _mm_blendv_ps(r, p, m);
+ g = _mm_blendv_ps(g, q, m);
+ b = _mm_blendv_ps(b, v, m);
out_mask = _mm_or_ps(out_mask, m);
/* h < 5 (case 4)*/
@@ -201,9 +195,11 @@
/* Remainder (case 5) */
/* case 5: *r = v; *g = p; *b = q; break; */
- r = _mm_or_ps(r, _mm_andnot_ps(out_mask,v));
- g = _mm_or_ps(g, _mm_andnot_ps(out_mask,p));
- b = _mm_or_ps(b, _mm_andnot_ps(out_mask,q));
+ __m128 all_ones = _mm_cmpeq_ps(h,h);
+ m = _mm_xor_ps(out_mask, all_ones);
+ r = _mm_blendv_ps(r, v, m);
+ g = _mm_blendv_ps(g, p, m);
+ b = _mm_blendv_ps(b, q, m);
*c0 = r;
*c1 = g;
@@ -211,10 +207,10 @@
}
+/* GCC 4.6.1 seems to miscompile this function with AVX, so disabled for now */
+#if 0
static gint _ones_epi32[4] __attribute__ ((aligned (16))) = {1,1,1,1};
-/* Until now the same as the SSE2 version, but included here to allow AVX
compilation */
-/* to utilize 3-paramater instructions */
void
huesat_map_AVX(RSHuesatMap *map, const PrecalcHSM* precalc, __m128 *_h, __m128
*_s, __m128 *_v)
@@ -331,7 +327,6 @@
__m128 hScaled = _mm_mul_ps(h, _mm_load_ps(precalc->hScale));
__m128 sScaled = _mm_mul_ps(s, _mm_load_ps(precalc->sScale));
__m128 vScaled = _mm_mul_ps(v, _mm_load_ps(precalc->vScale));
-
__m128i hIndex0 = _mm_cvttps_epi32(hScaled);
__m128i sIndex0 = _mm_cvttps_epi32(sScaled);
__m128i vIndex0 = _mm_cvttps_epi32(vScaled);
@@ -480,7 +475,9 @@
*_s = s;
*_v = v;
}
+#endif
+
static gfloat _16_bit_ps[4] __attribute__ ((aligned (16))) = {65535.0,
65535.0, 65535.0, 65535.0};
static gfloat _thousand_24_ps[4] __attribute__ ((aligned (16))) =
{1023.99999f, 1023.99999f, 1023.99999f, 1023.99999f};
@@ -494,7 +491,7 @@
_mm_store_si128((__m128i*)&xfer[0], lookup);
/* Calculate fractions */
- __m128 frac = _mm_sub_ps(mul, _mm_floor_positive_ps(mul));
+ __m128 frac = _mm_sub_ps(mul, _mm_floor_ps(mul));
__m128 inv_frac = _mm_sub_ps(_mm_load_ps(_ones_ps), frac);
/* Load two adjacent curve values and interpolate between them */
@@ -614,6 +611,8 @@
#define SETFLOAT4_SAME(N, A) float N[4] __attribute__ ((aligned (16))); \
N[0] = A; N[1] = A; N[2] = A; N[3] = A;
+extern void huesat_map_SSE2(RSHuesatMap *map, const PrecalcHSM* precalc,
__m128 *_h, __m128 *_s, __m128 *_v);
+
gboolean
render_AVX(ThreadInfo* t)
{
@@ -728,12 +727,12 @@
g2 = sse_matrix3_mul(&cam_prof[12], r, g, b);
b2 = sse_matrix3_mul(&cam_prof[24], r, g, b);
- RGBtoHSV_AVX(&r2, &g2, &b2);
+ RGBtoHSV_SSE4(&r2, &g2, &b2);
h = r2; s = g2; v = b2;
if (dcp->huesatmap)
{
- huesat_map_AVX(dcp->huesatmap,
dcp->huesatmap_precalc, &h, &s, &v);
+ huesat_map_SSE2(dcp->huesatmap,
dcp->huesatmap_precalc, &h, &s, &v);
}
/* Saturation */
@@ -766,7 +765,7 @@
h = _mm_add_ps(h, six_masked_lt);
__m128 v_stored = v;
- HSVtoRGB_SSE(&h, &s, &v);
+ HSVtoRGB_SSE4(&h, &s, &v);
r = h; g = s; b = v;
/* Exposure */
@@ -849,7 +848,7 @@
}
/* Convert to HSV */
- RGBtoHSV_AVX(&r, &g, &b);
+ RGBtoHSV_SSE4(&r, &g, &b);
h = r; s = g; v = b;
if (!dcp->curve_is_flat)
@@ -860,7 +859,7 @@
_mm_store_si128((__m128i*)&xfer[0], lookup);
/* Calculate fractions */
- __m128 frac = _mm_sub_ps(v_mul,
_mm_floor_positive_ps(v_mul));
+ __m128 frac = _mm_sub_ps(v_mul,
_mm_floor_ps(v_mul));
__m128 inv_frac =
_mm_sub_ps(_mm_load_ps(_ones_ps), frac);
/* Load two adjacent curve values and
interpolate between them */
@@ -877,7 +876,7 @@
/* Apply looktable */
if (dcp->looktable) {
- huesat_map_AVX(dcp->looktable,
dcp->looktable_precalc, &h, &s, &v);
+ huesat_map_SSE2(dcp->looktable,
dcp->looktable_precalc, &h, &s, &v);
}
/* Ensure that hue is within range */
@@ -892,7 +891,7 @@
/* s always slightly > 0 when converting to RGB */
s = _mm_max_ps(s, min_val);
- HSVtoRGB_SSE(&h, &s, &v);
+ HSVtoRGB_SSE4(&h, &s, &v);
r = h; g = s; b = v;
/* Apply Tone Curve in RGB space*/
Modified: trunk/plugins/dcp/dcp.c
===================================================================
--- trunk/plugins/dcp/dcp.c 2012-02-13 17:08:23 UTC (rev 4133)
+++ trunk/plugins/dcp/dcp.c 2012-02-16 18:07:44 UTC (rev 4134)
@@ -471,7 +471,7 @@
pre_cache_tables(t->dcp);
if (tmp->pixelsize == 4 && (rs_detect_cpu_features() &
RS_CPU_FLAG_SSE2) && !t->dcp->read_out_curve)
{
- if (FALSE && (rs_detect_cpu_features() & RS_CPU_FLAG_AVX) &&
render_AVX(t))
+ if ((rs_detect_cpu_features() & RS_CPU_FLAG_AVX) &&
render_AVX(t))
{
/* AVX routine renders 4 pixels in parallel, but any
remaining must be */
/* calculated using C routines */
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit