Author: post
Date: 2009-10-13 18:00:09 +0200 (Tue, 13 Oct 2009)
New Revision: 2709
Modified:
trunk/plugins/dcp/dcp.c
Log:
DCP-renderer: Moved table constant calculation out of pixel loop, and load all
values in one go. Moved hue+sat+exposure constants out of loop. Overall 5-10%
speedup.
Modified: trunk/plugins/dcp/dcp.c
===================================================================
--- trunk/plugins/dcp/dcp.c 2009-10-13 15:57:51 UTC (rev 2708)
+++ trunk/plugins/dcp/dcp.c 2009-10-13 16:00:09 UTC (rev 2709)
@@ -34,6 +34,22 @@
typedef struct _RSDcp RSDcp;
typedef struct _RSDcpClass RSDcpClass;
+#if defined (__SSE2__)
+
+typedef struct {
+ //Precalc:
+ gfloat hScale[4] __attribute__ ((aligned (16)));
+ gfloat sScale[4] __attribute__ ((aligned (16)));
+ gfloat vScale[4] __attribute__ ((aligned (16)));
+ gint maxHueIndex0[4] __attribute__ ((aligned (16)));
+ gint maxSatIndex0[4] __attribute__ ((aligned (16)));
+ gint maxValIndex0[4] __attribute__ ((aligned (16)));
+ gint hueStep[4] __attribute__ ((aligned (16)));
+ gint valStep[4] __attribute__ ((aligned (16)));
+} PrecalcHSM;
+
+#endif // defined (__SSE2__)
+
struct _RSDcp {
RSFilter parent;
@@ -73,6 +89,11 @@
RS_VECTOR3 camera_white;
RS_MATRIX3 camera_to_prophoto;
+
+#if defined (__SSE2__)
+ PrecalcHSM huesatmap_precalc;
+ PrecalcHSM looktable_precalc;
+#endif // defined (__SSE2__)
};
struct _RSDcpClass {
@@ -110,6 +131,7 @@
static void render(ThreadInfo* t);
#if defined (__SSE2__)
static void render_SSE2(ThreadInfo* t);
+static void calc_hsm_constants(const RSHuesatMap *map, PrecalcHSM* table);
#endif
static void read_profile(RSDcp *dcp, RSDcpFile *dcp_file);
static RSIccProfile *get_icc_profile(RSFilter *filter);
@@ -741,11 +763,30 @@
/* SSE2 implementation, matches the reference implementation pretty closely */
+
+static void
+calc_hsm_constants(const RSHuesatMap *map, PrecalcHSM* table)
+{
+ g_assert(RS_IS_HUESAT_MAP(map));
+ int i;
+ for (i = 0; i < 4; i++)
+ {
+ table->hScale[i] = (map->hue_divisions < 2) ? 0.0f :
(map->hue_divisions * (1.0f / 6.0f));
+ table->sScale[i] = (gfloat) (map->sat_divisions - 1);
+ table->vScale[i] = (gfloat) (map->val_divisions - 1);
+ table->maxHueIndex0[i] = map->hue_divisions - 1;
+ table->maxSatIndex0[i] = map->sat_divisions - 2;
+ table->maxValIndex0[i] = map->val_divisions - 2;
+ table->hueStep[i] = map->sat_divisions;
+ table->valStep[i] = map->hue_divisions * map->sat_divisions;
+ }
+}
+
static gfloat _mul_hue_ps[4] __attribute__ ((aligned (16))) = {6.0f / 360.0f,
6.0f / 360.0f, 6.0f / 360.0f, 6.0f / 360.0f};
static gint _ones_epi32[4] __attribute__ ((aligned (16))) = {1,1,1,1};
static void
-huesat_map_SSE2(RSHuesatMap *map, __m128 *_h, __m128 *_s, __m128 *_v)
+huesat_map_SSE2(RSHuesatMap *map, const PrecalcHSM* precalc, __m128 *_h,
__m128 *_s, __m128 *_v)
{
g_assert(RS_IS_HUESAT_MAP(map));
@@ -756,31 +797,19 @@
gint xfer_0[4] __attribute__ ((aligned (16)));
gint xfer_1[4] __attribute__ ((aligned (16)));
- //TODO: Precalc BEGIN
- gfloat hScale = (map->hue_divisions < 2) ? 0.0f : (map->hue_divisions *
(1.0f / 6.0f));
- gfloat sScale = (gfloat) (map->sat_divisions - 1);
- gfloat vScale = (gfloat) (map->val_divisions - 1);
- // END precalc this.
-
- gint _maxHueIndex0 = map->hue_divisions - 1;
- gint _maxSatIndex0 = map->sat_divisions - 2;
- gint _maxValIndex0 = map->val_divisions - 2;
-
const RS_VECTOR3 *tableBase = map->deltas;
- gint _valStep = map->hue_divisions * map->sat_divisions;
-
__m128 hueShift;
__m128 satScale;
__m128 valScale;
if (map->val_divisions < 2)
{
- __m128 hScaled = _mm_mul_ps(h, _mm_set_ps( hScale, hScale,
hScale, hScale));
- __m128 sScaled = _mm_mul_ps(s, _mm_set_ps( sScale, sScale,
sScale, sScale));
+ __m128 hScaled = _mm_mul_ps(h, _mm_load_ps(precalc->hScale));
+ __m128 sScaled = _mm_mul_ps(s, _mm_load_ps(precalc->sScale));
- __m128i maxHueIndex0 = _mm_set_epi32(_maxHueIndex0,
_maxHueIndex0, _maxHueIndex0, _maxHueIndex0);
- __m128i maxSatIndex0 = _mm_set_epi32(_maxSatIndex0,
_maxSatIndex0, _maxSatIndex0, _maxSatIndex0);
+ __m128i maxHueIndex0 =
_mm_load_si128((__m128i*)precalc->maxHueIndex0);
+ __m128i maxSatIndex0 =
_mm_load_si128((__m128i*)precalc->maxSatIndex0);
__m128i hIndex0 = _mm_cvttps_epi32( hScaled );
__m128i sIndex0 = _mm_cvttps_epi32( sScaled );
@@ -802,7 +831,7 @@
__m128 hFract0 = _mm_sub_ps(ones_ps, hFract1);
__m128 sFract0 = _mm_sub_ps(ones_ps, sFract1);
- __m128i hueStep = _mm_set_epi32(map->sat_divisions,
map->sat_divisions, map->sat_divisions, map->sat_divisions);
+ __m128i hueStep = _mm_load_si128((__m128i*)precalc->hueStep);
__m128i table_offsets = _mm_add_epi32(sIndex0,
_mm_mullo_epi16(hIndex0, hueStep));
__m128i next_offsets = _mm_add_epi32(sIndex0,
_mm_mullo_epi16(hIndex1, hueStep));
@@ -827,7 +856,6 @@
__m128 valScale0 = _mm_add_ps(_mm_mul_ps(vs0, hFract0),
_mm_mul_ps(vs1, hFract1));
valScale0 = _mm_mul_ps(valScale0, sFract0);
-
for (i = 0; i < 4; i++) {
entry00[i]++;
entry01[i]++;
@@ -851,13 +879,14 @@
}
else
{
- __m128 hScaled = _mm_mul_ps(h, _mm_set_ps( hScale, hScale,
hScale, hScale));
- __m128 sScaled = _mm_mul_ps(s, _mm_set_ps( sScale, sScale,
sScale, sScale));
- __m128 vScaled = _mm_mul_ps(v, _mm_set_ps( vScale, vScale,
vScale, vScale));
+ __m128 hScaled = _mm_mul_ps(h, _mm_load_ps(precalc->hScale));
+ __m128 sScaled = _mm_mul_ps(s, _mm_load_ps(precalc->sScale));
+ __m128 vScaled = _mm_mul_ps(v, _mm_load_ps(precalc->vScale));
- __m128i maxHueIndex0 = _mm_set_epi32(_maxHueIndex0,
_maxHueIndex0, _maxHueIndex0, _maxHueIndex0);
- __m128i maxSatIndex0 = _mm_set_epi32(_maxSatIndex0,
_maxSatIndex0, _maxSatIndex0, _maxSatIndex0);
- __m128i maxValIndex0 = _mm_set_epi32(_maxValIndex0,
_maxValIndex0, _maxValIndex0, _maxValIndex0);
+ __m128i maxHueIndex0 =
_mm_load_si128((__m128i*)precalc->maxHueIndex0);
+ __m128i maxSatIndex0 =
_mm_load_si128((__m128i*)precalc->maxSatIndex0);
+ __m128i maxValIndex0 =
_mm_load_si128((__m128i*)precalc->maxValIndex0);
+
__m128i hIndex0 = _mm_cvttps_epi32(hScaled);
__m128i sIndex0 = _mm_cvttps_epi32(sScaled);
__m128i vIndex0 = _mm_cvttps_epi32(vScaled);
@@ -865,7 +894,7 @@
// Requires that maxSatIndex0 and sIndex0 can be contained
within a 16 bit signed word.
sIndex0 = _mm_min_epi16(sIndex0, maxSatIndex0);
vIndex0 = _mm_min_epi16(vIndex0, maxValIndex0);
- __m128i ones_epi32 = _mm_set_epi32(1,1,1,1);
+ __m128i ones_epi32 = _mm_load_si128((__m128i*)_ones_epi32);
__m128i hIndex1 = _mm_add_epi32(hIndex0, ones_epi32);
/* if (hIndex0 > (maxHueIndex0 - 1)) */
@@ -886,8 +915,8 @@
__m128 sFract0 = _mm_sub_ps(ones_ps, sFract1);
__m128 vFract0 = _mm_sub_ps(ones_ps, vFract1);
- __m128i hueStep = _mm_set_epi32(map->sat_divisions,
map->sat_divisions, map->sat_divisions, map->sat_divisions);
- __m128i valStep = _mm_set_epi32(_valStep, _valStep, _valStep,
_valStep);
+ __m128i hueStep = _mm_load_si128((__m128i*)precalc->hueStep);
+ __m128i valStep = _mm_load_si128((__m128i*)precalc->valStep);
// This requires that hueStep and valStep can be contained in a
16 bit signed integer.
__m128i table_offsets = _mm_add_epi32(sIndex0,
_mm_mullo_epi16(vIndex0, valStep));
@@ -898,7 +927,8 @@
// TODO: This will result in a store->load forward size
mismatch penalty, if possible, avoid.
_mm_store_si128((__m128i*)xfer_0, table_offsets);
_mm_store_si128((__m128i*)xfer_1, next_offsets);
-
+ gint _valStep = precalc->valStep[0];
+
const RS_VECTOR3 *entry00[4] = { tableBase + xfer_0[0],
tableBase + xfer_0[1], tableBase + xfer_0[2], tableBase + xfer_0[3]};
const RS_VECTOR3 *entry01[4] = { tableBase + xfer_1[0],
tableBase + xfer_1[1], tableBase + xfer_1[2], tableBase + xfer_1[3]};
const RS_VECTOR3 *entry10[4] = { entry00[0] + _valStep,
entry00[1] + _valStep, entry00[2] + _valStep, entry00[3] + _valStep};
@@ -1122,8 +1152,9 @@
int xfer[4] __attribute__ ((aligned (16)));
const gfloat exposure_comp = pow(2.0, dcp->exposure);
- const gfloat saturation = dcp->saturation;
- const gfloat hue = dcp->hue;
+ __m128 exp = _mm_set_ps(exposure_comp, exposure_comp, exposure_comp,
exposure_comp);
+ __m128 hue_add = _mm_set_ps(dcp->hue, dcp->hue, dcp->hue, dcp->hue);
+ __m128 sat = _mm_set_ps(dcp->saturation, dcp->saturation,
dcp->saturation, dcp->saturation);
float cam_prof[4*4*3] __attribute__ ((aligned (16)));
for (x = 0; x < 4; x++ ) {
@@ -1198,20 +1229,17 @@
if (dcp->huesatmap)
{
- huesat_map_SSE2(dcp->huesatmap, &h, &s, &v);
+ huesat_map_SSE2(dcp->huesatmap,
&dcp->huesatmap_precalc, &h, &s, &v);
}
/* Exposure */
- __m128 exp = _mm_set_ps(exposure_comp, exposure_comp,
exposure_comp, exposure_comp);
v = _mm_min_ps(max_val, _mm_mul_ps(v, exp));
/* Saturation */
- __m128 sat = _mm_set_ps(saturation, saturation,
saturation, saturation);
s = _mm_min_ps(max_val, _mm_mul_ps(s, sat));
/* Hue */
- __m128 hue_add = _mm_set_ps(hue, hue, hue, hue);
__m128 six_ps = _mm_load_ps(_six_ps);
__m128 zero_ps = _mm_load_ps(_zero_ps);
h = _mm_add_ps(h, hue_add);
@@ -1240,14 +1268,12 @@
v_p[2] = dcp->curve_samples[xfer[2]];
v_p[3] = dcp->curve_samples[xfer[3]];
-
+ /* Apply looktable */
if (dcp->looktable) {
- huesat_map_SSE2(dcp->looktable, &h, &s, &v);
+ huesat_map_SSE2(dcp->looktable,
&dcp->looktable_precalc, &h, &s, &v);
}
-
/* Back to RGB */
-
/* ensure that hue is within range */
h_mask_gt = _mm_cmpgt_ps(h, six_ps);
h_mask_lt = _mm_cmplt_ps(h, zero_ps);
@@ -1549,6 +1575,13 @@
/* Camera to ProPhoto */
matrix3_multiply(&xyz_to_prophoto, &dcp->camera_to_pcs,
&dcp->camera_to_prophoto); /* verified by SDK */
+#if defined (__SSE2__)
+ if (dcp->huesatmap)
+ calc_hsm_constants(dcp->huesatmap,
&dcp->huesatmap_precalc);
+ if (dcp->looktable)
+ calc_hsm_constants(dcp->looktable,
&dcp->looktable_precalc);
+#endif
+
}
static void
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit