Author: post
Date: 2009-12-30 16:22:01 +0100 (Wed, 30 Dec 2009)
New Revision: 2891
Modified:
branches/rawstudio-ng-color/plugins/dcp/Makefile.am
branches/rawstudio-ng-color/plugins/dcp/dcp-sse.c
Log:
DCP: Restructured huesatmap lookups for much better code generation and pointer
re-use. Enabled GCC loop unrolling, and removed pointer aliasing warning. Huge
speedup on SSE2 machines.
Modified: branches/rawstudio-ng-color/plugins/dcp/Makefile.am
===================================================================
--- branches/rawstudio-ng-color/plugins/dcp/Makefile.am 2009-12-30 02:06:28 UTC
(rev 2890)
+++ branches/rawstudio-ng-color/plugins/dcp/Makefile.am 2009-12-30 15:22:01 UTC
(rev 2891)
@@ -1,7 +1,7 @@
plugindir = $(libdir)/rawstudio/plugins
AM_CFLAGS = -O4 -Wall -ffast-math -ffinite-math-only \
- -funsafe-math-optimizations
+ -funsafe-math-optimizations -funroll-loops
AM_CXXFLAGS = $(AM_CFLAGS)
Modified: branches/rawstudio-ng-color/plugins/dcp/dcp-sse.c
===================================================================
--- branches/rawstudio-ng-color/plugins/dcp/dcp-sse.c 2009-12-30 02:06:28 UTC
(rev 2890)
+++ branches/rawstudio-ng-color/plugins/dcp/dcp-sse.c 2009-12-30 15:22:01 UTC
(rev 2891)
@@ -27,6 +27,11 @@
#include <emmintrin.h>
#endif
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+/* We ignore this pragma, because we are casting a pointer from float to int
to pass a float using */
+/* _mm_insert_epi32, since no-one was kind enough to include "insertps xmm,
mem32, imm8" */
+/* as a valid intrinsic. So we use the integer equivalent instead */
+
static gfloat _zero_ps[4] __attribute__ ((aligned (16))) = {0.0f, 0.0f, 0.0f,
0.0f};
static gfloat _ones_ps[4] __attribute__ ((aligned (16))) = {1.0f, 1.0f, 1.0f,
1.0f};
static gfloat _two_ps[4] __attribute__ ((aligned (16))) = {2.0f, 2.0f, 2.0f,
2.0f};
@@ -228,7 +233,6 @@
__m128 h = *_h;
__m128 s = *_s;
__m128 v = *_v;
- gint i;
gint xfer_0[4] __attribute__ ((aligned (16)));
gint xfer_1[4] __attribute__ ((aligned (16)));
@@ -280,34 +284,26 @@
__m128 hs1 = _mm_set_ps(entry01[3]->fHueShift,
entry01[2]->fHueShift, entry01[1]->fHueShift, entry01[0]->fHueShift);
__m128 hueShift0 = _mm_add_ps(_mm_mul_ps(hs0, hFract0),
_mm_mul_ps(hs1, hFract1));
hueShift0 = _mm_mul_ps(hueShift0, sFract0);
-
+ hs0 = _mm_set_ps(entry00[3][1].fHueShift,
entry00[2][1].fHueShift, entry00[1][1].fHueShift, entry00[0][1].fHueShift);
+ hs1 = _mm_set_ps(entry01[3][1].fHueShift,
entry01[2][1].fHueShift, entry01[1][1].fHueShift, entry01[0][1].fHueShift);
+ __m128 hueShift1 = _mm_add_ps(_mm_mul_ps(hs0, hFract0),
_mm_mul_ps(hs1, hFract1));
+ hueShift = _mm_add_ps(hueShift0, _mm_mul_ps(hueShift1,
sFract1));
+
__m128 ss0 = _mm_set_ps(entry00[3]->fSatScale,
entry00[2]->fSatScale, entry00[1]->fSatScale, entry00[0]->fSatScale);
__m128 ss1 = _mm_set_ps(entry01[3]->fSatScale,
entry01[2]->fSatScale, entry01[1]->fSatScale, entry01[0]->fSatScale);
__m128 satScale0 = _mm_add_ps(_mm_mul_ps(ss0, hFract0),
_mm_mul_ps(ss1, hFract1));
satScale0 = _mm_mul_ps(satScale0, sFract0);
+ ss0 = _mm_set_ps(entry00[3][1].fSatScale,
entry00[2][1].fSatScale, entry00[1][1].fSatScale, entry00[0][1].fSatScale);
+ ss1 = _mm_set_ps(entry01[3][1].fSatScale,
entry01[2][1].fSatScale, entry01[1][1].fSatScale, entry01[0][1].fSatScale);
+ __m128 satScale1 = _mm_add_ps(_mm_mul_ps(ss0, hFract0),
_mm_mul_ps(ss1, hFract1));
+ satScale = _mm_add_ps(satScale0, _mm_mul_ps(satScale1,
sFract1));
__m128 vs0 = _mm_set_ps(entry00[3]->fValScale,
entry00[2]->fValScale, entry00[1]->fValScale, entry00[0]->fValScale);
__m128 vs1 = _mm_set_ps(entry01[3]->fValScale,
entry01[2]->fValScale, entry01[1]->fValScale, entry01[0]->fValScale);
__m128 valScale0 = _mm_add_ps(_mm_mul_ps(vs0, hFract0),
_mm_mul_ps(vs1, hFract1));
valScale0 = _mm_mul_ps(valScale0, sFract0);
-
- for (i = 0; i < 4; i++) {
- entry00[i]++;
- entry01[i]++;
- }
-
- hs0 = _mm_set_ps(entry00[3]->fHueShift, entry00[2]->fHueShift,
entry00[1]->fHueShift, entry00[0]->fHueShift);
- hs1 = _mm_set_ps(entry01[3]->fHueShift, entry01[2]->fHueShift,
entry01[1]->fHueShift, entry01[0]->fHueShift);
- __m128 hueShift1 = _mm_add_ps(_mm_mul_ps(hs0, hFract0),
_mm_mul_ps(hs1, hFract1));
- hueShift = _mm_add_ps(hueShift0, _mm_mul_ps(hueShift1,
sFract1));
-
- ss0 = _mm_set_ps(entry00[3]->fSatScale, entry00[2]->fSatScale,
entry00[1]->fSatScale, entry00[0]->fSatScale);
- ss1 = _mm_set_ps(entry01[3]->fSatScale, entry01[2]->fSatScale,
entry01[1]->fSatScale, entry01[0]->fSatScale);
- __m128 satScale1 = _mm_add_ps(_mm_mul_ps(ss0, hFract0),
_mm_mul_ps(ss1, hFract1));
- satScale = _mm_add_ps(satScale0, _mm_mul_ps(satScale1,
sFract1));
-
- vs0 = _mm_set_ps(entry00[3]->fValScale, entry00[2]->fValScale,
entry00[1]->fValScale, entry00[0]->fValScale);
- vs1 = _mm_set_ps(entry01[3]->fValScale, entry01[2]->fValScale,
entry01[1]->fValScale, entry01[0]->fValScale);
+ vs0 = _mm_set_ps(entry00[3][1].fValScale,
entry00[2][1].fValScale, entry00[1][1].fValScale, entry00[0][1].fValScale);
+ vs1 = _mm_set_ps(entry01[3][1].fValScale,
entry01[2][1].fValScale, entry01[1][1].fValScale, entry01[0][1].fValScale);
__m128 valScale1 = _mm_add_ps(_mm_mul_ps(vs0, hFract0),
_mm_mul_ps(vs1, hFract1));
valScale = _mm_add_ps(valScale0, _mm_mul_ps(valScale1,
sFract1));
@@ -369,6 +365,7 @@
const RS_VECTOR3 *entry10[4] = { entry00[0] + _valStep,
entry00[1] + _valStep, entry00[2] + _valStep, entry00[3] + _valStep};
const RS_VECTOR3 *entry11[4] = { entry01[0] + _valStep,
entry01[1] + _valStep, entry01[2] + _valStep, entry01[3] + _valStep};
+ /* Hue first element */
__m128 hs00 = _mm_set_ps(entry00[3]->fHueShift,
entry00[2]->fHueShift, entry00[1]->fHueShift, entry00[0]->fHueShift);
__m128 hs01 = _mm_set_ps(entry01[3]->fHueShift,
entry01[2]->fHueShift, entry01[1]->fHueShift, entry01[0]->fHueShift);
__m128 hs10 = _mm_set_ps(entry10[3]->fHueShift,
entry10[2]->fHueShift, entry10[1]->fHueShift, entry10[0]->fHueShift);
@@ -377,6 +374,16 @@
__m128 hueShift1 = _mm_mul_ps(vFract1,
_mm_add_ps(_mm_mul_ps(hs10, hFract0), _mm_mul_ps(hs11, hFract1)));
hueShift = _mm_mul_ps(sFract0, _mm_add_ps(hueShift0,
hueShift1));
+ /* Hue second element */
+ hs00 = _mm_set_ps(entry00[3][1].fHueShift,
entry00[2][1].fHueShift, entry00[1][1].fHueShift, entry00[0][1].fHueShift);
+ hs01 = _mm_set_ps(entry01[3][1].fHueShift,
entry01[2][1].fHueShift, entry01[1][1].fHueShift, entry01[0][1].fHueShift);
+ hs10 = _mm_set_ps(entry10[3][1].fHueShift,
entry10[2][1].fHueShift, entry10[1][1].fHueShift, entry10[0][1].fHueShift);
+ hs11 = _mm_set_ps(entry11[3][1].fHueShift,
entry11[2][1].fHueShift, entry11[1][1].fHueShift, entry11[0][1].fHueShift);
+ hueShift0 = _mm_mul_ps(vFract0, _mm_add_ps(_mm_mul_ps(hs00,
hFract0), _mm_mul_ps(hs01, hFract1)));
+ hueShift1 = _mm_mul_ps(vFract1, _mm_add_ps(_mm_mul_ps(hs10,
hFract0), _mm_mul_ps(hs11, hFract1)));
+ hueShift = _mm_add_ps(hueShift, _mm_mul_ps(sFract1,
_mm_add_ps(hueShift0, hueShift1)));
+
+ /* Sat first element */
__m128 ss00 = _mm_set_ps(entry00[3]->fSatScale,
entry00[2]->fSatScale, entry00[1]->fSatScale, entry00[0]->fSatScale);
__m128 ss01 = _mm_set_ps(entry01[3]->fSatScale,
entry01[2]->fSatScale, entry01[1]->fSatScale, entry01[0]->fSatScale);
__m128 ss10 = _mm_set_ps(entry10[3]->fSatScale,
entry10[2]->fSatScale, entry10[1]->fSatScale, entry10[0]->fSatScale);
@@ -385,6 +392,16 @@
__m128 satScale1 = _mm_mul_ps(vFract1,
_mm_add_ps(_mm_mul_ps(ss10, hFract0), _mm_mul_ps(ss11, hFract1)));
satScale = _mm_mul_ps(sFract0, _mm_add_ps(satScale0,
satScale1));
+ /* Sat second element */
+ ss00 = _mm_set_ps(entry00[3][1].fSatScale,
entry00[2][1].fSatScale, entry00[1][1].fSatScale, entry00[0][1].fSatScale);
+ ss01 = _mm_set_ps(entry01[3][1].fSatScale,
entry01[2][1].fSatScale, entry01[1][1].fSatScale, entry01[0][1].fSatScale);
+ ss10 = _mm_set_ps(entry10[3][1].fSatScale,
entry10[2][1].fSatScale, entry10[1][1].fSatScale, entry10[0][1].fSatScale);
+ ss11 = _mm_set_ps(entry11[3][1].fSatScale,
entry11[2][1].fSatScale, entry11[1][1].fSatScale, entry11[0][1].fSatScale);
+ satScale0 = _mm_mul_ps(vFract0, _mm_add_ps(_mm_mul_ps(ss00,
hFract0), _mm_mul_ps(ss01, hFract1)));
+ satScale1 = _mm_mul_ps(vFract1, _mm_add_ps(_mm_mul_ps(ss10,
hFract0), _mm_mul_ps(ss11, hFract1)));
+ satScale = _mm_add_ps(satScale, _mm_mul_ps(sFract1,
_mm_add_ps(satScale0, satScale1)));
+
+ /* Val first element */
__m128 vs00 = _mm_set_ps(entry00[3]->fValScale,
entry00[2]->fValScale, entry00[1]->fValScale, entry00[0]->fValScale);
__m128 vs01 = _mm_set_ps(entry01[3]->fValScale,
entry01[2]->fValScale, entry01[1]->fValScale, entry01[0]->fValScale);
__m128 vs10 = _mm_set_ps(entry10[3]->fValScale,
entry10[2]->fValScale, entry10[1]->fValScale, entry10[0]->fValScale);
@@ -393,33 +410,11 @@
__m128 valScale1 = _mm_mul_ps(vFract1,
_mm_add_ps(_mm_mul_ps(vs10, hFract0), _mm_mul_ps(vs11, hFract1)));
valScale = _mm_mul_ps(sFract0, _mm_add_ps(valScale0,
valScale1));
- for (i = 0; i < 4; i++) {
- entry00[i]++;
- entry01[i]++;
- entry10[i]++;
- entry11[i]++;
- }
-
- hs00 = _mm_set_ps(entry00[3]->fHueShift, entry00[2]->fHueShift,
entry00[1]->fHueShift, entry00[0]->fHueShift);
- hs01 = _mm_set_ps(entry01[3]->fHueShift, entry01[2]->fHueShift,
entry01[1]->fHueShift, entry01[0]->fHueShift);
- hs10 = _mm_set_ps(entry10[3]->fHueShift, entry10[2]->fHueShift,
entry10[1]->fHueShift, entry10[0]->fHueShift);
- hs11 = _mm_set_ps(entry11[3]->fHueShift, entry11[2]->fHueShift,
entry11[1]->fHueShift, entry11[0]->fHueShift);
- hueShift0 = _mm_mul_ps(vFract0, _mm_add_ps(_mm_mul_ps(hs00,
hFract0), _mm_mul_ps(hs01, hFract1)));
- hueShift1 = _mm_mul_ps(vFract1, _mm_add_ps(_mm_mul_ps(hs10,
hFract0), _mm_mul_ps(hs11, hFract1)));
- hueShift = _mm_add_ps(hueShift, _mm_mul_ps(sFract1,
_mm_add_ps(hueShift0, hueShift1)));
-
- ss00 = _mm_set_ps(entry00[3]->fSatScale, entry00[2]->fSatScale,
entry00[1]->fSatScale, entry00[0]->fSatScale);
- ss01 = _mm_set_ps(entry01[3]->fSatScale, entry01[2]->fSatScale,
entry01[1]->fSatScale, entry01[0]->fSatScale);
- ss10 = _mm_set_ps(entry10[3]->fSatScale, entry10[2]->fSatScale,
entry10[1]->fSatScale, entry10[0]->fSatScale);
- ss11 = _mm_set_ps(entry11[3]->fSatScale, entry11[2]->fSatScale,
entry11[1]->fSatScale, entry11[0]->fSatScale);
- satScale0 = _mm_mul_ps(vFract0, _mm_add_ps(_mm_mul_ps(ss00,
hFract0), _mm_mul_ps(ss01, hFract1)));
- satScale1 = _mm_mul_ps(vFract1, _mm_add_ps(_mm_mul_ps(ss10,
hFract0), _mm_mul_ps(ss11, hFract1)));
- satScale = _mm_add_ps(satScale, _mm_mul_ps(sFract1,
_mm_add_ps(satScale0, satScale1)));
-
- vs00 = _mm_set_ps(entry00[3]->fValScale, entry00[2]->fValScale,
entry00[1]->fValScale, entry00[0]->fValScale);
- vs01 = _mm_set_ps(entry01[3]->fValScale, entry01[2]->fValScale,
entry01[1]->fValScale, entry01[0]->fValScale);
- vs10 = _mm_set_ps(entry10[3]->fValScale, entry10[2]->fValScale,
entry10[1]->fValScale, entry10[0]->fValScale);
- vs11 = _mm_set_ps(entry11[3]->fValScale, entry11[2]->fValScale,
entry11[1]->fValScale, entry11[0]->fValScale);
+ /* Val second element */
+ vs00 = _mm_set_ps(entry00[3][1].fValScale,
entry00[2][1].fValScale, entry00[1][1].fValScale, entry00[0][1].fValScale);
+ vs01 = _mm_set_ps(entry01[3][1].fValScale,
entry01[2][1].fValScale, entry01[1][1].fValScale, entry01[0][1].fValScale);
+ vs10 = _mm_set_ps(entry10[3][1].fValScale,
entry10[2][1].fValScale, entry10[1][1].fValScale, entry10[0][1].fValScale);
+ vs11 = _mm_set_ps(entry11[3][1].fValScale,
entry11[2][1].fValScale, entry11[1][1].fValScale, entry11[0][1].fValScale);
valScale0 = _mm_mul_ps(vFract0, _mm_add_ps(_mm_mul_ps(vs00,
hFract0), _mm_mul_ps(vs01, hFract1)));
valScale1 = _mm_mul_ps(vFract1, _mm_add_ps(_mm_mul_ps(vs10,
hFract0), _mm_mul_ps(vs11, hFract1)));
valScale = _mm_add_ps(valScale, _mm_mul_ps(sFract1,
_mm_add_ps(valScale0, valScale1)));
@@ -1002,7 +997,6 @@
__m128 h = *_h;
__m128 s = *_s;
__m128 v = *_v;
- gint i;
const RS_VECTOR3 *tableBase = map->deltas;
@@ -1055,53 +1049,47 @@
LOOK_SINGLE(A, B, C, 2);\
LOOK_SINGLE(A, B, C, 3);
+#define LOOK_SINGLE_ONE(A,B,C,D) A = _mm_insert_epi32( A,
*(gint32*)&C[D][1].B, D)
+
+#define LOOKUP_FOUR_ONE(A, B, C) LOOK_SINGLE_ONE(A, B, C, 0);\
+ LOOK_SINGLE_ONE(A, B, C, 1);\
+ LOOK_SINGLE_ONE(A, B, C, 2);\
+ LOOK_SINGLE_ONE(A, B, C, 3);
+
/* Initialize to something (will be overwritten) */
- __m128i h00 = next_offsets;
- __m128i h01 = next_offsets;
+ __m128i h00 = _mm_setzero_si128();
+ __m128i h01 = _mm_setzero_si128();
LOOKUP_FOUR(h00, fHueShift, entry00);
LOOKUP_FOUR(h01, fHueShift, entry01);
__m128 hueShift0 = _mm_add_ps(_mm_mul_ps(PS(h00), hFract0),
_mm_mul_ps(PS(h01), hFract1));
hueShift0 = _mm_mul_ps(hueShift0, sFract0);
-
- __m128i s00 = h00;
- __m128i s01 = h00;
- LOOKUP_FOUR(s00, fSatScale, entry00);
- LOOKUP_FOUR(s01, fSatScale, entry01);
- __m128 satScale0 = _mm_add_ps(_mm_mul_ps(PS(s00), hFract0),
_mm_mul_ps(PS(s01), hFract1));
- satScale0 = _mm_mul_ps(satScale0, sFract0);
-
- __m128i v00 = h00;
- __m128i v01 = h00;
- LOOKUP_FOUR(v00, fValScale, entry00);
- LOOKUP_FOUR(v01, fValScale, entry01);
- __m128 valScale0 = _mm_add_ps(_mm_mul_ps(PS(v00), hFract0),
_mm_mul_ps(PS(v01), hFract1));
- valScale0 = _mm_mul_ps(valScale0, sFract0);
-
- for (i = 0; i < 4; i++) {
- entry00[i]++;
- entry01[i]++;
- }
-
- h00 = v00;
- h01 = v00;
-
- LOOKUP_FOUR(h00, fHueShift, entry00);
- LOOKUP_FOUR(h01, fHueShift, entry01);
+
+ LOOKUP_FOUR_ONE(h00, fHueShift, entry00);
+ LOOKUP_FOUR_ONE(h01, fHueShift, entry01);
__m128 hueShift1 = _mm_add_ps(_mm_mul_ps(PS(h00), hFract0),
_mm_mul_ps(PS(h01), hFract1));
hueShift = _mm_add_ps(hueShift0, _mm_mul_ps(hueShift1,
sFract1));
- s00 = v00;
- s01 = v00;
+ __m128i s00 = _mm_setzero_si128();
+ __m128i s01 = _mm_setzero_si128();
LOOKUP_FOUR(s00, fSatScale, entry00);
LOOKUP_FOUR(s01, fSatScale, entry01);
+ __m128 satScale0 = _mm_add_ps(_mm_mul_ps(PS(s00), hFract0),
_mm_mul_ps(PS(s01), hFract1));
+ satScale0 = _mm_mul_ps(satScale0, sFract0);
+ LOOKUP_FOUR_ONE(s00, fSatScale, entry00);
+ LOOKUP_FOUR_ONE(s01, fSatScale, entry01);
__m128 satScale1 = _mm_add_ps(_mm_mul_ps(PS(s00), hFract0),
_mm_mul_ps(PS(s01), hFract1));
satScale = _mm_add_ps(satScale0, _mm_mul_ps(satScale1,
sFract1));
- v00 = s00;
- v01 = s00;
+ __m128i v00 = _mm_setzero_si128();
+ __m128i v01 = _mm_setzero_si128();
LOOKUP_FOUR(v00, fValScale, entry00);
LOOKUP_FOUR(v01, fValScale, entry01);
+ __m128 valScale0 = _mm_add_ps(_mm_mul_ps(PS(v00), hFract0),
_mm_mul_ps(PS(v01), hFract1));
+ valScale0 = _mm_mul_ps(valScale0, sFract0);
+
+ LOOKUP_FOUR_ONE(v00, fValScale, entry00);
+ LOOKUP_FOUR_ONE(v01, fValScale, entry01);
__m128 valScale1 = _mm_add_ps(_mm_mul_ps(PS(v00), hFract0),
_mm_mul_ps(PS(v01), hFract1));
valScale = _mm_add_ps(valScale0, _mm_mul_ps(valScale1,
sFract1));
@@ -1171,6 +1159,7 @@
__m128i temp_10 = next_offsets;
__m128i temp_11 = next_offsets;
+ /* Hue first element */
LOOKUP_FOUR(temp_00, fHueShift, entry00);
LOOKUP_FOUR(temp_01, fHueShift, entry01);
LOOKUP_FOUR(temp_10, fHueShift, entry10);
@@ -1180,6 +1169,16 @@
__m128 hueShift1 = _mm_mul_ps(vFract1,
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
hueShift = _mm_mul_ps(sFract0, _mm_add_ps(hueShift0,
hueShift1));
+ /* Huw second element */
+ LOOKUP_FOUR_ONE(temp_00, fHueShift, entry00);
+ LOOKUP_FOUR_ONE(temp_01, fHueShift, entry01);
+ LOOKUP_FOUR_ONE(temp_10, fHueShift, entry10);
+ LOOKUP_FOUR_ONE(temp_11, fHueShift, entry11);
+ hueShift0 = _mm_mul_ps(vFract0,
_mm_add_ps(_mm_mul_ps(PS(temp_00), hFract0), _mm_mul_ps(PS(temp_01), hFract1)));
+ hueShift1 = _mm_mul_ps(vFract1,
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
+ hueShift = _mm_add_ps(hueShift, _mm_mul_ps(sFract1,
_mm_add_ps(hueShift0, hueShift1)));
+
+ /* Sat first element */
LOOKUP_FOUR(temp_00, fSatScale, entry00);
LOOKUP_FOUR(temp_01, fSatScale, entry01);
LOOKUP_FOUR(temp_10, fSatScale, entry10);
@@ -1188,6 +1187,16 @@
__m128 satScale1 = _mm_mul_ps(vFract1,
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
satScale = _mm_mul_ps(sFract0, _mm_add_ps(satScale0,
satScale1));
+ /* Sat second element */
+ LOOKUP_FOUR_ONE(temp_00, fSatScale, entry00);
+ LOOKUP_FOUR_ONE(temp_01, fSatScale, entry01);
+ LOOKUP_FOUR_ONE(temp_10, fSatScale, entry10);
+ LOOKUP_FOUR_ONE(temp_11, fSatScale, entry11);
+ satScale0 = _mm_mul_ps(vFract0,
_mm_add_ps(_mm_mul_ps(PS(temp_00), hFract0), _mm_mul_ps(PS(temp_01), hFract1)));
+ satScale1 = _mm_mul_ps(vFract1,
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
+ satScale = _mm_add_ps(satScale, _mm_mul_ps(sFract1,
_mm_add_ps(satScale0, satScale1)));
+
+ /* Val first element */
LOOKUP_FOUR(temp_00, fValScale, entry00);
LOOKUP_FOUR(temp_01, fValScale, entry01);
LOOKUP_FOUR(temp_10, fValScale, entry10);
@@ -1196,36 +1205,15 @@
__m128 valScale1 = _mm_mul_ps(vFract1,
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
valScale = _mm_mul_ps(sFract0, _mm_add_ps(valScale0,
valScale1));
- for (i = 0; i < 4; i++) {
- entry00[i]++;
- entry01[i]++;
- entry10[i]++;
- entry11[i]++;
- }
-
- LOOKUP_FOUR(temp_00, fHueShift, entry00);
- LOOKUP_FOUR(temp_01, fHueShift, entry01);
- LOOKUP_FOUR(temp_10, fHueShift, entry10);
- LOOKUP_FOUR(temp_11, fHueShift, entry11);
- hueShift0 = _mm_mul_ps(vFract0,
_mm_add_ps(_mm_mul_ps(PS(temp_00), hFract0), _mm_mul_ps(PS(temp_01), hFract1)));
- hueShift1 = _mm_mul_ps(vFract1,
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
- hueShift = _mm_add_ps(hueShift, _mm_mul_ps(sFract1,
_mm_add_ps(hueShift0, hueShift1)));
-
- LOOKUP_FOUR(temp_00, fSatScale, entry00);
- LOOKUP_FOUR(temp_01, fSatScale, entry01);
- LOOKUP_FOUR(temp_10, fSatScale, entry10);
- LOOKUP_FOUR(temp_11, fSatScale, entry11);
- satScale0 = _mm_mul_ps(vFract0,
_mm_add_ps(_mm_mul_ps(PS(temp_00), hFract0), _mm_mul_ps(PS(temp_01), hFract1)));
- satScale1 = _mm_mul_ps(vFract1,
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
- satScale = _mm_add_ps(satScale, _mm_mul_ps(sFract1,
_mm_add_ps(satScale0, satScale1)));
-
- LOOKUP_FOUR(temp_00, fValScale, entry00);
- LOOKUP_FOUR(temp_01, fValScale, entry01);
- LOOKUP_FOUR(temp_10, fValScale, entry10);
- LOOKUP_FOUR(temp_11, fValScale, entry11);
+ /* Val second element */
+ LOOKUP_FOUR_ONE(temp_00, fValScale, entry00);
+ LOOKUP_FOUR_ONE(temp_01, fValScale, entry01);
+ LOOKUP_FOUR_ONE(temp_10, fValScale, entry10);
+ LOOKUP_FOUR_ONE(temp_11, fValScale, entry11);
valScale0 = _mm_mul_ps(vFract0,
_mm_add_ps(_mm_mul_ps(PS(temp_00), hFract0), _mm_mul_ps(PS(temp_01), hFract1)));
valScale1 = _mm_mul_ps(vFract1,
_mm_add_ps(_mm_mul_ps(PS(temp_10), hFract0), _mm_mul_ps(PS(temp_11), hFract1)));
valScale = _mm_add_ps(valScale, _mm_mul_ps(sFract1,
_mm_add_ps(valScale0, valScale1)));
+
}
__m128 mul_hue = _mm_load_ps(_mul_hue_ps);
_______________________________________________
Rawstudio-commit mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-commit