Ah, i forgot to test the -m32 option... the ia32 asm doesn't allow
clobbering the registers you got in the output... with this one,
it should be ok (crossing fingers)

# HG changeset patch
# User Edouard Gomez <[EMAIL PROTECTED]>
# Date 1188602356 -7200
# Node ID b62e3facb809541427bc1455105b21a8dbd816fb
# Parent  a846a231fbc4d48865286ec28b2ab9fc5d784ff5
Use the curve as a luminance curve only

diff -r a846a231fbc4 -r b62e3facb809 src/rs-color-transform.c
--- a/src/rs-color-transform.c  Thu Aug 30 01:15:46 2007 +0200
+++ b/src/rs-color-transform.c  Sat Sep 01 01:19:16 2007 +0200
@@ -25,6 +25,21 @@ static void make_tables(RS_COLOR_TRANSFO
 static void make_tables(RS_COLOR_TRANSFORM *rct);
 static gboolean select_render(RS_COLOR_TRANSFORM *rct);
 
+/* The XYZ tri stimulus luminance coefficients */
+#define LUM_XYZ_R (0.212671f)
+#define LUM_XYZ_G (0.715160f)
+#define LUM_XYZ_B (0.072169f)
+
+/* Fixed precision version for better performance
+ * Cannot be >13 or the code overflows in some places
+ * and above 15 can generate sigsevs */
+#define LUM_PRECISION 13
+#define LUM_FIXED(a) ((gint)((a)*(1<<LUM_PRECISION)))
+#define LUM_FIXED_HALFONE LUM_FIXED(0.5f)
+#define LUM_FIXED_XYZ_R   LUM_FIXED(LUM_XYZ_R)
+#define LUM_FIXED_XYZ_G   LUM_FIXED(LUM_XYZ_G)
+#define LUM_FIXED_XYZ_B   LUM_FIXED(LUM_XYZ_B)
+
 /* Function pointers - initialized by arch binders */
 COLOR_TRANSFORM(*transform_nocms8);
 COLOR_TRANSFORM(*transform_cms8);
@@ -50,6 +65,8 @@ struct _RS_COLOR_TRANSFORM_PRIVATE {
        gint nknots;
        gfloat *knots;
        gfloat curve_samples[65536];
+       guint luminance_fixed[65536];
+       gfloat luminance_float[65535];
        void *transform;
 };
 
@@ -308,8 +325,48 @@ make_tables(RS_COLOR_TRANSFORM *rct)
                nd = ((gdouble) n) * rec65535;
                nd = pow(nd, gammavalue);
 
-               if (likely(rct->priv->curve_samples))
-                       nd = (gdouble) rct->priv->curve_samples[((gint) 
(nd*65535.0f))];
+               /* The idea is to use the curve to boost/dcrease
+                * luminance only.
+                * So we have to compute Y, map it to its new value
+                * and then compute a factor that would keep this
+                * new luminance if applied to the RGB triplet.
+                *
+                * Quite straight forward; let's do it quick...
+                * Y  = a.R + b.G + c.B
+                *
+                * We map Y to Y' according to the curve:
+                * Y' = curve(Y)
+                *
+                * let's compute a real 'd' such that:
+                * Y' = d.Y
+                *
+                * Then we have
+                * Y' = d.(a.R + b.G + c.B)
+                * or written a bit differently:
+                * Y' = a.(d.R) + b.(d.G) + c.(d.B)
+                *
+                * So the RGB triplet we are looking for is (d.R, d.G, d.B)
+                *
+                * The luminance LUTs store curve(Y)/Y as floating and fixed
+                * values so it's very easy to directly multiply the mapped
+                * value and the RGB vectors w/o any need for div except the
+                * one when generating the table. Which represents 65536 divs,
+                * which is equivalent to a 256x256 picture. So computing the
+                * LUTs saves divs in most cases (any raw smaller than
+                * 2048x2048 ? :-))
+                *
+                * NB: luminance curve is applied in linear space before
+                * any gamma */
+               if (likely(rct->priv->curve_samples)) {
+                       gfloat Yp;
+                       gfloat Y;
+
+                       Y = (n > 0) ? (gfloat)n : 1.f;
+                       Yp = rct->priv->curve_samples[n];
+                       Yp = Yp < 0.f ? 0.f : (Yp > 1.f) ? 1.f : Yp;
+                       rct->priv->luminance_float[n] = Yp*65536.f/Y;
+                       rct->priv->luminance_fixed[n] = 
LUM_FIXED(rct->priv->luminance_float[n]);
+               }
 
                nd = nd*contrast+postadd;
 
@@ -431,6 +488,8 @@ COLOR_TRANSFORM(transform_nocms_float)
                srcoffset = y * in_rowstride;
                for(x=0 ; x<width ; x++)
                {
+                       gfloat Y;
+
                        /* pre multipliers */
                        r1 = in[srcoffset+R] * rct->priv->pre_mul[R];
                        g1 = in[srcoffset+G] * rct->priv->pre_mul[G];
@@ -456,6 +515,18 @@ COLOR_TRANSFORM(transform_nocms_float)
                        r = r2;
                        g = g2;
                        b = b2;
+
+                       /* clamp to unsigned short */
+                       _CLAMP65535_TRIPLET(r,g,b);
+
+                       // Compute luminance and extract from the LUT curve[Y]/Y
+                       Y = LUM_XYZ_R*r + LUM_XYZ_G*g + LUM_XYZ_B*b + 0.5f;
+                       Y = 
rct->priv->luminance_float[(Y<0.f)?0:(Y>65535.f)?65535:(int)Y];
+
+                       // Scale RGB according to that factor
+                       r = (int)((gfloat)r*Y);
+                       g = (int)((gfloat)g*Y);
+                       b = (int)((gfloat)b*Y);
 
                        /* clamp to unsigned short */
                        _CLAMP65535_TRIPLET(r,g,b);
@@ -485,29 +556,23 @@ COLOR_TRANSFORM(transform_nocms_float)
 #if defined (__i386__) || defined (__x86_64__)
 COLOR_TRANSFORM(transform_nocms8_sse)
 {
-       register glong r,g,b;
+       register glong r, g, b;
        gint destoffset;
        gint col;
        gfloat top[4] align(16) = {65535.0, 65535.0, 65535.0, 65535.0};
        gfloat mat[12] align(16) = {
+               LUM_XYZ_R,
                rct->priv->color_matrix.coeff[0][0],
                rct->priv->color_matrix.coeff[1][0],
                rct->priv->color_matrix.coeff[2][0],
-               RLUM * (rct->priv->color_matrix.coeff[0][0]
-                       + rct->priv->color_matrix.coeff[0][1]
-                       + rct->priv->color_matrix.coeff[0][2]),
+               LUM_XYZ_G,
                rct->priv->color_matrix.coeff[0][1],
                rct->priv->color_matrix.coeff[1][1],
                rct->priv->color_matrix.coeff[2][1],
-               GLUM * (rct->priv->color_matrix.coeff[1][0]
-                       + rct->priv->color_matrix.coeff[1][1]
-                       + rct->priv->color_matrix.coeff[1][2]),
+               LUM_XYZ_B,
                rct->priv->color_matrix.coeff[0][2],
                rct->priv->color_matrix.coeff[1][2],
                rct->priv->color_matrix.coeff[2][2],
-               BLUM * (rct->priv->color_matrix.coeff[2][0]
-                       + rct->priv->color_matrix.coeff[2][1]
-                       + rct->priv->color_matrix.coeff[2][2])
        };
        asm volatile (
                "movups (%2), %%xmm2\n\t" /* rs->pre_mul */
@@ -515,11 +580,9 @@ COLOR_TRANSFORM(transform_nocms8_sse)
                "movaps 16(%0), %%xmm4\n\t"
                "movaps 32(%0), %%xmm5\n\t"
                "movaps (%1), %%xmm6\n\t" /* top */
-               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
                :
-               : "r" (mat), "r" (top), "r" (rct->priv->pre_mul)
-               : "memory"
-       );
+               : "r" (mat), "r" (top), "r" (rct->priv->pre_mul));
+
        while(height--)
        {
                destoffset = 0;
@@ -529,13 +592,15 @@ COLOR_TRANSFORM(transform_nocms8_sse)
                while(col--)
                {
                        asm volatile (
+                               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
+
                                /* load */
                                "movq (%3), %%mm0\n\t" /* R | G | B | G2 */
-                               "movq %%mm0, %%mm1\n\t" /* R | G | B | G2 */
+                               "movq (%3), %%mm1\n\t" /* R | G | B | G2 */
                                "punpcklwd %%mm7, %%mm0\n\t" /* R | G */
                                "punpckhwd %%mm7, %%mm1\n\t" /* B | G2 */
                                "cvtpi2ps %%mm1, %%xmm0\n\t" /* B | G2 | ? | ? 
*/
-                               "shufps $0x4E, %%xmm0, %%xmm0\n\t" /* ? | ? | B 
| G2 */
+                               "shufps $0x4e, %%xmm0, %%xmm0\n\t" /* ? | ? | B 
| G2 */
                                "cvtpi2ps %%mm0, %%xmm0\n\t" /* R | G | B | G2 
*/
 
                                "mulps %%xmm2, %%xmm0\n\t" /* (R | G | B | _) * 
premul */
@@ -553,7 +618,7 @@ COLOR_TRANSFORM(transform_nocms8_sse)
                                "addps %%xmm1, %%xmm7\n\t"
 
                                "movaps %%xmm0, %%xmm1\n\t"
-                               "shufps $0xAA, %%xmm1, %%xmm1\n\t" /* B | B | B 
| B */
+                               "shufps $0xaa, %%xmm1, %%xmm1\n\t" /* B | B | B 
| B */
                                "mulps %%xmm5, %%xmm1\n\t"
                                "addps %%xmm7, %%xmm1\n\t"
 
@@ -561,24 +626,33 @@ COLOR_TRANSFORM(transform_nocms8_sse)
                                "minps %%xmm6, %%xmm1\n\t" /* MIN (65535.0, in) 
*/
                                "maxps %%xmm7, %%xmm1\n\t" /* MAX (0.0, in) */
 
-                               /* xmm1: R | G | B | _ */
-//                             "shufps $0xFF, %%xmm1, %%xmm1\n\t"
-                               "cvtss2si %%xmm1, %0\n\t"
-                               "shufps $0xF9, %%xmm1, %%xmm1\n\t" /* xmm1: G | 
B | _ | _ */
-                               "cvtss2si %%xmm1, %1\n\t"
-                               "shufps $0xF9, %%xmm1, %%xmm1\n\t" /* xmm1: B | 
_ | _ | _ */
-                               "cvtss2si %%xmm1, %2\n\t"
+                               /* xmm1: Y | R | G | B */
+                               "cvtss2si %%xmm1, %%"REG_a"\n\t" /* Extract 
Luminance integer value */
+                               "movss (%4, %%"REG_a", 4), %%xmm0\n\t" /* xmm0 
= Y'/Y | _ | _ | _ */
+                               "shufps $0, %%xmm0, %%xmm0\n\t" /* xmm0 = Y'/Y 
| Y'/Y | Y'/Y | Y'/Y */
+                               "mulps %%xmm0, %%xmm1\n\t" /* xmm1 = Y' | R' | 
G' | B' */
+                               "minps %%xmm6, %%xmm1\n\t" /* MIN (65535.0, in) 
*/
+                               "maxps %%xmm7, %%xmm1\n\t" /* MAX (0.0, in) */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = R' 
| G' | B' | _ */
+                               "cvtss2si %%xmm1, %0\n\t" /* Extract R' */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = G' 
| B' | _ | _ */
+                               "cvtss2si %%xmm1, %1\n\t" /* Extract G' */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = B' 
| _ | _ | _ */
+                               "cvtss2si %%xmm1, %2\n\t" /* Extract B' */
                                : "=r" (r), "=r" (g), "=r" (b)
-                               : "r" (s)
-                               : "memory"
-                       );
+                               : "r" (s), "r" (rct->priv->luminance_float)
+                               : "%"REG_a,
+                                 "memory");
+
                        d[destoffset++] = rct->priv->table8[r];
                        d[destoffset++] = rct->priv->table8[g];
                        d[destoffset++] = rct->priv->table8[b];
                        s += 4;
                }
        }
-       asm volatile("emms\n\t");
+
+       asm volatile ("emms\n\t");
+
        return;
 }
 
@@ -586,32 +660,34 @@ COLOR_TRANSFORM(transform_nocms8_3dnow)
 {
        gint destoffset;
        gint col;
-       register glong r=0,g=0,b=0;
-       gfloat mat[12] align(8);
-       gfloat top[2] align(8);
-       mat[0] = rct->priv->color_matrix.coeff[0][0];
-       mat[1] = rct->priv->color_matrix.coeff[0][1];
-       mat[2] = rct->priv->color_matrix.coeff[0][2];
-       mat[3] = 0.0;
-       mat[4] = rct->priv->color_matrix.coeff[1][0];
-       mat[5] = rct->priv->color_matrix.coeff[1][1];
-       mat[6] = rct->priv->color_matrix.coeff[1][2];
-       mat[7] = 0.0;
-       mat[8] = rct->priv->color_matrix.coeff[2][0];
-       mat[9] = rct->priv->color_matrix.coeff[2][1];
-       mat[10] = rct->priv->color_matrix.coeff[2][2];
-       mat[11] = 0.0;
-       top[0] = 65535.0;
-       top[1] = 65535.0;
-       asm volatile (
-               "femms\n\t"
-               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
-               "movq (%0), %%mm2\n\t" /* pre_mul R | pre_mul G */
-               "movq 8(%0), %%mm3\n\t" /* pre_mul B | pre_mul G2 */
-               "movq (%1), %%mm6\n\t" /* 65535.0 | 65535.0 */
-               :
-               : "r" (rct->priv->pre_mul), "r" (&top)
-       );
+       gint rgbx[4] align(8) = {0, 0, 0, 0};
+       gfloat tmp[4] align(8) = { 0.f, 0.f, 0.f, 0.f};
+       const gfloat mat[] align(8) = {
+               rct->priv->color_matrix.coeff[0][0],
+               rct->priv->color_matrix.coeff[0][1],
+               rct->priv->color_matrix.coeff[0][2],
+               0.f,
+               rct->priv->color_matrix.coeff[1][0],
+               rct->priv->color_matrix.coeff[1][1],
+               rct->priv->color_matrix.coeff[1][2],
+               0.f,
+               rct->priv->color_matrix.coeff[2][0],
+               rct->priv->color_matrix.coeff[2][1],
+               rct->priv->color_matrix.coeff[2][2],
+               0.f,
+               LUM_XYZ_R,
+               LUM_XYZ_G,
+               LUM_XYZ_B,
+               0.f,
+               /* Not really matrix... but to avoid register pressure in the
+                * following asm blocks. It helps having only one register to
+                * hold multpile addresses. */
+               65535.f, 65535.f,
+               .5f, .5f,
+               rct->priv->pre_mul[0], rct->priv->pre_mul[1],
+               rct->priv->pre_mul[2], rct->priv->pre_mul[3],
+       };
+
        while(height--)
        {
                destoffset = 0;
@@ -621,9 +697,14 @@ COLOR_TRANSFORM(transform_nocms8_3dnow)
                while(col--)
                {
                        asm volatile (
+                               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
+                               "movq 80(%1), %%mm2\n\t" /* pre_mul R | pre_mul 
G */
+                               "movq 88(%1), %%mm3\n\t" /* pre_mul B | pre_mul 
G2 */
+                               "movq 64(%1), %%mm6\n\t" /* 65535.0 | 65535.0 */
+
                                /* pre multiply */
                                "movq (%0), %%mm0\n\t" /* R | G | B | G2 */
-                               "movq %%mm0, %%mm1\n\t" /* R | G | B | G2 */
+                               "movq (%0), %%mm1\n\t" /* R | G | B | G2 */
                                "punpcklwd %%mm7, %%mm0\n\t" /* R, G */
                                "punpckhwd %%mm7, %%mm1\n\t" /* B, G2 */
                                "pi2fd %%mm0, %%mm0\n\t" /* to float */
@@ -635,53 +716,87 @@ COLOR_TRANSFORM(transform_nocms8_3dnow)
                                "pfmax %%mm7, %%mm0\n\t"
                                "pfmax %%mm7, %%mm1\n\t"
 
-                               "add $8, %0\n\t" /* increment offset */
+                               "movq 72(%1), %%mm3\n\t" /* mm3 = halfone */
 
                                /* red */
-                               "movq (%4), %%mm4\n\t" /* mat[0] | mat[1] */
-                               "movq 8(%4), %%mm5\n\t" /* mat[2] | mat[3] */
+                               "movq (%1), %%mm4\n\t" /* mat[0] | mat[1] */
+                               "movq 8(%1), %%mm5\n\t" /* mat[2] | mat[3] */
                                "pfmul %%mm0, %%mm4\n\t" /* R*[0] | G*[1] */
                                "pfmul %%mm1, %%mm5\n\t" /* B*[2] | G2*[3] */
                                "pfadd %%mm4, %%mm5\n\t" /* R*[0] + B*[2] | 
G*[1] + G2*[3] */
                                "pfacc %%mm5, %%mm5\n\t" /* R*[0] + B*[2] + 
G*[1] + G2*[3] | ? */
                                "pfmin %%mm6, %%mm5\n\t"
                                "pfmax %%mm7, %%mm5\n\t"
-                               "pf2id %%mm5, %%mm5\n\t" /* to integer */
-                               "movd %%mm5, %1\n\t" /* write r */
+                               "movd %%mm5, (%3)\n\t" /* write to tmp[0] */
 
                                /* green */
-                               "movq 16(%4), %%mm4\n\t"
-                               "movq 24(%4), %%mm5\n\t"
+                               "movq 16(%1), %%mm4\n\t"
+                               "movq 24(%1), %%mm5\n\t"
                                "pfmul %%mm0, %%mm4\n\t"
                                "pfmul %%mm1, %%mm5\n\t"
                                "pfadd %%mm4, %%mm5\n\t"
                                "pfacc %%mm5, %%mm5\n\t"
                                "pfmin %%mm6, %%mm5\n\t"
                                "pfmax %%mm7, %%mm5\n\t"
-                               "pf2id %%mm5, %%mm5\n\t"
-                               "movd %%mm5, %2\n\t"
+                               "movd %%mm5, 4(%3)\n\t"  /* write to tmp[1] */
 
                                /* blue */
-                               "movq 32(%4), %%mm4\n\t"
-                               "movq 40(%4), %%mm5\n\t"
+                               "movq 32(%1), %%mm4\n\t"
+                               "movq 40(%1), %%mm5\n\t"
                                "pfmul %%mm0, %%mm4\n\t"
                                "pfmul %%mm1, %%mm5\n\t"
                                "pfadd %%mm4, %%mm5\n\t"
                                "pfacc %%mm5, %%mm5\n\t"
                                "pfmin %%mm6, %%mm5\n\t"
                                "pfmax %%mm7, %%mm5\n\t"
+                               "movd %%mm5, 8(%3)\n\t"  /* write to tmp[2] */
+
+                               /* Luminance */
+                               "movq 48(%1), %%mm4\n\t"
+                               "movq 56(%1), %%mm5\n\t"
+                               "pfmul %%mm0, %%mm4\n\t"
+                               "pfmul %%mm1, %%mm5\n\t"
+                               "pfadd %%mm4, %%mm5\n\t"
+                               "pfacc %%mm5, %%mm5\n\t"
+                               "pfmin %%mm6, %%mm5\n\t"
+                               "pfmax %%mm7, %%mm5\n\t"
+                               "pfadd %%mm3, %%mm5\n\t" /* +0.5 to have real 
round */
                                "pf2id %%mm5, %%mm5\n\t"
-                               "movd %%mm5, %3\n\t"
-                               : "+r" (s), "+r" (r), "+r" (g), "+r" (b)
-                               : "r" (&mat)
-                       );
-                       d[destoffset++] = rct->priv->table8[r];
-                       d[destoffset++] = rct->priv->table8[g];
-                       d[destoffset++] = rct->priv->table8[b];
-               }
-       }
-       asm volatile ("femms\n\t");
-
+                               "movd %%mm5, %%eax\n\t" /* write (int)Y to eax 
*/
+
+                               /* Compute luminance adjusted pixel */
+                               "movq (%3), %%mm0\n\t" /* R, G */
+                               "movq 8(%3), %%mm1\n\t" /* B, 0 */
+                               "movd (%2, %%"REG_a", 4), %%mm2\n\t" /* mm2 = x 
| a */
+                               "pshufw $0x44, %%mm2, %%mm2\n\t" /* mm2 = a | a 
*/
+                               "pfmul %%mm2, %%mm0\n\t" /* mm0 = aR | aG */
+                               "pfmul %%mm2, %%mm1\n\t" /* mm1 = aB | 0  */
+                               "pfmin %%mm6, %%mm0\n\t" /* mm0 = min(mm0, 
65535.f) */
+                               "pfmin %%mm6, %%mm1\n\t" /* mm1 = min(mm1, 
65535.f) */
+                               "pfmax %%mm7, %%mm0\n\t" /* mm0 = max(mm0, 0) */
+                               "pfmax %%mm7, %%mm1\n\t" /* mm1 = max(mm1, 0) */
+                               "pfadd %%mm3, %%mm0\n\t" /* +0.5 */
+                               "pfadd %%mm3, %%mm1\n\t" /* +0.5 */
+                               "pf2id %%mm0, %%mm0\n\t" /* mm0 = (int)aR | 
(int)aG */
+                               "pf2id %%mm1, %%mm1\n\t" /* mm1 = (int)aB | 0 */
+                               "movq %%mm0, (%4)\n\t" /* write new R/G to rgbx 
*/
+                               "movq %%mm1, 8(%4)\n\t" /* write new B/0 to 
rgbx */
+                               :
+                               : "r" (s),
+                                 "r" (mat),
+                                 "r" (rct->priv->luminance_float),
+                                 "r" (tmp),
+                                 "r" (rgbx)
+                               : "%"REG_a,
+                                 "memory");
+                       d[destoffset++] = rct->priv->table8[rgbx[0]];
+                       d[destoffset++] = rct->priv->table8[rgbx[1]];
+                       d[destoffset++] = rct->priv->table8[rgbx[2]];
+                       s += 4;
+               }
+       }
+
+       asm volatile("femms\n\t");
        return;
 }
 
@@ -693,29 +808,28 @@ COLOR_TRANSFORM(transform_cms8_sse)
        gint col;
        gfloat top[4] align(16) = {65535.0, 65535.0, 65535.0, 65535.0};
        gfloat mat[12] align(16) = {
+               LUM_XYZ_R,
                rct->priv->color_matrix.coeff[0][0],
                rct->priv->color_matrix.coeff[1][0],
                rct->priv->color_matrix.coeff[2][0],
-               0.0,
+               LUM_XYZ_G,
                rct->priv->color_matrix.coeff[0][1],
                rct->priv->color_matrix.coeff[1][1],
                rct->priv->color_matrix.coeff[2][1],
-               0.0,
+               LUM_XYZ_B,
                rct->priv->color_matrix.coeff[0][2],
                rct->priv->color_matrix.coeff[1][2],
                rct->priv->color_matrix.coeff[2][2],
-               0.0 };
+       };
        asm volatile (
                "movups (%2), %%xmm2\n\t" /* rs->pre_mul */
                "movaps (%0), %%xmm3\n\t" /* matrix */
                "movaps 16(%0), %%xmm4\n\t"
                "movaps 32(%0), %%xmm5\n\t"
                "movaps (%1), %%xmm6\n\t" /* top */
-               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
                :
-               : "r" (mat), "r" (top), "r" (rct->priv->pre_mul)
-               : "memory"
-       );
+               : "r" (mat), "r" (top), "r" (rct->priv->pre_mul));
+
        while(height--)
        {
                destoffset = 0;
@@ -724,6 +838,8 @@ COLOR_TRANSFORM(transform_cms8_sse)
                while(col--)
                {
                        asm volatile (
+                               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
+
                                /* load */
                                "movq (%3), %%mm0\n\t" /* R | G | B | G2 */
                                "movq %%mm0, %%mm1\n\t" /* R | G | B | G2 */
@@ -748,7 +864,7 @@ COLOR_TRANSFORM(transform_cms8_sse)
                                "addps %%xmm1, %%xmm7\n\t"
 
                                "movaps %%xmm0, %%xmm1\n\t"
-                               "shufps $0xAA, %%xmm1, %%xmm1\n\t"
+                               "shufps $0xaa, %%xmm1, %%xmm1\n\t"
                                "mulps %%xmm5, %%xmm1\n\t"
                                "addps %%xmm7, %%xmm1\n\t"
 
@@ -756,23 +872,32 @@ COLOR_TRANSFORM(transform_cms8_sse)
                                "minps %%xmm6, %%xmm1\n\t"
                                "maxps %%xmm7, %%xmm1\n\t"
 
-                               "cvtss2si %%xmm1, %0\n\t"
-                               "shufps $0xF9, %%xmm1, %%xmm1\n\t"
-                               "cvtss2si %%xmm1, %1\n\t"
-                               "shufps $0xF9, %%xmm1, %%xmm1\n\t"
-                               "cvtss2si %%xmm1, %2\n\t"
+                               /* xmm1: Y | R | G | B */
+                               "cvtss2si %%xmm1, %%"REG_a"\n\t" /* Extract 
Luminance integer value */
+                               "movss (%4, %%"REG_a", 4), %%xmm0\n\t" /* xmm0 
= Y'/Y | _ | _ | _ */
+                               "shufps $0, %%xmm0, %%xmm0\n\t" /* xmm0 = Y'/Y 
| Y'/Y | Y'/Y | Y'/Y */
+                               "mulps %%xmm0, %%xmm1\n\t" /* xmm1 = Y' | R' | 
G' | B' */
+                               "minps %%xmm6, %%xmm1\n\t" /* MIN (65535.0, in) 
*/
+                               "maxps %%xmm7, %%xmm1\n\t" /* MAX (0.0, in) */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = R' 
| G' | B' | _ */
+                               "cvtss2si %%xmm1, %0\n\t" /* Extract R' */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = G' 
| B' | _ | _ */
+                               "cvtss2si %%xmm1, %1\n\t" /* Extract G' */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = B' 
| _ | _ | _ */
+                               "cvtss2si %%xmm1, %2\n\t" /* Extract B' */
                                : "=r" (r), "=r" (g), "=r" (b)
-                               : "r" (s)
-                               : "memory"
-                       );
+                               : "r" (s), "r" (rct->priv->luminance_float)
+                               : "%"REG_a,
+                                 "memory");
+
                        buffer[destoffset++] = rct->priv->table16[r];
                        buffer[destoffset++] = rct->priv->table16[g];
                        buffer[destoffset++] = rct->priv->table16[b];
                        s += 4;
                }
+               asm volatile("emms\n\t");
                cmsDoTransform((cmsHPROFILE) rct->priv->transform, buffer, 
out+height * out_rowstride, width);
        }
-       asm volatile("emms\n\t");
        g_free(buffer);
        return;
 }
@@ -782,32 +907,34 @@ COLOR_TRANSFORM(transform_cms8_3dnow)
        gushort *buffer = g_malloc(width*3*sizeof(gushort));
        gint destoffset;
        gint col;
-       register glong r=0,g=0,b=0;
-       gfloat mat[12] align(8);
-       gfloat top[2] align(8);
-       mat[0] = rct->priv->color_matrix.coeff[0][0];
-       mat[1] = rct->priv->color_matrix.coeff[0][1];
-       mat[2] = rct->priv->color_matrix.coeff[0][2];
-       mat[3] = 0.0;
-       mat[4] = rct->priv->color_matrix.coeff[1][0];
-       mat[5] = rct->priv->color_matrix.coeff[1][1];
-       mat[6] = rct->priv->color_matrix.coeff[1][2];
-       mat[7] = 0.0;
-       mat[8] = rct->priv->color_matrix.coeff[2][0];
-       mat[9] = rct->priv->color_matrix.coeff[2][1];
-       mat[10] = rct->priv->color_matrix.coeff[2][2];
-       mat[11] = 0.0;
-       top[0] = 65535.0;
-       top[1] = 65535.0;
-       asm volatile (
-               "femms\n\t"
-               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
-               "movq (%0), %%mm2\n\t" /* pre_mul R | pre_mul G */
-               "movq 8(%0), %%mm3\n\t" /* pre_mul B | pre_mul G2 */
-               "movq (%1), %%mm6\n\t" /* 65535.0 | 65535.0 */
-               :
-               : "r" (rct->priv->pre_mul), "r" (&top)
-       );
+       gint rgbx[4] align(8) = { 0, 0, 0, 0};
+       gfloat tmp[4] align(8) = { 0.f, 0.f, 0.f, 0.f};
+       gfloat mat[] align(8) = {
+               rct->priv->color_matrix.coeff[0][0],
+               rct->priv->color_matrix.coeff[0][1],
+               rct->priv->color_matrix.coeff[0][2],
+               0.f,
+               rct->priv->color_matrix.coeff[1][0],
+               rct->priv->color_matrix.coeff[1][1],
+               rct->priv->color_matrix.coeff[1][2],
+               0.f,
+               rct->priv->color_matrix.coeff[2][0],
+               rct->priv->color_matrix.coeff[2][1],
+               rct->priv->color_matrix.coeff[2][2],
+               0.f,
+               LUM_XYZ_R,
+               LUM_XYZ_G,
+               LUM_XYZ_B,
+               0.f,
+               /* Not really matrix... but to avoid register pressure in the
+                * following asm blocks. It helps having only one register to
+                * hold multpile addresses. */
+               65535.f, 65535.f,
+               .5f, .5f,
+               rct->priv->pre_mul[0], rct->priv->pre_mul[0], 
+               rct->priv->pre_mul[1], rct->priv->pre_mul[2], 
+       };
+
        while(height--)
        {
                destoffset = 0;
@@ -816,9 +943,14 @@ COLOR_TRANSFORM(transform_cms8_3dnow)
                while(col--)
                {
                        asm volatile (
+                               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
+                               "movq 80(%1), %%mm2\n\t" /* pre_mul R | pre_mul 
G */
+                               "movq 88(%1), %%mm3\n\t" /* pre_mul B | pre_mul 
G2 */
+                               "movq 64(%1), %%mm6\n\t" /* 65535.0 | 65535.0 */
+
                                /* pre multiply */
                                "movq (%0), %%mm0\n\t" /* R | G | B | G2 */
-                               "movq %%mm0, %%mm1\n\t" /* R | G | B | G2 */
+                               "movq (%0), %%mm1\n\t" /* R | G | B | G2 */
                                "punpcklwd %%mm7, %%mm0\n\t" /* R, G */
                                "punpckhwd %%mm7, %%mm1\n\t" /* B, G2 */
                                "pi2fd %%mm0, %%mm0\n\t" /* to float */
@@ -830,23 +962,44 @@ COLOR_TRANSFORM(transform_cms8_3dnow)
                                "pfmax %%mm7, %%mm0\n\t"
                                "pfmax %%mm7, %%mm1\n\t"
 
-                               "add $8, %0\n\t" /* increment offset */
+                               "movq 72(%1), %%mm3\n\t" /* mm3 = halfone */
 
                                /* red */
-                               "movq (%4), %%mm4\n\t" /* mat[0] | mat[1] */
-                               "movq 8(%4), %%mm5\n\t" /* mat[2] | mat[3] */
+                               "movq (%1), %%mm4\n\t" /* mat[0] | mat[1] */
+                               "movq 8(%1), %%mm5\n\t" /* mat[2] | mat[3] */
                                "pfmul %%mm0, %%mm4\n\t" /* R*[0] | G*[1] */
                                "pfmul %%mm1, %%mm5\n\t" /* B*[2] | G2*[3] */
                                "pfadd %%mm4, %%mm5\n\t" /* R*[0] + B*[2] | 
G*[1] + G2*[3] */
                                "pfacc %%mm5, %%mm5\n\t" /* R*[0] + B*[2] + 
G*[1] + G2*[3] | ? */
                                "pfmin %%mm6, %%mm5\n\t"
                                "pfmax %%mm7, %%mm5\n\t"
-                               "pf2id %%mm5, %%mm5\n\t" /* to integer */
-                               "movd %%mm5, %1\n\t" /* write r */
+                               "movd %%mm5, (%3)\n\t" /* write to tmp[0] */
 
                                /* green */
-                               "movq 16(%4), %%mm4\n\t"
-                               "movq 24(%4), %%mm5\n\t"
+                               "movq 16(%1), %%mm4\n\t"
+                               "movq 24(%1), %%mm5\n\t"
+                               "pfmul %%mm0, %%mm4\n\t"
+                               "pfmul %%mm1, %%mm5\n\t"
+                               "pfadd %%mm4, %%mm5\n\t"
+                               "pfacc %%mm5, %%mm5\n\t"
+                               "pfmin %%mm6, %%mm5\n\t"
+                               "pfmax %%mm7, %%mm5\n\t"
+                               "movd %%mm5, 4(%3)\n\t" /* write to tmp[1] */
+
+                               /* blue */
+                               "movq 32(%1), %%mm4\n\t"
+                               "movq 40(%1), %%mm5\n\t"
+                               "pfmul %%mm0, %%mm4\n\t"
+                               "pfmul %%mm1, %%mm5\n\t"
+                               "pfadd %%mm4, %%mm5\n\t"
+                               "pfacc %%mm5, %%mm5\n\t"
+                               "pfmin %%mm6, %%mm5\n\t"
+                               "pfmax %%mm7, %%mm5\n\t"
+                               "movd %%mm5, 8(%3)\n\t"  /* write to tmp[2] */
+
+                               /* Luminance */
+                               "movq 48(%1), %%mm4\n\t"
+                               "movq 56(%1), %%mm5\n\t"
                                "pfmul %%mm0, %%mm4\n\t"
                                "pfmul %%mm1, %%mm5\n\t"
                                "pfadd %%mm4, %%mm5\n\t"
@@ -854,29 +1007,42 @@ COLOR_TRANSFORM(transform_cms8_3dnow)
                                "pfmin %%mm6, %%mm5\n\t"
                                "pfmax %%mm7, %%mm5\n\t"
                                "pf2id %%mm5, %%mm5\n\t"
-                               "movd %%mm5, %2\n\t"
-
-                               /* blue */
-                               "movq 32(%4), %%mm4\n\t"
-                               "movq 40(%4), %%mm5\n\t"
-                               "pfmul %%mm0, %%mm4\n\t"
-                               "pfmul %%mm1, %%mm5\n\t"
-                               "pfadd %%mm4, %%mm5\n\t"
-                               "pfacc %%mm5, %%mm5\n\t"
-                               "pfmin %%mm6, %%mm5\n\t"
-                               "pfmax %%mm7, %%mm5\n\t"
-                               "pf2id %%mm5, %%mm5\n\t"
-                               "movd %%mm5, %3\n\t"
-                               : "+r" (s), "+r" (r), "+r" (g), "+r" (b)
-                               : "r" (&mat)
-                       );
-                       buffer[destoffset++] = rct->priv->table16[r];
-                       buffer[destoffset++] = rct->priv->table16[g];
-                       buffer[destoffset++] = rct->priv->table16[b];
-               }
+                               "pfadd %%mm3, %%mm5\n\t" /* +0.5 to have real 
round */
+                               "movd %%mm5, %%eax\n\t"
+
+                               /* Compute luminance corrected values */
+                               "movq (%3), %%mm0\n\t" /* R, G */
+                               "movq 8(%3), %%mm1\n\t" /* B, 0 */
+                               "movd (%2, %%"REG_a", 4), %%mm2\n\t" /* mm2 = x 
| a */
+                               "pshufw $0x44, %%mm2, %%mm2\n\t" /* mm2 = a | a 
*/
+                               "pfmul %%mm2, %%mm0\n\t" /* mm0 = aR | aG */
+                               "pfmul %%mm2, %%mm1\n\t" /* mm1 = aB | 0  */
+                               "pfmin %%mm6, %%mm0\n\t" /* mm0 = min(mm0, 
65535.f) */
+                               "pfmin %%mm6, %%mm1\n\t" /* mm1 = min(mm1, 
65535.f) */
+                               "pfmax %%mm7, %%mm0\n\t" /* mm0 = max(mm0, 0) */
+                               "pfmax %%mm7, %%mm1\n\t" /* mm1 = max(mm1, 0) */
+                               "pfadd %%mm3, %%mm0\n\t" /* +0.5 */
+                               "pfadd %%mm3, %%mm1\n\t" /* +0.5 */
+                               "pf2id %%mm0, %%mm0\n\t" /* mm0 = (int)aR | 
(int)aG */
+                               "pf2id %%mm1, %%mm1\n\t" /* mm1 = (int)aB | 0 */
+                               "movq %%mm0, (%4)\n\t" /* write new R/G to rgbx 
*/
+                               "movq %%mm1, 8(%4)\n\t" /* write new B/0 to 
rgbx */
+                               :
+                               : "r" (s),
+                                 "r" (mat),
+                                 "r" (rct->priv->luminance_float),
+                                 "r" (tmp),
+                                 "r" (rgbx)
+                               : "%"REG_a,
+                                 "memory");
+                       buffer[destoffset++] = rct->priv->table16[rgbx[0]];
+                       buffer[destoffset++] = rct->priv->table16[rgbx[1]];
+                       buffer[destoffset++] = rct->priv->table16[rgbx[2]];
+                       s += 4;
+               }
+               asm volatile("femms\n\t");
                cmsDoTransform((cmsHPROFILE) rct->priv->transform, buffer, 
out+height * out_rowstride, width);
        }
-       asm volatile ("femms\n\t");
        g_free(buffer);
        return;
 }
@@ -901,6 +1067,8 @@ COLOR_TRANSFORM(transform_cms_c)
                srcoffset = y * in_rowstride;
                for(x=0 ; x<width ; x++)
                {
+                       guint Y;
+
                        rr = (in[srcoffset+R]*pre_muli[R])>>7;
                        gg = (in[srcoffset+G]*pre_muli[G])>>7;
                        bb = (in[srcoffset+B]*pre_muli[B])>>7;
@@ -915,6 +1083,23 @@ COLOR_TRANSFORM(transform_cms_c)
                                + gg*mati.coeff[2][1]
                                + bb*mati.coeff[2][2])>>MATRIX_RESOLUTION;
                        _CLAMP65535_TRIPLET(r,g,b);
+
+                       // Compute luminance
+                       Y = (     LUM_FIXED_XYZ_R*r
+                               + LUM_FIXED_XYZ_G*g
+                               + LUM_FIXED_XYZ_B*b
+                               + LUM_FIXED_HALFONE)>>LUM_PRECISION;
+
+                       // Find the factor to apply to the RGB triplet
+                       Y = rct->priv->luminance_fixed[Y];
+
+                       // Multiplythe RGB triplet using fixed point arithmetic
+                       r = (r*Y)>>LUM_PRECISION;
+                       g = (g*Y)>>LUM_PRECISION;
+                       b = (b*Y)>>LUM_PRECISION;
+
+                       _CLAMP65535_TRIPLET(r,g,b);
+
                        buffer[destoffset++] = rct->priv->table16[r];
                        buffer[destoffset++] = rct->priv->table16[g];
                        buffer[destoffset++] = rct->priv->table16[b];
@@ -945,6 +1130,8 @@ COLOR_TRANSFORM(transform_nocms_c)
                srcoffset = y * in_rowstride;
                for(x=0 ; x<width ; x++)
                {
+                       guint Y;
+
                        rr = (in[srcoffset+R]*pre_muli[R]+64)>>7;
                        gg = (in[srcoffset+G]*pre_muli[G]+64)>>7;
                        bb = (in[srcoffset+B]*pre_muli[B]+64)>>7;
@@ -959,6 +1146,23 @@ COLOR_TRANSFORM(transform_nocms_c)
                                + gg*mati.coeff[2][1]
                                + bb*mati.coeff[2][2])>>MATRIX_RESOLUTION;
                        _CLAMP65535_TRIPLET(r,g,b);
+
+                       // Compute luminance
+                       Y = (     LUM_FIXED_XYZ_R*r
+                               + LUM_FIXED_XYZ_G*g
+                               + LUM_FIXED_XYZ_B*b
+                               + LUM_FIXED_HALFONE)>>LUM_PRECISION;
+
+                       // Find the factor to apply to the RGB triplet
+                       Y = rct->priv->luminance_fixed[Y];
+
+                       // Multiplythe RGB triplet using fixed point arithmetic
+                       r = (r*Y)>>LUM_PRECISION;
+                       g = (g*Y)>>LUM_PRECISION;
+                       b = (b*Y)>>LUM_PRECISION;
+
+                       _CLAMP65535_TRIPLET(r,g,b);
+                               
                        d[destoffset++] = rct->priv->table8[r];
                        d[destoffset++] = rct->priv->table8[g];
                        d[destoffset++] = rct->priv->table8[b];
@@ -1005,6 +1209,8 @@ rs_color_transform_make_histogram(RS_COL
                srcoffset = y * input->rowstride;
                for(x=0 ; x<input->w ; x++)
                {
+                       guint Y;
+
                        rr = (in[srcoffset+R]*pre_muli[R])>>7;
                        gg = (in[srcoffset+G]*pre_muli[G])>>7;
                        bb = (in[srcoffset+B]*pre_muli[B])>>7;
@@ -1020,6 +1226,22 @@ rs_color_transform_make_histogram(RS_COL
                                + bb*mati.coeff[2][2])>>MATRIX_RESOLUTION;
                        _CLAMP65535_TRIPLET(r,g,b);
 
+                       // Compute luminance
+                       Y = (     LUM_FIXED_XYZ_R*r
+                               + LUM_FIXED_XYZ_G*g
+                               + LUM_FIXED_XYZ_B*b
+                               + LUM_FIXED_HALFONE)>>LUM_PRECISION;
+
+                       // Find the factor to apply to the RGB triplet
+                       Y = rct->priv->luminance_fixed[Y];
+
+                       // Multiplythe RGB triplet using fixed point arithmetic
+                       r = (r*Y)>>LUM_PRECISION;
+                       g = (g*Y)>>LUM_PRECISION;
+                       b = (b*Y)>>LUM_PRECISION;
+
+                       _CLAMP65535_TRIPLET(r,g,b);
+
                        if (rct->priv->transform != NULL)
                        {
                                buffer16[x*3+R] = r;


-- 
Edouard Gomez

_______________________________________________
Rawstudio-dev mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-dev

Reply via email to