[Rawstudio-dev] In the end, it's a fix for everyone !

Edouard Gomez Fri, 31 Aug 2007 15:36:22 -0700

Edouard Gomez ([EMAIL PROTECTED]) wrote:
> So give me an hour more, and i'l send another patch to the list.


Here it is...

Give me feedback if it works ok for you too.

On AMD64, i'm not sure it's safe because x87 is deprecated and lot
of x87 code is turned into SSE scalar code and we have 4 registers
we suppose being constant accross cms calls.

For ia32 it's less likely to cause problems except for Gentooists
that compiled all libs with the gcc option to use sse registers
instead of x87 code (-msse iirc).

# HG changeset patch
# User Edouard Gomez <[EMAIL PROTECTED]>
# Date 1188599409 -7200
# Node ID b807f302db9d8b804767b587e6a1a7e4ecd4e805
# Parent  a846a231fbc4d48865286ec28b2ab9fc5d784ff5
Use the curve as a luminance curve only

diff -r a846a231fbc4 -r b807f302db9d src/rs-color-transform.c
--- a/src/rs-color-transform.c  Thu Aug 30 01:15:46 2007 +0200
+++ b/src/rs-color-transform.c  Sat Sep 01 00:30:09 2007 +0200
@@ -25,6 +25,21 @@ static void make_tables(RS_COLOR_TRANSFO
 static void make_tables(RS_COLOR_TRANSFORM *rct);
 static gboolean select_render(RS_COLOR_TRANSFORM *rct);
 
+/* The XYZ tri stimulus luminance coefficients */
+#define LUM_XYZ_R (0.212671f)
+#define LUM_XYZ_G (0.715160f)
+#define LUM_XYZ_B (0.072169f)
+
+/* Fixed precision version for better performance
+ * Cannot be >13 or the code overflows in some places
+ * and above 15 can generate sigsevs */
+#define LUM_PRECISION 13
+#define LUM_FIXED(a) ((gint)((a)*(1<<LUM_PRECISION)))
+#define LUM_FIXED_HALFONE LUM_FIXED(0.5f)
+#define LUM_FIXED_XYZ_R   LUM_FIXED(LUM_XYZ_R)
+#define LUM_FIXED_XYZ_G   LUM_FIXED(LUM_XYZ_G)
+#define LUM_FIXED_XYZ_B   LUM_FIXED(LUM_XYZ_B)
+
 /* Function pointers - initialized by arch binders */
 COLOR_TRANSFORM(*transform_nocms8);
 COLOR_TRANSFORM(*transform_cms8);
@@ -50,6 +65,8 @@ struct _RS_COLOR_TRANSFORM_PRIVATE {
        gint nknots;
        gfloat *knots;
        gfloat curve_samples[65536];
+       guint luminance_fixed[65536];
+       gfloat luminance_float[65535];
        void *transform;
 };
 
@@ -308,8 +325,48 @@ make_tables(RS_COLOR_TRANSFORM *rct)
                nd = ((gdouble) n) * rec65535;
                nd = pow(nd, gammavalue);
 
-               if (likely(rct->priv->curve_samples))
-                       nd = (gdouble) rct->priv->curve_samples[((gint) 
(nd*65535.0f))];
+               /* The idea is to use the curve to boost/dcrease
+                * luminance only.
+                * So we have to compute Y, map it to its new value
+                * and then compute a factor that would keep this
+                * new luminance if applied to the RGB triplet.
+                *
+                * Quite straight forward; let's do it quick...
+                * Y  = a.R + b.G + c.B
+                *
+                * We map Y to Y' according to the curve:
+                * Y' = curve(Y)
+                *
+                * let's compute a real 'd' such that:
+                * Y' = d.Y
+                *
+                * Then we have
+                * Y' = d.(a.R + b.G + c.B)
+                * or written a bit differently:
+                * Y' = a.(d.R) + b.(d.G) + c.(d.B)
+                *
+                * So the RGB triplet we are looking for is (d.R, d.G, d.B)
+                *
+                * The luminance LUTs store curve(Y)/Y as floating and fixed
+                * values so it's very easy to directly multiply the mapped
+                * value and the RGB vectors w/o any need for div except the
+                * one when generating the table. Which represents 65536 divs,
+                * which is equivalent to a 256x256 picture. So computing the
+                * LUTs saves divs in most cases (any raw smaller than
+                * 2048x2048 ? :-))
+                *
+                * NB: luminance curve is applied in linear space before
+                * any gamma */
+               if (likely(rct->priv->curve_samples)) {
+                       gfloat Yp;
+                       gfloat Y;
+
+                       Y = (n > 0) ? (gfloat)n : 1.f;
+                       Yp = rct->priv->curve_samples[n];
+                       Yp = Yp < 0.f ? 0.f : (Yp > 1.f) ? 1.f : Yp;
+                       rct->priv->luminance_float[n] = Yp*65536.f/Y;
+                       rct->priv->luminance_fixed[n] = 
LUM_FIXED(rct->priv->luminance_float[n]);
+               }
 
                nd = nd*contrast+postadd;
 
@@ -431,6 +488,8 @@ COLOR_TRANSFORM(transform_nocms_float)
                srcoffset = y * in_rowstride;
                for(x=0 ; x<width ; x++)
                {
+                       gfloat Y;
+
                        /* pre multipliers */
                        r1 = in[srcoffset+R] * rct->priv->pre_mul[R];
                        g1 = in[srcoffset+G] * rct->priv->pre_mul[G];
@@ -456,6 +515,18 @@ COLOR_TRANSFORM(transform_nocms_float)
                        r = r2;
                        g = g2;
                        b = b2;
+
+                       /* clamp to unsigned short */
+                       _CLAMP65535_TRIPLET(r,g,b);
+
+                       // Compute luminance and extract from the LUT curve[Y]/Y
+                       Y = LUM_XYZ_R*r + LUM_XYZ_G*g + LUM_XYZ_B*b + 0.5f;
+                       Y = 
rct->priv->luminance_float[(Y<0.f)?0:(Y>65535.f)?65535:(int)Y];
+
+                       // Scale RGB according to that factor
+                       r = (int)((gfloat)r*Y);
+                       g = (int)((gfloat)g*Y);
+                       b = (int)((gfloat)b*Y);
 
                        /* clamp to unsigned short */
                        _CLAMP65535_TRIPLET(r,g,b);
@@ -485,29 +556,23 @@ COLOR_TRANSFORM(transform_nocms_float)
 #if defined (__i386__) || defined (__x86_64__)
 COLOR_TRANSFORM(transform_nocms8_sse)
 {
-       register glong r,g,b;
+       register glong r, g, b;
        gint destoffset;
        gint col;
        gfloat top[4] align(16) = {65535.0, 65535.0, 65535.0, 65535.0};
        gfloat mat[12] align(16) = {
+               LUM_XYZ_R,
                rct->priv->color_matrix.coeff[0][0],
                rct->priv->color_matrix.coeff[1][0],
                rct->priv->color_matrix.coeff[2][0],
-               RLUM * (rct->priv->color_matrix.coeff[0][0]
-                       + rct->priv->color_matrix.coeff[0][1]
-                       + rct->priv->color_matrix.coeff[0][2]),
+               LUM_XYZ_G,
                rct->priv->color_matrix.coeff[0][1],
                rct->priv->color_matrix.coeff[1][1],
                rct->priv->color_matrix.coeff[2][1],
-               GLUM * (rct->priv->color_matrix.coeff[1][0]
-                       + rct->priv->color_matrix.coeff[1][1]
-                       + rct->priv->color_matrix.coeff[1][2]),
+               LUM_XYZ_B,
                rct->priv->color_matrix.coeff[0][2],
                rct->priv->color_matrix.coeff[1][2],
                rct->priv->color_matrix.coeff[2][2],
-               BLUM * (rct->priv->color_matrix.coeff[2][0]
-                       + rct->priv->color_matrix.coeff[2][1]
-                       + rct->priv->color_matrix.coeff[2][2])
        };
        asm volatile (
                "movups (%2), %%xmm2\n\t" /* rs->pre_mul */
@@ -515,11 +580,10 @@ COLOR_TRANSFORM(transform_nocms8_sse)
                "movaps 16(%0), %%xmm4\n\t"
                "movaps 32(%0), %%xmm5\n\t"
                "movaps (%1), %%xmm6\n\t" /* top */
-               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
                :
                : "r" (mat), "r" (top), "r" (rct->priv->pre_mul)
-               : "memory"
-       );
+               : "%xmm2", "%xmm3", "%xmm4", "%xmm5");
+
        while(height--)
        {
                destoffset = 0;
@@ -529,13 +593,15 @@ COLOR_TRANSFORM(transform_nocms8_sse)
                while(col--)
                {
                        asm volatile (
+                               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
+
                                /* load */
                                "movq (%3), %%mm0\n\t" /* R | G | B | G2 */
-                               "movq %%mm0, %%mm1\n\t" /* R | G | B | G2 */
+                               "movq (%3), %%mm1\n\t" /* R | G | B | G2 */
                                "punpcklwd %%mm7, %%mm0\n\t" /* R | G */
                                "punpckhwd %%mm7, %%mm1\n\t" /* B | G2 */
                                "cvtpi2ps %%mm1, %%xmm0\n\t" /* B | G2 | ? | ? 
*/
-                               "shufps $0x4E, %%xmm0, %%xmm0\n\t" /* ? | ? | B 
| G2 */
+                               "shufps $0x4e, %%xmm0, %%xmm0\n\t" /* ? | ? | B 
| G2 */
                                "cvtpi2ps %%mm0, %%xmm0\n\t" /* R | G | B | G2 
*/
 
                                "mulps %%xmm2, %%xmm0\n\t" /* (R | G | B | _) * 
premul */
@@ -553,7 +619,7 @@ COLOR_TRANSFORM(transform_nocms8_sse)
                                "addps %%xmm1, %%xmm7\n\t"
 
                                "movaps %%xmm0, %%xmm1\n\t"
-                               "shufps $0xAA, %%xmm1, %%xmm1\n\t" /* B | B | B 
| B */
+                               "shufps $0xaa, %%xmm1, %%xmm1\n\t" /* B | B | B 
| B */
                                "mulps %%xmm5, %%xmm1\n\t"
                                "addps %%xmm7, %%xmm1\n\t"
 
@@ -561,24 +627,36 @@ COLOR_TRANSFORM(transform_nocms8_sse)
                                "minps %%xmm6, %%xmm1\n\t" /* MIN (65535.0, in) 
*/
                                "maxps %%xmm7, %%xmm1\n\t" /* MAX (0.0, in) */
 
-                               /* xmm1: R | G | B | _ */
-//                             "shufps $0xFF, %%xmm1, %%xmm1\n\t"
-                               "cvtss2si %%xmm1, %0\n\t"
-                               "shufps $0xF9, %%xmm1, %%xmm1\n\t" /* xmm1: G | 
B | _ | _ */
-                               "cvtss2si %%xmm1, %1\n\t"
-                               "shufps $0xF9, %%xmm1, %%xmm1\n\t" /* xmm1: B | 
_ | _ | _ */
-                               "cvtss2si %%xmm1, %2\n\t"
+                               /* xmm1: Y | R | G | B */
+                               "cvtss2si %%xmm1, %%"REG_a"\n\t" /* Extract 
Luminance integer value */
+                               "movss (%4, %%"REG_a", 4), %%xmm0\n\t" /* xmm0 
= Y'/Y | _ | _ | _ */
+                               "shufps $0, %%xmm0, %%xmm0\n\t" /* xmm0 = Y'/Y 
| Y'/Y | Y'/Y | Y'/Y */
+                               "mulps %%xmm0, %%xmm1\n\t" /* xmm1 = Y' | R' | 
G' | B' */
+                               "minps %%xmm6, %%xmm1\n\t" /* MIN (65535.0, in) 
*/
+                               "maxps %%xmm7, %%xmm1\n\t" /* MAX (0.0, in) */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = R' 
| G' | B' | _ */
+                               "cvtss2si %%xmm1, %0\n\t" /* Extract R' */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = G' 
| B' | _ | _ */
+                               "cvtss2si %%xmm1, %1\n\t" /* Extract G' */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = B' 
| _ | _ | _ */
+                               "cvtss2si %%xmm1, %2\n\t" /* Extract B' */
                                : "=r" (r), "=r" (g), "=r" (b)
-                               : "r" (s)
-                               : "memory"
-                       );
+                               : "r" (s), "r" (rct->priv->luminance_float)
+                               : "%"REG_a,
+                                 "%mm0", "%mm1", "%mm7",
+                                 "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+                                 "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+                                 "memory");
+
                        d[destoffset++] = rct->priv->table8[r];
                        d[destoffset++] = rct->priv->table8[g];
                        d[destoffset++] = rct->priv->table8[b];
                        s += 4;
                }
        }
-       asm volatile("emms\n\t");
+
+       asm volatile ("emms\n\t");
+
        return;
 }
 
@@ -586,32 +664,34 @@ COLOR_TRANSFORM(transform_nocms8_3dnow)
 {
        gint destoffset;
        gint col;
-       register glong r=0,g=0,b=0;
-       gfloat mat[12] align(8);
-       gfloat top[2] align(8);
-       mat[0] = rct->priv->color_matrix.coeff[0][0];
-       mat[1] = rct->priv->color_matrix.coeff[0][1];
-       mat[2] = rct->priv->color_matrix.coeff[0][2];
-       mat[3] = 0.0;
-       mat[4] = rct->priv->color_matrix.coeff[1][0];
-       mat[5] = rct->priv->color_matrix.coeff[1][1];
-       mat[6] = rct->priv->color_matrix.coeff[1][2];
-       mat[7] = 0.0;
-       mat[8] = rct->priv->color_matrix.coeff[2][0];
-       mat[9] = rct->priv->color_matrix.coeff[2][1];
-       mat[10] = rct->priv->color_matrix.coeff[2][2];
-       mat[11] = 0.0;
-       top[0] = 65535.0;
-       top[1] = 65535.0;
-       asm volatile (
-               "femms\n\t"
-               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
-               "movq (%0), %%mm2\n\t" /* pre_mul R | pre_mul G */
-               "movq 8(%0), %%mm3\n\t" /* pre_mul B | pre_mul G2 */
-               "movq (%1), %%mm6\n\t" /* 65535.0 | 65535.0 */
-               :
-               : "r" (rct->priv->pre_mul), "r" (&top)
-       );
+       gint rgbx[4] align(8) = {0, 0, 0, 0};
+       gfloat tmp[4] align(8) = { 0.f, 0.f, 0.f, 0.f};
+       const gfloat mat[] align(8) = {
+               rct->priv->color_matrix.coeff[0][0],
+               rct->priv->color_matrix.coeff[0][1],
+               rct->priv->color_matrix.coeff[0][2],
+               0.f,
+               rct->priv->color_matrix.coeff[1][0],
+               rct->priv->color_matrix.coeff[1][1],
+               rct->priv->color_matrix.coeff[1][2],
+               0.f,
+               rct->priv->color_matrix.coeff[2][0],
+               rct->priv->color_matrix.coeff[2][1],
+               rct->priv->color_matrix.coeff[2][2],
+               0.f,
+               LUM_XYZ_R,
+               LUM_XYZ_G,
+               LUM_XYZ_B,
+               0.f,
+               /* Not really matrix... but to avoid register pressure in the
+                * following asm blocks. It helps having only one register to
+                * hold multpile addresses. */
+               65535.f, 65535.f,
+               .5f, .5f,
+               rct->priv->pre_mul[0], rct->priv->pre_mul[1],
+               rct->priv->pre_mul[2], rct->priv->pre_mul[3],
+       };
+
        while(height--)
        {
                destoffset = 0;
@@ -621,9 +701,14 @@ COLOR_TRANSFORM(transform_nocms8_3dnow)
                while(col--)
                {
                        asm volatile (
+                               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
+                               "movq 80(%1), %%mm2\n\t" /* pre_mul R | pre_mul 
G */
+                               "movq 88(%1), %%mm3\n\t" /* pre_mul B | pre_mul 
G2 */
+                               "movq 64(%1), %%mm6\n\t" /* 65535.0 | 65535.0 */
+
                                /* pre multiply */
                                "movq (%0), %%mm0\n\t" /* R | G | B | G2 */
-                               "movq %%mm0, %%mm1\n\t" /* R | G | B | G2 */
+                               "movq (%0), %%mm1\n\t" /* R | G | B | G2 */
                                "punpcklwd %%mm7, %%mm0\n\t" /* R, G */
                                "punpckhwd %%mm7, %%mm1\n\t" /* B, G2 */
                                "pi2fd %%mm0, %%mm0\n\t" /* to float */
@@ -635,53 +720,89 @@ COLOR_TRANSFORM(transform_nocms8_3dnow)
                                "pfmax %%mm7, %%mm0\n\t"
                                "pfmax %%mm7, %%mm1\n\t"
 
-                               "add $8, %0\n\t" /* increment offset */
+                               "movq 72(%1), %%mm3\n\t" /* mm3 = halfone */
 
                                /* red */
-                               "movq (%4), %%mm4\n\t" /* mat[0] | mat[1] */
-                               "movq 8(%4), %%mm5\n\t" /* mat[2] | mat[3] */
+                               "movq (%1), %%mm4\n\t" /* mat[0] | mat[1] */
+                               "movq 8(%1), %%mm5\n\t" /* mat[2] | mat[3] */
                                "pfmul %%mm0, %%mm4\n\t" /* R*[0] | G*[1] */
                                "pfmul %%mm1, %%mm5\n\t" /* B*[2] | G2*[3] */
                                "pfadd %%mm4, %%mm5\n\t" /* R*[0] + B*[2] | 
G*[1] + G2*[3] */
                                "pfacc %%mm5, %%mm5\n\t" /* R*[0] + B*[2] + 
G*[1] + G2*[3] | ? */
                                "pfmin %%mm6, %%mm5\n\t"
                                "pfmax %%mm7, %%mm5\n\t"
-                               "pf2id %%mm5, %%mm5\n\t" /* to integer */
-                               "movd %%mm5, %1\n\t" /* write r */
+                               "movd %%mm5, (%3)\n\t" /* write to tmp[0] */
 
                                /* green */
-                               "movq 16(%4), %%mm4\n\t"
-                               "movq 24(%4), %%mm5\n\t"
+                               "movq 16(%1), %%mm4\n\t"
+                               "movq 24(%1), %%mm5\n\t"
                                "pfmul %%mm0, %%mm4\n\t"
                                "pfmul %%mm1, %%mm5\n\t"
                                "pfadd %%mm4, %%mm5\n\t"
                                "pfacc %%mm5, %%mm5\n\t"
                                "pfmin %%mm6, %%mm5\n\t"
                                "pfmax %%mm7, %%mm5\n\t"
-                               "pf2id %%mm5, %%mm5\n\t"
-                               "movd %%mm5, %2\n\t"
+                               "movd %%mm5, 4(%3)\n\t"  /* write to tmp[1] */
 
                                /* blue */
-                               "movq 32(%4), %%mm4\n\t"
-                               "movq 40(%4), %%mm5\n\t"
+                               "movq 32(%1), %%mm4\n\t"
+                               "movq 40(%1), %%mm5\n\t"
                                "pfmul %%mm0, %%mm4\n\t"
                                "pfmul %%mm1, %%mm5\n\t"
                                "pfadd %%mm4, %%mm5\n\t"
                                "pfacc %%mm5, %%mm5\n\t"
                                "pfmin %%mm6, %%mm5\n\t"
                                "pfmax %%mm7, %%mm5\n\t"
+                               "movd %%mm5, 8(%3)\n\t"  /* write to tmp[2] */
+
+                               /* Luminance */
+                               "movq 48(%1), %%mm4\n\t"
+                               "movq 56(%1), %%mm5\n\t"
+                               "pfmul %%mm0, %%mm4\n\t"
+                               "pfmul %%mm1, %%mm5\n\t"
+                               "pfadd %%mm4, %%mm5\n\t"
+                               "pfacc %%mm5, %%mm5\n\t"
+                               "pfmin %%mm6, %%mm5\n\t"
+                               "pfmax %%mm7, %%mm5\n\t"
+                               "pfadd %%mm3, %%mm5\n\t" /* +0.5 to have real 
round */
                                "pf2id %%mm5, %%mm5\n\t"
-                               "movd %%mm5, %3\n\t"
-                               : "+r" (s), "+r" (r), "+r" (g), "+r" (b)
-                               : "r" (&mat)
-                       );
-                       d[destoffset++] = rct->priv->table8[r];
-                       d[destoffset++] = rct->priv->table8[g];
-                       d[destoffset++] = rct->priv->table8[b];
-               }
-       }
-       asm volatile ("femms\n\t");
-
+                               "movd %%mm5, %%eax\n\t" /* write (int)Y to eax 
*/
+
+                               /* Compute luminance adjusted pixel */
+                               "movq (%3), %%mm0\n\t" /* R, G */
+                               "movq 8(%3), %%mm1\n\t" /* B, 0 */
+                               "movd (%2, %%"REG_a", 4), %%mm2\n\t" /* mm2 = x 
| a */
+                               "pshufw $0x44, %%mm2, %%mm2\n\t" /* mm2 = a | a 
*/
+                               "pfmul %%mm2, %%mm0\n\t" /* mm0 = aR | aG */
+                               "pfmul %%mm2, %%mm1\n\t" /* mm1 = aB | 0  */
+                               "pfmin %%mm6, %%mm0\n\t" /* mm0 = min(mm0, 
65535.f) */
+                               "pfmin %%mm6, %%mm1\n\t" /* mm1 = min(mm1, 
65535.f) */
+                               "pfmax %%mm7, %%mm0\n\t" /* mm0 = max(mm0, 0) */
+                               "pfmax %%mm7, %%mm1\n\t" /* mm1 = max(mm1, 0) */
+                               "pfadd %%mm3, %%mm0\n\t" /* +0.5 */
+                               "pfadd %%mm3, %%mm1\n\t" /* +0.5 */
+                               "pf2id %%mm0, %%mm0\n\t" /* mm0 = (int)aR | 
(int)aG */
+                               "pf2id %%mm1, %%mm1\n\t" /* mm1 = (int)aB | 0 */
+                               "movq %%mm0, (%4)\n\t" /* write new R/G to rgbx 
*/
+                               "movq %%mm1, 8(%4)\n\t" /* write new B/0 to 
rgbx */
+                               :
+                               : "r" (s),
+                                 "r" (mat),
+                                 "r" (rct->priv->luminance_float),
+                                 "r" (tmp),
+                                 "r" (rgbx)
+                               : "%"REG_a,
+                                 "%mm0", "%mm1", "%mm2", "%mm3",
+                                 "%mm4", "%mm5", "%mm6", "%mm7",
+                                 "memory");
+                       d[destoffset++] = rct->priv->table8[rgbx[0]];
+                       d[destoffset++] = rct->priv->table8[rgbx[1]];
+                       d[destoffset++] = rct->priv->table8[rgbx[2]];
+                       s += 4;
+               }
+       }
+
+       asm volatile("femms\n\t");
        return;
 }
 
@@ -693,29 +814,29 @@ COLOR_TRANSFORM(transform_cms8_sse)
        gint col;
        gfloat top[4] align(16) = {65535.0, 65535.0, 65535.0, 65535.0};
        gfloat mat[12] align(16) = {
+               LUM_XYZ_R,
                rct->priv->color_matrix.coeff[0][0],
                rct->priv->color_matrix.coeff[1][0],
                rct->priv->color_matrix.coeff[2][0],
-               0.0,
+               LUM_XYZ_G,
                rct->priv->color_matrix.coeff[0][1],
                rct->priv->color_matrix.coeff[1][1],
                rct->priv->color_matrix.coeff[2][1],
-               0.0,
+               LUM_XYZ_B,
                rct->priv->color_matrix.coeff[0][2],
                rct->priv->color_matrix.coeff[1][2],
                rct->priv->color_matrix.coeff[2][2],
-               0.0 };
+       };
        asm volatile (
                "movups (%2), %%xmm2\n\t" /* rs->pre_mul */
                "movaps (%0), %%xmm3\n\t" /* matrix */
                "movaps 16(%0), %%xmm4\n\t"
                "movaps 32(%0), %%xmm5\n\t"
                "movaps (%1), %%xmm6\n\t" /* top */
-               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
                :
                : "r" (mat), "r" (top), "r" (rct->priv->pre_mul)
-               : "memory"
-       );
+               : "%xmm2", "%xmm3", "%xmm4", "%xmm5");
+
        while(height--)
        {
                destoffset = 0;
@@ -724,6 +845,8 @@ COLOR_TRANSFORM(transform_cms8_sse)
                while(col--)
                {
                        asm volatile (
+                               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
+
                                /* load */
                                "movq (%3), %%mm0\n\t" /* R | G | B | G2 */
                                "movq %%mm0, %%mm1\n\t" /* R | G | B | G2 */
@@ -748,7 +871,7 @@ COLOR_TRANSFORM(transform_cms8_sse)
                                "addps %%xmm1, %%xmm7\n\t"
 
                                "movaps %%xmm0, %%xmm1\n\t"
-                               "shufps $0xAA, %%xmm1, %%xmm1\n\t"
+                               "shufps $0xaa, %%xmm1, %%xmm1\n\t"
                                "mulps %%xmm5, %%xmm1\n\t"
                                "addps %%xmm7, %%xmm1\n\t"
 
@@ -756,23 +879,35 @@ COLOR_TRANSFORM(transform_cms8_sse)
                                "minps %%xmm6, %%xmm1\n\t"
                                "maxps %%xmm7, %%xmm1\n\t"
 
-                               "cvtss2si %%xmm1, %0\n\t"
-                               "shufps $0xF9, %%xmm1, %%xmm1\n\t"
-                               "cvtss2si %%xmm1, %1\n\t"
-                               "shufps $0xF9, %%xmm1, %%xmm1\n\t"
-                               "cvtss2si %%xmm1, %2\n\t"
+                               /* xmm1: Y | R | G | B */
+                               "cvtss2si %%xmm1, %%"REG_a"\n\t" /* Extract 
Luminance integer value */
+                               "movss (%4, %%"REG_a", 4), %%xmm0\n\t" /* xmm0 
= Y'/Y | _ | _ | _ */
+                               "shufps $0, %%xmm0, %%xmm0\n\t" /* xmm0 = Y'/Y 
| Y'/Y | Y'/Y | Y'/Y */
+                               "mulps %%xmm0, %%xmm1\n\t" /* xmm1 = Y' | R' | 
G' | B' */
+                               "minps %%xmm6, %%xmm1\n\t" /* MIN (65535.0, in) 
*/
+                               "maxps %%xmm7, %%xmm1\n\t" /* MAX (0.0, in) */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = R' 
| G' | B' | _ */
+                               "cvtss2si %%xmm1, %0\n\t" /* Extract R' */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = G' 
| B' | _ | _ */
+                               "cvtss2si %%xmm1, %1\n\t" /* Extract G' */
+                               "shufps $0xf9, %%xmm1, %%xmm1\n\t" /* xmm1 = B' 
| _ | _ | _ */
+                               "cvtss2si %%xmm1, %2\n\t" /* Extract B' */
                                : "=r" (r), "=r" (g), "=r" (b)
-                               : "r" (s)
-                               : "memory"
-                       );
+                               : "r" (s), "r" (rct->priv->luminance_float)
+                               : "%"REG_a,
+                                 "%mm0", "%mm1", "%mm7",
+                                 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+                                 "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+                                 "memory");
+
                        buffer[destoffset++] = rct->priv->table16[r];
                        buffer[destoffset++] = rct->priv->table16[g];
                        buffer[destoffset++] = rct->priv->table16[b];
                        s += 4;
                }
+               asm volatile("emms\n\t");
                cmsDoTransform((cmsHPROFILE) rct->priv->transform, buffer, 
out+height * out_rowstride, width);
        }
-       asm volatile("emms\n\t");
        g_free(buffer);
        return;
 }
@@ -782,32 +917,34 @@ COLOR_TRANSFORM(transform_cms8_3dnow)
        gushort *buffer = g_malloc(width*3*sizeof(gushort));
        gint destoffset;
        gint col;
-       register glong r=0,g=0,b=0;
-       gfloat mat[12] align(8);
-       gfloat top[2] align(8);
-       mat[0] = rct->priv->color_matrix.coeff[0][0];
-       mat[1] = rct->priv->color_matrix.coeff[0][1];
-       mat[2] = rct->priv->color_matrix.coeff[0][2];
-       mat[3] = 0.0;
-       mat[4] = rct->priv->color_matrix.coeff[1][0];
-       mat[5] = rct->priv->color_matrix.coeff[1][1];
-       mat[6] = rct->priv->color_matrix.coeff[1][2];
-       mat[7] = 0.0;
-       mat[8] = rct->priv->color_matrix.coeff[2][0];
-       mat[9] = rct->priv->color_matrix.coeff[2][1];
-       mat[10] = rct->priv->color_matrix.coeff[2][2];
-       mat[11] = 0.0;
-       top[0] = 65535.0;
-       top[1] = 65535.0;
-       asm volatile (
-               "femms\n\t"
-               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
-               "movq (%0), %%mm2\n\t" /* pre_mul R | pre_mul G */
-               "movq 8(%0), %%mm3\n\t" /* pre_mul B | pre_mul G2 */
-               "movq (%1), %%mm6\n\t" /* 65535.0 | 65535.0 */
-               :
-               : "r" (rct->priv->pre_mul), "r" (&top)
-       );
+       gint rgbx[4] align(8) = { 0, 0, 0, 0};
+       gfloat tmp[4] align(8) = { 0.f, 0.f, 0.f, 0.f};
+       gfloat mat[] align(8) = {
+               rct->priv->color_matrix.coeff[0][0],
+               rct->priv->color_matrix.coeff[0][1],
+               rct->priv->color_matrix.coeff[0][2],
+               0.f,
+               rct->priv->color_matrix.coeff[1][0],
+               rct->priv->color_matrix.coeff[1][1],
+               rct->priv->color_matrix.coeff[1][2],
+               0.f,
+               rct->priv->color_matrix.coeff[2][0],
+               rct->priv->color_matrix.coeff[2][1],
+               rct->priv->color_matrix.coeff[2][2],
+               0.f,
+               LUM_XYZ_R,
+               LUM_XYZ_G,
+               LUM_XYZ_B,
+               0.f,
+               /* Not really matrix... but to avoid register pressure in the
+                * following asm blocks. It helps having only one register to
+                * hold multpile addresses. */
+               65535.f, 65535.f,
+               .5f, .5f,
+               rct->priv->pre_mul[0], rct->priv->pre_mul[0], 
+               rct->priv->pre_mul[1], rct->priv->pre_mul[2], 
+       };
+
        while(height--)
        {
                destoffset = 0;
@@ -816,9 +953,14 @@ COLOR_TRANSFORM(transform_cms8_3dnow)
                while(col--)
                {
                        asm volatile (
+                               "pxor %%mm7, %%mm7\n\t" /* 0x0 */
+                               "movq 80(%1), %%mm2\n\t" /* pre_mul R | pre_mul 
G */
+                               "movq 88(%1), %%mm3\n\t" /* pre_mul B | pre_mul 
G2 */
+                               "movq 64(%1), %%mm6\n\t" /* 65535.0 | 65535.0 */
+
                                /* pre multiply */
                                "movq (%0), %%mm0\n\t" /* R | G | B | G2 */
-                               "movq %%mm0, %%mm1\n\t" /* R | G | B | G2 */
+                               "movq (%0), %%mm1\n\t" /* R | G | B | G2 */
                                "punpcklwd %%mm7, %%mm0\n\t" /* R, G */
                                "punpckhwd %%mm7, %%mm1\n\t" /* B, G2 */
                                "pi2fd %%mm0, %%mm0\n\t" /* to float */
@@ -830,23 +972,44 @@ COLOR_TRANSFORM(transform_cms8_3dnow)
                                "pfmax %%mm7, %%mm0\n\t"
                                "pfmax %%mm7, %%mm1\n\t"
 
-                               "add $8, %0\n\t" /* increment offset */
+                               "movq 72(%1), %%mm3\n\t" /* mm3 = halfone */
 
                                /* red */
-                               "movq (%4), %%mm4\n\t" /* mat[0] | mat[1] */
-                               "movq 8(%4), %%mm5\n\t" /* mat[2] | mat[3] */
+                               "movq (%1), %%mm4\n\t" /* mat[0] | mat[1] */
+                               "movq 8(%1), %%mm5\n\t" /* mat[2] | mat[3] */
                                "pfmul %%mm0, %%mm4\n\t" /* R*[0] | G*[1] */
                                "pfmul %%mm1, %%mm5\n\t" /* B*[2] | G2*[3] */
                                "pfadd %%mm4, %%mm5\n\t" /* R*[0] + B*[2] | 
G*[1] + G2*[3] */
                                "pfacc %%mm5, %%mm5\n\t" /* R*[0] + B*[2] + 
G*[1] + G2*[3] | ? */
                                "pfmin %%mm6, %%mm5\n\t"
                                "pfmax %%mm7, %%mm5\n\t"
-                               "pf2id %%mm5, %%mm5\n\t" /* to integer */
-                               "movd %%mm5, %1\n\t" /* write r */
+                               "movd %%mm5, (%3)\n\t" /* write to tmp[0] */
 
                                /* green */
-                               "movq 16(%4), %%mm4\n\t"
-                               "movq 24(%4), %%mm5\n\t"
+                               "movq 16(%1), %%mm4\n\t"
+                               "movq 24(%1), %%mm5\n\t"
+                               "pfmul %%mm0, %%mm4\n\t"
+                               "pfmul %%mm1, %%mm5\n\t"
+                               "pfadd %%mm4, %%mm5\n\t"
+                               "pfacc %%mm5, %%mm5\n\t"
+                               "pfmin %%mm6, %%mm5\n\t"
+                               "pfmax %%mm7, %%mm5\n\t"
+                               "movd %%mm5, 4(%3)\n\t" /* write to tmp[1] */
+
+                               /* blue */
+                               "movq 32(%1), %%mm4\n\t"
+                               "movq 40(%1), %%mm5\n\t"
+                               "pfmul %%mm0, %%mm4\n\t"
+                               "pfmul %%mm1, %%mm5\n\t"
+                               "pfadd %%mm4, %%mm5\n\t"
+                               "pfacc %%mm5, %%mm5\n\t"
+                               "pfmin %%mm6, %%mm5\n\t"
+                               "pfmax %%mm7, %%mm5\n\t"
+                               "movd %%mm5, 8(%3)\n\t"  /* write to tmp[2] */
+
+                               /* Luminance */
+                               "movq 48(%1), %%mm4\n\t"
+                               "movq 56(%1), %%mm5\n\t"
                                "pfmul %%mm0, %%mm4\n\t"
                                "pfmul %%mm1, %%mm5\n\t"
                                "pfadd %%mm4, %%mm5\n\t"
@@ -854,29 +1017,44 @@ COLOR_TRANSFORM(transform_cms8_3dnow)
                                "pfmin %%mm6, %%mm5\n\t"
                                "pfmax %%mm7, %%mm5\n\t"
                                "pf2id %%mm5, %%mm5\n\t"
-                               "movd %%mm5, %2\n\t"
-
-                               /* blue */
-                               "movq 32(%4), %%mm4\n\t"
-                               "movq 40(%4), %%mm5\n\t"
-                               "pfmul %%mm0, %%mm4\n\t"
-                               "pfmul %%mm1, %%mm5\n\t"
-                               "pfadd %%mm4, %%mm5\n\t"
-                               "pfacc %%mm5, %%mm5\n\t"
-                               "pfmin %%mm6, %%mm5\n\t"
-                               "pfmax %%mm7, %%mm5\n\t"
-                               "pf2id %%mm5, %%mm5\n\t"
-                               "movd %%mm5, %3\n\t"
-                               : "+r" (s), "+r" (r), "+r" (g), "+r" (b)
-                               : "r" (&mat)
-                       );
-                       buffer[destoffset++] = rct->priv->table16[r];
-                       buffer[destoffset++] = rct->priv->table16[g];
-                       buffer[destoffset++] = rct->priv->table16[b];
-               }
+                               "pfadd %%mm3, %%mm5\n\t" /* +0.5 to have real 
round */
+                               "movd %%mm5, %%eax\n\t"
+
+                               /* Compute luminance corrected values */
+                               "movq (%3), %%mm0\n\t" /* R, G */
+                               "movq 8(%3), %%mm1\n\t" /* B, 0 */
+                               "movd (%2, %%"REG_a", 4), %%mm2\n\t" /* mm2 = x 
| a */
+                               "pshufw $0x44, %%mm2, %%mm2\n\t" /* mm2 = a | a 
*/
+                               "pfmul %%mm2, %%mm0\n\t" /* mm0 = aR | aG */
+                               "pfmul %%mm2, %%mm1\n\t" /* mm1 = aB | 0  */
+                               "pfmin %%mm6, %%mm0\n\t" /* mm0 = min(mm0, 
65535.f) */
+                               "pfmin %%mm6, %%mm1\n\t" /* mm1 = min(mm1, 
65535.f) */
+                               "pfmax %%mm7, %%mm0\n\t" /* mm0 = max(mm0, 0) */
+                               "pfmax %%mm7, %%mm1\n\t" /* mm1 = max(mm1, 0) */
+                               "pfadd %%mm3, %%mm0\n\t" /* +0.5 */
+                               "pfadd %%mm3, %%mm1\n\t" /* +0.5 */
+                               "pf2id %%mm0, %%mm0\n\t" /* mm0 = (int)aR | 
(int)aG */
+                               "pf2id %%mm1, %%mm1\n\t" /* mm1 = (int)aB | 0 */
+                               "movq %%mm0, (%4)\n\t" /* write new R/G to rgbx 
*/
+                               "movq %%mm1, 8(%4)\n\t" /* write new B/0 to 
rgbx */
+                               :
+                               : "r" (s),
+                                 "r" (mat),
+                                 "r" (rct->priv->luminance_float),
+                                 "r" (tmp),
+                                 "r" (rgbx)
+                               : "%"REG_a,
+                                 "%mm0", "%mm1", "%mm2", "%mm3",
+                                 "%mm4", "%mm5", "%mm6", "%mm7",
+                                 "memory");
+                       buffer[destoffset++] = rct->priv->table16[rgbx[0]];
+                       buffer[destoffset++] = rct->priv->table16[rgbx[1]];
+                       buffer[destoffset++] = rct->priv->table16[rgbx[2]];
+                       s += 4;
+               }
+               asm volatile("femms\n\t");
                cmsDoTransform((cmsHPROFILE) rct->priv->transform, buffer, 
out+height * out_rowstride, width);
        }
-       asm volatile ("femms\n\t");
        g_free(buffer);
        return;
 }
@@ -901,6 +1079,8 @@ COLOR_TRANSFORM(transform_cms_c)
                srcoffset = y * in_rowstride;
                for(x=0 ; x<width ; x++)
                {
+                       guint Y;
+
                        rr = (in[srcoffset+R]*pre_muli[R])>>7;
                        gg = (in[srcoffset+G]*pre_muli[G])>>7;
                        bb = (in[srcoffset+B]*pre_muli[B])>>7;
@@ -915,6 +1095,23 @@ COLOR_TRANSFORM(transform_cms_c)
                                + gg*mati.coeff[2][1]
                                + bb*mati.coeff[2][2])>>MATRIX_RESOLUTION;
                        _CLAMP65535_TRIPLET(r,g,b);
+
+                       // Compute luminance
+                       Y = (     LUM_FIXED_XYZ_R*r
+                               + LUM_FIXED_XYZ_G*g
+                               + LUM_FIXED_XYZ_B*b
+                               + LUM_FIXED_HALFONE)>>LUM_PRECISION;
+
+                       // Find the factor to apply to the RGB triplet
+                       Y = rct->priv->luminance_fixed[Y];
+
+                       // Multiplythe RGB triplet using fixed point arithmetic
+                       r = (r*Y)>>LUM_PRECISION;
+                       g = (g*Y)>>LUM_PRECISION;
+                       b = (b*Y)>>LUM_PRECISION;
+
+                       _CLAMP65535_TRIPLET(r,g,b);
+
                        buffer[destoffset++] = rct->priv->table16[r];
                        buffer[destoffset++] = rct->priv->table16[g];
                        buffer[destoffset++] = rct->priv->table16[b];
@@ -945,6 +1142,8 @@ COLOR_TRANSFORM(transform_nocms_c)
                srcoffset = y * in_rowstride;
                for(x=0 ; x<width ; x++)
                {
+                       guint Y;
+
                        rr = (in[srcoffset+R]*pre_muli[R]+64)>>7;
                        gg = (in[srcoffset+G]*pre_muli[G]+64)>>7;
                        bb = (in[srcoffset+B]*pre_muli[B]+64)>>7;
@@ -959,6 +1158,23 @@ COLOR_TRANSFORM(transform_nocms_c)
                                + gg*mati.coeff[2][1]
                                + bb*mati.coeff[2][2])>>MATRIX_RESOLUTION;
                        _CLAMP65535_TRIPLET(r,g,b);
+
+                       // Compute luminance
+                       Y = (     LUM_FIXED_XYZ_R*r
+                               + LUM_FIXED_XYZ_G*g
+                               + LUM_FIXED_XYZ_B*b
+                               + LUM_FIXED_HALFONE)>>LUM_PRECISION;
+
+                       // Find the factor to apply to the RGB triplet
+                       Y = rct->priv->luminance_fixed[Y];
+
+                       // Multiplythe RGB triplet using fixed point arithmetic
+                       r = (r*Y)>>LUM_PRECISION;
+                       g = (g*Y)>>LUM_PRECISION;
+                       b = (b*Y)>>LUM_PRECISION;
+
+                       _CLAMP65535_TRIPLET(r,g,b);
+                               
                        d[destoffset++] = rct->priv->table8[r];
                        d[destoffset++] = rct->priv->table8[g];
                        d[destoffset++] = rct->priv->table8[b];
@@ -1005,6 +1221,8 @@ rs_color_transform_make_histogram(RS_COL
                srcoffset = y * input->rowstride;
                for(x=0 ; x<input->w ; x++)
                {
+                       guint Y;
+
                        rr = (in[srcoffset+R]*pre_muli[R])>>7;
                        gg = (in[srcoffset+G]*pre_muli[G])>>7;
                        bb = (in[srcoffset+B]*pre_muli[B])>>7;
@@ -1020,6 +1238,22 @@ rs_color_transform_make_histogram(RS_COL
                                + bb*mati.coeff[2][2])>>MATRIX_RESOLUTION;
                        _CLAMP65535_TRIPLET(r,g,b);
 
+                       // Compute luminance
+                       Y = (     LUM_FIXED_XYZ_R*r
+                               + LUM_FIXED_XYZ_G*g
+                               + LUM_FIXED_XYZ_B*b
+                               + LUM_FIXED_HALFONE)>>LUM_PRECISION;
+
+                       // Find the factor to apply to the RGB triplet
+                       Y = rct->priv->luminance_fixed[Y];
+
+                       // Multiplythe RGB triplet using fixed point arithmetic
+                       r = (r*Y)>>LUM_PRECISION;
+                       g = (g*Y)>>LUM_PRECISION;
+                       b = (b*Y)>>LUM_PRECISION;
+
+                       _CLAMP65535_TRIPLET(r,g,b);
+
                        if (rct->priv->transform != NULL)
                        {
                                buffer16[x*3+R] = r;


-- 
Edouard Gomez

_______________________________________________
Rawstudio-dev mailing list
[email protected]
http://rawstudio.org/cgi-bin/mailman/listinfo/rawstudio-dev

[Rawstudio-dev] In the end, it's a fix for everyone !

Reply via email to