In these last few days I have been working on the Mesa software blending 
and the existing MMX bug. I've made some progress.

I made a small test program which calls the relevant functions directly as 
Alex suggested. In the process I added comments to the assembly code 
(which had none). The error is due to the fact that the inner loop blends 
two pixels at the same time, so if the mask of the first element is zero 
then both are skipped. I also spotted some errors in the runin section, 
e.g., it ANDs with 4 and compares the result with 8 which is impossible... 
I still have to study the x86 architecture optimization a little further 
to know how to optimally fix both these situations.

I also made two optimizations in blend_transparency(s_blend.c) which have 
no effect in the result precision but that achieved a global speedup of 
30% in the function. These optimizations are in the C code and benefit all 
architectures.

The first was to avoid the repetition of the input variable in the DIV255. 
At least my version of gcc (2.96) wasn't factoring the common code out 
yelding to a 17% speedup.

The second was to factor the equation of blending reducing in half the 
number of multiplications. This optimization can be applied in other 
places on this file as well.

A third optimization that I'll try is the "double blend" trick (make two 
8-bit multiplications at the same time in a 32-bit register) as documented 
by Michael Herf (http://www.stereopsis.com/doubleblend.html - a quite 
interesting site referred to me by Brian).


I would like to keep improving Mesa software rendering performance. I know 
that due to its versatility and power Mesa will never rival with a 
dedicated and non-conformant software 3d engine such as unreal one, 
nevertheless I think that it's possible to make it usefull for simple 
realtime rendering. Regards,

José Fonseca
Index: swrast/s_blend.c
===================================================================
RCS file: /cvsroot/mesa3d/Mesa/src/swrast/s_blend.c,v
retrieving revision 1.14
diff -u -r1.14 s_blend.c
--- swrast/s_blend.c    27 Mar 2002 15:49:27 -0000      1.14
+++ swrast/s_blend.c    1 Apr 2002 00:34:20 -0000
@@ -132,12 +132,24 @@
 #if CHAN_BITS == 8
             /* This satisfies Glean and should be reasonably fast */
             /* Contributed by Nathan Hand */
+#if 0
 #define DIV255(X)  (((X) << 8) + (X) + 256) >> 16
+#else
+           const GLint temp;
+#define DIV255(X)  (temp = (X), ((temp << 8) + temp + 256) >> 16)
+#endif
+#if 0
             const GLint s = CHAN_MAX - t;
             const GLint r = DIV255(rgba[i][RCOMP] * t + dest[i][RCOMP] * s);
             const GLint g = DIV255(rgba[i][GCOMP] * t + dest[i][GCOMP] * s);
             const GLint b = DIV255(rgba[i][BCOMP] * t + dest[i][BCOMP] * s);
             const GLint a = DIV255(rgba[i][ACOMP] * t + dest[i][ACOMP] * s);
+#else
+            const GLint r = DIV255((rgba[i][RCOMP] - dest[i][RCOMP]) * t) + 
+dest[i][RCOMP];
+            const GLint g = DIV255((rgba[i][GCOMP] - dest[i][GCOMP]) * t) + 
+dest[i][GCOMP];
+            const GLint b = DIV255((rgba[i][BCOMP] - dest[i][BCOMP]) * t) + 
+dest[i][BCOMP];
+            const GLint a = DIV255((rgba[i][ACOMP] - dest[i][ACOMP]) * t) + 
+dest[i][ACOMP]; 
+#endif
 #undef DIV255
 #elif CHAN_BITS == 16
             const GLfloat tt = (GLfloat) t / CHAN_MAXF;
Index: X86/mmx_blend.S
===================================================================
RCS file: /cvsroot/mesa3d/Mesa/src/X86/mmx_blend.S,v
retrieving revision 1.5
diff -u -r1.5 mmx_blend.S
--- X86/mmx_blend.S     28 Mar 2001 20:44:44 -0000      1.5
+++ X86/mmx_blend.S     1 Apr 2002 00:35:13 -0000
@@ -7,25 +7,35 @@
 ALIGNTEXT16
 GLOBL GLNAME(_mesa_mmx_blend_transparency)
 
+/*
+ * void blend_transparency( GLcontext *ctx,
+ *                          GLuint n, 
+ *                          const GLubyte mask[],
+ *                          GLchan rgba[][4], 
+ *                          CONST GLchan dest[][4] )
+ * 
+ * Common transparency blending mode.
+ */
 GLNAME( _mesa_mmx_blend_transparency ):
     PUSH_L    ( EBP )
     MOV_L     ( ESP, EBP )
     SUB_L     ( CONST(52), ESP )
     PUSH_L    ( EBX )
+
     MOV_L     ( CONST(16711680), REGOFF(-8, EBP) )
     MOV_L     ( CONST(16711680), REGOFF(-4, EBP) )
     MOV_L     ( CONST(0), REGOFF(-16, EBP) )
     MOV_L     ( CONST(-1), REGOFF(-12, EBP) )
     MOV_L     ( CONST(-1), REGOFF(-24, EBP) )
     MOV_L     ( CONST(0), REGOFF(-20, EBP) )
-    MOV_L     ( REGOFF(24, EBP), EAX )
+    MOV_L     ( REGOFF(24, EBP), EAX )         /* rgba */
     ADD_L     ( CONST(4), EAX )
     MOV_L     ( EAX, EDX )
-    AND_L     ( REGOFF(20, EBP), EDX )
+    AND_L     ( REGOFF(20, EBP), EDX )         /* mask */
     MOV_L     ( EDX, EAX )
     AND_L     ( CONST(4), EAX )
     CMP_L     ( CONST(8), EAX )
-    JNE       ( LLBL(GMBT_2) )
+    JNE       ( LLBL(GMBT_no_align) )
     MOV_L     ( REGOFF(20, EBP), EAX )
     ADD_L     ( CONST(3), EAX )
     XOR_L     ( EDX, EDX )
@@ -116,28 +126,28 @@
     ADD_L     ( CONST(4), REGOFF(20, EBP) )
     ADD_L     ( CONST(4), REGOFF(24, EBP) )
     DEC_L     ( REGOFF(12, EBP) )
-LLBL(GMBT_2):
+LLBL(GMBT_skip_runin):
 
-    CMP_L     ( CONST(0), REGOFF(12, EBP) )
-    JE        ( LLBL(GMBT_3) )
-    MOV_L     ( CONST(0), REGOFF(-28, EBP) )
+    CMP_L     ( CONST(0), REGOFF(12, EBP) )    /* n == 0 */
+    JE        ( LLBL(GMBT_zero_length) )
+    MOV_L     ( CONST(0), REGOFF(-28, EBP) )   
 ALIGNTEXT4
-LLBL(GMBT_4):
+LLBL(GMBT_main_loop):
 
     MOV_L     ( REGOFF(12, EBP), EDX )
     MOV_L     ( EDX, EAX )
-    SHR_L     ( CONST(1), EAX )
+    SHR_L     ( CONST(1), EAX )                        /* eax = n/2 */
     CMP_L     ( EAX, REGOFF(-28, EBP) )
-    JB        ( LLBL(GMBT_7) )
-    JMP       ( LLBL(GMBT_5) )
+    JB        ( LLBL(GMBT_no_jump) )
+    JMP       ( LLBL(GMBT_end_loop) )
 ALIGNTEXT16
-LLBL(GMBT_7):
+LLBL(GMBT_nojump):
 
     MOV_L     ( REGOFF(-28, EBP), EAX )
     LEA_L     ( REGDIS(0,EAX,2), EDX )
-    MOV_L     ( REGOFF(16, EBP), EAX )
+    MOV_L     ( REGOFF(16, EBP), EAX )         /* mask */
     CMP_B     ( CONST(0), REGBI(EAX,EDX) )
-    JE        ( LLBL(GMBT_6) )
+    JE        ( LLBL(GMBT_masked) )
     MOV_L     ( REGOFF(-28, EBP), EAX )
     MOV_L     ( EAX, EDX )
     LEA_L     ( REGDIS(0,EDX,8), ECX )
@@ -195,25 +205,22 @@
     POR       ( MM1, MM4 )
     MOVQ      ( MM4, REGIND(EAX) )
 
-
-LLBL(GMBT_8):
-
-LLBL(GMBT_6):
+LLBL(GMBT_masked):
 
     INC_L     ( REGOFF(-28, EBP) )
-    JMP       ( LLBL(GMBT_4) )
+    JMP       ( LLBL(GMBT_main_loop) )
 ALIGNTEXT16
-LLBL(GMBT_5):
+LLBL(GMBT_end_loop):
 
 
     EMMS
 
-LLBL(GMBT_3):
+LLBL(GMBT_runout):
 
     MOV_L     ( REGOFF(12, EBP), EAX )
     AND_L     ( CONST(1), EAX )
     TEST_L    ( EAX, EAX )
-    JE        ( LLBL(GMBT_9) )
+    JE        ( LLBL(GMBT_skip_runout) )
     MOV_L     ( REGOFF(12, EBP), EAX )
     LEA_L     ( REGDIS(0,EAX,4), EDX )
     MOV_L     ( EDX, EAX )
@@ -342,9 +349,7 @@
     LEA_L     ( REGOFF(-1, EAX), EDX )
     MOV_B     ( REGOFF(-32, EBP), AL )
     MOV_B     ( AL, REGIND(EDX) )
-LLBL(GMBT_9):
-
-LLBL(GMBT_1):
+LLBL(GMBT_skip_runout):
 
     MOV_L     ( REGOFF(-56, EBP), EBX )
     MOV_L     ( EBP, ESP )

Reply via email to