On 2/11/07, Jerome Glisse <[EMAIL PROTECTED]> wrote:
On 2/10/07, Rune Petersen <[EMAIL PROTECTED]> wrote:
> Hi,
>
> Getting proper SIN and COS wasn't as easy as it appeared. I had to make
> make some changes to the fragment program code.
>
> general FP changes:
> - support HHH swizzle for vector instructions.
> - don't copy a source to a temp when it is not XYZW swizzled, but
>   combine the two and have the swizzle resolve any issues.
>   (saves temps/instructions with more elaborate shader code)
> - Disable refcounting of temps.
>   The temp R0 in progs/fp/tri-cos.c is freed prematurely.
>   This should be resolved properly.
> - fix overflow in cnstv[].
>
>
> SIN & COS:
> they are based on:
>         http://www.devmaster.net/forums/showthread.php?t=5784
>
> There is an fast and a slow(high precision) version of SIN & COS.
>
> For SIN:
> fast = 2 vector instructions
> slow = 5 vector instructions
>
> For COS:
> fast = 5 vector instructions + 2 scaler instructions
> slow = 8 vector instructions + 2 scaler instructions
>
> The fast version appears to do a fine enough job, at least with the
> simple test I have made.
>
>
> Rune Petersen

Nice to tackle this :) few comment, maybe we could make an driconf
option to switch btw fast and slow version (or a more general conf
option to enable or disable fragprog optimization in case we come
with more optimization like that in the future).

For the refcounting i am wondering if i didn't bump into that in
the past, i did use gdb to trace fragprog construction at that
time and found some strange interaction (which lead me to
the rework i did on fragprog).

Anyway here from limited testing your patch seems good,
you should commit it.

best,
Jerome Glisse


Attached a patch to fix refcounting. Basicly whenever a temporary
source was used multiple time inside an instruction that lead to
multiple call to t_hw_src which is correct but as we also decrement
use counter in that function we over decremented the refcount.

The patch decrement refcount after instruction decoding and avoid
over decrementing refcount.

(The patch apply over yours)

best,
Jerome
--- r300_fragprog.c	2007-02-11 14:26:42.000000000 +0100
+++ /home/glisse/code/r300/mesa/src/mesa/drivers/dri/r300/r300_fragprog.c	2007-02-11 14:25:15.000000000 +0100
@@ -773,12 +773,6 @@
 			cs->temps[index].reg = get_hw_temp(rp);
 
 		idx = cs->temps[index].reg;
-
-/*
-		if (!REG_GET_NO_USE(src) &&
-		    (--cs->temps[index].refcount == 0))
-			free_temp(rp, src);
-*/
 		break;
 	case REG_TYPE_INPUT:
 		idx = cs->inputs[index].reg;
@@ -819,13 +813,6 @@
 			}
 		}
 		idx = cs->temps[index].reg;
-
-/*
-		if (!REG_GET_NO_USE(dest) &&
-		    (--cs->temps[index].refcount == 0))
-			free_temp(rp, dest);
-*/
-
 		cs->dest_in_node |= (1 << idx);
 		cs->used_in_node |= (1 << idx);
 		break;
@@ -1215,10 +1202,11 @@
 
 static GLboolean parse_program(struct r300_fragment_program *rp)
 {	
+	COMPILE_STATE;
 	struct gl_fragment_program *mp = &rp->mesa_program;
 	const struct prog_instruction *inst = mp->Base.Instructions;
 	struct prog_instruction *fpi;
-	GLuint src[3], dest, temp;
+	GLuint src[3], dest, temp, srccount;
 	GLuint cnst;
 	int flags, mask = 0;
 	GLfloat cnstv[4] = {0.0, 0.0, 0.0, 0.0};
@@ -1228,6 +1216,7 @@
 		return GL_FALSE;
 	}
 
+
 	for (fpi=mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
 		if (fpi->SaturateMode == SATURATE_ZERO_ONE)
 			flags = PFS_FLAG_SAT;
@@ -1238,14 +1227,17 @@
 			mask = fpi->DstReg.WriteMask;
 		}
 
+		srccount = 0;
 		switch (fpi->Opcode) {
 		case OPCODE_ABS:
+			srccount = 1;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			emit_arith(rp, PFS_OP_MAD, dest, mask,
 				   absolute(src[0]), pfs_one, pfs_zero,
 				   flags);
 			break;
 		case OPCODE_ADD:
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			emit_arith(rp, PFS_OP_MAD, dest, mask,
@@ -1253,6 +1245,7 @@
 				   flags);
 			break;
 		case OPCODE_CMP:
+			srccount = 3;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			src[2] = t_src(rp, fpi->SrcReg[2]);
@@ -1271,6 +1264,7 @@
 			 *   x = (x < PI)?x : x-2*PI
 			 *   result = sin(x)
 			 */
+			srccount = 1;
 			temp = get_temp_reg(rp);
 			if(rp->const_sin == -1){
 			    cnstv[0] = 1.273239545;
@@ -1350,6 +1344,7 @@
 			free_temp(rp, temp);
 			break;
 		case OPCODE_DP3:
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			emit_arith(rp, PFS_OP_DP3, dest, mask,
@@ -1357,6 +1352,7 @@
 				   flags);
 			break;
 		case OPCODE_DP4:
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			emit_arith(rp, PFS_OP_DP4, dest, mask,
@@ -1364,6 +1360,7 @@
 				   flags);
 			break;
 		case OPCODE_DPH:
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			/* src0.xyz1 -> temp
@@ -1386,6 +1383,7 @@
 #endif
 			break;
 		case OPCODE_DST:
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			/* dest.y = src0.y * src1.y */
@@ -1408,12 +1406,14 @@
 			}
 			break;
 		case OPCODE_EX2:
+			srccount = 1;
 			src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
 			emit_arith(rp, PFS_OP_EX2, dest, mask,
 				   src[0], undef, undef,
 				   flags);
 			break;
 		case OPCODE_FLR:		
+			srccount = 1;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			temp = get_temp_reg(rp);
 			/* FRC temp, src0
@@ -1428,6 +1428,7 @@
 			free_temp(rp, temp);
 			break;
 		case OPCODE_FRC:
+			srccount = 1;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			emit_arith(rp, PFS_OP_FRC, dest, mask,
 				   src[0], undef, undef,
@@ -1437,6 +1438,7 @@
 			emit_tex(rp, fpi, R300_FPITX_OP_KIL);
 			break;
 		case OPCODE_LG2:
+			srccount = 1;
 			src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
 			emit_arith(rp, PFS_OP_LG2, dest, mask,
 				   src[0], undef, undef,
@@ -1455,6 +1457,7 @@
 			 * change the compare to (t.x + 0.5) > 0.5 we may
 			 * save one instruction by doing CMP -t.x 
 			 */
+			srccount = 1;
 			cnstv[0] = cnstv[1] = cnstv[2] = cnstv[3] = 0.50001;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			temp = get_temp_reg(rp);
@@ -1505,6 +1508,7 @@
 			free_temp(rp, temp);
 			break;
 		case OPCODE_LRP:
+			srccount = 3;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			src[2] = t_src(rp, fpi->SrcReg[2]);
@@ -1523,6 +1527,7 @@
 			free_temp(rp, temp);
 			break;			
 		case OPCODE_MAD:
+			srccount = 3;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			src[2] = t_src(rp, fpi->SrcReg[2]);
@@ -1531,6 +1536,7 @@
 				   flags);
 			break;
 		case OPCODE_MAX:
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			emit_arith(rp, PFS_OP_MAX, dest, mask,
@@ -1538,6 +1544,7 @@
 				   flags);
 			break;
 		case OPCODE_MIN:
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			emit_arith(rp, PFS_OP_MIN, dest, mask,
@@ -1546,12 +1553,14 @@
 			break;
 		case OPCODE_MOV:
 		case OPCODE_SWZ:
+			srccount = 1;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			emit_arith(rp, PFS_OP_MAD, dest, mask,
 				   src[0], pfs_one, pfs_zero, 
 				   flags);
 			break;
 		case OPCODE_MUL:
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			emit_arith(rp, PFS_OP_MAD, dest, mask,
@@ -1559,6 +1568,7 @@
 				   flags);
 			break;
 		case OPCODE_POW:
+			srccount = 2;
 			src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
 			src[1] = t_scalar_src(rp, fpi->SrcReg[1]);
 			temp = get_temp_reg(rp);	
@@ -1574,12 +1584,14 @@
 			free_temp(rp, temp);
 			break;
 		case OPCODE_RCP:
+			srccount = 1;
 			src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
 			emit_arith(rp, PFS_OP_RCP, dest, mask,
 				   src[0], undef, undef,
 				   flags);
 			break;
 		case OPCODE_RSQ:
+			srccount = 1;
 			src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
 			emit_arith(rp, PFS_OP_RSQ, dest, mask,
 				   absolute(src[0]), pfs_zero, pfs_zero,
@@ -1589,6 +1601,7 @@
 			ERROR("SCS not implemented\n");
 			break;
 		case OPCODE_SGE:
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			temp = get_temp_reg(rp);
@@ -1610,7 +1623,7 @@
 			 * extra precision is obtained by weighting against
 			 * itself squared.
 			 */
-
+			srccount = 1;
 			temp = get_temp_reg(rp);
 			if(rp->const_sin == -1){
 			    cnstv[0] = 1.273239545;
@@ -1657,6 +1670,7 @@
 			free_temp(rp, temp);
 			break;
 		case OPCODE_SLT:
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			temp = get_temp_reg(rp);
@@ -1672,6 +1686,7 @@
 			free_temp(rp, temp);
 			break;
 		case OPCODE_SUB:
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			emit_arith(rp, PFS_OP_MAD, dest, mask,
@@ -1688,6 +1703,7 @@
 			emit_tex(rp, fpi, R300_FPITX_OP_TXP);
 			break;
 		case OPCODE_XPD: {
+			srccount = 2;
 			src[0] = t_src(rp, fpi->SrcReg[0]);
 			src[1] = t_src(rp, fpi->SrcReg[1]);
 			temp = get_temp_reg(rp);
@@ -1714,6 +1730,17 @@
 			break;
 		}
 
+		if (REG_GET_TYPE(dest) == REG_TYPE_TEMP &&
+		    !REG_GET_NO_USE(dest) &&
+		    (--cs->temps[REG_GET_INDEX(dest)].refcount == 0))
+			free_temp(rp, dest);
+		for (int i = 0; i < srccount; i++) {
+			if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP &&
+			    !REG_GET_NO_USE(src[i]) &&
+			    (--cs->temps[REG_GET_INDEX(src[i])].refcount == 0))
+				free_temp(rp, src[i]);
+		}
+
 		if (rp->error)
 			return GL_FALSE;
 
@@ -1823,8 +1850,9 @@
 					cs->temps[idx].reg = -1;
 					cs->temps[idx].refcount = 1;
 					temps_used |= (1 << idx);
-				} else
+				} else {
 					cs->temps[idx].refcount++;
+				}
 				break;
 			case PROGRAM_INPUT:
 				cs->inputs[idx].refcount++;
@@ -1839,8 +1867,9 @@
 				cs->temps[idx].reg = -1;
 				cs->temps[idx].refcount = 1;
 				temps_used |= (1 << idx);
-			} else
+			} else {
 				cs->temps[idx].refcount++;
+			}
 		}
 	}
 	cs->temp_in_use = temps_used;
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier.
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
--
_______________________________________________
Dri-devel mailing list
Dri-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/dri-devel

Reply via email to