On Tue, 8 Feb 2005 15:31:59 -0500, Daniel Phillips <[EMAIL PROTECTED]> wrote:
> Hi Timothy,
> 
> On Sunday 06 February 2005 10:39, Timothy Miller wrote:
> > Well, say it took 4 cycles to compute one sum.  Then what you need is
> > dZ, dZ*2, dZ*3, and dZ*4, all of which are either trivial or easy to
> > compute.  You use dZ*4 to get to the next loop, and send Z,0; Z,dZ;
> > Z,dZ*2; and Z,dZ*3 down the pipeline.
> 
> OK, I just want to tie this one off.  It's clear how this will work with
> floating point adders: say the adder requires 4 clocks, and delivers
> one result every clock.  For some interpolant T, four steps of dTdx can
> be in the pipeline, and because we compute two pixels on each step, we
> need two adders.  The increment will always be 8*dTdx.
> 
> We've got something like 17 interpolants, so that's 34 simplified fp
> adders.  Interpolating vertically between scan lines will additionally
> use twice as many FP adders as interpolants, because two edges have to
> be interpolated.  The edge setup requires a multiply and add per
> interpolant.  Should we worry about this number of components, or is it
> no sweat?  The vertical rasterization doesn't necessarily have to
> deliver a span per clock but it would be nice if it did, to keep up
> with one and two pixel-wide triangles.

We only interpolate the left edge vertically, but you do have a good
point that the vertical will double the number of adders.  While a
generalized adder takes up a whopping 1% of the design, the simplified
ones won't be nearly so bad.  There may be grounds for combining
horizontal and vertical and sharing some of the logic.

> 
> Finally, have we clawed our way back to 200 MHz yet?

Yes and no.  I'd taken true-zero and over/underflow detection out. 
Like that, I was able to get the generalized adder down to 5ns.  But
when I added true-zero and over/underflow back in, and then did more
tweaking, I'm back up to about 6ns.  As is the case with modern VLSI,
routing/wiring delay is the dominant performance factor.

Normally, 3 levels of logic isn't a big deal, but when the routing for
each level of logic exceeds 1ns, because it just can't seem to route
the whole design very well, you get into trouble.  There's a certain
aspect of the adder I've been tinkering with (on and off here) that is
purely academic, because pipelined adders can be any length and looped
ones will be simplified.


Here's the current design.  Mere mortals, bow at my feet because of my
great chip design m4d 5ki11z.  (Har har)


module float_add(
   clock,
   Ainx, Binx,
   Coutx);

input clock;
input [24:0] Ainx, Binx;
output [24:0] Coutx;
reg [24:0] Coutx, Cout;


reg [24:0] Ain, Bin;


// Decouple the inputs from the inferred IOBs,
// Otherwise, we get bad (and misleading) routing delays.
always @(posedge clock) begin
    Ain <= Ainx;
    Bin <= Binx;
end



/************** STAGE 1 **************/

reg [8:0] Bshift1, Ashift1;
reg BmantGT1u, BmantGT1l, BmantEQ1u;
reg [24:0] A1, B1;
reg Anzero1, Bnzero1;
always @(posedge clock) begin
    // For if A is greater
    Bshift1 <= Ain[24:17] - Bin[24:17];

    // For if B is greater
    Ashift1 <= Bin[24:17] - Ain[24:17];

    // Check for (non)zeros
    Anzero1 <= |Ain[24:17];
    Bnzero1 <= |Bin[24:17];

    // Compare mantissas
    BmantGT1u <= Bin[15:8] > Ain[15:8];
    BmantGT1l <= Bin[7:0] > Ain[7:0];
    BmantEQ1u <= Bin[15:8] == Ain[15:8];

    // Forward A and B
    {A1, B1} <= {Ain, Bin};
end


/************** STAGE 2 **************/

reg [16:0] Ashifted2, Bshifted2;
reg sub2, PickB2;
reg [24:0] A2, B2;

wire BmantGT2 = BmantGT1u || (BmantEQ1u && BmantGT1l);

wire [16:0] Btoshift2 = {Bnzero1, B1[15:0]};
wire [16:0] Atoshift2 = {Anzero1, A1[15:0]};

always @(posedge clock) begin
    // For if A is greater
    Bshifted2 <= Btoshift2 >> Bshift1[7:0];

    // For if B is greater
    Ashifted2 <= Atoshift2 >> Ashift1[7:0];

    // Subtract if signs are different
    sub2 <= A1[16] ^ B1[16];

    // Compare A and B
    PickB2 <= Bshift1[8] || (!Ashift1[8] && BmantGT2);

    // Forward
    A2 <= A1;
    B2 <= B1;
end


/************** STAGE 3 **************/

// For if A is greater, addsub: A,B
wire [17:0] A_as0, B_as0, C_as0;
assign A_as0 = {1'b1, A2[15:0]};
assign B_as0 = Bshifted2;
assign C_as0 = sub2 ? (A_as0-B_as0) : (A_as0+B_as0);

// For if B is greater, addsub: B,A
wire [17:0] A_as1, B_as1, C_as1;
assign B_as1 = {1'b1, B2[15:0]};
assign A_as1 = Ashifted2;
assign C_as1 = sub2 ? (B_as1-A_as1) : (B_as1+A_as1);

reg [17:0] Amant3, Bmant3;
reg Asign3, Bsign3, PickB3;
reg [7:0] Aexp3, Bexp3;

always @(posedge clock) begin
    // For if A is greater
    Amant3 <= C_as0;
    Asign3 <= A2[16];
    Aexp3 <= A2[24:17];

    // For if B is greater
    Bmant3 <= C_as1;
    Bsign3 <= B2[16];
    Bexp3 <= B2[24:17];

    // Forward
    PickB3 <= PickB2;
end


/************** STAGE 4 **************/


wire [17:0] mant4i = PickB3 ? Bmant3 : Amant3;
wire [7:0] exp4i = PickB3 ? Bexp3 : Aexp3;
wire sign4i = PickB3 ? Bsign3 : Asign3;


reg sign4;
reg [17:0] mant4;
reg [7:0] exp4;
reg [3:0] hasbits4;
reg [1:0] extra_shift [0:3];

always @(posedge clock) begin
    sign4 <= sign4i;

    mant4 <= mant4i;
    exp4 <= exp4i;

    extra_shift[0] <= 0;
    case (1'b1)
        mant4i[16]: extra_shift[0] <= 0;
        mant4i[15]: extra_shift[0] <= 1;
        mant4i[14]: extra_shift[0] <= 2;
        mant4i[13]: extra_shift[0] <= 3;
    endcase

    extra_shift[1] <= 0;
    case (1'b1)
        mant4i[12]: extra_shift[1] <= 0;
        mant4i[11]: extra_shift[1] <= 1;
        mant4i[10]: extra_shift[1] <= 2;
        mant4i[ 9]: extra_shift[1] <= 3;
    endcase

    extra_shift[2] <= 0;
    case (1'b1)
        mant4i[ 8]: extra_shift[2] <= 0;
        mant4i[ 7]: extra_shift[2] <= 1;
        mant4i[ 6]: extra_shift[2] <= 2;
        mant4i[ 5]: extra_shift[2] <= 3;
    endcase

    extra_shift[3] <= 0;
    case (1'b1)
        mant4i[ 4]: extra_shift[3] <= 0;
        mant4i[ 3]: extra_shift[3] <= 1;
        mant4i[ 2]: extra_shift[3] <= 2;
        mant4i[ 1]: extra_shift[3] <= 3;
    endcase

    hasbits4[3] <= |mant4i[17:13];
    hasbits4[2] <= |mant4i[12:9];
    hasbits4[1] <= |mant4i[8:5];
    hasbits4[0] <= |mant4i[4:1];
end



/************** STAGE 5 **************/


reg [8:0] exp5;
reg [4:0] exp_diff5;
reg sign5;
reg [15:0] mant5;
reg [1:0] lsh5;


always @(posedge clock) begin
    sign5 <= sign4;

    exp5 <= exp4;
    if (mant4[17]) begin
        mant5 <= mant4[16:1];
        lsh5 <= 0;
        exp5 <= exp4 + 1;
        exp_diff5 <= 0;
    end else if (hasbits4[3]) begin
        mant5 <= mant4[15:0];
        lsh5 <= extra_shift[0];
        exp_diff5 <= extra_shift[0];
    end else if (hasbits4[2]) begin
        mant5 <= {mant4[11:0], 4'b0};
        lsh5 <= extra_shift[1];
        exp_diff5 <= {1'b1, extra_shift[1]};
    end else if (hasbits4[1]) begin
        mant5 <= {mant4[7:0], 8'b0};
        lsh5 <= extra_shift[2];
        exp_diff5 <= {2'd2, extra_shift[2]};
    end else if (hasbits4[0]) begin
        mant5 <= {mant4[3:0], 12'b0};
        lsh5 <= extra_shift[3];
        exp_diff5 <= {2'd3, extra_shift[3]};
    end else begin
        mant5 <= 0;
        lsh5 <= 0;
        if (mant4[0]) begin
            exp_diff5 <= 16;
        end else begin
            exp5 <= 0;
            exp_diff5 <= 0;
        end
    end
end


/************** STAGE 6 **************/


wire [9:0] exp6 = exp5 - exp_diff5;

always @(posedge clock) begin
    Cout[16] <= sign5;
    case (exp6[9:8])
        0: Cout[24:17] <= exp6[7:0];  // in bounds
        1: Cout[24:17] <= 255;        // overflow
        2,3: Cout[24:17] <= 0;        // underflow
    endcase
    Cout[15:0] <= mant5 << lsh5;
end




// Decouple outputs from inferred IOBs.
always @(posedge clock) begin
    Coutx <= Cout;
end


endmodule
_______________________________________________
Open-graphics mailing list
[email protected]
http://lists.duskglow.com/mailman/listinfo/open-graphics
List service provided by Duskglow Consulting, LLC (www.duskglow.com)

Reply via email to