On Monday, 28 January 2013 at 23:11:11 UTC, Walter Bright wrote:
http://www.drdobbs.com/cpp/implementing-half-floats-in-d/240146674


Since it got lost in the old thread on this topic, I'll repost my versions of floatToshort and shortToFloat, which are extremely fast (no unpredictable branches, no lookup tables) and respect the current rounding mode:

-----------------

float shortToFloat(ushort s)
{
// note this is a signed shift, so sign bit gets smeared all the way into the int bit!
    uint u = ((cast(int)cast(short)s) << 13);

    if ( (s & EXPMASK) == 0 )
    {   // Subnormal or 0.
        // The simple conversion is wrong in two ways:
        // (1) it's added an implicit bit. This has value 0x1p-15.
// (2) the mantissa bits got shifted along to make space for the hidden bit.
        //     So we need to multiply the result by 2.
        // Note that 0x3880_0000 means 0x1p-14.

        uint v = (u & 0x0FFF_FFFF ) + 0x3880_0000;
        float f = *cast(float *)&v - 0x1p-14;
        u = (u & 0x8000_0000) | *cast(uint *)&f;
        return *cast(float *)&u;
    }

    u = (u & 0x8FFF_FFFF) + 0x3800_0000;

    if ( (s & EXPMASK) == EXPMASK )
    {   // infinity or NaN
        u |= FEXPMASK;
    }
    return *cast(float *)&u;
}


-----------------
NOTE: this only works on 64-bit runtime, for 32bit or CTFE with 80-bit intermediates, the constants need to be changed. Unfortunately I don't know of a nice way to detect the size of the intermediates.
-----

ushort floatToShort(float f)
{
    // Remember the sign
    uint x = *cast(uint *)&f;

    ushort sgn = (x >> 16) & 0x8000;

    // Need to treat NaN and Inf specially, otherwise the
    // mantissa shortening step would generate a new NaN.
    if ( (x & FEXPMASK) == FEXPMASK)
        return ( (x >> 13) & 0x7FFF) | sgn;

// Shorten the mantissa, rounding it according to the current rounding mode

    f = (f * (1.0f + 0x1p-13f) -f) * 0x1p13f;

    // Force large numbers to overflow my moving near float.max

    f *= 0x1p112f;
    f *= 0x1p-112f; // Then undo it

    // Force small numbers to underflow, and shift into position

    f *= 0x1p-112f;

    uint u = *cast(uint *)&f;

    return ((u>>13) & 0x7FFF) | sgn;
}

Reply via email to