So after reading the translation of RYU I was interested too see if the decimalLength() function can be written to be faster, as it cascades up to 8 CMP.

After struggling with bad ideas I finally find something that looks nice:

- count the leading zero of the input
- switch() this count to have in the worst of the cases only 1 CMP (when the bit number possibly changes the digit count, e.g 9 -> 10 (when bsr(input) == 4)

after writing all the values allowing to know the cases where a comparison is necessary...

  min2 = 0b10
  max2 = 0b11
  min3 = 0b100
  max3 = 0b111
  ...
  ...
  min32 = 0b100.......0
  max32 = 0b111.......1

...I finally write the "nice" thing.

---
#!dmd -boundscheck=off -O -release -inline
import std.stdio;

ubyte decimalLength9(const uint v)
{
    if (v >= 100000000) { return 9; }
    if (v >= 10000000) { return 8; }
    if (v >= 1000000) { return 7; }
    if (v >= 100000) { return 6; }
    if (v >= 10000) { return 5; }
    if (v >= 1000) { return 4; }
    if (v >= 100) { return 3; }
    if (v >= 10) { return 2; }
    return 1;
}

ubyte fdecimalLength9(const uint v) pure nothrow
{
    import core.bitop;
    const ubyte lzc = cast(ubyte) (bsr(v) + 1);
    ubyte result;
    switch (lzc)
    {
        case 0 :
        case 1 :
        case 2 :
        case 3 : result =  1; break;
        case 4 : result =  v >= 10  ? 2 : 1; break;
        case 5 :
        case 6 : result =  2; break;
        case 7 : result =  v >= 100  ? 3 : 2; break;
        case 8 :
        case 9 : result =  3; break;
        case 10: result =  v >= 1000  ? 4 : 3; break;
        case 11:
        case 12:
        case 13: result =  4; break;
        case 14: result =  v >= 10000  ? 5 : 4; break;
        case 15:
        case 16: result =  5; break;
        case 17: result =  v >= 100000  ? 6 : 5; break;
        case 18:
        case 19: result =  6; break;
        case 20: result =  v >= 1000000  ? 7 : 6; break;
        case 21:
        case 22:
        case 23: result =  7; break;
        case 24: result =  v >= 10000000  ? 8 : 7; break;
        case 25:
        case 26: result =  8; break;
        case 27: result =  v >= 100000000  ? 9 : 8; break;
        case 28:
        case 29:
        case 30:
        case 31:
        case 32: result =  9; break;
        default: assert(false);
    }
    return result;
}

private ubyte ffdecimalLength9(const uint v) pure nothrow
{
    import core.bitop;
    const ubyte lzc = cast(ubyte) (bsr(v) + 1);
    static immutable pure nothrow ubyte function(uint)[33] tbl =
    [
    0 : (uint a) => ubyte(1),
    1 : (uint a) => ubyte(1),
    2 : (uint a) => ubyte(1),
    3 : (uint a) => ubyte(1),
    4 : (uint a) => a >= 10  ? ubyte(2) : ubyte(1),
    5 : (uint a) => ubyte(2),
    6 : (uint a) => ubyte(2),
    7 : (uint a) => a >= 100  ? ubyte(3) : ubyte(2),
    8 : (uint a) => ubyte(3),
    9 : (uint a) => ubyte(3),
    10: (uint a) => a >= 1000  ? ubyte(4) : ubyte(3),
    11: (uint a) => ubyte(4),
    12: (uint a) => ubyte(4),
    13: (uint a) => ubyte(4),
    14: (uint a) => a >= 10000  ? ubyte(5) : ubyte(4),
    15: (uint a) => ubyte(5),
    16: (uint a) => ubyte(5),
    17: (uint a) => a >= 100000  ? ubyte(6) : ubyte(5),
    18: (uint a) => ubyte(6),
    19: (uint a) => ubyte(6),
    20: (uint a) => a >= 1000000  ? ubyte(7) : ubyte(6),
    21: (uint a) => ubyte(7),
    22: (uint a) => ubyte(7),
    23: (uint a) => ubyte(7),
    24: (uint a) => a >= 10000000  ? ubyte(8) : ubyte(7),
    25: (uint a) => ubyte(8),
    26: (uint a) => ubyte(8),
    27: (uint a) => a >= 100000000  ? ubyte(9) : ubyte(8),
    28: (uint a) => ubyte(9),
    29: (uint a) => ubyte(9),
    30: (uint a) => ubyte(9),
    31: (uint a) => ubyte(9),
    32: (uint a) => ubyte(9),
    ];
    return tbl[lzc](v);
}

void main()
{
    import std.datetime.stopwatch, std.range, std.algorithm;

    int s1, s2, s3;
benchmark!({ iota(100000000u).each!(a => s1 += decimalLength9(a+1)); })(10).writeln; benchmark!({ iota(100000000u).each!(a => s2 += fdecimalLength9(a+1));})(10).writeln; benchmark!({ iota(100000000u).each!(a => s3 += ffdecimalLength9(a+1));})(10).writeln;
    assert(s1 == s2);
    assert(s1 == s3);
}
---

Then bad surprise. Even with ldmd (so ldc2 basically) feeded with the args from the script line. Maybe the fdecimalLength9 version is slightly faster. Only *slightly*. Good news, I've lost my time. So I try an alternative version that uses a table of delegates instead of a switch (ffdecimalLength9) and surprise, "tada", it is like **100x** slower then the two others.

How is that possible ?

Reply via email to