I'm exploring the possibility of implementing some of the basic software building blocks (memcpy, memcmp, memmove, etc...) that D utilizes from the C library with D implementations. There are many reasons to do this, one of which is to leverage information available at compile-time and in D's type system (type sizes, alignment, etc...) in order to optimize the implementation of these functions, and allow them to be used from @safe code.

The prevailing wisdom has been that there is no way to improve on C's memcpy implementation given that it has been mirco-optimized to death over several decades by many talented members of the human race.

So, I threw the following benchmark together to try to get a clue about what I was up against, and in a very short time, I beat the snot of C's memcpy. The benefit seems to disappear as the array sizes increase, but I believe the vast majority of calls to memcpy are probably quite small.

import std.datetime.stopwatch;
import std.stdio;
import core.stdc.string;
import std.random;
import std.algorithm;

enum length = 4096 * 2;
ubyte[length] dst;
ubyte[length] src;
auto rnd = Random(42);
ubyte[] src2;
ubyte[] dst2;

void verifyResults()
{
    assert(memcmp(dst.ptr, src.ptr, length) == 0);
    assert(memcmp(dst2.ptr, src2.ptr, length) == 0);
}

void randomizeData()
{
    for(int i = 0; i < length; i++)
    {
        src[i] = uniform!ubyte;
        dst[i] = uniform!ubyte;
    }

    src2 = src;
    dst2 = dst;
}

void memcpyD()
{
    dst = src.dup;
}

void memcpyDstdAlg()
{
    copy(src2, dst2);
}

void memcpyC()
{
    memcpy(dst.ptr, src.ptr, length);
}

void memcpyNaive()
{
    for(int i = 0; i < length; i++)
    {
        dst[i] = src[i];
    }
}

void memcpyASM()
{
    auto s = src.ptr;
    auto d = dst.ptr;
    size_t len = length;
    asm pure nothrow @nogc
    {
        mov RSI, s;
        mov RDI, d;
        cld;
        mov RCX, len;
        rep;
        movsb;
    }
}

void main()
{
    // verify the integrity of the algorithm
    randomizeData();
    memcpyD();
    verifyResults();

    randomizeData();
    memcpyDstdAlg();
    verifyResults();

    randomizeData();
    memcpyC();
    verifyResults();

    randomizeData();
    memcpyNaive();
    verifyResults();

    randomizeData();
    memcpyASM();
    verifyResults();

    // test the performance of the algorithm
auto r = benchmark!(memcpyD, memcpyDstdAlg, memcpyC, memcpyNaive, memcpyASM)(1000);
    Duration memcpyDResult = r[0];
    Duration memcpyDstdAlgResult = r[1];
    Duration memcpyCResult = r[2];
    Duration memcpyNaiveResult = r[3];
    Duration memcpyASMResult = r[4];

    writeln("memcpyD: ", memcpyDResult);
    writeln("memcpyDstdAlg: ", memcpyDstdAlgResult);
    writeln("memcpyC: ", memcpyCResult);
    writeln("memcpyNaive: ", memcpyNaiveResult);
    writeln("memcpyASM: ", memcpyASMResult);
}


------ Output --------
memcpyD:         1 ms, 772 μs, and 4 hnsecs
memcpyDstdAlg: 531 μs and 8 hnsecs
memcpyC:       371 μs and 3 hnsecs
memcpyNaive:    21 ms, 572 μs, and 2 hnsecs
memcpyASM:     119 μs and 6 hnsecs



I'm not experienced with this kind of programming, so I'm doubting these results. Have I done something wrong? Am I overlooking something?

Thanks,
Mike

Reply via email to