I'm exploring the possibility of implementing some of the basic
software building blocks (memcpy, memcmp, memmove, etc...) that D
utilizes from the C library with D implementations. There are
many reasons to do this, one of which is to leverage information
available at compile-time and in D's type system (type sizes,
alignment, etc...) in order to optimize the implementation of
these functions, and allow them to be used from @safe code.
The prevailing wisdom has been that there is no way to improve on
C's memcpy implementation given that it has been mirco-optimized
to death over several decades by many talented members of the
human race.
So, I threw the following benchmark together to try to get a clue
about what I was up against, and in a very short time, I beat the
snot of C's memcpy. The benefit seems to disappear as the array
sizes increase, but I believe the vast majority of calls to
memcpy are probably quite small.
import std.datetime.stopwatch;
import std.stdio;
import core.stdc.string;
import std.random;
import std.algorithm;
enum length = 4096 * 2;
ubyte[length] dst;
ubyte[length] src;
auto rnd = Random(42);
ubyte[] src2;
ubyte[] dst2;
void verifyResults()
{
assert(memcmp(dst.ptr, src.ptr, length) == 0);
assert(memcmp(dst2.ptr, src2.ptr, length) == 0);
}
void randomizeData()
{
for(int i = 0; i < length; i++)
{
src[i] = uniform!ubyte;
dst[i] = uniform!ubyte;
}
src2 = src;
dst2 = dst;
}
void memcpyD()
{
dst = src.dup;
}
void memcpyDstdAlg()
{
copy(src2, dst2);
}
void memcpyC()
{
memcpy(dst.ptr, src.ptr, length);
}
void memcpyNaive()
{
for(int i = 0; i < length; i++)
{
dst[i] = src[i];
}
}
void memcpyASM()
{
auto s = src.ptr;
auto d = dst.ptr;
size_t len = length;
asm pure nothrow @nogc
{
mov RSI, s;
mov RDI, d;
cld;
mov RCX, len;
rep;
movsb;
}
}
void main()
{
// verify the integrity of the algorithm
randomizeData();
memcpyD();
verifyResults();
randomizeData();
memcpyDstdAlg();
verifyResults();
randomizeData();
memcpyC();
verifyResults();
randomizeData();
memcpyNaive();
verifyResults();
randomizeData();
memcpyASM();
verifyResults();
// test the performance of the algorithm
auto r = benchmark!(memcpyD, memcpyDstdAlg, memcpyC,
memcpyNaive, memcpyASM)(1000);
Duration memcpyDResult = r[0];
Duration memcpyDstdAlgResult = r[1];
Duration memcpyCResult = r[2];
Duration memcpyNaiveResult = r[3];
Duration memcpyASMResult = r[4];
writeln("memcpyD: ", memcpyDResult);
writeln("memcpyDstdAlg: ", memcpyDstdAlgResult);
writeln("memcpyC: ", memcpyCResult);
writeln("memcpyNaive: ", memcpyNaiveResult);
writeln("memcpyASM: ", memcpyASMResult);
}
------ Output --------
memcpyD: 1 ms, 772 μs, and 4 hnsecs
memcpyDstdAlg: 531 μs and 8 hnsecs
memcpyC: 371 μs and 3 hnsecs
memcpyNaive: 21 ms, 572 μs, and 2 hnsecs
memcpyASM: 119 μs and 6 hnsecs
I'm not experienced with this kind of programming, so I'm
doubting these results. Have I done something wrong? Am I
overlooking something?
Thanks,
Mike