My crappy old Pentium 4 has gone totally mad: Sorting with Array.opIndex: 45080 Sorting with pointers: 45608 4.092e+16 percent faster (??)
Compiled with dmd -profile -O -release -inline -noboundscheck On 12/13/10, Iain Buclaw <ibuc...@ubuntu.com> wrote: > == Quote from Craig Black (craigbla...@cox.net)'s article >> The following program illustrates the problems with inlining in the dmd >> compiler. Perhaps with some more work I can reduce it to a smaller test >> case. I was playing around with a simple Array template, and noticed that >> similar C++ code performs much better. This is due, at least in part, to >> opIndex not being properly inlined by dmd. There are two sort functions, >> quickSort1 and quickSort2. quickSort1 indexes an Array data structure. >> quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my >> core i7. >> import std.stdio; >> import std.date; >> import std.random; >> import std.conv; >> import std.algorithm; >> import std.range; >> struct Array(T) >> { >> public: >> this(int length) { resize(length); } >> this(T[] a) { append(a); } >> this(this) >> { >> if(!base.array) return; >> ArrayBase!T old; >> old = base; >> base.array = null; >> reserve(old.length(), old.length()); >> copyData(old); >> old.array = null; >> } >> void clear() { base.clear(); } >> void resize(int sz) >> { >> assert(sz >= 0); >> if(sz == 0) return clear(); >> if(!base.array) reserve(sz, sz); >> *base.lengthPtr() = sz; >> } >> void reserve(int capacity, int length) >> { >> assert(length <= capacity); >> if(capacity == 0) return clear(); >> ArrayBase!T old; >> if(base.array) >> { >> if(base.capacity() >= capacity) return; >> old.array = base.array; >> base.array = null; >> } >> base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]); >> *base.lengthPtr() = length; >> *base.capacityPtr() = capacity; >> for(int i = 0; i < capacity; i++) emplace!T(base.data()+i); >> if(old.array) copyData(old); >> } >> int length() const { return base.length(); } >> int capacity() const { return base.capacity(); } >> bool empty() const { return base.array == null; } >> ref T at(int i) >> { >> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) >> ~ >> " out of bounds of empty array"); >> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ >> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); >> return base.data()[i]; >> } >> ref const T at(int i) const >> { >> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) >> ~ >> " out of bounds of empty array"); >> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ >> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); >> return base.data()[i]; >> } >> const ref T opIndex(int i) const { return at(i); } >> void opIndexAssign(T t, int i) { at(i) = t; } >> void opIndexAssign(ref T t, int i) { at(i) = t; } >> void opAssign(ref const Array!T array) >> { >> copy(array); >> } >> void opAssign(T[] array) >> { >> int len = array.length; >> resize(len); >> for(int i = 0; i < len; i++) at(i) = array[i]; >> } >> void copy(ref const Array!T array) >> { >> if(array.empty()) return clear(); >> int len = array.length(); >> reserve(len, len); >> for(int i = 0; i < len; i++) at(i) = array[i]; >> } >> void opOpAssign(string op, A...)(A a) if(op == "~") >> { >> appendComposite(a); >> } >> ref T front() { return at(0); } >> const ref T front() const { return at(0); } >> ref T back() { return at(length()-1); } >> const ref T back() const { return at(length()-1); } >> ref T appendOne() >> { >> int sz = length(); >> int sp = capacity(); >> if(sp > sz) (*base.lengthPtr())++; >> else >> { >> sz++; >> sp = max(2, sp+sp/2, sz); >> reserve(sp, sz); >> } >> return back(); >> } >> void append(A...)(A a) >> { >> static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || >> is(typeof(a[0]) == T[]))) >> { >> appendComposite(a[0]); >> } else { >> appendTuple(a); >> } >> } >> void appendTuple(A...)(A a) >> { >> foreach(x; a) appendOne() = x; >> } >> void appendComposite(A)(A a) >> { >> int prevLen = length(); >> resize(prevLen + a.length); >> for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i]; >> } >> private: >> static struct ArrayBase(T) >> { >> public: >> ~this() { clear(); } >> void clear() { if(array) delete array; } >> int length() const { return array ? *lengthPtr() : 0; } >> int capacity() const { return array ? *capacityPtr() : 0; } >> int* lengthPtr() const { return cast(int*)(array); } >> int* capacityPtr() const { return cast(int*)(array+4); } >> T* data() const { return cast(T*)(array+8); } >> ubyte* array; >> } >> ArrayBase!T base; >> void copyData(ref ArrayBase!T array) >> { >> int copyLen = min(length, array.length()); >> for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } >> } >> } >> static bool less(T)(T a, T b) { return a < b; } >> void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high) >> { >> for(int i = low; i <= high; i++) >> { >> int min = i; >> for(int j = i + 1; j <= high; j++) >> if(L(a[j], a[min])) min = j; >> swap(a[i], a[min]); >> } >> } >> int partition1(T, alias L = less!T)(ref Array!T a, int p, int r) >> { >> T x = a[r]; >> int j = p - 1; >> for (int i = p; i < r; i++) >> { >> if (L(x, a[i])) continue; >> swap(a[i], a[++j]); >> } >> a[r] = a[j + 1]; >> a[j + 1] = x; >> return j + 1; >> } >> void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r) >> { >> if(p + 7 > r) return insertionSort1!(T, L)(a, p, r); >> if (p < r) >> { >> int q = partition1!(T, L)(a, p, r); >> quickSort1!(T, L)(a, p, q - 1); >> quickSort1!(T, L)(a, q + 1, r); >> } >> } >> void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, >> a.length()-1); } >> void insertionSort2(T, alias L = less!T)(T *a, int low, int high) >> { >> for(int i = low; i <= high; i++) >> { >> int min = i; >> for(int j = i + 1; j <= high; j++) >> if(L(a[j], a[min])) min = j; >> swap(a[i], a[min]); >> } >> } >> int partition2(T, alias L = less!T)(T *a, int p, int r) >> { >> T x = a[r]; >> int j = p - 1; >> for (int i = p; i < r; i++) >> { >> if (L(x, a[i])) continue; >> swap(a[i], a[++j]); >> } >> a[r] = a[j + 1]; >> a[j + 1] = x; >> return j + 1; >> } >> void quickSort2(T, alias L = less!T)(T *a, int p, int r) >> { >> if(p + 7 > r) return insertionSort2!(T, L)(a, p, r); >> if (p < r) >> { >> int q = partition2!(T, L)(a, p, r); >> quickSort2!(T, L)(a, p, q - 1); >> quickSort2!(T, L)(a, q + 1, r); >> } >> } >> void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, >> 0, >> length-1); } >> double[] vals; >> void bench1() >> { >> Array!double v; >> for(int i = 0; i < 100; i++) >> { >> v = vals; >> sort1(v); >> } >> } >> void bench2() >> { >> Array!double v; >> for(int i = 0; i < 100; i++) >> { >> v = vals; >> sort2(&v[0], v.length); >> } >> } >> void main() >> { >> Mt19937 gen; >> vals.length = 1000; >> for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0); >> ulong[] times = [0, 0]; >> for(int i = 0; i < 100; i++) >> { >> auto times2 = benchmark!(bench1, bench2)(1); >> times[0] += times2[0]; >> times[1] += times2[1]; >> } >> writeln("Sorting with Array.opIndex: ", times[0]); >> writeln("Sorting with pointers: ", times[1]); >> writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster"); >> } > > Testing on GDC with a Netbook, results from 3 consecutive runs are: > > Without -frelease > ------- > Sorting with Array.opIndex: 27981 > Sorting with pointers: 5602 > 79.9793 percent faster > > Sorting with Array.opIndex: 25565 > Sorting with pointers: 5179 > 79.7418 percent faster > > Sorting with Array.opIndex: 28657 > Sorting with pointers: 5772 > 79.8583 percent faster > ------- > > > With -frelease > ------- > Sorting with Array.opIndex: 10591 > Sorting with pointers: 4771 > 54.9523 percent faster > > Sorting with Array.opIndex: 10289 > Sorting with pointers: 4710 > 54.223 percent faster > > Sorting with Array.opIndex: 11305 > Sorting with pointers: 5216 > 53.8611 percent faster > ------- > > > With -frelease -fno-bounds-check > ------- > Sorting with Array.opIndex: 11651 > Sorting with pointers: 5236 > 55.0597 percent faster > > Sorting with Array.opIndex: 9873 > Sorting with pointers: 4559 > 53.8236 percent faster > > Sorting with Array.opIndex: 10361 > Sorting with pointers: 4745 > 54.2033 percent faster > ------- > > > GDC doesn't use DMD's FE inliner, but results from the GCC backend's > inliner: > ------- > Considering inline candidate check. > Inlining check into fillUp. > Merging blocks 9 and 10 > Merging blocks 9 and 11 > > Considering inline candidate initialize. > Inlining initialize into emplace. > Merging blocks 2 and 3 > Merging blocks 2 and 4 > > Considering inline candidate bench2. > Not inlining: code size would grow by 77 insns. > Considering inline candidate bench1. > Not inlining: code size would grow by 49 insns. > ------- > > > So there's _me_ seriously doubting that inlining has anything to do with the > 50% increase. > > Regards > Iain >