Ye that's better: Sorting with Array.opIndex: 2859 Sorting with pointers: 1765 38.2651 percent faster
And it only took 2 seconds. With profile it takes a full minute. On 12/13/10, Craig Black <craigbla...@cox.net> wrote: > Try it without -profile. That tends to slow everything down to a halt. > > -Craig > > "Andrej Mitrovic" <andrej.mitrov...@gmail.com> wrote in message > news:mailman.948.1292198993.21107.digitalmar...@puremagic.com... >> My crappy old Pentium 4 has gone totally mad: >> >> Sorting with Array.opIndex: 45080 >> Sorting with pointers: 45608 >> 4.092e+16 percent faster (??) >> >> Compiled with dmd -profile -O -release -inline -noboundscheck >> >> On 12/13/10, Iain Buclaw <ibuc...@ubuntu.com> wrote: >>> == Quote from Craig Black (craigbla...@cox.net)'s article >>>> The following program illustrates the problems with inlining in the dmd >>>> compiler. Perhaps with some more work I can reduce it to a smaller test >>>> case. I was playing around with a simple Array template, and noticed >>>> that >>>> similar C++ code performs much better. This is due, at least in part, >>>> to >>>> opIndex not being properly inlined by dmd. There are two sort >>>> functions, >>>> quickSort1 and quickSort2. quickSort1 indexes an Array data structure. >>>> quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my >>>> core i7. >>>> import std.stdio; >>>> import std.date; >>>> import std.random; >>>> import std.conv; >>>> import std.algorithm; >>>> import std.range; >>>> struct Array(T) >>>> { >>>> public: >>>> this(int length) { resize(length); } >>>> this(T[] a) { append(a); } >>>> this(this) >>>> { >>>> if(!base.array) return; >>>> ArrayBase!T old; >>>> old = base; >>>> base.array = null; >>>> reserve(old.length(), old.length()); >>>> copyData(old); >>>> old.array = null; >>>> } >>>> void clear() { base.clear(); } >>>> void resize(int sz) >>>> { >>>> assert(sz >= 0); >>>> if(sz == 0) return clear(); >>>> if(!base.array) reserve(sz, sz); >>>> *base.lengthPtr() = sz; >>>> } >>>> void reserve(int capacity, int length) >>>> { >>>> assert(length <= capacity); >>>> if(capacity == 0) return clear(); >>>> ArrayBase!T old; >>>> if(base.array) >>>> { >>>> if(base.capacity() >= capacity) return; >>>> old.array = base.array; >>>> base.array = null; >>>> } >>>> base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]); >>>> *base.lengthPtr() = length; >>>> *base.capacityPtr() = capacity; >>>> for(int i = 0; i < capacity; i++) emplace!T(base.data()+i); >>>> if(old.array) copyData(old); >>>> } >>>> int length() const { return base.length(); } >>>> int capacity() const { return base.capacity(); } >>>> bool empty() const { return base.array == null; } >>>> ref T at(int i) >>>> { >>>> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ >>>> to!string(i) >>>> ~ >>>> " out of bounds of empty array"); >>>> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " >>>> >>>> ~ >>>> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); >>>> return base.data()[i]; >>>> } >>>> ref const T at(int i) const >>>> { >>>> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ >>>> to!string(i) >>>> ~ >>>> " out of bounds of empty array"); >>>> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " >>>> >>>> ~ >>>> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); >>>> return base.data()[i]; >>>> } >>>> const ref T opIndex(int i) const { return at(i); } >>>> void opIndexAssign(T t, int i) { at(i) = t; } >>>> void opIndexAssign(ref T t, int i) { at(i) = t; } >>>> void opAssign(ref const Array!T array) >>>> { >>>> copy(array); >>>> } >>>> void opAssign(T[] array) >>>> { >>>> int len = array.length; >>>> resize(len); >>>> for(int i = 0; i < len; i++) at(i) = array[i]; >>>> } >>>> void copy(ref const Array!T array) >>>> { >>>> if(array.empty()) return clear(); >>>> int len = array.length(); >>>> reserve(len, len); >>>> for(int i = 0; i < len; i++) at(i) = array[i]; >>>> } >>>> void opOpAssign(string op, A...)(A a) if(op == "~") >>>> { >>>> appendComposite(a); >>>> } >>>> ref T front() { return at(0); } >>>> const ref T front() const { return at(0); } >>>> ref T back() { return at(length()-1); } >>>> const ref T back() const { return at(length()-1); } >>>> ref T appendOne() >>>> { >>>> int sz = length(); >>>> int sp = capacity(); >>>> if(sp > sz) (*base.lengthPtr())++; >>>> else >>>> { >>>> sz++; >>>> sp = max(2, sp+sp/2, sz); >>>> reserve(sp, sz); >>>> } >>>> return back(); >>>> } >>>> void append(A...)(A a) >>>> { >>>> static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || >>>> is(typeof(a[0]) == T[]))) >>>> { >>>> appendComposite(a[0]); >>>> } else { >>>> appendTuple(a); >>>> } >>>> } >>>> void appendTuple(A...)(A a) >>>> { >>>> foreach(x; a) appendOne() = x; >>>> } >>>> void appendComposite(A)(A a) >>>> { >>>> int prevLen = length(); >>>> resize(prevLen + a.length); >>>> for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i]; >>>> } >>>> private: >>>> static struct ArrayBase(T) >>>> { >>>> public: >>>> ~this() { clear(); } >>>> void clear() { if(array) delete array; } >>>> int length() const { return array ? *lengthPtr() : 0; } >>>> int capacity() const { return array ? *capacityPtr() : 0; } >>>> int* lengthPtr() const { return cast(int*)(array); } >>>> int* capacityPtr() const { return cast(int*)(array+4); } >>>> T* data() const { return cast(T*)(array+8); } >>>> ubyte* array; >>>> } >>>> ArrayBase!T base; >>>> void copyData(ref ArrayBase!T array) >>>> { >>>> int copyLen = min(length, array.length()); >>>> for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } >>>> } >>>> } >>>> static bool less(T)(T a, T b) { return a < b; } >>>> void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int >>>> high) >>>> { >>>> for(int i = low; i <= high; i++) >>>> { >>>> int min = i; >>>> for(int j = i + 1; j <= high; j++) >>>> if(L(a[j], a[min])) min = j; >>>> swap(a[i], a[min]); >>>> } >>>> } >>>> int partition1(T, alias L = less!T)(ref Array!T a, int p, int r) >>>> { >>>> T x = a[r]; >>>> int j = p - 1; >>>> for (int i = p; i < r; i++) >>>> { >>>> if (L(x, a[i])) continue; >>>> swap(a[i], a[++j]); >>>> } >>>> a[r] = a[j + 1]; >>>> a[j + 1] = x; >>>> return j + 1; >>>> } >>>> void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r) >>>> { >>>> if(p + 7 > r) return insertionSort1!(T, L)(a, p, r); >>>> if (p < r) >>>> { >>>> int q = partition1!(T, L)(a, p, r); >>>> quickSort1!(T, L)(a, p, q - 1); >>>> quickSort1!(T, L)(a, q + 1, r); >>>> } >>>> } >>>> void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, >>>> a.length()-1); } >>>> void insertionSort2(T, alias L = less!T)(T *a, int low, int high) >>>> { >>>> for(int i = low; i <= high; i++) >>>> { >>>> int min = i; >>>> for(int j = i + 1; j <= high; j++) >>>> if(L(a[j], a[min])) min = j; >>>> swap(a[i], a[min]); >>>> } >>>> } >>>> int partition2(T, alias L = less!T)(T *a, int p, int r) >>>> { >>>> T x = a[r]; >>>> int j = p - 1; >>>> for (int i = p; i < r; i++) >>>> { >>>> if (L(x, a[i])) continue; >>>> swap(a[i], a[++j]); >>>> } >>>> a[r] = a[j + 1]; >>>> a[j + 1] = x; >>>> return j + 1; >>>> } >>>> void quickSort2(T, alias L = less!T)(T *a, int p, int r) >>>> { >>>> if(p + 7 > r) return insertionSort2!(T, L)(a, p, r); >>>> if (p < r) >>>> { >>>> int q = partition2!(T, L)(a, p, r); >>>> quickSort2!(T, L)(a, p, q - 1); >>>> quickSort2!(T, L)(a, q + 1, r); >>>> } >>>> } >>>> void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, >>>> 0, >>>> length-1); } >>>> double[] vals; >>>> void bench1() >>>> { >>>> Array!double v; >>>> for(int i = 0; i < 100; i++) >>>> { >>>> v = vals; >>>> sort1(v); >>>> } >>>> } >>>> void bench2() >>>> { >>>> Array!double v; >>>> for(int i = 0; i < 100; i++) >>>> { >>>> v = vals; >>>> sort2(&v[0], v.length); >>>> } >>>> } >>>> void main() >>>> { >>>> Mt19937 gen; >>>> vals.length = 1000; >>>> for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0); >>>> ulong[] times = [0, 0]; >>>> for(int i = 0; i < 100; i++) >>>> { >>>> auto times2 = benchmark!(bench1, bench2)(1); >>>> times[0] += times2[0]; >>>> times[1] += times2[1]; >>>> } >>>> writeln("Sorting with Array.opIndex: ", times[0]); >>>> writeln("Sorting with pointers: ", times[1]); >>>> writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster"); >>>> } >>> >>> Testing on GDC with a Netbook, results from 3 consecutive runs are: >>> >>> Without -frelease >>> ------- >>> Sorting with Array.opIndex: 27981 >>> Sorting with pointers: 5602 >>> 79.9793 percent faster >>> >>> Sorting with Array.opIndex: 25565 >>> Sorting with pointers: 5179 >>> 79.7418 percent faster >>> >>> Sorting with Array.opIndex: 28657 >>> Sorting with pointers: 5772 >>> 79.8583 percent faster >>> ------- >>> >>> >>> With -frelease >>> ------- >>> Sorting with Array.opIndex: 10591 >>> Sorting with pointers: 4771 >>> 54.9523 percent faster >>> >>> Sorting with Array.opIndex: 10289 >>> Sorting with pointers: 4710 >>> 54.223 percent faster >>> >>> Sorting with Array.opIndex: 11305 >>> Sorting with pointers: 5216 >>> 53.8611 percent faster >>> ------- >>> >>> >>> With -frelease -fno-bounds-check >>> ------- >>> Sorting with Array.opIndex: 11651 >>> Sorting with pointers: 5236 >>> 55.0597 percent faster >>> >>> Sorting with Array.opIndex: 9873 >>> Sorting with pointers: 4559 >>> 53.8236 percent faster >>> >>> Sorting with Array.opIndex: 10361 >>> Sorting with pointers: 4745 >>> 54.2033 percent faster >>> ------- >>> >>> >>> GDC doesn't use DMD's FE inliner, but results from the GCC backend's >>> inliner: >>> ------- >>> Considering inline candidate check. >>> Inlining check into fillUp. >>> Merging blocks 9 and 10 >>> Merging blocks 9 and 11 >>> >>> Considering inline candidate initialize. >>> Inlining initialize into emplace. >>> Merging blocks 2 and 3 >>> Merging blocks 2 and 4 >>> >>> Considering inline candidate bench2. >>> Not inlining: code size would grow by 77 insns. >>> Considering inline candidate bench1. >>> Not inlining: code size would grow by 49 insns. >>> ------- >>> >>> >>> So there's _me_ seriously doubting that inlining has anything to do with >>> the >>> 50% increase. >>> >>> Regards >>> Iain >>> > >