== Quote from Craig Black (craigbla...@cox.net)'s article > If the problem is not inlining, then why does the same code in C++ does not > suffer from this performance limitation (when compiled with Visual C++ > 2008)? > #include <stdio.h> > #include <assert.h> > #include <stdlib.h> > template <class T> > T min(T a, T b) { return a < b ? a : b; } > template <class T> > T max(T a, T b) { return a > b ? a : b; } > inline void *operator new(size_t, void *ptr) { return ptr; } > inline void operator delete(void *, void *) {} > template <class T> > class Array > { > public: > Array() {} > Array(int length) { resize(length); } > Array(const Array<T> &a) { append(a); } > void clear() { base.clear(); } > void resize(int sz) > { > assert(sz >= 0); > if(sz == 0) return clear(); > if(!base.array) reserve(sz, sz); > *base.lengthPtr() = sz; > } > void reserve(int capacity, int length) > { > assert(length <= capacity); > if(capacity == 0) return clear(); > ArrayBase old; > if(base.array) > { > if(base.capacity() >= capacity) return; > old.array = base.array; > base.array = 0; > } > base.array = (char*)(new char[capacity*sizeof(T)+8]); > *base.lengthPtr() = length; > *base.capacityPtr() = capacity; > for(int i = 0; i < capacity; i++) new(base.data()+i)T; > if(old.array) copyData(old); > } > int length() const { return base.length(); } > int capacity() const { return base.capacity(); } > bool empty() const { return base.array == 0; } > T &at(int i) > { > assert(!empty()); > assert(i >= 0 && i < length()); > return base.data()[i]; > } > const T &at(int i) const > { > assert(!empty()); > assert(i >= 0 && i < length()); > return base.data()[i]; > } > T &operator [](int i) { return at(i); } > const T &operator [](int i) const { return at(i); } > void operator = (const Array &array) > { > copy(array); > } > void copy(const Array &array) > { > if(array.empty()) return clear(); > int len = array.length(); > reserve(len, len); > for(int i = 0; i < len; i++) at(i) = array[i]; > } > T &front() { return at(0); } > const T &front() const { return at(0); } > T &back() { return at(length()-1); } > const T &back() const { return at(length()-1); } > T &append() > { > int sz = length(); > int sp = capacity(); > if(sp > sz) (*base.lengthPtr())++; > else > { > sz++; > sp = max(max(2, sp+sp/2), sz); > reserve(sp, sz); > } > return back(); > } > void append(const Array<T> &a) > { > int prevLen = length(); > resize(prevLen + a.length()); > for(int i = 0; i < a.length(); i++) at(i+prevLen) = a[i]; > } > private: > class ArrayBase > { > public: > ArrayBase() { array = 0; } > ~ArrayBase() { clear(); } > void clear() { if(array) delete array; } > int length() const { return array ? *lengthPtr() : 0; } > int capacity() const { return array ? *capacityPtr() : 0; } > int* lengthPtr() const { return (int*)array; } > int* capacityPtr() const { return (int*)(array+4); } > T* data() const { return (T*)(array+8); } > char* array; > }; > ArrayBase base; > void copyData(ArrayBase &array) > { > int copyLen = min(length(), array.length()); > for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } > } > }; > template <class T> > struct Less > { > bool operator() (T a, T b) { return a < b; } > }; > template <class T> > void swap(T &a, T &b) > { > T temp = a; > a = b; > b = temp; > } > template <class T, class L> > void insertionSort1(Array<T> &a, int low, int high, L less) > { > for(int i = low; i <= high; i++) > { > int min = i; > for(int j = i + 1; j <= high; j++) > if(less(a[j], a[min])) min = j; > swap(a[i], a[min]); > } > } > template<class T, class L> > int partition1(Array<T> &a, int p, int r, L less) > { > T x = a[r]; > int j = p - 1; > for (int i = p; i < r; i++) > { > if (less(x, a[i])) continue; > swap(a[i], a[++j]); > } > a[r] = a[j + 1]; > a[j + 1] = x; > return j + 1; > } > template<class T, class L> > void quickSort1(Array<T> &a, int p, int r, L less) > { > if(p + 7 > r) return insertionSort1(a, p, r, less); > if (p < r) > { > int q = partition1(a, p, r, less); > quickSort1(a, p, q - 1, less); > quickSort1(a, q + 1, r, less); > } > } > template <class T, class L> > void sort1(Array<T> &a) { quickSort1(a, 0, a.length()-1, Less<T>()); } > template <class T, class L> > void insertionSort2(T *a, int low, int high, L less) > { > for(int i = low; i <= high; i++) > { > int min = i; > for(int j = i + 1; j <= high; j++) > if(less(a[j], a[min])) min = j; > swap(a[i], a[min]); > } > } > template<class T, class L> > int partition2(T *a, int p, int r, L less) > { > T x = a[r]; > int j = p - 1; > for (int i = p; i < r; i++) > { > if (less(x, a[i])) continue; > swap(a[i], a[++j]); > } > a[r] = a[j + 1]; > a[j + 1] = x; > return j + 1; > } > template<class T, class L> > void quickSort2(T *a, int p, int r, L less) > { > if(p + 7 > r) return insertionSort2(a, p, r, less); > if (p < r) > { > int q = partition2(a, p, r, less); > quickSort2(a, p, q - 1, less); > quickSort2(a, q + 1, r, less); > } > } > template <class T, class L> > void sort2(Array<T> &a) { quickSort2(&a[0], 0, a.length()-1, Less<T>()); } > typedef unsigned long long uint64; > uint64 getCycle() { > __asm { RDTSC } > } > Array<double> vals; > uint64 bench1() > { > uint64 startTime = getCycle(); > Array<double> v; > for(int i = 0; i < 100; i++) > { > v = vals; > sort1<double, Less<double> >(v); > } > return getCycle()-startTime; > } > uint64 bench2() > { > uint64 startTime = getCycle(); > Array<double> v; > for(int i = 0; i < 100; i++) > { > v = vals; > sort2<double, Less<double> >(v); > } > return getCycle()-startTime; > } > void main() > { > vals.resize(1000); > for(int i = 0; i < 1000; i++) vals[i] = (rand() % 1000000) / 1000.0; > uint64 time1 = 0; > uint64 time2 = 0; > for(int i = 0; i < 100; i++) > { > time1 += bench1(); > time2 += bench2(); > } > printf("Sorting with Array: %f\n", time1/100000.0); > printf("Sorting with pointers: %f\n", time2/100000.0); > printf("%f percent faster\n", 100.0 * (time1-time2) / time1); > }
D can use inline asm in much the same way, but that's deterring from your point. =) import std.c.stdio; import std.random; import std.conv; import std.algorithm; import std.range; struct Array(T) { public: this(int length) { resize(length); } this(T[] a) { append(a); } this(this) { if(!base.array) return; ArrayBase!T old; old = base; base.array = null; reserve(old.length(), old.length()); copyData(old); old.array = null; } void clear() { base.clear(); } void resize(int sz) { assert(sz >= 0); if(sz == 0) return clear(); if(!base.array) reserve(sz, sz); *base.lengthPtr() = sz; } void reserve(int capacity, int length) { assert(length <= capacity); if(capacity == 0) return clear(); ArrayBase!T old; if(base.array) { if(base.capacity() >= capacity) return; old.array = base.array; base.array = null; } base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]); *base.lengthPtr() = length; *base.capacityPtr() = capacity; for(int i = 0; i < capacity; i++) emplace!T(base.data()+i); if(old.array) copyData(old); } int length() const { return base.length(); } int capacity() const { return base.capacity(); } bool empty() const { return base.array == null; } ref T at(int i) { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()- 1)); return base.data()[i]; } ref const T at(int i) const { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()- 1)); return base.data()[i]; } const ref T opIndex(int i) const { return at(i); } void opIndexAssign(T t, int i) { at(i) = t; } void opIndexAssign(ref T t, int i) { at(i) = t; } void opAssign(ref const Array!T array) { copy(array); } void opAssign(T[] array) { int len = array.length; resize(len); for(int i = 0; i < len; i++) at(i) = array[i]; } void copy(ref const Array!T array) { if(array.empty()) return clear(); int len = array.length(); reserve(len, len); for(int i = 0; i < len; i++) at(i) = array[i]; } void opOpAssign(string op, A...)(A a) if(op == "~") { appendComposite(a); } ref T front() { return at(0); } const ref T front() const { return at(0); } ref T back() { return at(length()-1); } const ref T back() const { return at(length()-1); } ref T appendOne() { int sz = length(); int sp = capacity(); if(sp > sz) (*base.lengthPtr())++; else { sz++; sp = max(2, sp+sp/2, sz); reserve(sp, sz); } return back(); } void append(A...)(A a) { static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || is(typeof(a[0]) == T[]))) { appendComposite(a[0]); } else { appendTuple(a); } } void appendTuple(A...)(A a) { foreach(x; a) appendOne() = x; } void appendComposite(A)(A a) { int prevLen = length(); resize(prevLen + a.length); for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i]; } private: static struct ArrayBase(T) { public: ~this() { clear(); } void clear() { if(array) delete array; } int length() const { return array ? *lengthPtr() : 0; } int capacity() const { return array ? *capacityPtr() : 0; } int* lengthPtr() const { return cast(int*)(array); } int* capacityPtr() const { return cast(int*)(array+4); } T* data() const { return cast(T*)(array+8); } ubyte* array; } ArrayBase!T base; void copyData(ref ArrayBase!T array) { int copyLen = min(length, array.length()); for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } } } static bool less(T)(T a, T b) { return a < b; } void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition1(T, alias L = less!T)(ref Array!T a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r) { if(p + 7 > r) return insertionSort1!(T, L)(a, p, r); if (p < r) { int q = partition1!(T, L)(a, p, r); quickSort1!(T, L)(a, p, q - 1); quickSort1!(T, L)(a, q + 1, r); } } void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, a.length()-1); } void insertionSort2(T, alias L = less!T)(T *a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition2(T, alias L = less!T)(T *a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort2(T, alias L = less!T)(T *a, int p, int r) { if(p + 7 > r) return insertionSort2!(T, L)(a, p, r); if (p < r) { int q = partition2!(T, L)(a, p, r); quickSort2!(T, L)(a, p, q - 1); quickSort2!(T, L)(a, q + 1, r); } } void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, length-1); } ulong getCycle() { asm { rdtsc; } } double[] vals; ulong bench1() { ulong startTime = getCycle(); Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort1(v); } return getCycle() - startTime; } ulong bench2() { ulong startTime = getCycle(); Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort2(&v[0], v.length); } return getCycle() - startTime; } void main() { Mt19937 gen; vals.length = 1000; for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0); ulong time1; ulong time2; for(int i = 0; i < 100; i++) { time1 += bench1(); time2 += bench2(); } printf("Sorting with Array.opIndex: %f\n", time1/1e5); printf("Sorting with pointers: %f\n", time2/1e5); printf("%f percent faster\n", 100.0 * (time1-time2) / time1); } Testing your C++ program (altered getCycle() for GCC) Times I get: ------- Sorting with Array: 46869.159840 Sorting with pointers: 38688.937320 17.453316 percent faster Sorting with Array: 46631.903760 Sorting with pointers: 38520.609360 17.394302 percent faster Sorting with Array: 46674.330720 Sorting with pointers: 38545.202520 17.416700 percent faster ------- On a , I thought I might try an older version of GCC for the D program. Really surprisingly, I got: ------- Sorting with Array.opIndex: 43075.059840 Sorting with pointers: 40019.701920 7.093102 percent faster Sorting with Array.opIndex: 42940.085640 Sorting with pointers: 39594.089040 7.792245 percent faster Sorting with Array.opIndex: 44389.127280 Sorting with pointers: 41159.016960 7.276805 percent faster ------- This will need some thinking through as to just what happened between GDC-4.3 -> GDC-4.4 :~) Regards