dsimcha: > import std.stdio, std.perf; [...] > Results: > Direct: 226 > Indirect: 229
Thank you. Following your example I have tried to do a bit more realistic tests. // test1.d, for D2, Phobos import std.stdio: writeln; import std.perf: PerformanceCounter; final class TNew(T) { T* ptr; size_t length, capacity; this(size_t size) { this.ptr = (new T[size]).ptr; this.length = size; this.capacity = size; } ref T opIndex(size_t index) { return this.ptr[index]; } } void main() { // increase N if you have a large L2 cache enum int N = 100_000; enum int NLOOP = 40_000; alias uint T; auto pc = new PerformanceCounter; auto a = new T[N]; auto c = new TNew!T(N); pc.start; T* p = a.ptr; for (uint j; j < NLOOP; j++) for (uint i; i < N; i++) p[i]++; pc.stop; writeln("Direct: ", pc.milliseconds); pc.start; for (uint j; j < NLOOP; j++) for (uint i; i < N; i++) c[i]++; pc.stop; writeln("Class: ", pc.milliseconds); pc.start; for (uint j; j < NLOOP; j++) for (uint i; i < N; i++) a[i]++; pc.stop; writeln("Array: ", pc.milliseconds); writeln(); pc.start; for (uint j; j < NLOOP; j++) for (uint i = N-1; i--; ) p[i]++; pc.stop; writeln("Direct: ", pc.milliseconds); pc.start; for (uint j; j < NLOOP; j++) for (uint i = N-1; i--; ) c[i]++; pc.stop; writeln("Class: ", pc.milliseconds); pc.start; for (uint j; j < NLOOP; j++) for (uint i = N-1; i--; ) a[i]++; pc.stop; writeln("Array: ", pc.milliseconds); } Timings on Windows, DMD v2.031, 2 GHz Core2 PC (use a larger N if you have a large L2 CPU Cache): Direct: 4654 Class: 4670 Array: 4095 Direct: 4650 Class: 4688 Array: 4089 It's curious how the "direct" version is slower than the normal dynamic array (this doesn't happen in LDC, see below). ----------------------------- Then I've tried to create a version that can be compiled with LDC too (with Tango). The LDC that I have compiles D1 code, so I can't use ref T opIndex(). So I have created the following version, if you spot bugs or mistakes please tell me: // test2.d, for D1, Phobos and Tango version (Tango) { import tango.stdc.stdio: printf; import tango.time.StopWatch: StopWatch; } else { import std.c.stdio: printf; import std.perf: PerformanceCounter; } final class TNew(T) { T* ptr; size_t length, capacity; this(size_t size) { this.ptr = (new T[size]).ptr; this.length = size; this.capacity = size; } T* refItem(size_t index) { return &(this.ptr[index]); } } void main() { // increase N if you have a large L2 cache const int N = 100_000; const int NLOOP = 40_000; alias uint T; version (Tango) StopWatch timer; else auto timer = new PerformanceCounter; int deltaTime; auto a = new T[N]; auto c = new TNew!(T)(N); timer.start; T* p = a.ptr; for (uint j; j < NLOOP; j++) for (uint i; i < N; i++) p[i]++; version (Tango) { deltaTime = timer.microsec() / 1000; } else { timer.stop; deltaTime = cast(int)timer.milliseconds; } printf("Direct: %d\n", deltaTime); timer.start; for (uint j; j < NLOOP; j++) for (uint i; i < N; i++) (*c.refItem(i))++; version (Tango) { deltaTime = timer.microsec() / 1000; } else { timer.stop; deltaTime = cast(int)timer.milliseconds; } printf("Class: %d\n", deltaTime); timer.start; for (uint j; j < NLOOP; j++) for (uint i; i < N; i++) a[i]++; version (Tango) { deltaTime = timer.microsec() / 1000; } else { timer.stop; deltaTime = cast(int)timer.milliseconds; } printf("Array: %d\n", deltaTime); printf("\n"); timer.start; for (uint j; j < NLOOP; j++) for (uint i = N-1; i--; ) p[i]++; version (Tango) { deltaTime = timer.microsec() / 1000; } else { timer.stop; deltaTime = cast(int)timer.milliseconds; } printf("Direct: %d\n", deltaTime); timer.start; for (uint j; j < NLOOP; j++) for (uint i = N-1; i--; ) (*c.refItem(i))++; version (Tango) { deltaTime = timer.microsec() / 1000; } else { timer.stop; deltaTime = cast(int)timer.milliseconds; } printf("Class: %d\n", deltaTime); timer.start; for (uint j; j < NLOOP; j++) for (uint i = N-1; i--; ) a[i]++; version (Tango) { deltaTime = timer.microsec() / 1000; } else { timer.stop; deltaTime = cast(int)timer.milliseconds; } printf("Array: %d\n", deltaTime); } Timings on Windows: Direct: 4654 Class: 4810 Array: 4086 Direct: 4665 Class: 4598 Array: 4084 Timings on a 32 bit Linux (ldc -O3 -release -inline): Direct: 4280 Class: 5000 Array: 4280 Direct: 4280 Class: 4860 Array: 4270 If such tests are correct, then the new resizable arrays of D2 will indeed have some performance penality (something like 13%-17% in this test, but results may change with other array sizes). Bye, bearophile