While writing code that works on matrices I have found something curious, so I have written the following little benchmark. As usual keep eyes open for possible bugs and mistakes of mine:
import std.conv: toInt; import std.c.stdlib: rand, malloc; const int RAND_MAX = short.max; // RAND_MAX seems missing from std.c.stdlib void main(string[] args) { int i, j, k; int N = args.length == 2 ? toInt(args[1]) : 400; // alloc A, B, C static if (1) { // Version #1 auto A = new double[][](N, N); auto B = new double[][](N, N); auto C = new double[][](N, N); } else { // Version #2 double** A = cast(double**)malloc(N * (double*).sizeof); for (i = 0; i < N; i++) A[i] = cast(double*)malloc(N * (double).sizeof); double** B = cast(double**)malloc(N * (double*).sizeof); for (i = 0; i < N; i++) B[i] = cast(double*)malloc(N * (double).sizeof); double** C = cast(double**)malloc(N * (double*).sizeof); for (i = 0; i < N; i++) C[i] = cast(double*)malloc(N * (double).sizeof); } // init mats randomly for (i = 0; i < N; i++) for (j = 0; j < N; j++) { A[i][j] = cast(double)rand() / RAND_MAX; B[i][j] = cast(double)rand() / RAND_MAX; } // C = A * B, naive way for (i = 0; i < N; i++) for (j = 0; j < N; j++) for (k = 0; k < N; k++) C[i][k] += A[i][j] * B[j][k]; } Timings, Core2, 2 GHz, DMD V.1.036: N = 100: Version 1: 0.17 s Version 2: 0.03 s N = 400: Version 1: 9.25 s Version 2: 0.26 s N = 512: Version 1: 19.3 s Version 2: 1.0 s -------------------------- Note there are much faster ways to perform A * B: // Alternative mul code double[] bj = new double[N]; for (int j = 0; j < N; j++) { for (int k = 0; k < N; k++) bj[k] = B[k][j]; for (int i = 0; i < N; i++) { double[] ai = A[i]; double s = 0; for (int k = 0; k < N; k++) s += ai[k] * bj[k]; C[i][j] = s; } } -------------------------------- Inner loop asm, version #1: L164: mov ECX,[EAX] fld qword ptr [ESI*8][ECX] mov ECX,0[EBP] fmul qword ptr [EBX*8][ECX] mov ECX,[EDX] fadd qword ptr [EBX*8][ECX] fstp qword ptr [EBX*8][ECX] inc EBX cmp EBX,EDI jl L164 Inner loop asm, version #2: L12B: mov EDX,4[EBX] mov EAX,[EBX] fld qword ptr [EDI*8][EDX] mov ECX,010h[ESP] mov EDX,4[ECX] mov EAX,[ECX] fmul qword ptr [ESI*8][EDX] mov ECX,014h[ESP] mov EDX,4[ECX] mov EAX,[ECX] fadd qword ptr [ESI*8][EDX] fstp qword ptr [ESI*8][EDX] inc ESI cmp ESI,EBP jl L12B Note: compiling the alternative mul code written in C with GCC the running speed is about twice still compared to the Version2. Bye, bearophile