bearophile: > doubles12 : 0.00184026840268 For fun, and to have an almost baseline reference, I have done a little benchmark in D too:
import std.stdio: writefln; // Using Tango std lib you don't need all this version (Win32) { import std.c.windows.windows; double clock() { long t; QueryPerformanceCounter(&t); return cast(double)t / queryPerformanceFrequency; } long queryPerformanceFrequency; static this() { QueryPerformanceFrequency(&queryPerformanceFrequency); } } union N { struct { int x, y; } long xy; } auto IN = [ N(4, 9), N(5, 0), N(6, 6), N(7, 2), N(3, 6), N(9, 6), N(0, 1), N(1, 6), N(0, 5), N(1, 2), N(8, 9), N(5, 4), N(1, 6), N(7, 6), N(9, 1), N(7, 6), N(0, 1), N(7, 4), N(7, 4), N(8, 4), N(8, 4), N(3, 5), N(9, 6), N(6, 1), N(3, 4), N(4, 5), N(0, 5), N(6, 3), N(2, 4), N(1, 6), N(9, 5), N(1, 2), N(5, 8), N(8, 5), N(3, 1), N(9, 4), N(9, 4), N(3, 3), N(4, 8), N(9, 7), N(8, 4), N(6, 2), N(1, 5), N(5, 8), N(8, 6), N(0, 8), N(5, 2), N(3, 4), N(0, 5), N(4, 4), N(2, 9), N(7, 7), N(1, 0), N(4, 2), N(5, 7), N(0, 4), N(2, 5), N(0, 8), N(7, 3), N(9, 1), N(0, 4), N(5, 0), N(4, 9), N(0, 6), N(3, 0), N(3, 0), N(3, 9), N(8, 3), N(7, 9), N(8, 5), N(7, 6), N(1, 5), N(0, 6), N(5, 9), N(6, 8), N(0, 0), N(4, 1), N(3, 3), N(5, 4), N(5, 3), N(6, 1), N(5, 4), N(4, 5), N(5, 8), N(4, 1), N(3, 6), N(1, 9), N(0, 5), N(6, 5), N(5, 5), N(6, 0), N(0, 9), N(2, 6), N(0, 7), N(5, 9), N(7, 3), N(7, 9), N(5, 4), N(4, 9), N(2, 9) ]; N[] doubles13() { size_t[N] dup; // used as set N[] SN; foreach (item; IN) { if (item in dup) SN ~= item; else dup[item] = 0; } return SN; } N[] doubles0() { N[] SN; for (int i; i < IN.length; i++) for (int j; j < IN.length; j++) if (i != j) if (IN[i] == IN[j]) SN ~= IN[i]; return SN; } void test_results() { size_t[N] set1, set2; foreach (n; doubles0()) set1[n] = 0; foreach (n; doubles13()) set2[n] = 0; if (set1.keys.sort != set2.keys.sort) throw new Error("ERROR"); } void main() { int n = 150_000; test_results(); int count = n; auto t0 = clock(); while (count--) doubles13(); auto t1 = clock(); writefln("%0.10f", cast(double)(t1 - t0) / n); } doubles13() needs 0.000027-0.000037 seconds (*), about 60-75 times faster than doubles12, this means about 3-4 seconds instead of 15h (on the original computer). Using C++ with GCC (using a <hash_map> for dup and a <vector> for SN) you can probably go 10-40% faster still :-) (*) Probably there's such variability of timings because the current DMD compiler I have used doesn't add "align" instructions in the asm it produces as GCC does. With modern CPUs very sensitive to code alignment this leads to even 20-30% runtime differences in small tight loops. Bye, bearophile -- http://mail.python.org/mailman/listinfo/python-list