Given: a large list (10,000,000) of floating point numbers; Task: fastest python code that finds k (small, e.g. 10) smallest items, preferably with item indexes; Limitations: in python, using only standard libraries (numpy & scipy is Ok);
I've tried several methods. With N = 10,000,000, K = 10 The fastest so far (without item indexes) was pure python implementation nsmallest_slott_bisect (using bisect/insert). And with indexes nargsmallest_numpy_argmin (argmin() in the numpy array k times). Anyone up to the challenge beating my code with some clever selection algorithm? Current Table: 1.66864395142 mins_heapq(items, n): 0.946580886841 nsmallest_slott_bisect(items, n): 1.38014793396 nargsmallest(items, n): 10.0732769966 sorted(items)[:n]: 3.17916202545 nargsmallest_numpy_argsort(items, n): 1.31794500351 nargsmallest_numpy_argmin(items, n): 2.37499308586 nargsmallest_numpy_array_argsort(items, n): 0.524670124054 nargsmallest_numpy_array_argmin(items, n): 0.0525538921356 numpy argmin(items): 1892997 0.364673852921 min(items): 10.0000026786 Code: ---------------- import heapq from random import randint, random import time from bisect import insort from itertools import islice from operator import itemgetter def mins_heapq(items, n): nlesser_items = heapq.nsmallest(n, items) return nlesser_items def nsmallest_slott_bisect(iterable, n, insort=insort): it = iter(iterable) mins = sorted(islice(it, n)) for el in it: if el <= mins[-1]: #NOTE: equal sign is to preserve duplicates insort(mins, el) mins.pop() return mins def nargsmallest(iterable, n, insort=insort): it = enumerate(iterable) mins = sorted(islice(it, n), key = itemgetter(1)) loser = mins[-1][1] # largest of smallest for el in it: if el[1] <= loser: # NOTE: equal sign is to preserve dupl mins.append(el) mins.sort(key = itemgetter(1)) mins.pop() loser = mins[-1][1] return mins def nargsmallest_numpy_argsort(iter, k): distances = N.asarray(iter) return [(i, distances[i]) for i in distances.argsort()[0:k]] def nargsmallest_numpy_array_argsort(array, k): return [(i, array[i]) for i in array.argsort()[0:k]] def nargsmallest_numpy_argmin(iter, k): distances = N.asarray(iter) mins = [] def nargsmallest_numpy_array_argmin(distances, k): mins = [] for i in xrange(k): j = distances.argmin() mins.append((j, distances[j])) distances[j] = float('inf') return mins test_data = [randint(10, 50) + random() for i in range(10000000)] K = 10 init = time.time() mins = mins_heapq(test_data, K) print time.time() - init, 'mins_heapq(items, n):', mins[:2] init = time.time() mins = nsmallest_slott_bisect(test_data, K) print time.time() - init, 'nsmallest_slott_bisect(items, n):', mins[: 2] init = time.time() mins = nargsmallest(test_data, K) print time.time() - init, 'nargsmallest(items, n):', mins[:2] init = time.time() mins = sorted(test_data)[:K] print time.time() - init, 'sorted(items)[:n]:', time.time() - init, mins[:2] import numpy as N init = time.time() mins = nargsmallest_numpy_argsort(test_data, K) print time.time() - init, 'nargsmallest_numpy_argsort(items, n):', mins[:2] init = time.time() mins = nargsmallest_numpy_argmin(test_data, K) print time.time() - init, 'nargsmallest_numpy_argmin(items, n):', mins[:2] print init = time.time() mins = array.argmin() print time.time() - init, 'numpy argmin(items):', mins init = time.time() mins = min(test_data) print time.time() - init, 'min(items):', mins -- http://mail.python.org/mailman/listinfo/python-list