Hi,
I spent some time a while ago on an histogram function for numpy. It uses
digitize and bincount instead of sorting the data. If I remember right, it
was significantly faster than numpy's histogram, but I don't know how it
will behave with very large data sets.
I attached the file if you want to take a look, or if you me the benchmark,
I'll add it to it and report the results.
Cheers,
David
2006/12/14, eric jones <[EMAIL PROTECTED]>:
Rick White wrote:
> Just so we don't get too smug about the speed, if I do this in IDL on
> the same machine it is 10 times faster (0.28 seconds instead of 4
> seconds). I'm sure the IDL version uses the much faster approach of
> just sweeping through the array once, incrementing counts in the
> appropriate bins. It only handles equal-sized bins, so it is not as
> general as the numpy version -- but equal-sized bins is a very common
> case. I'd still like to see a C version of histogram (which I guess
> would need to be a ufunc) go into the core numpy.
>
Yes, this gets rid of the search, and indices can just be caluclated
from offsets. I've attached a modified weaved histogram that takes this
approach. Running the snippet below on my machine takes .118 sec for
the evenly binned weave algorithm and 0.385 sec for Rick's algorithm on
5 million elements. That is close to 4x faster (but not 10x...), so
there is indeed some speed to be gained for the common special case. I
don't know if the code I wrote has a 2x gain left in it, but I've spent
zero time optimizing it. I'd bet it can be improved substantially.
eric
### test_weave_even_histogram.py
from numpy import arange, product, sum, zeros, uint8
from numpy.random import randint
import weave_even_histogram
import time
shape = 1000,1000,5
size = product(shape)
data = randint(0,256,size).astype(uint8)
bins = arange(256+1)
print 'type:', data.dtype
print 'millions of elements:', size/1e6
bin_start = 0
bin_size = 1
bin_count = 256
t1 = time.clock()
res = weave_even_histogram.histogram(data, bin_start, bin_size, bin_count)
t2 = time.clock()
print 'sec (evenly spaced):', t2-t1, sum(res)
print res
> Rick
> _______________________________________________
> Numpy-discussion mailing list
> [email protected]
> http://projects.scipy.org/mailman/listinfo/numpy-discussion
>
>
_______________________________________________
Numpy-discussion mailing list
[email protected]
http://projects.scipy.org/mailman/listinfo/numpy-discussion
# License: Scipy compatible
# Author: David Huard, 2006
from numpy import *
def histogram(a, bins=10, range=None, normed=False, weights=None, axis=None):
"""histogram(a, bins=10, range=None, normed=False, weights=None, axis=None)
-> H, dict
Return the distribution of sample.
Parameters
----------
a: Array sample.
bins: Number of bins, or
an array of bin edges, in which case the range is not used.
range: Lower and upper bin edges, default: [min, max].
normed: Boolean, if False, return the number of samples in each bin,
if True, return the density.
weights: Sample weights. The weights are normed only if normed is True.
Should weights.sum() not equal len(a), the total bin count will
not be equal to the number of samples.
axis: Specifies the dimension along which the histogram is computed.
Defaults to None, which aggregates the entire sample array.
Output
------
H: The number of samples in each bin.
If normed is True, H is a frequency distribution.
dict{
'edges': The bin edges, including the rightmost edge.
'upper': Upper outliers.
'lower': Lower outliers.
'bincenters': Center of bins.
}
Examples
--------
x = random.rand(100,10)
H, Dict = histogram(x, bins=10, range=[0,1], normed=True)
H2, Dict = histogram(x, bins=10, range=[0,1], normed=True, axis=0)
See also: histogramnd
"""
a = asarray(a)
if axis is None:
a = atleast_1d(a.ravel())
axis = 0
# Bin edges.
if not iterable(bins):
if range is None:
range = (a.min(), a.max())
mn, mx = [mi+0.0 for mi in range]
if mn == mx:
mn -= 0.5
mx += 0.5
edges = linspace(mn, mx, bins+1, endpoint=True)
else:
edges = asarray(bins, float)
dedges = diff(edges)
decimal = int(-log10(dedges.min())+6)
bincenters = edges[:-1] + dedges/2.
# apply_along_axis accepts only one array input, but we need to pass the
# weights along with the sample. The strategy here is to concatenate the
# weights array along axis, so the passed array contains [sample, weights].
# The array is then split back in __hist1d.
if weights is not None:
aw = concatenate((a, weights), axis)
weighted = True
else:
aw = a
weighted = False
count = apply_along_axis(__hist1d, axis, aw, edges, decimal, weighted, normed)
# Outlier count
upper = count.take(array([-1]), axis)
lower = count.take(array([0]), axis)
# Non-outlier count
core = a.ndim*[slice(None)]
core[axis] = slice(1, -1)
hist = count[core]
if normed:
normalize = lambda x: atleast_1d(x/(x*dedges).sum())
hist = apply_along_axis(normalize, axis, hist)
return hist, {'edges':edges, 'lower':lower, 'upper':upper, \
'bincenters':bincenters}
def __hist1d(aw, edges, decimal, weighted, normed):
"""Internal routine to compute the 1d histogram.
aw: sample, [weights]
edges: bin edges
decimal: approximation to put values lying on the rightmost edge in the last
bin.
weighted: Means that the weights are appended to array a.
Return the bin count or frequency if normed.
"""
nbin = edges.shape[0]+1
if weighted:
count = zeros(nbin, dtype=float)
a,w = hsplit(aw,2)
if normed:
w = w/w.mean()
else:
a = aw
count = zeros(nbin, dtype=int)
w = None
binindex = digitize(a, edges)
# Values that fall on an edge are put in the right bin.
# For the rightmost bin, we want values equal to the right
# edge to be counted in the last bin, and not as an outlier.
on_edge = where(around(a,decimal) == around(edges[-1], decimal))[0]
binindex[on_edge] -= 1
# Count the number of identical indices.
flatcount = bincount(binindex, w)
# Place the count in the histogram array.
i = arange(len(flatcount))
count[i] = flatcount
return count
from numpy.testing import *
class test_histogram(NumpyTestCase):
def check_simple(self):
n=100
v=rand(n)
(a,b)=histogram(v)
#check if the sum of the bins equals the number of samples
assert(sum(a,axis=0)==n)
#check that the bin counts are evenly spaced when the data is from a linear function
(a,b)=histogram(linspace(0,10,100))
assert(all(a==10))
#Check the construction of the bin array
a, b = histogram(v, bins=4, range=[.2,.8])
assert(all(b['edges']==linspace(.2, .8, 5)))
#Check the number of outliers
assert((v<.2).sum() == b['lower'])
assert((v>.8).sum() == b['upper'])
#Check the normalization
bins = [0,.5,.75,1]
a,b = histogram(v, bins, normed=True)
assert_almost_equal((a*diff(bins)).sum(), 1)
def check_axis(self):
n,m = 100,20
v = rand(n,m)
a,b = histogram(v, bins=5)
# Check dimension is reduced (axis=None).
assert(a.ndim == 1)
#Check total number of count is equal to the number of samples.
assert(a.sum() == n*m)
a,b = histogram(v, bins = 7, axis=0)
# Check shape of new array is ok.
assert(a.ndim == 2)
assert_array_equal(a.shape,[7, m])
# Check normalization is consistent
a,b = histogram(v, bins = 7, axis=0, normed=True)
assert_array_almost_equal((a.T*diff(b['edges'])).sum(1), ones((m)))
a,b = histogram(v, bins = 7, axis=1, normed=True)
assert_array_equal(a.shape, [n,7])
assert_array_almost_equal((a*diff(b['edges'])).sum(1), ones((n)))
# Check results are consistent with 1d estimate
a1, b1 = histogram(v[0,:], bins=b['edges'], normed=True)
assert_array_equal(a1, a[0,:])
def check_weights(self):
# Check weights = constant gives the same answer as no weights.
v = rand(100)
w = ones(100)*5
a,b = histogram(v)
na,nb = histogram(v, normed=True)
wa,wb = histogram(v, weights=w)
nwa,nwb = histogram(v, weights=w, normed=True)
assert_array_equal(a*5, wa)
assert_array_equal(na, nwa)
# Check weights are properly applied.
v = linspace(0,10,10)
w = concatenate((zeros(5), ones(5)))
wa,wb = histogram(v, bins=arange(11),weights=w)
assert_array_almost_equal(wa, w)
if __name__ == "__main__":
NumpyTest().run()
_______________________________________________
Numpy-discussion mailing list
[email protected]
http://projects.scipy.org/mailman/listinfo/numpy-discussion