Here is the code for the new histogram, tests included.
I'll wait for comments or suggestions before submitting a patch (numpy / scipy) ?
Cheers
David
2006/10/18, Tim Hochberg <[EMAIL PROTECTED]>:
My $0.02:
If histogram is going to get a makeover, particularly one that makes it
more complex than at present, it should probably be moved to SciPy.
Failing that, it should be moved to a submodule of numpy with similar
statistical tools. Preferably with consistent interfaces for all of the
functions.
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Numpy-discussion mailing list
Numpy-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/numpy-discussion
# License: Scipy compatible # Author: David Huard, 2006 from numpy import * def histogram(a, bins=10, range=None, normed=False, weights=None, axis=None): """histogram(a, bins=10, range=None, normed=False, weights=None, axis=None) -> H, dict Return the distribution of sample. Parameters ---------- a: Array sample. bins: Number of bins, or an array of bin edges, in which case the range is not used. range: Lower and upper bin edges, default: [min, max]. normed: Boolean, if False, return the number of samples in each bin, if True, return a frequency distribution. weights: Sample weights. axis: Specifies the dimension along which the histogram is computed. Defaults to None, which aggregates the entire sample array. Output ------ H: The number of samples in each bin. If normed is True, H is a frequency distribution. dict{ 'edges': The bin edges, including the rightmost edge. 'upper': Upper outliers. 'lower': Lower outliers. 'bincenters': Center of bins. } Examples -------- x = random.rand(100,10) H, Dict = histogram(x, bins=10, range=[0,1], normed=True) H2, Dict = histogram(x, bins=10, range=[0,1], normed=True, axis=0) See also: histogramnd """ a = asarray(a) if axis is None: a = atleast_1d(a.ravel()) axis = 0 # Bin edges. if not iterable(bins): if range is None: range = (a.min(), a.max()) mn, mx = [mi+0.0 for mi in range] if mn == mx: mn -= 0.5 mx += 0.5 edges = linspace(mn, mx, bins+1, endpoint=True) else: edges = asarray(bins, float)
dedges = diff(edges) decimal = int(-log10(dedges.min())+6) bincenters = edges[:-1] + dedges/2. # apply_along_axis accepts only one array input, but we need to pass the # weights along with the sample. The strategy here is to concatenate the # weights array along axis, so the passed array contains [sample, weights]. # The array is then split back in __hist1d. if weights is not None: aw = concatenate((a, weights), axis) weighted = True else: aw = a weighted = False count = apply_along_axis(__hist1d, axis, aw, edges, decimal, weighted) # Outlier count upper = count.take(array([-1]), axis) lower = count.take(array([0]), axis) # Non-outlier count core = a.ndim*[slice(None)] core[axis] = slice(1, -1) hist = count[core] if normed: normalize = lambda x: atleast_1d(x/(x*dedges).sum()) hist = apply_along_axis(normalize, axis, hist) return hist, {'edges':edges, 'lower':lower, 'upper':upper, \ 'bincenters':bincenters} def __hist1d(aw, edges, decimal, weighted): """Internal routine to compute the 1d histogram. aw: sample, [weights] edges: bin edges decimal: approximation to put values lying on the rightmost edge in the last bin. weighted: Means that the weights are appended to array a. Return the bin count or frequency if normed. """ nbin = edges.shape[0]+1 if weighted: count = zeros(nbin, dtype=float) a,w = hsplit(aw,2) w = w/w.mean() else: a = aw count = zeros(nbin, dtype=int) w = None binindex = digitize(a, edges) # Values that fall on an edge are put in the right bin. # For the rightmost bin, we want values equal to the right # edge to be counted in the last bin, and not as an outlier. on_edge = where(around(a,decimal) == around(edges[-1], decimal))[0] binindex[on_edge] -= 1 # Count the number of identical indices. flatcount = bincount(binindex, w) # Place the count in the histogram array. i = arange(len(flatcount)) count[i] = flatcount return count from numpy.testing import * class test_histogram(NumpyTestCase): def check_simple(self): n=100 v=rand(n) (a,b)=histogram(v) #check if the sum of the bins equals the number of samples assert(sum(a,axis=0)==n) #check that the bin counts are evenly spaced when the data is from a linear function (a,b)=histogram(linspace(0,10,100)) assert(all(a==10)) #Check the construction of the bin array a, b = histogram(v, bins=4, range=[.2,.8]) assert(all(b['edges']==linspace(.2, .8, 5))) #Check the number of outliers assert((v<.2).sum() == b['lower']) assert((v>.8).sum() == b['upper']) #Check the normalization bins = [0,.5,.75,1] a,b = histogram(v, bins, normed=True) assert_almost_equal((a*diff(bins)).sum(), 1) def check_axis(self): n,m = 100,20 v = rand(n,m) a,b = histogram(v, bins=5) # Check dimension is reduced (axis=None). assert(a.ndim == 1) #Check total number of count is equal to the number of samples. assert(a.sum() == n*m) a,b = histogram(v, bins = 7, axis=0) # Check shape of new array is ok. assert(a.ndim == 2) assert_array_equal(a.shape,[7, m]) # Check normalization is consistent a,b = histogram(v, bins = 7, axis=0, normed=True) assert_array_almost_equal((a.T*diff(b['edges'])).sum(1), ones((m))) a,b = histogram(v, bins = 7, axis=1, normed=True) assert_array_equal(a.shape, [n,7]) assert_array_almost_equal((a*diff(b['edges'])).sum(1), ones((n))) # Check results are consistent with 1d estimate a1, b1 = histogram(v[0,:], bins=b['edges'], normed=True) assert_array_equal(a1, a[0,:]) def check_weights(self): # Check weights = constant gives the same answer as no weights. v = rand(100) w = ones(100)*5 wa,wb = histogram(v, weights=w) a,b = histogram(v) assert_array_equal(a, wa) # Check weights are properly applied. v = linspace(0,10,10) w = concatenate((zeros(5), ones(5))) wa,wb = histogram(v, bins=arange(11),weights=w) assert_array_almost_equal(wa, w*2) if __name__ == "__main__": NumpyTest().run()
------------------------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________ Numpy-discussion mailing list Numpy-discussion@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/numpy-discussion