Thanks for the comments,
Here is the code for the new histogram, tests included.
I'll wait for comments or suggestions before submitting a patch (numpy / scipy) ?

Cheers

David





2006/10/18, Tim Hochberg <[EMAIL PROTECTED]>:

My $0.02:

If histogram is going to get a makeover, particularly one that makes it
more complex than at present, it should probably be moved to SciPy.
Failing that, it should be moved to a submodule of numpy with similar
statistical tools. Preferably with consistent interfaces for all of the
functions.


-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Numpy-discussion mailing list
Numpy-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/numpy-discussion

# License: Scipy compatible
# Author: David Huard, 2006
from numpy import *
def histogram(a, bins=10, range=None, normed=False, weights=None, axis=None):
    """histogram(a, bins=10, range=None, normed=False, weights=None, axis=None) 
                                                                   -> H, dict
    
    Return the distribution of sample.
    
    Parameters
    ----------
    a:       Array sample.
    bins:    Number of bins, or 
             an array of bin edges, in which case the range is not used.
    range:   Lower and upper bin edges, default: [min, max].
    normed:  Boolean, if False, return the number of samples in each bin,
             if True, return a frequency distribution.  
    weights: Sample weights.
    axis:    Specifies the dimension along which the histogram is computed. 
             Defaults to None, which aggregates the entire sample array. 
    
    Output
    ------
    H:            The number of samples in each bin. 
                  If normed is True, H is a frequency distribution.
    dict{
    'edges':      The bin edges, including the rightmost edge.
    'upper':      Upper outliers.
    'lower':      Lower outliers.
    'bincenters': Center of bins.
    }
    
    Examples
    --------
    x = random.rand(100,10)
    H, Dict = histogram(x, bins=10, range=[0,1], normed=True)
    H2, Dict = histogram(x, bins=10, range=[0,1], normed=True, axis=0)
    
    See also: histogramnd
    """
    
    a = asarray(a)
    if axis is None:
        a = atleast_1d(a.ravel())
        axis = 0 
        
    # Bin edges.   
    if not iterable(bins):
        if range is None:
            range = (a.min(), a.max())
        mn, mx = [mi+0.0 for mi in range]
        if mn == mx:
            mn -= 0.5
            mx += 0.5
        edges = linspace(mn, mx, bins+1, endpoint=True)
    else:
        edges = asarray(bins, float)

    dedges = diff(edges)
    decimal = int(-log10(dedges.min())+6)
    bincenters = edges[:-1] + dedges/2.

    # apply_along_axis accepts only one array input, but we need to pass the 
    # weights along with the sample. The strategy here is to concatenate the 
    # weights array along axis, so the passed array contains [sample, weights]. 
    # The array is then split back in  __hist1d.
    if weights is not None:
        aw = concatenate((a, weights), axis)
        weighted = True
    else:
        aw = a
        weighted = False
        
    count = apply_along_axis(__hist1d, axis, aw, edges, decimal, weighted)
    
    # Outlier count
    upper = count.take(array([-1]), axis)
    lower = count.take(array([0]), axis)
    
    # Non-outlier count
    core = a.ndim*[slice(None)]
    core[axis] = slice(1, -1)
    hist = count[core]
    
    if normed:
        normalize = lambda x: atleast_1d(x/(x*dedges).sum())
        hist = apply_along_axis(normalize, axis, hist)

    return hist, {'edges':edges, 'lower':lower, 'upper':upper, \
        'bincenters':bincenters}
        
         
def __hist1d(aw, edges, decimal, weighted):
    """Internal routine to compute the 1d histogram.
    aw: sample, [weights]
    edges: bin edges
    decimal: approximation to put values lying on the rightmost edge in the last
             bin.
    weighted: Means that the weights are appended to array a. 
    Return the bin count or frequency if normed.
    """
    nbin = edges.shape[0]+1
    if weighted:
        count = zeros(nbin, dtype=float)
        a,w = hsplit(aw,2)
        w = w/w.mean()
    else:
        a = aw
        count = zeros(nbin, dtype=int)
        w = None
        
    binindex = digitize(a, edges)
    
    # Values that fall on an edge are put in the right bin.
    # For the rightmost bin, we want values equal to the right 
    # edge to be counted in the last bin, and not as an outlier. 
    on_edge = where(around(a,decimal) == around(edges[-1], decimal))[0]
    binindex[on_edge] -= 1
    
    # Count the number of identical indices.
    flatcount = bincount(binindex, w)
    
    # Place the count in the histogram array.
    i = arange(len(flatcount))
    count[i] = flatcount
       
    return count
    

from numpy.testing import *
class test_histogram(NumpyTestCase):
    def check_simple(self):
        n=100
        v=rand(n)
        (a,b)=histogram(v)
        #check if the sum of the bins equals the number of samples
        assert(sum(a,axis=0)==n)
        #check that the bin counts are evenly spaced when the data is from a linear function
        (a,b)=histogram(linspace(0,10,100))
        assert(all(a==10))
        #Check the construction of the bin array
        a, b = histogram(v, bins=4, range=[.2,.8])
        assert(all(b['edges']==linspace(.2, .8, 5)))
        #Check the number of outliers
        assert((v<.2).sum() == b['lower'])
        assert((v>.8).sum() == b['upper'])
        #Check the normalization
        bins = [0,.5,.75,1]
        a,b = histogram(v, bins, normed=True)
        assert_almost_equal((a*diff(bins)).sum(), 1)
        
    def check_axis(self):
        n,m = 100,20
        v = rand(n,m)
        a,b = histogram(v, bins=5)
        # Check dimension is reduced (axis=None).
        assert(a.ndim == 1)
        #Check total number of count is equal to the number of samples.
        assert(a.sum() == n*m)
        a,b = histogram(v, bins = 7, axis=0)
        # Check shape of new array is ok.
        assert(a.ndim == 2)
        assert_array_equal(a.shape,[7, m])
        # Check normalization is consistent 
        a,b = histogram(v, bins = 7, axis=0, normed=True)
        assert_array_almost_equal((a.T*diff(b['edges'])).sum(1), ones((m)))
        a,b = histogram(v, bins = 7, axis=1, normed=True)
        assert_array_equal(a.shape, [n,7])
        assert_array_almost_equal((a*diff(b['edges'])).sum(1), ones((n)))
        # Check results are consistent with 1d estimate
        a1, b1 = histogram(v[0,:], bins=b['edges'], normed=True)
        assert_array_equal(a1, a[0,:])
            
    def check_weights(self):
        # Check weights = constant gives the same answer as no weights.
        v = rand(100)
        w = ones(100)*5
        wa,wb = histogram(v, weights=w)
        a,b = histogram(v)
        assert_array_equal(a, wa)
        # Check weights are properly applied.
        v = linspace(0,10,10)
        w = concatenate((zeros(5), ones(5)))
        wa,wb = histogram(v, bins=arange(11),weights=w)
        assert_array_almost_equal(wa, w*2)
        
if __name__ == "__main__":
    NumpyTest().run()
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Numpy-discussion mailing list
Numpy-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/numpy-discussion

Reply via email to