#!/usr/bin/python
"""Natural-order sorting, supporting embedded numbers.

foo9bar2 < foo10bar2 < foo10bar10

"""
import random, re, sys

def natsort_key(item):
    chunks = re.split('(\d+(?:\.\d+)?)', item)
    for ii in range(len(chunks)):
        if chunks[ii] and chunks[ii][0] in '0123456789':
            if '.' in chunks[ii]: numtype = float
            else: numtype = int
            # wrap in tuple with '0' to explicitly specify numbers come first
            chunks[ii] = (0, numtype(chunks[ii]))
        else:
            chunks[ii] = (1, chunks[ii])
    return (chunks, item)

def natsort(seq):
    "Sort a sequence of text strings in a reasonable order."
    alist = [item for item in seq]
    alist.sort(key=natsort_key)
    return alist

def ok(a, b): assert a == b, (a, b)
def test():
    ok(natsort('foo10bar10 foo9bar2 foo10bar2'.split()),
       'foo9bar2 foo10bar2 foo10bar10'.split())

    nums = map(str, range(100))
    random.shuffle(nums)
    ok(natsort(nums), map(str, range(100)))
    assert nums != map(str, range(100))  # didn't get mutated...

    # compare lexically when numerical value identical
    ok(natsort('b1 a2 a1 b01 b2'.split()), 'a1 a2 b01 b1 b2'.split())

    # A lazy implementor might support only short numbers; this prevents that:
    ok(natsort(['a100000000000000000000000.txt',
                'a99999999999999999999999.txt']),
       ['a99999999999999999999999.txt', 'a100000000000000000000000.txt'])

    # Don't interpret 033 as octal (decimal 27).
    ok(natsort(['033', '32']), ['32', '033'])
    
    # This one was hard and possibly not worth it: use floating point
    # to handle non-integer numbers.  The cost is that sequences of
    # integers separated by dots will lose precision in some cases,
    # and also may sort funny.
    # e.g. 1234.56789012345678901234567890.34567890.
    ok(natsort(['2.49', '2.5', '2.6']), ['2.49', '2.5', '2.6'])

    # Make sure we can still handle sequences of integers with dots.
    ok(natsort(['1.2.3', '1.3.4']), ['1.2.3', '1.3.4'])

    # There are cases where we lose on floating-point precision,
    # although usually then the fallback to pure lexical comparison
    # saves us:
    ok(natsort(['1.3000000000000000000000000000000000000000000000000000001',
                '1.3000000000000000000000000000000000000000000000000000000']),
       ['1.3000000000000000000000000000000000000000000000000000000',
        '1.3000000000000000000000000000000000000000000000000000001'])

    ### Here are some cases where we get suboptimal answers:
    # Hexadecimal numbers get mangled.
    ok(natsort(['0x99', '0x9a']), ['0x9a', '0x99'])
    # In cases where you expect asciibetical order, you will be
    # disappointed.
    ok(natsort([',', '5', ':']), ['5', ',', ':'])
    # Fallback to pure lexical comparison doesn't always save us on
    # floating-point precision:
    ok(natsort(['01.3000000000000000000000000000000000000000000000000000001',
                '1.3000000000000000000000000000000000000000000000000000000']),
       ['01.3000000000000000000000000000000000000000000000000000001',
        '1.3000000000000000000000000000000000000000000000000000000'])

test()

if __name__ == '__main__':
    sys.stdout.writelines(natsort(sys.stdin))

Reply via email to