#!/usr/bin/python """Natural-order sorting, supporting embedded numbers.
foo9bar2 < foo10bar2 < foo10bar10 """ import random, re, sys def natsort_key(item): chunks = re.split('(\d+(?:\.\d+)?)', item) for ii in range(len(chunks)): if chunks[ii] and chunks[ii][0] in '0123456789': if '.' in chunks[ii]: numtype = float else: numtype = int # wrap in tuple with '0' to explicitly specify numbers come first chunks[ii] = (0, numtype(chunks[ii])) else: chunks[ii] = (1, chunks[ii]) return (chunks, item) def natsort(seq): "Sort a sequence of text strings in a reasonable order." alist = [item for item in seq] alist.sort(key=natsort_key) return alist def ok(a, b): assert a == b, (a, b) def test(): ok(natsort('foo10bar10 foo9bar2 foo10bar2'.split()), 'foo9bar2 foo10bar2 foo10bar10'.split()) nums = map(str, range(100)) random.shuffle(nums) ok(natsort(nums), map(str, range(100))) assert nums != map(str, range(100)) # didn't get mutated... # compare lexically when numerical value identical ok(natsort('b1 a2 a1 b01 b2'.split()), 'a1 a2 b01 b1 b2'.split()) # A lazy implementor might support only short numbers; this prevents that: ok(natsort(['a100000000000000000000000.txt', 'a99999999999999999999999.txt']), ['a99999999999999999999999.txt', 'a100000000000000000000000.txt']) # Don't interpret 033 as octal (decimal 27). ok(natsort(['033', '32']), ['32', '033']) # This one was hard and possibly not worth it: use floating point # to handle non-integer numbers. The cost is that sequences of # integers separated by dots will lose precision in some cases, # and also may sort funny. # e.g. 1234.56789012345678901234567890.34567890. ok(natsort(['2.49', '2.5', '2.6']), ['2.49', '2.5', '2.6']) # Make sure we can still handle sequences of integers with dots. ok(natsort(['1.2.3', '1.3.4']), ['1.2.3', '1.3.4']) # There are cases where we lose on floating-point precision, # although usually then the fallback to pure lexical comparison # saves us: ok(natsort(['1.3000000000000000000000000000000000000000000000000000001', '1.3000000000000000000000000000000000000000000000000000000']), ['1.3000000000000000000000000000000000000000000000000000000', '1.3000000000000000000000000000000000000000000000000000001']) ### Here are some cases where we get suboptimal answers: # Hexadecimal numbers get mangled. ok(natsort(['0x99', '0x9a']), ['0x9a', '0x99']) # In cases where you expect asciibetical order, you will be # disappointed. ok(natsort([',', '5', ':']), ['5', ',', ':']) # Fallback to pure lexical comparison doesn't always save us on # floating-point precision: ok(natsort(['01.3000000000000000000000000000000000000000000000000000001', '1.3000000000000000000000000000000000000000000000000000000']), ['01.3000000000000000000000000000000000000000000000000000001', '1.3000000000000000000000000000000000000000000000000000000']) test() if __name__ == '__main__': sys.stdout.writelines(natsort(sys.stdin))