I've made a few more changes to my little collate module. There might be better ways to handle the options, or better choices for the options themselves. I tried to keep it as general as possible.
I think it should work with Unicode now too. Any suggestions in making it faster will be appreciated. Any minor improvements to wording, spelling, etc.. are also welcome. Many thanks for all the terrific feed back and suggestions! Cheers, Ron ---start--- """ Collate.py - Sorts lists of strings in various ways depending options and locale. Class: Collate(option_string) Functions: collate(text, option_string) -> collate in place result = collated(text, option_string) -> get a collated copy To use collate with your user locale you need to call setlocale() before collating: locale.setlocale(locale.LC_ALL, '') This will set several global variables such as string constants, time and monitary formats, as well as numerical formatting to your user Locale. Instead of '', you can also specify a specific locale. The setttings can be restored back to the default 'C' locale with the following: locale.setlocale(locale.LC_ALL, 'C') Example: locale.setlocale(locale.LC_ALL, '') tlist = [ 'Fruit', 'fruit-apple', 'fruit-bannana', 'nut-pecan', 'Nut' ] collate(tlist, 'caps_first hyphen_as_space') * For more examples see doctests in function test(). Author: Ron Adam, [EMAIL PROTECTED] """ import re import locale import string __version__ = '0.03 (pre-alpha) 10/23/2006' class Collate(object): """ A general purpose collator class. Use: collate(text, option_string) result = collated(text, option_string) text = a list of strings opton_string = a string of space separated options. Collation can be modified with the following options: CAPS_FIRST -> Aaa, aaa, Bbb, bbb LOWERCASE_FIRST -> aaa, Aaa, bbb, Bbb HYPHEN_AS_SPACE -> hyphens separate words PERIOD_AS_SPACE -> Periods separate numerals NUMERICAL -> Digit sequences as numerals IGNORE_COMMAS -> Allows commas in numerals UNDERSCORE_AS_SPACE -> Underscore as white space IGNORE_LEADING_WS -> Disregard leading white space """ options = ( 'CAPS_FIRST', 'LOWERCASE_FIRST', 'NUMERICAL', 'HYPHEN_AS_SPACE', 'UNDERSCORE_AS_SPACE', 'IGNORE_LEADING_WS', 'IGNORE_COMMAS', 'PERIOD_AS_SPACE' ) def __init__(self, flags=""): locale_conv = locale.localeconv() self.encoding = locale.getpreferredencoding() self.dpoint = locale_conv['decimal_point'] self.thsep = locale_conv['thousands_sep'] pattern = ''.join([r'([\d\\', self.dpoint, r']*|\D*)']) self.numrex = re.compile(pattern, re.LOCALE) if flags: flags = flags.upper().split() for value in flags: if value not in self.options: raise ValueError, 'Invalid option: %s' % value txtable = [] if 'HYPHEN_AS_SPACE' in flags: txtable.append(('-', ' ')) if 'UNDERSCORE_AS_SPACE' in flags: txtable.append(('_', ' ')) if 'PERIOD_AS_SPACE' in flags: txtable.append(('.', ' ')) if 'IGNORE_COMMAS' in flags: txtable.append((',', '')) self.txtable = txtable self.capsfirst = ( sorted(['A','a'], key=locale.strxfrm) == ['A', 'a'] ) self.flags = flags def transform(self, s): """ Transform a string for collating. """ if type(s) is unicode: s = s.encode(self.encoding, 'replace') if not self.flags: return locale.strxfrm(s) for a, b in self.txtable: s = s.replace(a, b) if 'IGNORE_LEADING_WS' in self.flags: s = s.lstrip() if 'CAPS_FIRST' in self.flags and not self.capsfirst: s = s.swapcase() if 'LOWERCASE_FIRST' in self.flags and self.capsfirst: s = s.swapcase() if 'NUMERICAL' in self.flags: slist = self.numrex.split(s) for i, x in enumerate(slist): if x: # slist may contain null strings # inlined local.atof() if self.thsep: xx = x.replace(self.thsep, '') if self.dpoint: xx = xx.replace(self.dpoint, '.') try: slist[i] = float(xx) except: slist[i] = locale.strxfrm(x) return slist return locale.strxfrm(s) def __call__(self, a): """ This allows the Collate class to be used as a sort key. USE: list.sort(key=Collate(flags)) """ return self.transform(a) def collate(slist, flags=[]): """ Collate list of strings in place. """ slist.sort(key=Collate(flags).transform) def collated(slist, flags=[]): """ Return a collated list of strings. """ return sorted(slist, key=Collate(flags).transform) def test(): """ DOC TESTS AND EXAMPLES: Initiat the current locale to the User locale settings. >>> locale.setlocale(locale.LC_ALL, '') 'English_United States.1252' Sort (and sorted) normally order all words beginning with caps before all words beginning with lower case. >>> t = ['tuesday', 'Tuesday', 'Monday', 'monday'] >>> sorted(t) # regular sort ['Monday', 'Tuesday', 'monday', 'tuesday'] Locale collation puts words beginning with caps after words beginning with lower case of the same letter. >>> collated(t) ['monday', 'Monday', 'tuesday', 'Tuesday'] The CAPS_FIRST option can be used to put all words beginning with caps before words beginning in lowercase. >>> collated(t, 'CAPS_FIRST') ['Monday', 'monday', 'Tuesday', 'tuesday'] The LOWERCASE_FIRST option can be used to put all words beginning with lowercase before words beginning in caps. >>> collated(t, 'LOWERCASE_FIRST') ['monday', 'Monday', 'tuesday', 'Tuesday'] Note: CAPS_FIRST also reverses the ascii behavior of putting all lowercase words before all uppercase words. The HYPHEN_AS_SPACE option causes hyphens to be equal to space. >>> t = ['a-b', 'b-a', 'aa-b', 'bb-a'] >>> collated(t) ['aa-b', 'a-b', 'b-a', 'bb-a'] >>> collated(t, 'HYPHEN_AS_SPACE') ['a-b', 'aa-b', 'b-a', 'bb-a'] The IGNORE_LEADING_WS and UNDERSCORE_AS_SPACE options can be used together to improve ordering in some situations. >>> t = ['sum', '__str__', 'about', ' round'] >>> collated(t) [' round', '__str__', 'about', 'sum'] >>> collated(t, 'IGNORE_LEADING_WS') ['__str__', 'about', ' round', 'sum'] >>> collated(t, 'UNDERSCORE_AS_SPACE') [' round', '__str__', 'about', 'sum'] >>> collated(t, 'IGNORE_LEADING_WS UNDERSCORE_AS_SPACE') ['about', ' round', '__str__', 'sum'] The NUMERICAL option orders sequences of digits as numerals. >>> t = ['a5', 'a40', '4abc', '20abc', 'a10.2', '13.5b', 'b2'] >>> collated(t, 'NUMERICAL') ['4abc', '13.5b', '20abc', 'a5', 'a10.2', 'a40', 'b2'] The IGNORE_COMMAS option prevents commas from seperating numerals. >>> t = ['a5', 'a4,000', '500b', '100,000b'] >>> collated(t, 'NUMERICAL IGNORE_COMMAS') ['500b', '100,000b', 'a5', 'a4,000'] The PERIOD_AS_COMMAS option can be used to sort version numbers and other decimal seperated numbers. >>> t = ['5.1.1', '5.10.12','5.2.2', '5.2.19' ] >>> collated(t, 'NUMERICAL PERIOD_AS_SPACE') ['5.1.1', '5.2.2', '5.2.19', '5.10.12'] Collate also can be done in place by using collate() instead of collated(). >>> t = ['Fred', 'Ron', 'Carol', 'Bob'] >>> collate(t) >>> t ['Bob', 'Carol', 'Fred', 'Ron'] """ import doctest doctest.testmod() if __name__ == '__main__': test() ---end--- -- http://mail.python.org/mailman/listinfo/python-list