John Machin wrote: > Some of the transformations are a little unfortunate :-(
here's a slightly silly way to map a unicode string to its "unaccented" version: ### import unicodedata, sys CHAR_REPLACEMENT = { 0xc6: u"AE", # LATIN CAPITAL LETTER AE 0xd0: u"D", # LATIN CAPITAL LETTER ETH 0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE 0xde: u"Th", # LATIN CAPITAL LETTER THORN 0xdf: u"ss", # LATIN SMALL LETTER SHARP S 0xe6: u"ae", # LATIN SMALL LETTER AE 0xf0: u"d", # LATIN SMALL LETTER ETH 0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE 0xfe: u"th", # LATIN SMALL LETTER THORN } class unaccented_map(dict): def mapchar(self, key): ch = self.get(key) if ch is not None: return ch ch = unichr(key) try: ch = unichr(int(unicodedata.decomposition(ch).split()[0], 16)) except (IndexError, ValueError): ch = CHAR_REPLACEMENT.get(key, ch) # uncomment the following line if you want to remove remaining # non-ascii characters # if ch >= u"\x80": return None self[key] = ch return ch if sys.version >= "2.5": __missing__ = mapchar else: __getitem__ = mapchar assert isinstance(mystring, unicode) print mystring.translate(unaccented_map()) ### if the source string is not unicode, you can use something like s = mystring.decode("iso-8859-1") s = s.translate(unaccented_map()) s = s.encode("ascii", "ignore") (this works well for characters in the latin-1 range, at least. no guarantees for other character ranges) </F> -- http://mail.python.org/mailman/listinfo/python-list