I've written simple code in 2.6 and 3.0 to read every charcter of a set of files and print out some information for each of these characters. I tested each program on a large Cyrillic/Latin text. The result was that the 2.6 version was about 5x faster. Here are the two programs:
#!/usr/bin/env python import sys import codecs import unicodedata for path in sys.argv[1:]: lines = codecs.open(path, encoding='UTF-8', errors='replace').readlines() for line in lines: for c in line: name = unicodedata.name(c,'unknown') prnt = prnt_rep = c.encode('utf8') if name == 'unknown': prnt = ' ' if ord(c) > 127: print('%s %-14r U+%04x %s' % (prnt, prnt_rep, ord(c), name)) else: if ord(c) == 9: name = 'tab' prnt = ' ' elif ord(c) == 10: name = 'LF' prnt = ' ' elif ord(c) == 13: name = 'CR' prnt = ' ' print("{0:s} '\\x{1:02x}' U+{2:04x} {3:s}".format( prnt, ord(c), ord(c), name)) #!/usr/bin/env python3 import sys import unicodedata for path in sys.argv[1:]: lines = open(path, errors='replace').readlines() for line in lines: for c in line: code_point = ord(c) utf8 = c.encode() if ord(c) <= 127: utf8 = "b'\\" + hex(ord(c))[1:] + "'" name = unicodedata.name(c,'unknown') if name == 'unknown': c = ' ' if code_point == 9: c = ' ' name = 'tab' elif code_point == 10: c = ' ' name = 'LF' elif code_point == 13: c = ' ' name = 'CR' print("{0:s} {1:15s} U+{2:04x} {3:s}".format( c, utf8, code_point, name)) -- http://mail.python.org/mailman/listinfo/python-list