OK, I'll answer myself ;-) Found needed information at http://www1.tip.nl/~t876506/utf8tbl.html See below new version,
Denis ________________________________ la vita e estrany http://spir.wikidot.com/ ============================= # coding: utf8 import sys ; end = sys.exit # constant max_code = 1114111 # U+00101111 def toUTF8(code): ''' UTF-8 single character octet string, from unicode code ''' # case 1 octet: ASCII if code<128 : o1 = code octets = (o1,) # case 2 octets elif code < 2048: o1 = code // 64 + 192 o2 = code % 64 + 128 octets = (o1,o2) # case 3 octets elif code < 65536: o1 = code // 4096 + 224 o2 = (code//64) % 64 + 128 o3 = code % 64 + 128 octets = (o1,o2,o3) # case 4 octets elif code > 65535 and code <= max_code: o1 = code // 262144 + 240 o2 = (code//4096) % 64 + 128 o3 = (code//64) % 64 + 128 o4 = code % 64 + 128 octets = (o1,o2,o3,o4) # case error else: message = "Invalid unicode code: %s" %code raise ValueError(message) # octet string return ''.join(chr(o) for o in octets) def fromUTF8(string): ''' unicode code, from UTF-8 single character octet string ''' octets = [ord(o) for o in string] o1 = octets[0] # case o1 = 0xxxxxxx --> 1 octet: ASCII if o1<128: return o1 # case o1 = 110xxxxx --> 2 octets elif o1>192 and o1<224: o2 = octets[1] return (o1-192)*64 + (o2-128) # case o1 = 1110xxxx --> 3 octets elif o1>223 and o1<240: o2,o3 = octets[1:] return (o1-224)*4096 + (o2-128)*64 + (o3-128) # case o1 = 11110xxx --> 4 octets elif o1>239 and o1<248: o2,o3,o4 = octets[1:] return (o1-240)*262144 + (o2-128)*4096 + (o3-128)*64 + (o4-128) # case error else: decseq = " ".join(str(v) for v in octets) hexseq = " ".join(hex(v)[2:] for v in octets) message = "Invalid UTF-8 sequence: %u [%s] = [%s] (hex)." \ %(ord(string),decseq,hexseq) raise ValueError(message) # def test(): # ASCII, latin, BMP, >BMP codes = [ 9,10,32,57,65,97,126,127, 160,233,255, 256,2048,65535, 65536,max_code, ] for c1 in codes: u = toUTF8(c1) c2 = fromUTF8(u) print c1,"\t --> ",u,"\t --> ",c2 test() _______________________________________________ Tutor maillist - Tutor@python.org To unsubscribe or change subscription options: http://mail.python.org/mailman/listinfo/tutor