OK, I'll answer myself ;-)
Found needed information at http://www1.tip.nl/~t876506/utf8tbl.html 
See below new version,

Denis
________________________________

la vita e estrany

http://spir.wikidot.com/



=============================
# coding: utf8
import sys ; end = sys.exit

# constant
max_code = 1114111              # U+00101111

def toUTF8(code):
        ''' UTF-8 single character octet string, from unicode code '''
        # case 1 octet: ASCII
        if code<128 :
                o1 = code
                octets = (o1,)
        # case 2 octets
        elif code < 2048:
                o1 = code // 64 + 192
                o2 = code % 64 + 128
                octets = (o1,o2)
        # case 3 octets
        elif code < 65536:
                o1 = code // 4096 + 224
                o2 = (code//64) % 64 + 128
                o3 = code % 64 + 128
                octets = (o1,o2,o3)
        # case 4 octets
        elif code > 65535 and code <= max_code:
                o1 = code // 262144 + 240
                o2 = (code//4096) % 64 + 128
                o3 = (code//64) % 64 + 128
                o4 = code % 64 + 128
                octets = (o1,o2,o3,o4)
        # case error
        else:
                message = "Invalid unicode code: %s" %code
                raise ValueError(message)
        # octet string
        return ''.join(chr(o) for o in octets)

def fromUTF8(string):
        ''' unicode code, from UTF-8 single character octet string '''
        octets = [ord(o) for o in string]
        o1 = octets[0]
        # case o1 = 0xxxxxxx --> 1 octet: ASCII
        if o1<128:
                return o1
        # case o1 = 110xxxxx --> 2 octets
        elif o1>192 and o1<224:
                o2 = octets[1]
                return (o1-192)*64 + (o2-128)
        # case o1 = 1110xxxx --> 3 octets
        elif o1>223 and o1<240:
                o2,o3 = octets[1:]
                return (o1-224)*4096 + (o2-128)*64 + (o3-128)
        # case o1 = 11110xxx --> 4 octets
        elif o1>239 and o1<248:
                o2,o3,o4 = octets[1:]
                return (o1-240)*262144 + (o2-128)*4096 + (o3-128)*64 + (o4-128)
        # case error
        else:
                decseq = " ".join(str(v) for v in octets)
                hexseq = " ".join(hex(v)[2:] for v in octets)
                message =       "Invalid UTF-8 sequence: %u [%s] = [%s] (hex)." 
\
                                        %(ord(string),decseq,hexseq)
                raise ValueError(message)
#
def test():
        # ASCII, latin, BMP, >BMP
        codes  = [      9,10,32,57,65,97,126,127,
                                160,233,255,
                                256,2048,65535,
                                65536,max_code,
                                ]
        for c1 in codes:
                u = toUTF8(c1)
                c2 = fromUTF8(u)
                print c1,"\t --> ",u,"\t --> ",c2
test()
_______________________________________________
Tutor maillist  -  Tutor@python.org
To unsubscribe or change subscription options:
http://mail.python.org/mailman/listinfo/tutor

Reply via email to