[Tutor] unicode ordinals to/from utf8

spir Fri, 25 Dec 2009 09:40:43 -0800

Special offer for coders coding on Christmas day!

I'm looking for the simplest way to decode/encode unicode ordinals (called 
'codes' below) to/from utf8. Find this rather tricky, esp because of variable 
number of meaningful bits in first octet. Specifically, for encoding, is there 
a way to avoid paasing through binary (and back)?
Below what I have so far (test by converting to utf8 & back ;-).


Denis
________________________________

la vita e estrany

http://spir.wikidot.com/


=========================================
# coding: utf8
import sys ; end = sys.exit

sizes_to_values = {2:192, 3:224, 4:240}
def ordinalFromUtf8(s):
        n = len(s)
        byte0 = ord(s[0])
        # case ASCII
        if n == 1:
                return byte0
        # case else
        # get actual value for byte #0
        value0 = byte0 - sizes_to_values[n]
        ordinal = value0 * 64**(n-1)
        # compute other bytes
        for i in range(1,n):
                byte = ord(s[i])
                value = byte - 128
                weight = 64**(n-i-1)
                ordinal = ordinal + (byte - 128) * 64**(n-i-1)
        return ordinal

def ordinalToUtf8(o):
        # case ASCII
        if o < 128 : return chr(o)
        # case else
        # split into octets,
        # each holding '10' & 6 meaningful bits
        binary = bin(o)[2:]
        octets = list()
        while len(binary) > 6:
                octet = '10' + binary[-6:]
                octets.insert(0, octet)
                binary = binary[:-6]
        # first octet can have 3 to 5 free bits,
        # depending on overall length
        bit_count = 6 - len(octets)
        rest_bit_count = len(binary)
        if rest_bit_count > bit_count:
                octet = '10' + '0' * (6 - rest_bit_count) + binary
                octets.insert(0, octet)
                binary = binary[:-6]
        zero_count = 7 - len(octets) - len(binary)
        octet = '1' * (len(octets)+1) + '0' * zero_count + binary
        octets.insert(0, octet)
        # convert to ordinals --> chars --> string
        ordinals = [int(bits,2) for bits in octets]
        chars = [chr(o) for o in ordinals]
        return ''.join(chars)

def test():
        def ue(u): return unicode.encode(u, 'utf8')
        # ASCII, latin, BMP, >BMP
        chars = ['\n','\t',' ','A','a','~',
                        ue(u'\u00a0'),'£','µ','¿','À','é','ÿ',
                        ue(u'\u0100'),'€',ue(u'\u1234'),ue(u'\uffff'),
                        ue(u'\U00010000'),ue(u'\U000fffff')]
        for char in chars:
                o = ordinalFromUtf8(char)
                s = ordinalToUtf8(o)
                print char,repr(char), "        -->", o,'=',hex(o), "   -->", 
s,repr(s)
test()
_______________________________________________
Tutor maillist  -  Tutor@python.org
To unsubscribe or change subscription options:
http://mail.python.org/mailman/listinfo/tutor

[Tutor] unicode ordinals to/from utf8

Reply via email to