Special offer for coders coding on Christmas day! I'm looking for the simplest way to decode/encode unicode ordinals (called 'codes' below) to/from utf8. Find this rather tricky, esp because of variable number of meaningful bits in first octet. Specifically, for encoding, is there a way to avoid paasing through binary (and back)? Below what I have so far (test by converting to utf8 & back ;-).
Denis ________________________________ la vita e estrany http://spir.wikidot.com/ ========================================= # coding: utf8 import sys ; end = sys.exit sizes_to_values = {2:192, 3:224, 4:240} def ordinalFromUtf8(s): n = len(s) byte0 = ord(s[0]) # case ASCII if n == 1: return byte0 # case else # get actual value for byte #0 value0 = byte0 - sizes_to_values[n] ordinal = value0 * 64**(n-1) # compute other bytes for i in range(1,n): byte = ord(s[i]) value = byte - 128 weight = 64**(n-i-1) ordinal = ordinal + (byte - 128) * 64**(n-i-1) return ordinal def ordinalToUtf8(o): # case ASCII if o < 128 : return chr(o) # case else # split into octets, # each holding '10' & 6 meaningful bits binary = bin(o)[2:] octets = list() while len(binary) > 6: octet = '10' + binary[-6:] octets.insert(0, octet) binary = binary[:-6] # first octet can have 3 to 5 free bits, # depending on overall length bit_count = 6 - len(octets) rest_bit_count = len(binary) if rest_bit_count > bit_count: octet = '10' + '0' * (6 - rest_bit_count) + binary octets.insert(0, octet) binary = binary[:-6] zero_count = 7 - len(octets) - len(binary) octet = '1' * (len(octets)+1) + '0' * zero_count + binary octets.insert(0, octet) # convert to ordinals --> chars --> string ordinals = [int(bits,2) for bits in octets] chars = [chr(o) for o in ordinals] return ''.join(chars) def test(): def ue(u): return unicode.encode(u, 'utf8') # ASCII, latin, BMP, >BMP chars = ['\n','\t',' ','A','a','~', ue(u'\u00a0'),'£','µ','¿','À','é','ÿ', ue(u'\u0100'),'€',ue(u'\u1234'),ue(u'\uffff'), ue(u'\U00010000'),ue(u'\U000fffff')] for char in chars: o = ordinalFromUtf8(char) s = ordinalToUtf8(o) print char,repr(char), " -->", o,'=',hex(o), " -->", s,repr(s) test() _______________________________________________ Tutor maillist - Tutor@python.org To unsubscribe or change subscription options: http://mail.python.org/mailman/listinfo/tutor