Serhiy Storchaka added the comment:

Good catch Antoine!

Here is a sample of more complicated implementation.

----------
title: Add a convert_surrogates function to "clean" surrogate escaped strings 
-> Add codecs.convert_surrogateescape to "clean" surrogate escaped strings
Added file: http://bugs.python.org/file36700/convert_surrogates.py

_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue18814>
_______________________________________
import codecs
import re

def convert_surrogates(data, errors='strict'):
    handler = None
    p = re.compile('[\ud800-\uefff]+')
    pos = 0
    res = []
    while True:
        m = p.search(data, pos)
        if m:
            if handler is None:
                handler = codecs.lookup_error(errors)
            res.append(data[pos: m.start()])
            repl, pos = handler(UnicodeTranslateError(data, m.start(), m.end(),
                                                      'lone surrogates'))
            res.append(repl)
        elif pos:
            res.append(data[pos:])
            return ''.join(res)
        else:
            return data

def convert_surrogateescape(data, errors='strict'):
    handler = None
    p = re.compile('[\ud800-\uefff]+')
    pos = 0
    res = []
    while True:
        m = p.search(data, pos)
        if m:
            if handler is None:
                handler = codecs.lookup_error(errors)
            start = m.start()
            res.append(data[pos: start])
            try:
                baddata = data[start: m.end()].encode('ascii', 
'surrogateescape')
            except UnicodeEncodeError as err:
                raise UnicodeTranslateError(data,
                        err.start + start,err.end + start,
                        r'surrogates not in range \ud880-\ud8ff') from None
            try:
                repl, pos = handler(UnicodeDecodeError('unicode', baddata,
                                                       0, len(baddata),
                                                       'lone surrogates'))
            except UnicodeDecodeError as err:
                raise UnicodeTranslateError(data,
                                            err.start + start,
                                            err.end + start,
                                            err.reason) from None
            pos += start
            res.append(repl)
        elif pos:
            res.append(data[pos:])
            return ''.join(res)
        else:
            return data
_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
https://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to