"Carl W. Brown" wrote:
> I am checking out my UTF-8 validation rules to see if they are correct.
> Check each character to be a valid UTF-8 initial character.
> \x00 to \x7f or \xC2 to \xF4
> Allow invalid forms such as \xC0 & \xC1 to decode but consider them invalid.

Unicode 3.1 says that these should not be allowed to decode (see the first
and second notes after C12 added by UAX #27).

> A first byte of \xE0 or \xF0 with a second byte less than \xA0 is also an
> invalid form.
> \xED followed by anything >= \xA0 is an encoded surrogate and not a valid
> character.
> \xEF\xBF\xBE and \xEF\xBF\xBF are invalid Unicode characters.
> Anything greater than \xF4\x80\xBF\xBF is beyond the Unicode range.

It's arguably simpler to convert to a code point, and then check whether the
code point is valid, than to directly check that the UTF-8 encoding is valid
(see the pseudocode below for precisely what I mean).

Also, if you're converting to, say, UTF-16, then non-character sequences
like \xEF\xBF\xBE and \xEF\xBF\xBF should probably be converted to the
corresponding UTF-16 non-characters (\uFFFE and \uFFFF), rather than being
rejected. (Note: Unicode 3.1 and ISO/IEC 10646-1:2000 differ on this point;
10646 requires them to be rejected.)

Here is some C-like pseudocode for a validating converter from UTF-8 to
UTF-16. It is suitable for cases where a bijective mapping between valid
sequences is needed, provided the ALLOW_IRREGULAR flag is *not* set.

// Set STRICT_ISO10646 for strict ISO/IEC 10646-1:2000 Annex D compliance
//   (reject U+FFFE and U+FFFF).
// Set ALLOW_IRREGULAR to tolerate irregular UTF-8 sequences (that is,
//   where UTF-16 surrogates have been incorrectly treated as separate
//   characters).

int toUTF16(uint8_t * utf8, int utf8len) { // utf8len type must be signed
    uint8_t b0, b1, b2, b3;
    uint32_t codepoint, temp;
    int i;

    for (i = 0; i < utf8len; ) {
        b0 = utf8[i++];
        if ((b0 & 0x80) == 0) {           // 0xxxxxxx
            output b0;

        } else if ((b0 & 0xE0) == 0xC0) { // 110xxxxx 10xxxxxx
            if (i >= utf8len) {
                return TRUNCATED;
            b1 = utf8[i++];
            if ((b1 & 0xC0) != 0x80) {
                return INVALID;
            codepoint = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
            if (codepoint < 0x80) {
                return INVALID; // non-shortest form
            output codepoint;

        } else if ((b0 & 0xF0) == 0xE0) { // 1110xxxx 10xxxxxx 10xxxxxx
            if (i >= utf8len-1) {
                return TRUNCATED;
            b1 = utf8[i++];
            b2 = utf8[i++];
            if ((b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80) {
                return INVALID;
            codepoint = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);

            if (ALLOW_IRREGULAR && codepoint >= 0xD800 && codepoint <= 0xDBFF) {
                if (i >= utf8len-2) {
                    return TRUNCATED;
                b0 = utf8[i++];
                b1 = utf8[i++];
                b2 = utf8[i++];
                if ((b0 & 0xF0) != 0xE0 || (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80) 
                    return INVALID;
                temp = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
                if (temp < 0xDC00 || temp > 0xDFFF) {
                    return INVALID;
                output codepoint;
                output temp;
            } else if (codepoint < 0x800 // non-shortest form
                       || (codepoint >= 0xD800 && codepoint <= 0xDFFF)
                       || (STRICT_ISO10646 && codepoint >= 0xFFFE)) {
                return INVALID;
            } else {
                output codepoint;
        } else if ((b0 & 0xF8) == 0xF0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            if (i >= utf8len-2) {
                return TRUNCATED;
            b1 = utf8[i++];
            b2 = utf8[i++];
            b3 = utf8[i++];
            if ((b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80) {
                return INVALID;
            codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) |
                        ((b2 & 0x3F) << 6) | (b3 & 0x3F);
            if (codepoint < 0x10000 // non-shortest form
                || codepoint > 0x10FFFF) {
                return INVALID;
            temp = codepoint - 0x10000;
            output (temp >> 10  ) + 0xD800;
            output (temp & 0x3FF) + 0xDC00;

        } else {
            return INVALID;
    } /* for i */

    return VALID;

- -- 
David Hopwood <[EMAIL PROTECTED]>

Home page & PGP public key: http://www.users.zetnet.co.uk/hopwood/
RSA 2048-bit; fingerprint 71 8E A6 23 0E D3 4C E5  0F 69 8C D4 FA 66 15 01
Nothing in this message is intended to be legally binding. If I revoke a
public key but refuse to specify why, it is because the private key has been
seized under the Regulation of Investigatory Powers Act; see www.fipr.org/rip

Version: 2.6.3i
Charset: noconv


Reply via email to