Stop using UTF8_COUNT array For most UTF-8 input, especially ASCII text, it's faster to avoid an additional memory load.
Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/64a10008 Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/64a10008 Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/64a10008 Branch: refs/heads/master Commit: 64a100083d473b498e4fa524360156c47d7008bc Parents: 20d2820 Author: Nick Wellnhofer <wellnho...@aevum.de> Authored: Tue Aug 2 18:22:43 2016 +0200 Committer: Nick Wellnhofer <wellnho...@aevum.de> Committed: Tue Aug 2 18:22:43 2016 +0200 ---------------------------------------------------------------------- runtime/core/Clownfish/String.c | 5 +- runtime/core/Clownfish/Util/StringHelper.c | 89 +++++++++++++------------ 2 files changed, 49 insertions(+), 45 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/64a10008/runtime/core/Clownfish/String.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/String.c b/runtime/core/Clownfish/String.c index 767a34e..0353ffd 100644 --- a/runtime/core/Clownfish/String.c +++ b/runtime/core/Clownfish/String.c @@ -690,7 +690,10 @@ StrIter_Advance_IMP(StringIterator *self, size_t num) { break; } uint8_t first_byte = ptr[byte_offset]; - byte_offset += StrHelp_UTF8_COUNT[first_byte]; + if (first_byte < 0x80) { byte_offset += 1; } + else if (first_byte < 0xE0) { byte_offset += 2; } + else if (first_byte < 0xF0) { byte_offset += 3; } + else { byte_offset += 4; } ++num_skipped; } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/64a10008/runtime/core/Clownfish/Util/StringHelper.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/Util/StringHelper.c b/runtime/core/Clownfish/Util/StringHelper.c index 4af9d84..256c9e0 100644 --- a/runtime/core/Clownfish/Util/StringHelper.c +++ b/runtime/core/Clownfish/Util/StringHelper.c @@ -87,54 +87,55 @@ S_find_invalid_utf8(const uint8_t *string, size_t size) { while (string < end) { const uint8_t *start = string; const uint8_t header_byte = *string++; - int count = StrHelp_UTF8_COUNT[header_byte] & 0x7; - switch (count & 0x7) { - case 1: - // ASCII - break; - case 2: - if (string == end) { return start; } - // Disallow non-shortest-form ASCII. - if (!(header_byte & 0x1E)) { return start; } - if ((*string++ & 0xC0) != 0x80) { return start; } - break; - case 3: - if (end - string < 2) { return start; } - if (header_byte == 0xED) { - // Disallow UTF-16 surrogates. - if (*string < 0x80 || *string > 0x9F) { - return start; - } + + if (header_byte < 0x80) { + // ASCII + ; + } + else if (header_byte < 0xE0) { + // Disallow non-shortest-form ASCII and continuation bytes. + if (header_byte < 0xC2) { return start; } + // Two-byte sequence. + if (string == end) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } + } + else if (header_byte < 0xF0) { + // Three-byte sequence. + if (end - string < 2) { return start; } + if (header_byte == 0xED) { + // Disallow UTF-16 surrogates. + if (*string < 0x80 || *string > 0x9F) { + return start; } - else if (!(header_byte & 0x0F)) { - // Disallow non-shortest-form. - if (!(*string & 0x20)) { - return start; - } + } + else if (!(header_byte & 0x0F)) { + // Disallow non-shortest-form. + if (!(*string & 0x20)) { + return start; } - if ((*string++ & 0xC0) != 0x80) { return start; } - if ((*string++ & 0xC0) != 0x80) { return start; } - break; - case 4: - if (end - string < 3) { return start; } - if (!(header_byte & 0x07)) { - // Disallow non-shortest-form. - if (!(*string & 0x30)) { - return start; - } + } + if ((*string++ & 0xC0) != 0x80) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } + } + else { + if (header_byte > 0xF4) { return start; } + // Four-byte sequence. + if (end - string < 3) { return start; } + if (!(header_byte & 0x07)) { + // Disallow non-shortest-form. + if (!(*string & 0x30)) { + return start; } - else if (header_byte == 0xF4) { - // Code point larger than 0x10FFFF. - if (*string >= 0x90) { - return start; - } + } + else if (header_byte == 0xF4) { + // Code point larger than 0x10FFFF. + if (*string >= 0x90) { + return start; } - if ((*string++ & 0xC0) != 0x80) { return start; } - if ((*string++ & 0xC0) != 0x80) { return start; } - if ((*string++ & 0xC0) != 0x80) { return start; } - break; - default: - return start; + } + if ((*string++ & 0xC0) != 0x80) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } } }