[3/6] lucy-clownfish git commit: Stop using UTF8_COUNT array

nwellnhof Sat, 06 Aug 2016 07:29:46 -0700

Stop using UTF8_COUNT array

For most UTF-8 input, especially ASCII text, it's faster to avoid
an additional memory load.



Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/64a10008
Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/64a10008
Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/64a10008

Branch: refs/heads/master
Commit: 64a100083d473b498e4fa524360156c47d7008bc
Parents: 20d2820
Author: Nick Wellnhofer <wellnho...@aevum.de>
Authored: Tue Aug 2 18:22:43 2016 +0200
Committer: Nick Wellnhofer <wellnho...@aevum.de>
Committed: Tue Aug 2 18:22:43 2016 +0200

----------------------------------------------------------------------
 runtime/core/Clownfish/String.c            |  5 +-
 runtime/core/Clownfish/Util/StringHelper.c | 89 +++++++++++++------------
 2 files changed, 49 insertions(+), 45 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/64a10008/runtime/core/Clownfish/String.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/String.c b/runtime/core/Clownfish/String.c
index 767a34e..0353ffd 100644
--- a/runtime/core/Clownfish/String.c
+++ b/runtime/core/Clownfish/String.c
@@ -690,7 +690,10 @@ StrIter_Advance_IMP(StringIterator *self, size_t num) {
             break;
         }
         uint8_t first_byte = ptr[byte_offset];
-        byte_offset += StrHelp_UTF8_COUNT[first_byte];
+        if      (first_byte < 0x80) { byte_offset += 1; }
+        else if (first_byte < 0xE0) { byte_offset += 2; }
+        else if (first_byte < 0xF0) { byte_offset += 3; }
+        else                        { byte_offset += 4; }
         ++num_skipped;
     }
 

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/64a10008/runtime/core/Clownfish/Util/StringHelper.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/Util/StringHelper.c 
b/runtime/core/Clownfish/Util/StringHelper.c
index 4af9d84..256c9e0 100644
--- a/runtime/core/Clownfish/Util/StringHelper.c
+++ b/runtime/core/Clownfish/Util/StringHelper.c
@@ -87,54 +87,55 @@ S_find_invalid_utf8(const uint8_t *string, size_t size) {
     while (string < end) {
         const uint8_t *start = string;
         const uint8_t header_byte = *string++;
-        int count = StrHelp_UTF8_COUNT[header_byte] & 0x7;
-        switch (count & 0x7) {
-            case 1:
-                // ASCII
-                break;
-            case 2:
-                if (string == end)              { return start; }
-                // Disallow non-shortest-form ASCII.
-                if (!(header_byte & 0x1E))      { return start; }
-                if ((*string++ & 0xC0) != 0x80) { return start; }
-                break;
-            case 3:
-                if (end - string < 2)           { return start; }
-                if (header_byte == 0xED) {
-                    // Disallow UTF-16 surrogates.
-                    if (*string < 0x80 || *string > 0x9F) {
-                        return start;
-                    }
+
+        if (header_byte < 0x80) {
+            // ASCII
+            ;
+        }
+        else if (header_byte < 0xE0) {
+            // Disallow non-shortest-form ASCII and continuation bytes.
+            if (header_byte < 0xC2)         { return start; }
+            // Two-byte sequence.
+            if (string == end)              { return start; }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
+        }
+        else if (header_byte < 0xF0) {
+            // Three-byte sequence.
+            if (end - string < 2)           { return start; }
+            if (header_byte == 0xED) {
+                // Disallow UTF-16 surrogates.
+                if (*string < 0x80 || *string > 0x9F) {
+                    return start;
                 }
-                else if (!(header_byte & 0x0F)) {
-                    // Disallow non-shortest-form.
-                    if (!(*string & 0x20)) {
-                        return start;
-                    }
+            }
+            else if (!(header_byte & 0x0F)) {
+                // Disallow non-shortest-form.
+                if (!(*string & 0x20)) {
+                    return start;
                 }
-                if ((*string++ & 0xC0) != 0x80) { return start; }
-                if ((*string++ & 0xC0) != 0x80) { return start; }
-                break;
-            case 4:
-                if (end - string < 3)           { return start; }
-                if (!(header_byte & 0x07)) {
-                    // Disallow non-shortest-form.
-                    if (!(*string & 0x30)) {
-                        return start;
-                    }
+            }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
+        }
+        else {
+            if (header_byte > 0xF4)         { return start; }
+            // Four-byte sequence.
+            if (end - string < 3)           { return start; }
+            if (!(header_byte & 0x07)) {
+                // Disallow non-shortest-form.
+                if (!(*string & 0x30)) {
+                    return start;
                 }
-                else if (header_byte == 0xF4) {
-                    // Code point larger than 0x10FFFF.
-                    if (*string >= 0x90) {
-                        return start;
-                    }
+            }
+            else if (header_byte == 0xF4) {
+                // Code point larger than 0x10FFFF.
+                if (*string >= 0x90) {
+                    return start;
                 }
-                if ((*string++ & 0xC0) != 0x80) { return start; }
-                if ((*string++ & 0xC0) != 0x80) { return start; }
-                if ((*string++ & 0xC0) != 0x80) { return start; }
-                break;
-            default:
-                return start;
+            }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
         }
     }

[3/6] lucy-clownfish git commit: Stop using UTF8_COUNT array

Reply via email to