[1/4] lucy git commit: Use CharBuf to unescape JSON strings

nwellnhof Sat, 06 Aug 2016 07:42:36 -0700

Repository: lucy
Updated Branches:
  refs/heads/master 0a41b29b1 -> 7071a277c



Use CharBuf to unescape JSON strings

Also use a stricter custom decoder for hex escapes. (strtol allows
leading whitespace and plus signs.)


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/3cc03972
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/3cc03972
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/3cc03972

Branch: refs/heads/master
Commit: 3cc039724fa1862aa71333911f2bfc5c84b50f3e
Parents: 0a41b29
Author: Nick Wellnhofer <wellnho...@aevum.de>
Authored: Tue Aug 2 16:58:43 2016 +0200
Committer: Nick Wellnhofer <wellnho...@aevum.de>
Committed: Tue Aug 2 17:07:30 2016 +0200

----------------------------------------------------------------------
 core/Lucy/Util/Json.c          | 94 +++++++++++++++++++++----------------
 test/Lucy/Test/Util/TestJson.c |  3 +-
 2 files changed, 56 insertions(+), 41 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/3cc03972/core/Lucy/Util/Json.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/Json.c b/core/Lucy/Util/Json.c
index 6c7473d..f3e7a219 100644
--- a/core/Lucy/Util/Json.c
+++ b/core/Lucy/Util/Json.c
@@ -578,86 +578,100 @@ S_parse_string(const char **json_ptr, const char *limit) 
{
 
 static String*
 S_unescape_text(const char *top, const char *end) {
-    // The unescaped string will never be longer than the escaped string
-    // because only a \u escape can theoretically be too long and
-    // StrHelp_encode_utf8_char guards against sequences over 4 bytes.
-    // Therefore we can allocate once and not worry about reallocating.
     size_t cap = (size_t)(end - top) + 1;
-    char *target_buf = (char*)MALLOCATE(cap);
-    size_t target_size = 0;
-    for (const char *text = top; text < end; text++) {
+    CharBuf *cb = CB_new(cap);
+    const char *chunk = top;
+    const char *text  = top;
+
+    while (text < end) {
         if (*text != '\\') {
-            target_buf[target_size++] = *text;
+            text++;
         }
         else {
+            if (!StrHelp_utf8_valid(chunk, (size_t)(text - chunk))) {
+                DECREF(cb);
+                String *mess = MAKE_MESS("Bad UTF-8 in JSON");
+                Err_set_error(Err_new(mess));
+                return NULL;
+            }
+            CB_Cat_Trusted_Utf8(cb, chunk, (size_t)(text - chunk));
+
             // Process escape.
             text++;
             switch (*text) {
                 case '"':
-                    target_buf[target_size++] = '"';
-                    break;
                 case '\\':
-                    target_buf[target_size++] = '\\';
-                    break;
                 case '/':
-                    target_buf[target_size++] = '/';
+                    CB_Cat_Trusted_Utf8(cb, text, 1);
                     break;
                 case 'b':
-                    target_buf[target_size++] = '\b';
+                    CB_Cat_Trusted_Utf8(cb, "\b", 1);
                     break;
                 case 'f':
-                    target_buf[target_size++] = '\f';
+                    CB_Cat_Trusted_Utf8(cb, "\f", 1);
                     break;
                 case 'n':
-                    target_buf[target_size++] = '\n';
+                    CB_Cat_Trusted_Utf8(cb, "\n", 1);
                     break;
                 case 'r':
-                    target_buf[target_size++] = '\r';
+                    CB_Cat_Trusted_Utf8(cb, "\r", 1);
                     break;
                 case 't':
-                    target_buf[target_size++] = '\t';
+                    CB_Cat_Trusted_Utf8(cb, "\t", 1);
                     break;
                 case 'u': {
-                        // Copy into a temp buffer because strtol will overrun
-                        // into adjacent text data for e.g. "\uAAAA1".
-                        char temp[5] = { 0, 0, 0, 0, 0 };
-                        memcpy(temp, text + 1, 4);
-                        text += 4;
-                        char *num_end;
-                        long code_point = strtol(temp, &num_end, 16);
-                        char *temp_ptr = temp;
-                        if (num_end != temp_ptr + 4 || code_point < 0) {
-                            FREEMEM(target_buf);
-                            SET_ERROR("Invalid \\u escape", text - 5, end);
-                            return NULL;
+                        int32_t code_point = 0;
+                        for (int i = 1; i < 5; i++) {
+                            char c = text[i];
+                            int32_t digit = 0;
+                            if (c >= '0' && c <= '9') {
+                                digit = c - '0';
+                            }
+                            else if (c >= 'a' && c <= 'f') {
+                                digit = c - 'a' + 10;
+                            }
+                            else if (c >= 'A' && c <= 'F') {
+                                digit = c - 'A' + 10;
+                            }
+                            else {
+                                DECREF(cb);
+                                SET_ERROR("Invalid \\u escape", text - 1, end);
+                                return NULL;
+                            }
+                            code_point = code_point * 16 + digit;
                         }
                         if (code_point >= 0xD800 && code_point <= 0xDFFF) {
-                            FREEMEM(target_buf);
+                            DECREF(cb);
                             SET_ERROR("Surrogate pairs not supported",
-                                      text - 5, end);
+                                      text - 1, end);
                             return NULL;
                         }
-                        target_size += 
StrHelp_encode_utf8_char((int32_t)code_point,
-                                                                target_buf + 
target_size);
+                        CB_Cat_Char(cb, code_point);
+                        text += 4;
                     }
                     break;
                 default:
-                    FREEMEM(target_buf);
+                    DECREF(cb);
                     SET_ERROR("Illegal escape", text - 1, end);
                     return NULL;
             }
+
+            text++;
+            chunk = text;
         }
     }
 
-    // NULL-terminate, sanity check, then return the escaped string.
-    target_buf[target_size] = '\0';
-    if (!StrHelp_utf8_valid(target_buf, target_size)) {
-        FREEMEM(target_buf);
+    if (!StrHelp_utf8_valid(chunk, (size_t)(text - chunk))) {
+        DECREF(cb);
         String *mess = MAKE_MESS("Bad UTF-8 in JSON");
         Err_set_error(Err_new(mess));
         return NULL;
     }
-    return Str_new_steal_trusted_utf8(target_buf, target_size);
+    CB_Cat_Trusted_Utf8(cb, chunk, (size_t)(text - chunk));
+
+    String *retval = CB_Yield_String(cb);
+    DECREF(cb);
+    return retval;
 }
 
 static CFISH_INLINE bool

http://git-wip-us.apache.org/repos/asf/lucy/blob/3cc03972/test/Lucy/Test/Util/TestJson.c
----------------------------------------------------------------------
diff --git a/test/Lucy/Test/Util/TestJson.c b/test/Lucy/Test/Util/TestJson.c
index c67e292..78fa959 100644
--- a/test/Lucy/Test/Util/TestJson.c
+++ b/test/Lucy/Test/Util/TestJson.c
@@ -275,6 +275,7 @@ test_syntax_errors(TestBatchRunner *runner) {
     S_verify_bad_syntax(runner, "+1.0 ", "float with prepended plus");
     S_verify_bad_syntax(runner, "\"\\g\"", "invalid char escape");
     S_verify_bad_syntax(runner, "\"\\uAAAZ\"", "invalid \\u escape");
+    S_verify_bad_syntax(runner, "\"\\uAAA\"", "invalid \\u escape");
 }
 
 static void
@@ -342,7 +343,7 @@ void
 TestJson_Run_IMP(TestJson *self, TestBatchRunner *runner) {
     uint32_t num_tests = 105;
 #ifndef LUCY_VALGRIND
-    num_tests += 28; // FIXME: syntax errors leak memory.
+    num_tests += 30; // FIXME: syntax errors leak memory.
 #endif
     TestBatchRunner_Plan(runner, (TestBatch*)self, num_tests);

[1/4] lucy git commit: Use CharBuf to unescape JSON strings

Reply via email to