Repository: lucy Updated Branches: refs/heads/master 0a41b29b1 -> 7071a277c
Use CharBuf to unescape JSON strings Also use a stricter custom decoder for hex escapes. (strtol allows leading whitespace and plus signs.) Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/3cc03972 Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/3cc03972 Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/3cc03972 Branch: refs/heads/master Commit: 3cc039724fa1862aa71333911f2bfc5c84b50f3e Parents: 0a41b29 Author: Nick Wellnhofer <wellnho...@aevum.de> Authored: Tue Aug 2 16:58:43 2016 +0200 Committer: Nick Wellnhofer <wellnho...@aevum.de> Committed: Tue Aug 2 17:07:30 2016 +0200 ---------------------------------------------------------------------- core/Lucy/Util/Json.c | 94 +++++++++++++++++++++---------------- test/Lucy/Test/Util/TestJson.c | 3 +- 2 files changed, 56 insertions(+), 41 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/3cc03972/core/Lucy/Util/Json.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Util/Json.c b/core/Lucy/Util/Json.c index 6c7473d..f3e7a219 100644 --- a/core/Lucy/Util/Json.c +++ b/core/Lucy/Util/Json.c @@ -578,86 +578,100 @@ S_parse_string(const char **json_ptr, const char *limit) { static String* S_unescape_text(const char *top, const char *end) { - // The unescaped string will never be longer than the escaped string - // because only a \u escape can theoretically be too long and - // StrHelp_encode_utf8_char guards against sequences over 4 bytes. - // Therefore we can allocate once and not worry about reallocating. size_t cap = (size_t)(end - top) + 1; - char *target_buf = (char*)MALLOCATE(cap); - size_t target_size = 0; - for (const char *text = top; text < end; text++) { + CharBuf *cb = CB_new(cap); + const char *chunk = top; + const char *text = top; + + while (text < end) { if (*text != '\\') { - target_buf[target_size++] = *text; + text++; } else { + if (!StrHelp_utf8_valid(chunk, (size_t)(text - chunk))) { + DECREF(cb); + String *mess = MAKE_MESS("Bad UTF-8 in JSON"); + Err_set_error(Err_new(mess)); + return NULL; + } + CB_Cat_Trusted_Utf8(cb, chunk, (size_t)(text - chunk)); + // Process escape. text++; switch (*text) { case '"': - target_buf[target_size++] = '"'; - break; case '\\': - target_buf[target_size++] = '\\'; - break; case '/': - target_buf[target_size++] = '/'; + CB_Cat_Trusted_Utf8(cb, text, 1); break; case 'b': - target_buf[target_size++] = '\b'; + CB_Cat_Trusted_Utf8(cb, "\b", 1); break; case 'f': - target_buf[target_size++] = '\f'; + CB_Cat_Trusted_Utf8(cb, "\f", 1); break; case 'n': - target_buf[target_size++] = '\n'; + CB_Cat_Trusted_Utf8(cb, "\n", 1); break; case 'r': - target_buf[target_size++] = '\r'; + CB_Cat_Trusted_Utf8(cb, "\r", 1); break; case 't': - target_buf[target_size++] = '\t'; + CB_Cat_Trusted_Utf8(cb, "\t", 1); break; case 'u': { - // Copy into a temp buffer because strtol will overrun - // into adjacent text data for e.g. "\uAAAA1". - char temp[5] = { 0, 0, 0, 0, 0 }; - memcpy(temp, text + 1, 4); - text += 4; - char *num_end; - long code_point = strtol(temp, &num_end, 16); - char *temp_ptr = temp; - if (num_end != temp_ptr + 4 || code_point < 0) { - FREEMEM(target_buf); - SET_ERROR("Invalid \\u escape", text - 5, end); - return NULL; + int32_t code_point = 0; + for (int i = 1; i < 5; i++) { + char c = text[i]; + int32_t digit = 0; + if (c >= '0' && c <= '9') { + digit = c - '0'; + } + else if (c >= 'a' && c <= 'f') { + digit = c - 'a' + 10; + } + else if (c >= 'A' && c <= 'F') { + digit = c - 'A' + 10; + } + else { + DECREF(cb); + SET_ERROR("Invalid \\u escape", text - 1, end); + return NULL; + } + code_point = code_point * 16 + digit; } if (code_point >= 0xD800 && code_point <= 0xDFFF) { - FREEMEM(target_buf); + DECREF(cb); SET_ERROR("Surrogate pairs not supported", - text - 5, end); + text - 1, end); return NULL; } - target_size += StrHelp_encode_utf8_char((int32_t)code_point, - target_buf + target_size); + CB_Cat_Char(cb, code_point); + text += 4; } break; default: - FREEMEM(target_buf); + DECREF(cb); SET_ERROR("Illegal escape", text - 1, end); return NULL; } + + text++; + chunk = text; } } - // NULL-terminate, sanity check, then return the escaped string. - target_buf[target_size] = '\0'; - if (!StrHelp_utf8_valid(target_buf, target_size)) { - FREEMEM(target_buf); + if (!StrHelp_utf8_valid(chunk, (size_t)(text - chunk))) { + DECREF(cb); String *mess = MAKE_MESS("Bad UTF-8 in JSON"); Err_set_error(Err_new(mess)); return NULL; } - return Str_new_steal_trusted_utf8(target_buf, target_size); + CB_Cat_Trusted_Utf8(cb, chunk, (size_t)(text - chunk)); + + String *retval = CB_Yield_String(cb); + DECREF(cb); + return retval; } static CFISH_INLINE bool http://git-wip-us.apache.org/repos/asf/lucy/blob/3cc03972/test/Lucy/Test/Util/TestJson.c ---------------------------------------------------------------------- diff --git a/test/Lucy/Test/Util/TestJson.c b/test/Lucy/Test/Util/TestJson.c index c67e292..78fa959 100644 --- a/test/Lucy/Test/Util/TestJson.c +++ b/test/Lucy/Test/Util/TestJson.c @@ -275,6 +275,7 @@ test_syntax_errors(TestBatchRunner *runner) { S_verify_bad_syntax(runner, "+1.0 ", "float with prepended plus"); S_verify_bad_syntax(runner, "\"\\g\"", "invalid char escape"); S_verify_bad_syntax(runner, "\"\\uAAAZ\"", "invalid \\u escape"); + S_verify_bad_syntax(runner, "\"\\uAAA\"", "invalid \\u escape"); } static void @@ -342,7 +343,7 @@ void TestJson_Run_IMP(TestJson *self, TestBatchRunner *runner) { uint32_t num_tests = 105; #ifndef LUCY_VALGRIND - num_tests += 28; // FIXME: syntax errors leak memory. + num_tests += 30; // FIXME: syntax errors leak memory. #endif TestBatchRunner_Plan(runner, (TestBatch*)self, num_tests);