On 3/22/22 21:17, Rosemarie O'Riorden wrote: > Hi Dumitru, > > Thank you for the review! I'm going to send a v2 shortly with some changes. > >> I'm not sure what the benefit is to have 'start' as 'const unsigned char >> *'; it could easily be 'const char *' and we wouldn't have to cast later >> on. Also, a comment might be useful, describing that this points to the >> start of the input that has been preprocessed but not yet copied to >> 'buffer'. > > I agree with this and have updated 'start' to be a const char *, which > took away the need for all those casts. I have also added a comment to > explain what p->start is. > > >> On the other hand, json_lex_keyword(p) calls json_parser_input(p, ..) >> which clears p->buffer at the end. >> >> So I thought that we should set p->start = NULL when we clear p->buffer. >> >> But json_parser_input() changes the lexer state back to JSON_LEX_START >> so p->start will be reset when json_lex_input() is called next. >> >> I don't really have a good suggestion about how to improve this, it just >> took me a bit to figure out why it's working correctly now. > > Maybe I can add a few comments to better explain what is going on. > SInce p->start is updated at the next json_lex_input call, I guess > this logic doesn't need to be changed, but if you think it does, let > me know. > > >> I keep wondering if there's a way to avoid the p->start all together. >> What if we instead pass 'start' as an argument to json_lex_input() along >> with an offset to indicate the next character to process? > > I thought about this for a while. I originally had this idea too, but > I found that it made less sense to me. If I removed start and replaced > it with an offset count, I'd need to track the offset with a pointer > that needs to be incremented for every character that is processed. > The way I have it now, p->start is only assigned once at every state > change, so it makes it quite simple in my opinion. If there is more > input on how I can improve this, please let me know. > > >>> - json_lex_input(p, ' '); >>> + p->start = space; >>> + json_lex_input(p, space); >> >> This can be: >> >> p->start = " "; >> json_lex_input(p, " "); >> > > I've updated this the way you suggested. > > > Thank you, > Rosemarie O'Riorden >
Hi Rosemarie, What about the alternative approach below (incremental on top of your patch)? It might be a bit intrusive because we explicitly inline json_lex_input() but that function was anyway used in a single place (well, actually two places but the call in json_parser_finish() can be replaced with a call to json_parser_feed()). Like this we avoid the 'start' field in 'struct json_parser' and make it all contained, in a single place, in json_parser_feed(). No need to explain how p->start is used and what it does. Thanks, Dumitru diff --git a/lib/json.c b/lib/json.c index f48cb32415bf..2b18ec550d67 100644 --- a/lib/json.c +++ b/lib/json.c @@ -101,7 +101,6 @@ struct json_parser { int line_number; int column_number; int byte_number; - const unsigned char *start; /* Parsing. */ enum json_parse_state parse_state; @@ -977,96 +976,6 @@ json_lex_string(struct json_parser *p) } } -static inline ALWAYS_INLINE bool -json_lex_input(struct json_parser *p, const unsigned char *ch) -{ - struct json_token token; - unsigned char c = *ch; - - switch (p->lex_state) { - case JSON_LEX_START: - switch (c) { - case ' ': case '\t': case '\n': case '\r': - /* Nothing to do. */ - p->start = ch + 1; - return true; - - case 'a': case 'b': case 'c': case 'd': case 'e': - case 'f': case 'g': case 'h': case 'i': case 'j': - case 'k': case 'l': case 'm': case 'n': case 'o': - case 'p': case 'q': case 'r': case 's': case 't': - case 'u': case 'v': case 'w': case 'x': case 'y': - case 'z': - p->lex_state = JSON_LEX_KEYWORD; - p->start = ch; - break; - - case '[': case '{': case ']': case '}': case ':': case ',': - token.type = c; - json_parser_input(p, &token); - p->start = ch + 1; - return true; - - case '-': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - p->lex_state = JSON_LEX_NUMBER; - p->start = ch; - break; - - case '"': - p->lex_state = JSON_LEX_STRING; - p->start = ch + 1; - return true; - - default: - if (isprint(c)) { - json_error(p, "invalid character '%c'", c); - } else { - json_error(p, "invalid character U+%04x", c); - } - return true; - } - break; - - case JSON_LEX_KEYWORD: - if (!isalpha((unsigned char) c)) { - ds_put_buffer(&p->buffer, (const char *) p->start, ch - p->start); - json_lex_keyword(p); - return false; - } - break; - - case JSON_LEX_NUMBER: - if (!strchr(".0123456789eE-+", c)) { - ds_put_buffer(&p->buffer, (const char *) p->start, ch - p->start); - json_lex_number(p); - return false; - } - break; - - case JSON_LEX_STRING: - if (c == '\\') { - p->lex_state = JSON_LEX_ESCAPE; - } else if (c == '"') { - ds_put_buffer(&p->buffer, (const char *) p->start, ch - p->start); - json_lex_string(p); - return true; - } else if (c < 0x20) { - json_error(p, "U+%04X must be escaped in quoted string", c); - return true; - } - break; - - case JSON_LEX_ESCAPE: - p->lex_state = JSON_LEX_STRING; - break; - - default: - abort(); - } - return true; -} /* Parsing. */ @@ -1169,26 +1078,123 @@ json_parser_create(int flags) return p; } +static inline void ALWAYS_INLINE +json_parser_account_byte(struct json_parser *p, unsigned char c) +{ + p->byte_number++; + if (OVS_UNLIKELY(c == '\n')) { + p->column_number = 0; + p->line_number++; + } else { + p->column_number++; + } +} + size_t json_parser_feed(struct json_parser *p, const char *input, size_t n) { + size_t token_start = 0; size_t i; - p->start = (const unsigned char *) input; + for (i = 0; !p->done && i < n; ) { - if (json_lex_input(p, (unsigned const char *) &input[i])) { - p->byte_number++; - if (OVS_UNLIKELY(input[i] == '\n')) { - p->column_number = 0; - p->line_number++; - } else { - p->column_number++; + bool consumed = true; + + const char *start_p = &input[token_start]; + unsigned char c = input[i]; + struct json_token token; + + switch (p->lex_state) { + case JSON_LEX_START: + switch (c) { + case ' ': case '\t': case '\n': case '\r': + /* Nothing to do. */ + + token_start = i + 1; + break; + + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': + p->lex_state = JSON_LEX_KEYWORD; + token_start = i; + break; + + case '[': case '{': case ']': case '}': case ':': case ',': + token.type = c; + json_parser_input(p, &token); + token_start = i + 1; + break; + + case '-': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + p->lex_state = JSON_LEX_NUMBER; + token_start = i; + break; + + case '"': + p->lex_state = JSON_LEX_STRING; + token_start = i + 1; + break; + + default: + if (isprint(c)) { + json_error(p, "invalid character '%c'", c); + } else { + json_error(p, "invalid character U+%04x", c); + } + break; } + break; + + case JSON_LEX_KEYWORD: + if (!isalpha((unsigned char) c)) { + ds_put_buffer(&p->buffer, start_p, i - token_start); + json_lex_keyword(p); + consumed = false; + break; + } + break; + + case JSON_LEX_NUMBER: + if (!strchr(".0123456789eE-+", c)) { + ds_put_buffer(&p->buffer, start_p, i - token_start); + json_lex_number(p); + consumed = false; + break; + } + break; + + case JSON_LEX_STRING: + if (c == '\\') { + p->lex_state = JSON_LEX_ESCAPE; + } else if (c == '"') { + ds_put_buffer(&p->buffer, start_p, i - token_start); + json_lex_string(p); + } else if (c < 0x20) { + json_error(p, "U+%04X must be escaped in quoted string", c); + } + break; + + case JSON_LEX_ESCAPE: + p->lex_state = JSON_LEX_STRING; + break; + + default: + ovs_abort(0, "unexpected lexer state"); + } + + if (consumed) { + json_parser_account_byte(p, c); i++; } } + if (!p->done) { - ds_put_buffer(&p->buffer, (const char *) p->start, - (const unsigned char *) &input[i] - p->start); + ds_put_buffer(&p->buffer, &input[token_start], i - token_start); } return i; } @@ -1203,7 +1209,6 @@ struct json * json_parser_finish(struct json_parser *p) { struct json *json; - const unsigned char *space = (const unsigned char *) " "; switch (p->lex_state) { case JSON_LEX_START: @@ -1216,8 +1221,7 @@ json_parser_finish(struct json_parser *p) case JSON_LEX_NUMBER: case JSON_LEX_KEYWORD: - p->start = space; - json_lex_input(p, space); + json_parser_feed(p, " ", 1); break; } --- _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev