Re: [ovs-dev] [PATCH v2] json: Improve string parsing

Dumitru Ceara Wed, 30 Mar 2022 02:20:55 -0700

On 3/29/22 18:51, Rosemarie O'Riorden wrote:
> To parse a json string prior to this change, json_lex_input is called
> with each character of the string. If the character needs to be copied
> to the buffer, it is copied individually. This is an expensive
> operation, as often there are multiple characters in a row
> that need to be copied, and copying memory in blocks is more efficient
> than byte by byte. To improve this, the string is now copied
> in blocks with an offset counter. A copy is performed when the parser
> state equals done.
> 
> Functions that are called for each character use a lot of CPU cycles.
> Making these functions inline greatly reduces the cycles used and
> improves overall performance. Since json_lex_input was only needed in
> one place, it doesn't have to be its own function.
> 
> There is also a conditional that checks if the current character is a
> new line, which is quite unlikely. When this was examined with perf, the
> comparison had a very high CPU cycle usage. To improve this, the
> OVS_UNLIKELY macro was used, which forces the compiler to switch the
> order of the instructions.
> 
> Here is the improvement seen in the json-string-benchmark test:
> 
>   SIZE      Q  S         BEFORE       AFTER      CHANGE
> --------------------------------------------------------
> 100000      0  0 :      0.842 ms     0.489 ms   -41.9 %
> 100000      2  1 :      0.917 ms     0.535 ms   -41.7 %
> 100000      10 1 :      1.063 ms     0.656 ms   -38.3 %
> 10000000    0  0 :     85.328 ms    49.878 ms   -41.5 %
> 10000000    2  1 :     92.555 ms    54.778 ms   -40.8 %
> 10000000    10 1 :    106.728 ms    66.735 ms   -37.5 %
> 100000000   0  0 :    955.375 ms   621.950 ms   -34.9 %
> 100000000   2  1 :   1031.700 ms   665.200 ms   -35.5 %
> 100000000   10 1 :   1189.300 ms   796.050 ms   -33.0 %
> 
> Here Q is probability (%) for a character to be a '\"' and
> S is probability (%) to be a special character ( < 32).
> 
> Signed-off-by: Rosemarie O'Riorden <rorior...@redhat.com>
> ---


Hi Rosemarie,

I just have two small style related comments but I guess, if Ilya
agrees, that those can be fixed up at commit time.  I should've
mentioned those on the previous review but I missed them, sorry.

Aside from that the code looks OK to me, thanks!

Acked-by: Dumitru Ceara <dce...@redhat.com>

Thanks,
Dumitru

> Adds a local counter so p->start is no longer needed
> Moves contents of json_lex_input into json_parser_feed
> Adds function json_parser_account_byte
> Performs copy when parser state equals done
> 
>  lib/json.c | 200 +++++++++++++++++++++++++++++------------------------
>  1 file changed, 110 insertions(+), 90 deletions(-)
> 
> diff --git a/lib/json.c b/lib/json.c
> index 720c73d94..2b18ec550 100644
> --- a/lib/json.c
> +++ b/lib/json.c
> @@ -976,88 +976,6 @@ json_lex_string(struct json_parser *p)
>      }
>  }
>  
> -static bool
> -json_lex_input(struct json_parser *p, unsigned char c)
> -{
> -    struct json_token token;
> -
> -    switch (p->lex_state) {
> -    case JSON_LEX_START:
> -        switch (c) {
> -        case ' ': case '\t': case '\n': case '\r':
> -            /* Nothing to do. */
> -            return true;
> -
> -        case 'a': case 'b': case 'c': case 'd': case 'e':
> -        case 'f': case 'g': case 'h': case 'i': case 'j':
> -        case 'k': case 'l': case 'm': case 'n': case 'o':
> -        case 'p': case 'q': case 'r': case 's': case 't':
> -        case 'u': case 'v': case 'w': case 'x': case 'y':
> -        case 'z':
> -            p->lex_state = JSON_LEX_KEYWORD;
> -            break;
> -
> -        case '[': case '{': case ']': case '}': case ':': case ',':
> -            token.type = c;
> -            json_parser_input(p, &token);
> -            return true;
> -
> -        case '-':
> -        case '0': case '1': case '2': case '3': case '4':
> -        case '5': case '6': case '7': case '8': case '9':
> -            p->lex_state = JSON_LEX_NUMBER;
> -            break;
> -
> -        case '"':
> -            p->lex_state = JSON_LEX_STRING;
> -            return true;
> -
> -        default:
> -            if (isprint(c)) {
> -                json_error(p, "invalid character '%c'", c);
> -            } else {
> -                json_error(p, "invalid character U+%04x", c);
> -            }
> -            return true;
> -        }
> -        break;
> -
> -    case JSON_LEX_KEYWORD:
> -        if (!isalpha((unsigned char) c)) {
> -            json_lex_keyword(p);
> -            return false;
> -        }
> -        break;
> -
> -    case JSON_LEX_NUMBER:
> -        if (!strchr(".0123456789eE-+", c)) {
> -            json_lex_number(p);
> -            return false;
> -        }
> -        break;
> -
> -    case JSON_LEX_STRING:
> -        if (c == '\\') {
> -            p->lex_state = JSON_LEX_ESCAPE;
> -        } else if (c == '"') {
> -            json_lex_string(p);
> -            return true;
> -        } else if (c < 0x20) {
> -            json_error(p, "U+%04X must be escaped in quoted string", c);
> -            return true;
> -        }
> -        break;
> -
> -    case JSON_LEX_ESCAPE:
> -        p->lex_state = JSON_LEX_STRING;
> -        break;
> -
> -    default:
> -        abort();
> -    }
> -    ds_put_char(&p->buffer, c);
> -    return true;
> -}
>  
>  /* Parsing. */
>  
> @@ -1160,22 +1078,124 @@ json_parser_create(int flags)
>      return p;
>  }
>  
> +static inline void ALWAYS_INLINE
> +json_parser_account_byte(struct json_parser *p, unsigned char c)
> +{
> +    p->byte_number++;
> +    if (OVS_UNLIKELY(c == '\n')) {
> +        p->column_number = 0;
> +        p->line_number++;
> +    } else {
> +        p->column_number++;
> +    }
> +}
> +

I think this function definition should go in the "static functions
section" above, after json_parser_input_string().

>  size_t
>  json_parser_feed(struct json_parser *p, const char *input, size_t n)
>  {
> +    size_t token_start = 0;
>      size_t i;
> +
>      for (i = 0; !p->done && i < n; ) {
> -        if (json_lex_input(p, input[i])) {
> -            p->byte_number++;
> -            if (input[i] == '\n') {
> -                p->column_number = 0;
> -                p->line_number++;
> -            } else {
> -                p->column_number++;
> +        bool consumed = true;
> +
> +        const char *start_p = &input[token_start];
> +        unsigned char c = input[i];
> +        struct json_token token;
> +
> +        switch (p->lex_state) {
> +        case JSON_LEX_START:
> +            switch (c) {
> +            case ' ': case '\t': case '\n': case '\r':
> +                /* Nothing to do. */
> +

Nit: empty line not really needed.

> +                token_start = i + 1;
> +                break;
> +
> +            case 'a': case 'b': case 'c': case 'd': case 'e':
> +            case 'f': case 'g': case 'h': case 'i': case 'j':
> +            case 'k': case 'l': case 'm': case 'n': case 'o':
> +            case 'p': case 'q': case 'r': case 's': case 't':
> +            case 'u': case 'v': case 'w': case 'x': case 'y':
> +            case 'z':
> +                p->lex_state = JSON_LEX_KEYWORD;
> +                token_start = i;
> +                break;
> +
> +            case '[': case '{': case ']': case '}': case ':': case ',':
> +                token.type = c;
> +                json_parser_input(p, &token);
> +                token_start = i + 1;
> +                break;
> +
> +            case '-':
> +            case '0': case '1': case '2': case '3': case '4':
> +            case '5': case '6': case '7': case '8': case '9':
> +                p->lex_state = JSON_LEX_NUMBER;
> +                token_start = i;
> +                break;
> +
> +            case '"':
> +                p->lex_state = JSON_LEX_STRING;
> +                token_start = i + 1;
> +                break;
> +
> +            default:
> +                if (isprint(c)) {
> +                    json_error(p, "invalid character '%c'", c);
> +                } else {
> +                    json_error(p, "invalid character U+%04x", c);
> +                }
> +                break;
> +            }
> +            break;
> +
> +        case JSON_LEX_KEYWORD:
> +            if (!isalpha((unsigned char) c)) {
> +                ds_put_buffer(&p->buffer, start_p, i - token_start);
> +                json_lex_keyword(p);
> +                consumed = false;
> +                break;
>              }
> +            break;
> +
> +        case JSON_LEX_NUMBER:
> +            if (!strchr(".0123456789eE-+", c)) {
> +                ds_put_buffer(&p->buffer, start_p, i - token_start);
> +                json_lex_number(p);
> +                consumed = false;
> +                break;
> +            }
> +            break;
> +
> +        case JSON_LEX_STRING:
> +            if (c == '\\') {
> +                p->lex_state = JSON_LEX_ESCAPE;
> +            } else if (c == '"') {
> +                ds_put_buffer(&p->buffer, start_p, i - token_start);
> +                json_lex_string(p);
> +            } else if (c < 0x20) {
> +                json_error(p, "U+%04X must be escaped in quoted string", c);
> +            }
> +            break;
> +
> +        case JSON_LEX_ESCAPE:
> +            p->lex_state = JSON_LEX_STRING;
> +            break;
> +
> +        default:
> +            ovs_abort(0, "unexpected lexer state");
> +        }
> +
> +        if (consumed) {
> +            json_parser_account_byte(p, c);
>              i++;
>          }
>      }
> +
> +    if (!p->done) {
> +        ds_put_buffer(&p->buffer, &input[token_start], i - token_start);
> +    }
>      return i;
>  }
>  
> @@ -1201,7 +1221,7 @@ json_parser_finish(struct json_parser *p)
>  
>      case JSON_LEX_NUMBER:
>      case JSON_LEX_KEYWORD:
> -        json_lex_input(p, ' ');
> +        json_parser_feed(p, " ", 1);
>          break;
>      }
>  

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v2] json: Improve string parsing

Reply via email to