On 01/08/2013 03:12 PM, Andrew Dunstan wrote:
On 01/08/2013 09:58 AM, Andrew Dunstan wrote:
If you have such a datum, parsing it involves having it in memory and
then taking a copy (I wonder if we could avoid that step - will take
a look).
Here is a Proof Of Concept patch against my development tip on what's
involved in getting the JSON lexer not to need a nul-terminated string
to parse. This passes regression, incidentally. The downside is that
processing is very slightly more complex, and that json_in() would
need to call strlen() on its input. The upside would be that the
processing routines I've been working on would no longer need to
create copies of their json arguments using text_to_cstring() just so
they can get a null-terminated string to process.
Consequent changes would modify the signature of makeJsonLexContext()
so it's first argument would be a text* instead of a char* (and of
course its logic would change accordingly).
I could go either way. Thoughts?
this time with patch ...
*** a/src/backend/utils/adt/json.c
--- b/src/backend/utils/adt/json.c
***************
*** 212,217 **** makeJsonLexContext(char *json, bool need_escapes)
--- 212,218 ----
lex->input = lex->token_terminator = lex->line_start = json;
lex->line_number = 1;
+ lex->input_length = strlen(json);
if (need_escapes)
lex->strval = makeStringInfo();
return lex;
***************
*** 398,416 **** static void
json_lex(JsonLexContext *lex)
{
char *s;
!
/* Skip leading whitespace. */
s = lex->token_terminator;
! while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')
{
if (*s == '\n')
++lex->line_number;
++s;
}
lex->token_start = s;
/* Determine token type. */
! if (*s == '\0')
{
lex->token_start = NULL;
lex->prev_token_terminator = lex->token_terminator;
--- 399,420 ----
json_lex(JsonLexContext *lex)
{
char *s;
! int len;
/* Skip leading whitespace. */
s = lex->token_terminator;
! len = s - lex->input;
! while (len < lex->input_length &&
! (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
{
if (*s == '\n')
++lex->line_number;
++s;
+ ++len;
}
lex->token_start = s;
/* Determine token type. */
! if (len >= lex->input_length)
{
lex->token_start = NULL;
lex->prev_token_terminator = lex->token_terminator;
***************
*** 476,482 **** json_lex(JsonLexContext *lex)
* whole word as an unexpected token, rather than just some
* unintuitive prefix thereof.
*/
! for (p = s; JSON_ALPHANUMERIC_CHAR(*p); p++)
/* skip */ ;
/*
--- 480,486 ----
* whole word as an unexpected token, rather than just some
* unintuitive prefix thereof.
*/
! for (p = s; JSON_ALPHANUMERIC_CHAR(*p) && p - s < lex->input_length - len; p++)
/* skip */ ;
/*
***************
*** 519,539 **** static void
json_lex_string(JsonLexContext *lex)
{
char *s;
!
if (lex->strval != NULL)
resetStringInfo(lex->strval);
! for (s = lex->token_start + 1; *s != '"'; s++)
{
! /* Per RFC4627, these characters MUST be escaped. */
! if ((unsigned char) *s < 32)
{
! /* A NUL byte marks the (premature) end of the string. */
! if (*s == '\0')
! {
! lex->token_terminator = s;
! report_invalid_token(lex);
! }
/* Since *s isn't printable, exclude it from the context string */
lex->token_terminator = s;
ereport(ERROR,
--- 523,545 ----
json_lex_string(JsonLexContext *lex)
{
char *s;
! int len;
if (lex->strval != NULL)
resetStringInfo(lex->strval);
! len = lex->token_start - lex->input;
! len++;
! for (s = lex->token_start + 1; *s != '"'; s++, len++)
{
! /* Premature end of the string. */
! if (len >= lex->input_length)
{
! lex->token_terminator = s;
! report_invalid_token(lex);
! }
! else if ((unsigned char) *s < 32)
! {
! /* Per RFC4627, these characters MUST be escaped. */
/* Since *s isn't printable, exclude it from the context string */
lex->token_terminator = s;
ereport(ERROR,
***************
*** 547,553 **** json_lex_string(JsonLexContext *lex)
{
/* OK, we have an escape character. */
s++;
! if (*s == '\0')
{
lex->token_terminator = s;
report_invalid_token(lex);
--- 553,560 ----
{
/* OK, we have an escape character. */
s++;
! len++;
! if (len >= lex->input_length)
{
lex->token_terminator = s;
report_invalid_token(lex);
***************
*** 560,566 **** json_lex_string(JsonLexContext *lex)
for (i = 1; i <= 4; i++)
{
s++;
! if (*s == '\0')
{
lex->token_terminator = s;
report_invalid_token(lex);
--- 567,574 ----
for (i = 1; i <= 4; i++)
{
s++;
! len++;
! if (len >= lex->input_length)
{
lex->token_terminator = s;
report_invalid_token(lex);
***************
*** 690,696 **** json_lex_number(JsonLexContext *lex, char *s)
--- 698,706 ----
{
bool error = false;
char *p;
+ int len;
+ len = s - lex->input;
/* Part (1): leading sign indicator. */
/* Caller already did this for us; so do nothing. */
***************
*** 702,741 **** json_lex_number(JsonLexContext *lex, char *s)
do
{
s++;
! } while (*s >= '0' && *s <= '9');
}
else
error = true;
/* Part (3): parse optional decimal portion. */
! if (*s == '.')
{
s++;
! if (*s < '0' || *s > '9')
error = true;
else
{
do
{
s++;
! } while (*s >= '0' && *s <= '9');
}
}
/* Part (4): parse optional exponent. */
! if (*s == 'e' || *s == 'E')
{
s++;
! if (*s == '+' || *s == '-')
s++;
! if (*s < '0' || *s > '9')
error = true;
else
{
do
{
s++;
! } while (*s >= '0' && *s <= '9');
}
}
--- 712,759 ----
do
{
s++;
! len++;
! } while (*s >= '0' && *s <= '9' && len < lex->input_length);
}
else
error = true;
/* Part (3): parse optional decimal portion. */
! if (len < lex->input_length && *s == '.')
{
s++;
! len++;
! if (len == lex->input_length || *s < '0' || *s > '9')
error = true;
else
{
do
{
s++;
! len++;
! } while (*s >= '0' && *s <= '9' && len < lex->input_length);
}
}
/* Part (4): parse optional exponent. */
! if (len < lex->input_length && (*s == 'e' || *s == 'E'))
{
s++;
! len++;
! if (len < lex->input_length && (*s == '+' || *s == '-'))
! {
s++;
! len++;
! }
! if (len == lex->input_length || *s < '0' || *s > '9')
error = true;
else
{
do
{
s++;
! len++;
! } while (len < lex->input_length && *s >= '0' && *s <= '9');
}
}
***************
*** 744,750 **** json_lex_number(JsonLexContext *lex, char *s)
* here should be considered part of the token for error-reporting
* purposes.
*/
! for (p = s; JSON_ALPHANUMERIC_CHAR(*p); p++)
error = true;
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = p;
--- 762,768 ----
* here should be considered part of the token for error-reporting
* purposes.
*/
! for (p = s; JSON_ALPHANUMERIC_CHAR(*p) && len < lex->input_length; p++, len++)
error = true;
lex->prev_token_terminator = lex->token_terminator;
lex->token_terminator = p;
*** a/src/include/utils/jsonapi.h
--- b/src/include/utils/jsonapi.h
***************
*** 36,41 **** typedef enum
--- 36,42 ----
typedef struct JsonLexContext
{
char *input;
+ int input_length;
char *token_start;
char *token_terminator;
char *prev_token_terminator;
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers