On 01/08/2013 03:12 PM, Andrew Dunstan wrote:

On 01/08/2013 09:58 AM, Andrew Dunstan wrote:

If you have such a datum, parsing it involves having it in memory and then taking a copy (I wonder if we could avoid that step - will take a look).


Here is a Proof Of Concept patch against my development tip on what's involved in getting the JSON lexer not to need a nul-terminated string to parse. This passes regression, incidentally. The downside is that processing is very slightly more complex, and that json_in() would need to call strlen() on its input. The upside would be that the processing routines I've been working on would no longer need to create copies of their json arguments using text_to_cstring() just so they can get a null-terminated string to process.

Consequent changes would modify the signature of makeJsonLexContext() so it's first argument would be a text* instead of a char* (and of course its logic would change accordingly).

I could go either way. Thoughts?



this time with patch ...



*** a/src/backend/utils/adt/json.c
--- b/src/backend/utils/adt/json.c
***************
*** 212,217 **** makeJsonLexContext(char *json, bool need_escapes)
--- 212,218 ----
  
  	lex->input = lex->token_terminator = lex->line_start = json;
  	lex->line_number = 1;
+ 	lex->input_length = strlen(json);
  	if (need_escapes)
  		lex->strval = makeStringInfo();
  	return lex;
***************
*** 398,416 **** static void
  json_lex(JsonLexContext *lex)
  {
  	char	   *s;
! 
  	/* Skip leading whitespace. */
  	s = lex->token_terminator;
! 	while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')
  	{
  		if (*s == '\n')
  			++lex->line_number;
  		++s;
  	}
  	lex->token_start = s;
  
  	/* Determine token type. */
! 	if (*s == '\0')
  	{
  		lex->token_start = NULL;
  		lex->prev_token_terminator = lex->token_terminator;
--- 399,420 ----
  json_lex(JsonLexContext *lex)
  {
  	char	   *s;
! 	int         len;
  	/* Skip leading whitespace. */
  	s = lex->token_terminator;
! 	len = s - lex->input;
! 	while (len < lex->input_length &&
! 		   (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
  	{
  		if (*s == '\n')
  			++lex->line_number;
  		++s;
+ 		++len;
  	}
  	lex->token_start = s;
  
  	/* Determine token type. */
! 	if (len >= lex->input_length)
  	{
  		lex->token_start = NULL;
  		lex->prev_token_terminator = lex->token_terminator;
***************
*** 476,482 **** json_lex(JsonLexContext *lex)
  		 * whole word as an unexpected token, rather than just some
  		 * unintuitive prefix thereof.
  		 */
! 		for (p = s; JSON_ALPHANUMERIC_CHAR(*p); p++)
  			 /* skip */ ;
  
  		/*
--- 480,486 ----
  		 * whole word as an unexpected token, rather than just some
  		 * unintuitive prefix thereof.
  		 */
! 		for (p = s; JSON_ALPHANUMERIC_CHAR(*p) && p - s < lex->input_length - len; p++)
  			 /* skip */ ;
  
  		/*
***************
*** 519,539 **** static void
  json_lex_string(JsonLexContext *lex)
  {
  	char	   *s;
! 
  	if (lex->strval != NULL)
  		resetStringInfo(lex->strval);
  
! 	for (s = lex->token_start + 1; *s != '"'; s++)
  	{
! 		/* Per RFC4627, these characters MUST be escaped. */
! 		if ((unsigned char) *s < 32)
  		{
! 			/* A NUL byte marks the (premature) end of the string. */
! 			if (*s == '\0')
! 			{
! 				lex->token_terminator = s;
! 				report_invalid_token(lex);
! 			}
  			/* Since *s isn't printable, exclude it from the context string */
  			lex->token_terminator = s;
  			ereport(ERROR,
--- 523,545 ----
  json_lex_string(JsonLexContext *lex)
  {
  	char	   *s;
! 	int         len;
  	if (lex->strval != NULL)
  		resetStringInfo(lex->strval);
  
! 	len = lex->token_start - lex->input;
! 	len++;
! 	for (s = lex->token_start + 1; *s != '"'; s++, len++)
  	{
! 		/* Premature end of the string. */
! 		if (len >= lex->input_length)
  		{
! 			lex->token_terminator = s;
! 			report_invalid_token(lex);
! 		}
! 		else if ((unsigned char) *s < 32)
! 		{
! 			/* Per RFC4627, these characters MUST be escaped. */
  			/* Since *s isn't printable, exclude it from the context string */
  			lex->token_terminator = s;
  			ereport(ERROR,
***************
*** 547,553 **** json_lex_string(JsonLexContext *lex)
  		{
  			/* OK, we have an escape character. */
  			s++;
! 			if (*s == '\0')
  			{
  				lex->token_terminator = s;
  				report_invalid_token(lex);
--- 553,560 ----
  		{
  			/* OK, we have an escape character. */
  			s++;
! 			len++;
! 			if (len >= lex->input_length)
  			{
  				lex->token_terminator = s;
  				report_invalid_token(lex);
***************
*** 560,566 **** json_lex_string(JsonLexContext *lex)
  				for (i = 1; i <= 4; i++)
  				{
  					s++;
! 					if (*s == '\0')
  					{
  						lex->token_terminator = s;
  						report_invalid_token(lex);
--- 567,574 ----
  				for (i = 1; i <= 4; i++)
  				{
  					s++;
! 					len++;
! 					if (len >= lex->input_length)
  					{
  						lex->token_terminator = s;
  						report_invalid_token(lex);
***************
*** 690,696 **** json_lex_number(JsonLexContext *lex, char *s)
--- 698,706 ----
  {
  	bool		error = false;
  	char	   *p;
+ 	int         len;
  
+ 	len = s - lex->input;
  	/* Part (1): leading sign indicator. */
  	/* Caller already did this for us; so do nothing. */
  
***************
*** 702,741 **** json_lex_number(JsonLexContext *lex, char *s)
  		do
  		{
  			s++;
! 		} while (*s >= '0' && *s <= '9');
  	}
  	else
  		error = true;
  
  	/* Part (3): parse optional decimal portion. */
! 	if (*s == '.')
  	{
  		s++;
! 		if (*s < '0' || *s > '9')
  			error = true;
  		else
  		{
  			do
  			{
  				s++;
! 			} while (*s >= '0' && *s <= '9');
  		}
  	}
  
  	/* Part (4): parse optional exponent. */
! 	if (*s == 'e' || *s == 'E')
  	{
  		s++;
! 		if (*s == '+' || *s == '-')
  			s++;
! 		if (*s < '0' || *s > '9')
  			error = true;
  		else
  		{
  			do
  			{
  				s++;
! 			} while (*s >= '0' && *s <= '9');
  		}
  	}
  
--- 712,759 ----
  		do
  		{
  			s++;
! 			len++;
! 		} while (*s >= '0' && *s <= '9' && len < lex->input_length);
  	}
  	else
  		error = true;
  
  	/* Part (3): parse optional decimal portion. */
! 	if (len < lex->input_length && *s == '.')
  	{
  		s++;
! 		len++;
! 		if (len == lex->input_length || *s < '0' || *s > '9')
  			error = true;
  		else
  		{
  			do
  			{
  				s++;
! 				len++;
! 			} while (*s >= '0' && *s <= '9' && len < lex->input_length);
  		}
  	}
  
  	/* Part (4): parse optional exponent. */
! 	if (len < lex->input_length && (*s == 'e' || *s == 'E'))
  	{
  		s++;
! 		len++;
! 		if (len < lex->input_length && (*s == '+' || *s == '-'))
! 		{
  			s++;
! 			len++;
! 		}
! 		if (len == lex->input_length || *s < '0' || *s > '9')
  			error = true;
  		else
  		{
  			do
  			{
  				s++;
! 				len++;
! 			} while (len < lex->input_length && *s >= '0' && *s <= '9');
  		}
  	}
  
***************
*** 744,750 **** json_lex_number(JsonLexContext *lex, char *s)
  	 * here should be considered part of the token for error-reporting
  	 * purposes.
  	 */
! 	for (p = s; JSON_ALPHANUMERIC_CHAR(*p); p++)
  		error = true;
  	lex->prev_token_terminator = lex->token_terminator;
  	lex->token_terminator = p;
--- 762,768 ----
  	 * here should be considered part of the token for error-reporting
  	 * purposes.
  	 */
! 	for (p = s; JSON_ALPHANUMERIC_CHAR(*p) && len < lex->input_length; p++, len++)
  		error = true;
  	lex->prev_token_terminator = lex->token_terminator;
  	lex->token_terminator = p;
*** a/src/include/utils/jsonapi.h
--- b/src/include/utils/jsonapi.h
***************
*** 36,41 **** typedef enum
--- 36,42 ----
  typedef struct JsonLexContext
  {
  	char	   *input;
+ 	int			input_length;
  	char	   *token_start;
  	char	   *token_terminator;
  	char	   *prev_token_terminator;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to