Re: [HACKERS] json api WIP patch

Andrew Dunstan Tue, 08 Jan 2013 12:22:54 -0800


On 01/08/2013 03:12 PM, Andrew Dunstan wrote:

On 01/08/2013 09:58 AM, Andrew Dunstan wrote:
If you have such a datum, parsing it involves having it in memory andthen taking a copy (I wonder if we could avoid that step - will takea look).
Here is a Proof Of Concept patch against my development tip on what'sinvolved in getting the JSON lexer not to need a nul-terminated stringto parse. This passes regression, incidentally. The downside is thatprocessing is very slightly more complex, and that json_in() wouldneed to call strlen() on its input. The upside would be that theprocessing routines I've been working on would no longer need tocreate copies of their json arguments using text_to_cstring() just sothey can get a null-terminated string to process.
Consequent changes would modify the signature of makeJsonLexContext()so it's first argument would be a text* instead of a char* (and ofcourse its logic would change accordingly).
I could go either way. Thoughts?


this time with patch ...

*** a/src/backend/utils/adt/json.c
--- b/src/backend/utils/adt/json.c
***************
*** 212,217 **** makeJsonLexContext(char *json, bool need_escapes)
--- 212,218 ----
  
  	lex->input = lex->token_terminator = lex->line_start = json;
  	lex->line_number = 1;
+ 	lex->input_length = strlen(json);
  	if (need_escapes)
  		lex->strval = makeStringInfo();
  	return lex;
***************
*** 398,416 **** static void
  json_lex(JsonLexContext *lex)
  {
  	char	   *s;
! 
  	/* Skip leading whitespace. */
  	s = lex->token_terminator;
! 	while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')
  	{
  		if (*s == '\n')
  			++lex->line_number;
  		++s;
  	}
  	lex->token_start = s;
  
  	/* Determine token type. */
! 	if (*s == '\0')
  	{
  		lex->token_start = NULL;
  		lex->prev_token_terminator = lex->token_terminator;
--- 399,420 ----
  json_lex(JsonLexContext *lex)
  {
  	char	   *s;
! 	int         len;
  	/* Skip leading whitespace. */
  	s = lex->token_terminator;
! 	len = s - lex->input;
! 	while (len < lex->input_length &&
! 		   (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
  	{
  		if (*s == '\n')
  			++lex->line_number;
  		++s;
+ 		++len;
  	}
  	lex->token_start = s;
  
  	/* Determine token type. */
! 	if (len >= lex->input_length)
  	{
  		lex->token_start = NULL;
  		lex->prev_token_terminator = lex->token_terminator;
***************
*** 476,482 **** json_lex(JsonLexContext *lex)
  		 * whole word as an unexpected token, rather than just some
  		 * unintuitive prefix thereof.
  		 */
! 		for (p = s; JSON_ALPHANUMERIC_CHAR(*p); p++)
  			 /* skip */ ;
  
  		/*
--- 480,486 ----
  		 * whole word as an unexpected token, rather than just some
  		 * unintuitive prefix thereof.
  		 */
! 		for (p = s; JSON_ALPHANUMERIC_CHAR(*p) && p - s < lex->input_length - len; p++)
  			 /* skip */ ;
  
  		/*
***************
*** 519,539 **** static void
  json_lex_string(JsonLexContext *lex)
  {
  	char	   *s;
! 
  	if (lex->strval != NULL)
  		resetStringInfo(lex->strval);
  
! 	for (s = lex->token_start + 1; *s != '"'; s++)
  	{
! 		/* Per RFC4627, these characters MUST be escaped. */
! 		if ((unsigned char) *s < 32)
  		{
! 			/* A NUL byte marks the (premature) end of the string. */
! 			if (*s == '\0')
! 			{
! 				lex->token_terminator = s;
! 				report_invalid_token(lex);
! 			}
  			/* Since *s isn't printable, exclude it from the context string */
  			lex->token_terminator = s;
  			ereport(ERROR,
--- 523,545 ----
  json_lex_string(JsonLexContext *lex)
  {
  	char	   *s;
! 	int         len;
  	if (lex->strval != NULL)
  		resetStringInfo(lex->strval);
  
! 	len = lex->token_start - lex->input;
! 	len++;
! 	for (s = lex->token_start + 1; *s != '"'; s++, len++)
  	{
! 		/* Premature end of the string. */
! 		if (len >= lex->input_length)
  		{
! 			lex->token_terminator = s;
! 			report_invalid_token(lex);
! 		}
! 		else if ((unsigned char) *s < 32)
! 		{
! 			/* Per RFC4627, these characters MUST be escaped. */
  			/* Since *s isn't printable, exclude it from the context string */
  			lex->token_terminator = s;
  			ereport(ERROR,
***************
*** 547,553 **** json_lex_string(JsonLexContext *lex)
  		{
  			/* OK, we have an escape character. */
  			s++;
! 			if (*s == '\0')
  			{
  				lex->token_terminator = s;
  				report_invalid_token(lex);
--- 553,560 ----
  		{
  			/* OK, we have an escape character. */
  			s++;
! 			len++;
! 			if (len >= lex->input_length)
  			{
  				lex->token_terminator = s;
  				report_invalid_token(lex);
***************
*** 560,566 **** json_lex_string(JsonLexContext *lex)
  				for (i = 1; i <= 4; i++)
  				{
  					s++;
! 					if (*s == '\0')
  					{
  						lex->token_terminator = s;
  						report_invalid_token(lex);
--- 567,574 ----
  				for (i = 1; i <= 4; i++)
  				{
  					s++;
! 					len++;
! 					if (len >= lex->input_length)
  					{
  						lex->token_terminator = s;
  						report_invalid_token(lex);
***************
*** 690,696 **** json_lex_number(JsonLexContext *lex, char *s)
--- 698,706 ----
  {
  	bool		error = false;
  	char	   *p;
+ 	int         len;
  
+ 	len = s - lex->input;
  	/* Part (1): leading sign indicator. */
  	/* Caller already did this for us; so do nothing. */
  
***************
*** 702,741 **** json_lex_number(JsonLexContext *lex, char *s)
  		do
  		{
  			s++;
! 		} while (*s >= '0' && *s <= '9');
  	}
  	else
  		error = true;
  
  	/* Part (3): parse optional decimal portion. */
! 	if (*s == '.')
  	{
  		s++;
! 		if (*s < '0' || *s > '9')
  			error = true;
  		else
  		{
  			do
  			{
  				s++;
! 			} while (*s >= '0' && *s <= '9');
  		}
  	}
  
  	/* Part (4): parse optional exponent. */
! 	if (*s == 'e' || *s == 'E')
  	{
  		s++;
! 		if (*s == '+' || *s == '-')
  			s++;
! 		if (*s < '0' || *s > '9')
  			error = true;
  		else
  		{
  			do
  			{
  				s++;
! 			} while (*s >= '0' && *s <= '9');
  		}
  	}
  
--- 712,759 ----
  		do
  		{
  			s++;
! 			len++;
! 		} while (*s >= '0' && *s <= '9' && len < lex->input_length);
  	}
  	else
  		error = true;
  
  	/* Part (3): parse optional decimal portion. */
! 	if (len < lex->input_length && *s == '.')
  	{
  		s++;
! 		len++;
! 		if (len == lex->input_length || *s < '0' || *s > '9')
  			error = true;
  		else
  		{
  			do
  			{
  				s++;
! 				len++;
! 			} while (*s >= '0' && *s <= '9' && len < lex->input_length);
  		}
  	}
  
  	/* Part (4): parse optional exponent. */
! 	if (len < lex->input_length && (*s == 'e' || *s == 'E'))
  	{
  		s++;
! 		len++;
! 		if (len < lex->input_length && (*s == '+' || *s == '-'))
! 		{
  			s++;
! 			len++;
! 		}
! 		if (len == lex->input_length || *s < '0' || *s > '9')
  			error = true;
  		else
  		{
  			do
  			{
  				s++;
! 				len++;
! 			} while (len < lex->input_length && *s >= '0' && *s <= '9');
  		}
  	}
  
***************
*** 744,750 **** json_lex_number(JsonLexContext *lex, char *s)
  	 * here should be considered part of the token for error-reporting
  	 * purposes.
  	 */
! 	for (p = s; JSON_ALPHANUMERIC_CHAR(*p); p++)
  		error = true;
  	lex->prev_token_terminator = lex->token_terminator;
  	lex->token_terminator = p;
--- 762,768 ----
  	 * here should be considered part of the token for error-reporting
  	 * purposes.
  	 */
! 	for (p = s; JSON_ALPHANUMERIC_CHAR(*p) && len < lex->input_length; p++, len++)
  		error = true;
  	lex->prev_token_terminator = lex->token_terminator;
  	lex->token_terminator = p;
*** a/src/include/utils/jsonapi.h
--- b/src/include/utils/jsonapi.h
***************
*** 36,41 **** typedef enum
--- 36,42 ----
  typedef struct JsonLexContext
  {
  	char	   *input;
+ 	int			input_length;
  	char	   *token_start;
  	char	   *token_terminator;
  	char	   *prev_token_terminator;

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] json api WIP patch

Reply via email to