Re: Consider \v to the list of whitespace characters in the parser

Michael Paquier Mon, 03 Jul 2023 17:01:11 -0700

On Mon, Jul 03, 2023 at 12:17:10PM +0200, Peter Eisentraut wrote:
> SQL has "whitespace", which includes any Unicode character with the
> White_Space property (which includes \v), and <newline>, which is
> implementation-defined.
> 
> So nothing there speaks against treating \v as a (white)space character in
> the SQL scanner.


Okay, thanks for confirming.  

> In scan.l, you might want to ponder horiz_space: Even though \v is clearly
> not "horizontal space", horiz_space already includes \f, which is also not
> horizontal IMO.  I think horiz_space is really all space characters except
> newline characters.  Maybe this should be rephrased.

And a few lines above, there is a comment from 2000 (3cfdd8f)
pondering if \f should be handled as a newline, which is kind of
incorrect anyway?

FWIW, I agree that horiz_space is confusing in this context because it
does not completely reflect the reality, and \v is not that so adding
it to the existing list felt wrong to me.  Form feed is also not a
newline, from what I understand..  From what the parser tells, there
are two things we want to track to handle comments:
- All space characters, which would be \t\n\r\f\v.
- All space characters that are not newlines, \t\f\v.

I don't really have a better idea this morning than using the
following terms in the parser, changing the surroundings with similar
terms:
-space          [ \t\n\r\f]
-horiz_space        [ \t\f]
+space          [ \t\n\r\f\v]
+non_newline_space      [ \t\f\v]

Perhaps somebody has a better idea of split?
--
Michael

diff --git a/src/backend/parser/parse_type.c b/src/backend/parser/parse_type.c
index be75dc6ab0..63b4e96962 100644
--- a/src/backend/parser/parse_type.c
+++ b/src/backend/parser/parse_type.c
@@ -742,7 +742,7 @@ typeStringToTypeName(const char *str, Node *escontext)
 	ErrorContextCallback ptserrcontext;
 
 	/* make sure we give useful error for empty input */
-	if (strspn(str, " \t\n\r\f") == strlen(str))
+	if (strspn(str, " \t\n\r\f\v") == strlen(str))
 		goto fail;
 
 	/*
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index b2216a9eac..0708ba6540 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -213,16 +213,16 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
  * versions of Postgres failed to recognize -- as a comment if the input
  * did not end with a newline.
  *
- * XXX perhaps \f (formfeed) should be treated as a newline as well?
+ * non_newline_space tracks all the other space characters except newlines.
  *
  * XXX if you change the set of whitespace characters, fix scanner_isspace()
  * to agree.
  */
 
-space			[ \t\n\r\f]
-horiz_space		[ \t\f]
-newline			[\n\r]
-non_newline		[^\n\r]
+space				[ \t\n\r\f\v]
+non_newline_space	[ \t\f\v]
+newline				[\n\r]
+non_newline			[^\n\r]
 
 comment			("--"{non_newline}*)
 
@@ -236,8 +236,8 @@ whitespace		({space}+|{comment})
  */
 
 special_whitespace		({space}+|{comment}{newline})
-horiz_whitespace		({horiz_space}|{comment})
-whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
+non_newline_whitespace	({non_newline_space}|{comment})
+whitespace_with_newline	({non_newline_whitespace}*{newline}{special_whitespace}*)
 
 quote			'
 /* If we see {quote} then {quotecontinue}, the quoted string continues */
@@ -1414,6 +1414,8 @@ unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
 			return '\r';
 		case 't':
 			return '\t';
+		case 'v':
+			return '\v';
 		default:
 			/* check for backslash followed by non-7-bit-ASCII */
 			if (c == '\0' || IS_HIGHBIT_SET(c))
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index ed67f5f5fe..4f0005a114 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -121,6 +121,7 @@ scanner_isspace(char ch)
 		ch == '\t' ||
 		ch == '\n' ||
 		ch == '\r' ||
+		ch == '\v' ||
 		ch == '\f')
 		return true;
 	return false;
diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l
index cb467ca46f..1cc7fb858c 100644
--- a/src/backend/replication/repl_scanner.l
+++ b/src/backend/replication/repl_scanner.l
@@ -73,7 +73,7 @@ static void addlitchar(unsigned char ychar);
 %x xd
 %x xq
 
-space			[ \t\n\r\f]
+space			[ \t\n\r\f\v]
 
 quote			'
 quotestop		{quote}
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index 9000f83a83..4359dbd83d 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -24,6 +24,7 @@
 #include "nodes/nodeFuncs.h"
 #include "nodes/supportnodes.h"
 #include "optimizer/optimizer.h"
+#include "parser/scansup.h"
 #include "port/pg_bitutils.h"
 #include "utils/array.h"
 #include "utils/arrayaccess.h"
@@ -89,7 +90,6 @@ typedef struct ArrayIteratorData
 	int			current_item;	/* the item # we're at in the array */
 }			ArrayIteratorData;
 
-static bool array_isspace(char ch);
 static int	ArrayCount(const char *str, int *dim, char typdelim,
 					   Node *escontext);
 static bool ReadArrayStr(char *arrayStr, const char *origStr,
@@ -254,7 +254,7 @@ array_in(PG_FUNCTION_ARGS)
 		 * Note: we currently allow whitespace between, but not within,
 		 * dimension items.
 		 */
-		while (array_isspace(*p))
+		while (scanner_isspace(*p))
 			p++;
 		if (*p != '[')
 			break;				/* no more dimension items */
@@ -338,7 +338,7 @@ array_in(PG_FUNCTION_ARGS)
 					 errdetail("Missing \"%s\" after array dimensions.",
 							   ASSGN)));
 		p += strlen(ASSGN);
-		while (array_isspace(*p))
+		while (scanner_isspace(*p))
 			p++;
 
 		/*
@@ -434,27 +434,6 @@ array_in(PG_FUNCTION_ARGS)
 	PG_RETURN_ARRAYTYPE_P(retval);
 }
 
-/*
- * array_isspace() --- a non-locale-dependent isspace()
- *
- * We used to use isspace() for parsing array values, but that has
- * undesirable results: an array value might be silently interpreted
- * differently depending on the locale setting.  Now we just hard-wire
- * the traditional ASCII definition of isspace().
- */
-static bool
-array_isspace(char ch)
-{
-	if (ch == ' ' ||
-		ch == '\t' ||
-		ch == '\n' ||
-		ch == '\r' ||
-		ch == '\v' ||
-		ch == '\f')
-		return true;
-	return false;
-}
-
 /*
  * ArrayCount
  *	 Determines the dimensions for an array string.
@@ -654,7 +633,7 @@ ArrayCount(const char *str, int *dim, char typdelim, Node *escontext)
 							itemdone = true;
 							nelems[nest_level - 1]++;
 						}
-						else if (!array_isspace(*ptr))
+						else if (!scanner_isspace(*ptr))
 						{
 							/*
 							 * Other non-space characters must be after a
@@ -684,7 +663,7 @@ ArrayCount(const char *str, int *dim, char typdelim, Node *escontext)
 	/* only whitespace is allowed after the closing brace */
 	while (*ptr)
 	{
-		if (!array_isspace(*ptr++))
+		if (!scanner_isspace(*ptr++))
 			ereturn(escontext, -1,
 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 					 errmsg("malformed array literal: \"%s\"", str),
@@ -884,7 +863,7 @@ ReadArrayStr(char *arrayStr,
 						indx[ndim - 1]++;
 						srcptr++;
 					}
-					else if (array_isspace(*srcptr))
+					else if (scanner_isspace(*srcptr))
 					{
 						/*
 						 * If leading space, drop it immediately.  Else, copy
@@ -1176,7 +1155,7 @@ array_out(PG_FUNCTION_ARGS)
 					overall_length += 1;
 				}
 				else if (ch == '{' || ch == '}' || ch == typdelim ||
-						 array_isspace(ch))
+						 scanner_isspace(ch))
 					needquote = true;
 			}
 		}
diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l
index 84754aca4a..5dc6fc2fb9 100644
--- a/src/fe_utils/psqlscan.l
+++ b/src/fe_utils/psqlscan.l
@@ -149,16 +149,16 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner);
  * versions of Postgres failed to recognize -- as a comment if the input
  * did not end with a newline.
  *
- * XXX perhaps \f (formfeed) should be treated as a newline as well?
+ * non_newline_space tracks all space characters except newlines.
  *
  * XXX if you change the set of whitespace characters, fix scanner_isspace()
  * to agree.
  */
 
-space			[ \t\n\r\f]
-horiz_space		[ \t\f]
-newline			[\n\r]
-non_newline		[^\n\r]
+space				[ \t\n\r\f\v]
+non_newline_space	[ \t\f\v]
+newline				[\n\r]
+non_newline			[^\n\r]
 
 comment			("--"{non_newline}*)
 
@@ -172,8 +172,8 @@ whitespace		({space}+|{comment})
  */
 
 special_whitespace		({space}+|{comment}{newline})
-horiz_whitespace		({horiz_space}|{comment})
-whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
+non_newline_whitespace	({non_newline_space}|{comment})
+whitespace_with_newline	({non_newline_whitespace}*{newline}{special_whitespace}*)
 
 quote			'
 /* If we see {quote} then {quotecontinue}, the quoted string continues */
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
index 0429a72bfe..58b21c4d6a 100644
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@@ -761,7 +761,7 @@ appendPGArray(PQExpBuffer buffer, const char *value)
 
 			if (ch == '"' || ch == '\\' ||
 				ch == '{' || ch == '}' || ch == ',' ||
-			/* these match array_isspace(): */
+			/* these match scanner_isspace(): */
 				ch == ' ' || ch == '\t' || ch == '\n' ||
 				ch == '\r' || ch == '\v' || ch == '\f')
 			{
diff --git a/src/bin/psql/psqlscanslash.l b/src/bin/psql/psqlscanslash.l
index 5c020f30b9..1461fa3d3e 100644
--- a/src/bin/psql/psqlscanslash.l
+++ b/src/bin/psql/psqlscanslash.l
@@ -108,7 +108,7 @@ extern void slash_yyset_column(int column_no, yyscan_t yyscanner);
 /*
  * Assorted character class definitions that should match psqlscan.l.
  */
-space			[ \t\n\r\f]
+space			[ \t\n\r\f\v]
 quote			'
 xeoctesc		[\\][0-7]{1,3}
 xehexesc		[\\]x[0-9A-Fa-f]{1,2}
diff --git a/contrib/cube/cubescan.l b/contrib/cube/cubescan.l
index 49cb699216..a30fbfc311 100644
--- a/contrib/cube/cubescan.l
+++ b/contrib/cube/cubescan.l
@@ -63,7 +63,7 @@ NaN          [nN][aA][nN]
 \(           cube_yylval = "("; return O_PAREN;
 \)           cube_yylval = ")"; return C_PAREN;
 \,           cube_yylval = ","; return COMMA;
-[ \t\n\r\f]+ /* discard spaces */
+[ \t\n\r\f\v]+ /* discard spaces */
 .            return yytext[0]; /* alert parser of the garbage */
 
 %%
diff --git a/contrib/hstore/expected/hstore_utf8.out b/contrib/hstore/expected/hstore_utf8.out
index 4405824413..bbc885a181 100644
--- a/contrib/hstore/expected/hstore_utf8.out
+++ b/contrib/hstore/expected/hstore_utf8.out
@@ -34,3 +34,34 @@ SELECT 'keyąfoo=>valueą'::hstore;
  "keyąfoo"=>"valueą"
 (1 row)
 
+-- More patterns that may depend on isspace() and locales, all discarded.
+SELECT E'key\u000A=>value\u000A'::hstore; -- \n
+     hstore     
+----------------
+ "key"=>"value"
+(1 row)
+
+SELECT E'key\u0009=>value\u0009'::hstore; -- \t
+     hstore     
+----------------
+ "key"=>"value"
+(1 row)
+
+SELECT E'key\u000D=>value\u000D'::hstore; -- \r
+     hstore     
+----------------
+ "key"=>"value"
+(1 row)
+
+SELECT E'key\u000B=>value\u000B'::hstore; -- \v
+     hstore     
+----------------
+ "key"=>"value"
+(1 row)
+
+SELECT E'key\u000C=>value\u000C'::hstore; -- \f
+     hstore     
+----------------
+ "key"=>"value"
+(1 row)
+
diff --git a/contrib/hstore/sql/hstore_utf8.sql b/contrib/hstore/sql/hstore_utf8.sql
index face878324..38c9481ee6 100644
--- a/contrib/hstore/sql/hstore_utf8.sql
+++ b/contrib/hstore/sql/hstore_utf8.sql
@@ -17,3 +17,10 @@ SELECT E'key\u0105=>value\u0105'::hstore;
 SELECT 'keyą=>valueą'::hstore;
 SELECT 'ą=>ą'::hstore;
 SELECT 'keyąfoo=>valueą'::hstore;
+
+-- More patterns that may depend on isspace() and locales, all discarded.
+SELECT E'key\u000A=>value\u000A'::hstore; -- \n
+SELECT E'key\u0009=>value\u0009'::hstore; -- \t
+SELECT E'key\u000D=>value\u000D'::hstore; -- \r
+SELECT E'key\u000B=>value\u000B'::hstore; -- \v
+SELECT E'key\u000C=>value\u000C'::hstore; -- \f
diff --git a/contrib/seg/segscan.l b/contrib/seg/segscan.l
index a1e9e9937e..4ad529eccc 100644
--- a/contrib/seg/segscan.l
+++ b/contrib/seg/segscan.l
@@ -59,7 +59,7 @@ float        ({integer}|{real})([eE]{integer})?
 \<           seg_yylval.text = "<"; return EXTENSION;
 \>           seg_yylval.text = ">"; return EXTENSION;
 \~           seg_yylval.text = "~"; return EXTENSION;
-[ \t\n\r\f]+ /* discard spaces */
+[ \t\n\r\f\v]+ /* discard spaces */
 .            return yytext[0]; /* alert parser of the garbage */
 
 %%
diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l
index dcd567e8c3..77bdf4f82f 100644
--- a/src/interfaces/ecpg/preproc/pgc.l
+++ b/src/interfaces/ecpg/preproc/pgc.l
@@ -180,16 +180,16 @@ static struct _if_value
  * versions of Postgres failed to recognize -- as a comment if the input
  * did not end with a newline.
  *
- * XXX perhaps \f (formfeed) should be treated as a newline as well?
+ * non_newline_space tracks all space characters except newlines.
  *
  * XXX if you change the set of whitespace characters, fix ecpg_isspace()
  * to agree.
  */
 
-space			[ \t\n\r\f]
-horiz_space		[ \t\f]
-newline			[\n\r]
-non_newline		[^\n\r]
+space				[ \t\n\r\f\v]
+non_newline_space	[ \t\f\v]
+newline				[\n\r]
+non_newline			[^\n\r]
 
 comment			("--"{non_newline}*)
 
@@ -202,8 +202,8 @@ whitespace		({space}+|{comment})
  * it, whereas {whitespace} should generally have a * after it...
  */
 
-horiz_whitespace		({horiz_space}|{comment})
-whitespace_with_newline	({horiz_whitespace}*{newline}{whitespace}*)
+non_newline_whitespace	({non_newline_space}|{comment})
+whitespace_with_newline	({non_newline_whitespace}*{newline}{whitespace}*)
 
 quote			'
 /* If we see {quote} then {quotecontinue}, the quoted string continues */
@@ -1721,7 +1721,8 @@ ecpg_isspace(char ch)
 		ch == '\t' ||
 		ch == '\n' ||
 		ch == '\r' ||
-		ch == '\f')
+		ch == '\f' ||
+		ch == '\v')
 		return true;
 	return false;
 }

signature.asc
Description: PGP signature

Re: Consider \v to the list of whitespace characters in the parser

Reply via email to