Handle \v as a whitespace character in parsers

This commit comes as a continuation of the discussion that has led to d522b05, as \v was handled inconsistently when parsing array values or anything going through the parsers, and changing a parser behavior in stable branches is a scary thing to do. The parsing of array values now uses the more central scanner_isspace() and array_isspace() is removed. As pointing out by Peter Eisentraut, fix a confusing reference to horizontal space in the parsers with the term "horiz_space". \f was included in this set since 3cfdd8f from 2000, but it is not horizontal. "horiz_space" is renamed to "non_newline_space", to refer to all whitespace characters except newlines. The changes impact the parsers for the backend, psql, seg, cube, ecpg and replication commands. Note that JSON should not escape \v, as per RFC 7159, so these are not touched. Reviewed-by: Peter Eisentraut, Tom Lane Discussion: https://postgr.es/m/ZJKcjNwWHHvw9ksQ@paquier.xyz
2023-07-06 08:16:24 +09:00 · 2023-07-06 08:16:24 +09:00 · ae6d06f096
commit ae6d06f096
parent 4f4d73466d
13 changed files with 77 additions and 56 deletions
--- a/contrib/cube/cubescan.l
+++ b/contrib/cube/cubescan.l
@ -63,7 +63,7 @@ NaN          [nN][aA][nN]
 \(           cube_yylval = "("; return O_PAREN;
 \)           cube_yylval = ")"; return C_PAREN;
 \,           cube_yylval = ","; return COMMA;
-[ \t\n\r\f]+ /* discard spaces */
+[ \t\n\r\f\v]+ /* discard spaces */
 .            return yytext[0]; /* alert parser of the garbage */

 %%
--- a/contrib/hstore/expected/hstore_utf8.out
+++ b/contrib/hstore/expected/hstore_utf8.out
@ -34,3 +34,34 @@ SELECT 'keyąfoo=>valueą'::hstore;
 "keyąfoo"=>"valueą"
 (1 row)

+-- More patterns that may depend on isspace() and locales, all discarded.
+SELECT E'key\u000A=>value\u000A'::hstore; -- \n
+     hstore     
+----------------
+ "key"=>"value"
+(1 row)
+
+SELECT E'key\u0009=>value\u0009'::hstore; -- \t
+     hstore     
+----------------
+ "key"=>"value"
+(1 row)
+
+SELECT E'key\u000D=>value\u000D'::hstore; -- \r
+     hstore     
+----------------
+ "key"=>"value"
+(1 row)
+
+SELECT E'key\u000B=>value\u000B'::hstore; -- \v
+     hstore     
+----------------
+ "key"=>"value"
+(1 row)
+
+SELECT E'key\u000C=>value\u000C'::hstore; -- \f
+     hstore     
+----------------
+ "key"=>"value"
+(1 row)
+
--- a/contrib/hstore/sql/hstore_utf8.sql
+++ b/contrib/hstore/sql/hstore_utf8.sql
@ -17,3 +17,10 @@ SELECT E'key\u0105=>value\u0105'::hstore;
 SELECT 'keyą=>valueą'::hstore;
 SELECT 'ą=>ą'::hstore;
 SELECT 'keyąfoo=>valueą'::hstore;
+
+-- More patterns that may depend on isspace() and locales, all discarded.
+SELECT E'key\u000A=>value\u000A'::hstore; -- \n
+SELECT E'key\u0009=>value\u0009'::hstore; -- \t
+SELECT E'key\u000D=>value\u000D'::hstore; -- \r
+SELECT E'key\u000B=>value\u000B'::hstore; -- \v
+SELECT E'key\u000C=>value\u000C'::hstore; -- \f
--- a/contrib/seg/segscan.l
+++ b/contrib/seg/segscan.l
@ -59,7 +59,7 @@ float        ({integer}|{real})([eE]{integer})?
 \<           seg_yylval.text = "<"; return EXTENSION;
 \>           seg_yylval.text = ">"; return EXTENSION;
 \~           seg_yylval.text = "~"; return EXTENSION;
-[ \t\n\r\f]+ /* discard spaces */
+[ \t\n\r\f\v]+ /* discard spaces */
 .            return yytext[0]; /* alert parser of the garbage */

 %%
--- a/src/backend/parser/parse_type.c
+++ b/src/backend/parser/parse_type.c
@ -742,7 +742,7 @@ typeStringToTypeName(const char *str, Node *escontext)
 	ErrorContextCallback ptserrcontext;

 	/* make sure we give useful error for empty input */
-	if (strspn(str, " \t\n\r\f") == strlen(str))
+	if (strspn(str, " \t\n\r\f\v") == strlen(str))
 		goto fail;

 	/*
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@ -213,16 +213,16 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
 * versions of Postgres failed to recognize -- as a comment if the input
 * did not end with a newline.
 *
- * XXX perhaps \f (formfeed) should be treated as a newline as well?
+ * non_newline_space tracks all the other space characters except newlines.
 *
 * XXX if you change the set of whitespace characters, fix scanner_isspace()
 * to agree.
 */

-space			[ \t\n\r\f]
-horiz_space		[ \t\f]
-newline			[\n\r]
-non_newline		[^\n\r]
+space				[ \t\n\r\f\v]
+non_newline_space	[ \t\f\v]
+newline				[\n\r]
+non_newline			[^\n\r]

 comment			("--"{non_newline}*)

@ -236,8 +236,8 @@ whitespace		({space}+|{comment})
 */

 special_whitespace		({space}+|{comment}{newline})
-horiz_whitespace		({horiz_space}|{comment})
-whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
+non_newline_whitespace	({non_newline_space}|{comment})
+whitespace_with_newline	({non_newline_whitespace}*{newline}{special_whitespace}*)

 quote			'
 /* If we see {quote} then {quotecontinue}, the quoted string continues */
@ -1414,6 +1414,8 @@ unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
 			return '\r';
 		case 't':
 			return '\t';
+		case 'v':
+			return '\v';
 		default:
 			/* check for backslash followed by non-7-bit-ASCII */
 			if (c == '\0' || IS_HIGHBIT_SET(c))
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@ -121,6 +121,7 @@ scanner_isspace(char ch)
 		ch == '\t' ||
 		ch == '\n' ||
 		ch == '\r' ||
+		ch == '\v' ||
 		ch == '\f')
 		return true;
 	return false;
--- a/src/backend/replication/repl_scanner.l
+++ b/src/backend/replication/repl_scanner.l
@ -73,7 +73,7 @@ static void addlitchar(unsigned char ychar);
 %x xd
 %x xq

-space			[ \t\n\r\f]
+space			[ \t\n\r\f\v]

 quote			'
 quotestop		{quote}
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@ -24,6 +24,7 @@
 #include "nodes/nodeFuncs.h"
 #include "nodes/supportnodes.h"
 #include "optimizer/optimizer.h"
+#include "parser/scansup.h"
 #include "port/pg_bitutils.h"
 #include "utils/array.h"
 #include "utils/arrayaccess.h"
@ -89,7 +90,6 @@ typedef struct ArrayIteratorData
 	int			current_item;	/* the item # we're at in the array */
 }			ArrayIteratorData;

-static bool array_isspace(char ch);
 static int	ArrayCount(const char *str, int *dim, char typdelim,
 					   Node *escontext);
 static bool ReadArrayStr(char *arrayStr, const char *origStr,
@ -254,7 +254,7 @@ array_in(PG_FUNCTION_ARGS)
 		 * Note: we currently allow whitespace between, but not within,
 		 * dimension items.
 		 */
-		while (array_isspace(*p))
+		while (scanner_isspace(*p))
 			p++;
 		if (*p != '[')
 			break;				/* no more dimension items */
@ -338,7 +338,7 @@ array_in(PG_FUNCTION_ARGS)
 					 errdetail("Missing \"%s\" after array dimensions.",
 							   ASSGN)));
 		p += strlen(ASSGN);
-		while (array_isspace(*p))
+		while (scanner_isspace(*p))
 			p++;

 		/*
@ -434,27 +434,6 @@ array_in(PG_FUNCTION_ARGS)
 	PG_RETURN_ARRAYTYPE_P(retval);
 }

-/*
- * array_isspace() --- a non-locale-dependent isspace()
- *
- * We used to use isspace() for parsing array values, but that has
- * undesirable results: an array value might be silently interpreted
- * differently depending on the locale setting.  Now we just hard-wire
- * the traditional ASCII definition of isspace().
- */
-static bool
-array_isspace(char ch)
-{
-	if (ch == ' ' ||
-		ch == '\t' ||
-		ch == '\n' ||
-		ch == '\r' ||
-		ch == '\v' ||
-		ch == '\f')
-		return true;
-	return false;
-}
-
 /*
 * ArrayCount
 *	 Determines the dimensions for an array string.
@ -654,7 +633,7 @@ ArrayCount(const char *str, int *dim, char typdelim, Node *escontext)
 							itemdone = true;
 							nelems[nest_level - 1]++;
 						}
-						else if (!array_isspace(*ptr))
+						else if (!scanner_isspace(*ptr))
 						{
 							/*
 							 * Other non-space characters must be after a
@ -684,7 +663,7 @@ ArrayCount(const char *str, int *dim, char typdelim, Node *escontext)
 	/* only whitespace is allowed after the closing brace */
 	while (*ptr)
 	{
-		if (!array_isspace(*ptr++))
+		if (!scanner_isspace(*ptr++))
 			ereturn(escontext, -1,
 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 					 errmsg("malformed array literal: \"%s\"", str),
@ -884,7 +863,7 @@ ReadArrayStr(char *arrayStr,
 						indx[ndim - 1]++;
 						srcptr++;
 					}
-					else if (array_isspace(*srcptr))
+					else if (scanner_isspace(*srcptr))
 					{
 						/*
 						 * If leading space, drop it immediately.  Else, copy
@ -1176,7 +1155,7 @@ array_out(PG_FUNCTION_ARGS)
 					overall_length += 1;
 				}
 				else if (ch == '{' || ch == '}' || ch == typdelim ||
-						 array_isspace(ch))
+						 scanner_isspace(ch))
 					needquote = true;
 			}
 		}
--- a/src/bin/psql/psqlscanslash.l
+++ b/src/bin/psql/psqlscanslash.l
@ -108,7 +108,7 @@ extern void slash_yyset_column(int column_no, yyscan_t yyscanner);
 /*
 * Assorted character class definitions that should match psqlscan.l.
 */
-space			[ \t\n\r\f]
+space			[ \t\n\r\f\v]
 quote			'
 xeoctesc		[\\][0-7]{1,3}
 xehexesc		[\\]x[0-9A-Fa-f]{1,2}
--- a/src/fe_utils/psqlscan.l
+++ b/src/fe_utils/psqlscan.l
@ -149,16 +149,16 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner);
 * versions of Postgres failed to recognize -- as a comment if the input
 * did not end with a newline.
 *
- * XXX perhaps \f (formfeed) should be treated as a newline as well?
+ * non_newline_space tracks all space characters except newlines.
 *
 * XXX if you change the set of whitespace characters, fix scanner_isspace()
 * to agree.
 */

-space			[ \t\n\r\f]
-horiz_space		[ \t\f]
-newline			[\n\r]
-non_newline		[^\n\r]
+space				[ \t\n\r\f\v]
+non_newline_space	[ \t\f\v]
+newline				[\n\r]
+non_newline			[^\n\r]

 comment			("--"{non_newline}*)

@ -172,8 +172,8 @@ whitespace		({space}+|{comment})
 */

 special_whitespace		({space}+|{comment}{newline})
-horiz_whitespace		({horiz_space}|{comment})
-whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
+non_newline_whitespace	({non_newline_space}|{comment})
+whitespace_with_newline	({non_newline_whitespace}*{newline}{special_whitespace}*)

 quote			'
 /* If we see {quote} then {quotecontinue}, the quoted string continues */
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@ -761,7 +761,7 @@ appendPGArray(PQExpBuffer buffer, const char *value)

 			if (ch == '"' || ch == '\\' ||
 				ch == '{' || ch == '}' || ch == ',' ||
-			/* these match array_isspace(): */
+			/* these match scanner_isspace(): */
 				ch == ' ' || ch == '\t' || ch == '\n' ||
 				ch == '\r' || ch == '\v' || ch == '\f')
 			{
--- a/src/interfaces/ecpg/preproc/pgc.l
+++ b/src/interfaces/ecpg/preproc/pgc.l
@ -180,16 +180,16 @@ static struct _if_value
 * versions of Postgres failed to recognize -- as a comment if the input
 * did not end with a newline.
 *
- * XXX perhaps \f (formfeed) should be treated as a newline as well?
+ * non_newline_space tracks all space characters except newlines.
 *
 * XXX if you change the set of whitespace characters, fix ecpg_isspace()
 * to agree.
 */

-space			[ \t\n\r\f]
-horiz_space		[ \t\f]
-newline			[\n\r]
-non_newline		[^\n\r]
+space				[ \t\n\r\f\v]
+non_newline_space	[ \t\f\v]
+newline				[\n\r]
+non_newline			[^\n\r]

 comment			("--"{non_newline}*)

@ -202,8 +202,8 @@ whitespace		({space}+|{comment})
 * it, whereas {whitespace} should generally have a * after it...
 */

-horiz_whitespace		({horiz_space}|{comment})
-whitespace_with_newline	({horiz_whitespace}*{newline}{whitespace}*)
+non_newline_whitespace	({non_newline_space}|{comment})
+whitespace_with_newline	({non_newline_whitespace}*{newline}{whitespace}*)

 quote			'
 /* If we see {quote} then {quotecontinue}, the quoted string continues */
@ -1721,7 +1721,8 @@ ecpg_isspace(char ch)
 		ch == '\t' ||
 		ch == '\n' ||
 		ch == '\r' ||
-		ch == '\f')
+		ch == '\f' ||
+		ch == '\v')
 		return true;
 	return false;
 }