
The reverts the following and makes some associated cleanups: commit f79b803dc: Common SQL/JSON clauses commit f4fb45d15: SQL/JSON constructors commit 5f0adec25: Make STRING an unreserved_keyword. commit 33a377608: IS JSON predicate commit 1a36bc9db: SQL/JSON query functions commit 606948b05: SQL JSON functions commit 49082c2cc: RETURNING clause for JSON() and JSON_SCALAR() commit 4e34747c8: JSON_TABLE commit fadb48b00: PLAN clauses for JSON_TABLE commit 2ef6f11b0: Reduce running time of jsonb_sqljson test commit 14d3f24fa: Further improve jsonb_sqljson parallel test commit a6baa4bad: Documentation for SQL/JSON features commit b46bcf7a4: Improve readability of SQL/JSON documentation. commit 112fdb352: Fix finalization for json_objectagg and friends commit fcdb35c32: Fix transformJsonBehavior commit 4cd8717af: Improve a couple of sql/json error messages commit f7a605f63: Small cleanups in SQL/JSON code commit 9c3d25e17: Fix JSON_OBJECTAGG uniquefying bug commit a79153b7a: Claim SQL standard compliance for SQL/JSON features commit a1e7616d6: Rework SQL/JSON documentation commit 8d9f9634e: Fix errors in copyfuncs/equalfuncs support for JSON node types. commit 3c633f32b: Only allow returning string types or bytea from json_serialize commit 67b26703b: expression eval: Fix EEOP_JSON_CONSTRUCTOR and EEOP_JSONEXPR size. The release notes are also adjusted. Backpatch to release 15. Discussion: https://postgr.es/m/40d2c882-bcac-19a9-754d-4299e1d87ac7@postgresql.org
502 lines
13 KiB
C
502 lines
13 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* parser.c
|
|
* Main entry point/driver for PostgreSQL grammar
|
|
*
|
|
* Note that the grammar is not allowed to perform any table access
|
|
* (since we need to be able to do basic parsing even while inside an
|
|
* aborted transaction). Therefore, the data structures returned by
|
|
* the grammar are "raw" parsetrees that still need to be analyzed by
|
|
* analyze.c and related files.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/parser/parser.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "mb/pg_wchar.h"
|
|
#include "parser/gramparse.h"
|
|
#include "parser/parser.h"
|
|
#include "parser/scansup.h"
|
|
|
|
static bool check_uescapechar(unsigned char escape);
|
|
static char *str_udeescape(const char *str, char escape,
|
|
int position, core_yyscan_t yyscanner);
|
|
|
|
|
|
/*
|
|
* raw_parser
|
|
* Given a query in string form, do lexical and grammatical analysis.
|
|
*
|
|
* Returns a list of raw (un-analyzed) parse trees. The contents of the
|
|
* list have the form required by the specified RawParseMode.
|
|
*/
|
|
List *
|
|
raw_parser(const char *str, RawParseMode mode)
|
|
{
|
|
core_yyscan_t yyscanner;
|
|
base_yy_extra_type yyextra;
|
|
int yyresult;
|
|
|
|
/* initialize the flex scanner */
|
|
yyscanner = scanner_init(str, &yyextra.core_yy_extra,
|
|
&ScanKeywords, ScanKeywordTokens);
|
|
|
|
/* base_yylex() only needs us to initialize the lookahead token, if any */
|
|
if (mode == RAW_PARSE_DEFAULT)
|
|
yyextra.have_lookahead = false;
|
|
else
|
|
{
|
|
/* this array is indexed by RawParseMode enum */
|
|
static const int mode_token[] = {
|
|
0, /* RAW_PARSE_DEFAULT */
|
|
MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */
|
|
MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */
|
|
MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */
|
|
MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */
|
|
MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */
|
|
};
|
|
|
|
yyextra.have_lookahead = true;
|
|
yyextra.lookahead_token = mode_token[mode];
|
|
yyextra.lookahead_yylloc = 0;
|
|
yyextra.lookahead_end = NULL;
|
|
}
|
|
|
|
/* initialize the bison parser */
|
|
parser_init(&yyextra);
|
|
|
|
/* Parse! */
|
|
yyresult = base_yyparse(yyscanner);
|
|
|
|
/* Clean up (release memory) */
|
|
scanner_finish(yyscanner);
|
|
|
|
if (yyresult) /* error */
|
|
return NIL;
|
|
|
|
return yyextra.parsetree;
|
|
}
|
|
|
|
|
|
/*
|
|
* Intermediate filter between parser and core lexer (core_yylex in scan.l).
|
|
*
|
|
* This filter is needed because in some cases the standard SQL grammar
|
|
* requires more than one token lookahead. We reduce these cases to one-token
|
|
* lookahead by replacing tokens here, in order to keep the grammar LALR(1).
|
|
*
|
|
* Using a filter is simpler than trying to recognize multiword tokens
|
|
* directly in scan.l, because we'd have to allow for comments between the
|
|
* words. Furthermore it's not clear how to do that without re-introducing
|
|
* scanner backtrack, which would cost more performance than this filter
|
|
* layer does.
|
|
*
|
|
* We also use this filter to convert UIDENT and USCONST sequences into
|
|
* plain IDENT and SCONST tokens. While that could be handled by additional
|
|
* productions in the main grammar, it's more efficient to do it like this.
|
|
*
|
|
* The filter also provides a convenient place to translate between
|
|
* the core_YYSTYPE and YYSTYPE representations (which are really the
|
|
* same thing anyway, but notationally they're different).
|
|
*/
|
|
int
|
|
base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
|
|
{
|
|
base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
|
|
int cur_token;
|
|
int next_token;
|
|
int cur_token_length;
|
|
YYLTYPE cur_yylloc;
|
|
|
|
/* Get next token --- we might already have it */
|
|
if (yyextra->have_lookahead)
|
|
{
|
|
cur_token = yyextra->lookahead_token;
|
|
lvalp->core_yystype = yyextra->lookahead_yylval;
|
|
*llocp = yyextra->lookahead_yylloc;
|
|
if (yyextra->lookahead_end)
|
|
*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
|
|
yyextra->have_lookahead = false;
|
|
}
|
|
else
|
|
cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
|
|
|
|
/*
|
|
* If this token isn't one that requires lookahead, just return it. If it
|
|
* does, determine the token length. (We could get that via strlen(), but
|
|
* since we have such a small set of possibilities, hardwiring seems
|
|
* feasible and more efficient --- at least for the fixed-length cases.)
|
|
*/
|
|
switch (cur_token)
|
|
{
|
|
case NOT:
|
|
cur_token_length = 3;
|
|
break;
|
|
case NULLS_P:
|
|
cur_token_length = 5;
|
|
break;
|
|
case WITH:
|
|
cur_token_length = 4;
|
|
break;
|
|
case UIDENT:
|
|
case USCONST:
|
|
cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
|
|
break;
|
|
default:
|
|
return cur_token;
|
|
}
|
|
|
|
/*
|
|
* Identify end+1 of current token. core_yylex() has temporarily stored a
|
|
* '\0' here, and will undo that when we call it again. We need to redo
|
|
* it to fully revert the lookahead call for error reporting purposes.
|
|
*/
|
|
yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
|
|
*llocp + cur_token_length;
|
|
Assert(*(yyextra->lookahead_end) == '\0');
|
|
|
|
/*
|
|
* Save and restore *llocp around the call. It might look like we could
|
|
* avoid this by just passing &lookahead_yylloc to core_yylex(), but that
|
|
* does not work because flex actually holds onto the last-passed pointer
|
|
* internally, and will use that for error reporting. We need any error
|
|
* reports to point to the current token, not the next one.
|
|
*/
|
|
cur_yylloc = *llocp;
|
|
|
|
/* Get next token, saving outputs into lookahead variables */
|
|
next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
|
|
yyextra->lookahead_token = next_token;
|
|
yyextra->lookahead_yylloc = *llocp;
|
|
|
|
*llocp = cur_yylloc;
|
|
|
|
/* Now revert the un-truncation of the current token */
|
|
yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
|
|
*(yyextra->lookahead_end) = '\0';
|
|
|
|
yyextra->have_lookahead = true;
|
|
|
|
/* Replace cur_token if needed, based on lookahead */
|
|
switch (cur_token)
|
|
{
|
|
case NOT:
|
|
/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
|
|
switch (next_token)
|
|
{
|
|
case BETWEEN:
|
|
case IN_P:
|
|
case LIKE:
|
|
case ILIKE:
|
|
case SIMILAR:
|
|
cur_token = NOT_LA;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case NULLS_P:
|
|
/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
|
|
switch (next_token)
|
|
{
|
|
case FIRST_P:
|
|
case LAST_P:
|
|
cur_token = NULLS_LA;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case WITH:
|
|
/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
|
|
switch (next_token)
|
|
{
|
|
case TIME:
|
|
case ORDINALITY:
|
|
cur_token = WITH_LA;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case UIDENT:
|
|
case USCONST:
|
|
/* Look ahead for UESCAPE */
|
|
if (next_token == UESCAPE)
|
|
{
|
|
/* Yup, so get third token, which had better be SCONST */
|
|
const char *escstr;
|
|
|
|
/* Again save and restore *llocp */
|
|
cur_yylloc = *llocp;
|
|
|
|
/* Un-truncate current token so errors point to third token */
|
|
*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
|
|
|
|
/* Get third token */
|
|
next_token = core_yylex(&(yyextra->lookahead_yylval),
|
|
llocp, yyscanner);
|
|
|
|
/* If we throw error here, it will point to third token */
|
|
if (next_token != SCONST)
|
|
scanner_yyerror("UESCAPE must be followed by a simple string literal",
|
|
yyscanner);
|
|
|
|
escstr = yyextra->lookahead_yylval.str;
|
|
if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
|
|
scanner_yyerror("invalid Unicode escape character",
|
|
yyscanner);
|
|
|
|
/* Now restore *llocp; errors will point to first token */
|
|
*llocp = cur_yylloc;
|
|
|
|
/* Apply Unicode conversion */
|
|
lvalp->core_yystype.str =
|
|
str_udeescape(lvalp->core_yystype.str,
|
|
escstr[0],
|
|
*llocp,
|
|
yyscanner);
|
|
|
|
/*
|
|
* We don't need to revert the un-truncation of UESCAPE. What
|
|
* we do want to do is clear have_lookahead, thereby consuming
|
|
* all three tokens.
|
|
*/
|
|
yyextra->have_lookahead = false;
|
|
}
|
|
else
|
|
{
|
|
/* No UESCAPE, so convert using default escape character */
|
|
lvalp->core_yystype.str =
|
|
str_udeescape(lvalp->core_yystype.str,
|
|
'\\',
|
|
*llocp,
|
|
yyscanner);
|
|
}
|
|
|
|
if (cur_token == UIDENT)
|
|
{
|
|
/* It's an identifier, so truncate as appropriate */
|
|
truncate_identifier(lvalp->core_yystype.str,
|
|
strlen(lvalp->core_yystype.str),
|
|
true);
|
|
cur_token = IDENT;
|
|
}
|
|
else if (cur_token == USCONST)
|
|
{
|
|
cur_token = SCONST;
|
|
}
|
|
break;
|
|
}
|
|
|
|
return cur_token;
|
|
}
|
|
|
|
/* convert hex digit (caller should have verified that) to value */
|
|
static unsigned int
|
|
hexval(unsigned char c)
|
|
{
|
|
if (c >= '0' && c <= '9')
|
|
return c - '0';
|
|
if (c >= 'a' && c <= 'f')
|
|
return c - 'a' + 0xA;
|
|
if (c >= 'A' && c <= 'F')
|
|
return c - 'A' + 0xA;
|
|
elog(ERROR, "invalid hexadecimal digit");
|
|
return 0; /* not reached */
|
|
}
|
|
|
|
/* is Unicode code point acceptable? */
|
|
static void
|
|
check_unicode_value(pg_wchar c)
|
|
{
|
|
if (!is_valid_unicode_codepoint(c))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("invalid Unicode escape value")));
|
|
}
|
|
|
|
/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
|
|
static bool
|
|
check_uescapechar(unsigned char escape)
|
|
{
|
|
if (isxdigit(escape)
|
|
|| escape == '+'
|
|
|| escape == '\''
|
|
|| escape == '"'
|
|
|| scanner_isspace(escape))
|
|
return false;
|
|
else
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Process Unicode escapes in "str", producing a palloc'd plain string
|
|
*
|
|
* escape: the escape character to use
|
|
* position: start position of U&'' or U&"" string token
|
|
* yyscanner: context information needed for error reports
|
|
*/
|
|
static char *
|
|
str_udeescape(const char *str, char escape,
|
|
int position, core_yyscan_t yyscanner)
|
|
{
|
|
const char *in;
|
|
char *new,
|
|
*out;
|
|
size_t new_len;
|
|
pg_wchar pair_first = 0;
|
|
ScannerCallbackState scbstate;
|
|
|
|
/*
|
|
* Guesstimate that result will be no longer than input, but allow enough
|
|
* padding for Unicode conversion.
|
|
*/
|
|
new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
|
|
new = palloc(new_len);
|
|
|
|
in = str;
|
|
out = new;
|
|
while (*in)
|
|
{
|
|
/* Enlarge string if needed */
|
|
size_t out_dist = out - new;
|
|
|
|
if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
|
|
{
|
|
new_len *= 2;
|
|
new = repalloc(new, new_len);
|
|
out = new + out_dist;
|
|
}
|
|
|
|
if (in[0] == escape)
|
|
{
|
|
/*
|
|
* Any errors reported while processing this escape sequence will
|
|
* have an error cursor pointing at the escape.
|
|
*/
|
|
setup_scanner_errposition_callback(&scbstate, yyscanner,
|
|
in - str + position + 3); /* 3 for U&" */
|
|
if (in[1] == escape)
|
|
{
|
|
if (pair_first)
|
|
goto invalid_pair;
|
|
*out++ = escape;
|
|
in += 2;
|
|
}
|
|
else if (isxdigit((unsigned char) in[1]) &&
|
|
isxdigit((unsigned char) in[2]) &&
|
|
isxdigit((unsigned char) in[3]) &&
|
|
isxdigit((unsigned char) in[4]))
|
|
{
|
|
pg_wchar unicode;
|
|
|
|
unicode = (hexval(in[1]) << 12) +
|
|
(hexval(in[2]) << 8) +
|
|
(hexval(in[3]) << 4) +
|
|
hexval(in[4]);
|
|
check_unicode_value(unicode);
|
|
if (pair_first)
|
|
{
|
|
if (is_utf16_surrogate_second(unicode))
|
|
{
|
|
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
|
|
pair_first = 0;
|
|
}
|
|
else
|
|
goto invalid_pair;
|
|
}
|
|
else if (is_utf16_surrogate_second(unicode))
|
|
goto invalid_pair;
|
|
|
|
if (is_utf16_surrogate_first(unicode))
|
|
pair_first = unicode;
|
|
else
|
|
{
|
|
pg_unicode_to_server(unicode, (unsigned char *) out);
|
|
out += strlen(out);
|
|
}
|
|
in += 5;
|
|
}
|
|
else if (in[1] == '+' &&
|
|
isxdigit((unsigned char) in[2]) &&
|
|
isxdigit((unsigned char) in[3]) &&
|
|
isxdigit((unsigned char) in[4]) &&
|
|
isxdigit((unsigned char) in[5]) &&
|
|
isxdigit((unsigned char) in[6]) &&
|
|
isxdigit((unsigned char) in[7]))
|
|
{
|
|
pg_wchar unicode;
|
|
|
|
unicode = (hexval(in[2]) << 20) +
|
|
(hexval(in[3]) << 16) +
|
|
(hexval(in[4]) << 12) +
|
|
(hexval(in[5]) << 8) +
|
|
(hexval(in[6]) << 4) +
|
|
hexval(in[7]);
|
|
check_unicode_value(unicode);
|
|
if (pair_first)
|
|
{
|
|
if (is_utf16_surrogate_second(unicode))
|
|
{
|
|
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
|
|
pair_first = 0;
|
|
}
|
|
else
|
|
goto invalid_pair;
|
|
}
|
|
else if (is_utf16_surrogate_second(unicode))
|
|
goto invalid_pair;
|
|
|
|
if (is_utf16_surrogate_first(unicode))
|
|
pair_first = unicode;
|
|
else
|
|
{
|
|
pg_unicode_to_server(unicode, (unsigned char *) out);
|
|
out += strlen(out);
|
|
}
|
|
in += 8;
|
|
}
|
|
else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("invalid Unicode escape"),
|
|
errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
|
|
|
|
cancel_scanner_errposition_callback(&scbstate);
|
|
}
|
|
else
|
|
{
|
|
if (pair_first)
|
|
goto invalid_pair;
|
|
|
|
*out++ = *in++;
|
|
}
|
|
}
|
|
|
|
/* unfinished surrogate pair? */
|
|
if (pair_first)
|
|
goto invalid_pair;
|
|
|
|
*out = '\0';
|
|
return new;
|
|
|
|
/*
|
|
* We might get here with the error callback active, or not. Call
|
|
* scanner_errposition to make sure an error cursor appears; if the
|
|
* callback is active, this is duplicative but harmless.
|
|
*/
|
|
invalid_pair:
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("invalid Unicode surrogate pair"),
|
|
scanner_errposition(in - str + position + 3, /* 3 for U&" */
|
|
yyscanner)));
|
|
return NULL; /* keep compiler quiet */
|
|
}
|