Surrogate pair support for U& string and identifier syntax
This is mainly to make the functionality consistent with the proposed \u escape syntax.
This commit is contained in:
parent
c6bc0feb00
commit
02faeb4ac8
@ -1,4 +1,4 @@
|
|||||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.134 2009/08/27 20:08:02 tgl Exp $ -->
|
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
|
||||||
|
|
||||||
<chapter id="sql-syntax">
|
<chapter id="sql-syntax">
|
||||||
<title>SQL Syntax</title>
|
<title>SQL Syntax</title>
|
||||||
@ -238,6 +238,10 @@ U&"d!0061t!+000061" UESCAPE '!'
|
|||||||
The Unicode escape syntax works only when the server encoding is
|
The Unicode escape syntax works only when the server encoding is
|
||||||
UTF8. When other server encodings are used, only code points in
|
UTF8. When other server encodings are used, only code points in
|
||||||
the ASCII range (up to <literal>\007F</literal>) can be specified.
|
the ASCII range (up to <literal>\007F</literal>) can be specified.
|
||||||
|
Both the 4-digit and the 6-digit form can be used to specify
|
||||||
|
UTF-16 surrogate pairs to compose characters with code points
|
||||||
|
larger than <literal>\FFFF</literal> (although the availability of
|
||||||
|
the 6-digit form technically makes this unnecessary).
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
@ -497,6 +501,10 @@ U&'d!0061t!+000061' UESCAPE '!'
|
|||||||
UTF8. When other server encodings are used, only code points in
|
UTF8. When other server encodings are used, only code points in
|
||||||
the ASCII range (up to <literal>\007F</literal>) can be
|
the ASCII range (up to <literal>\007F</literal>) can be
|
||||||
specified.
|
specified.
|
||||||
|
Both the 4-digit and the 6-digit form can be used to specify
|
||||||
|
UTF-16 surrogate pairs to compose characters with code points
|
||||||
|
larger than <literal>\FFFF</literal> (although the availability
|
||||||
|
of the 6-digit form technically makes this unnecessary).
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
|
@ -24,7 +24,7 @@
|
|||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
is_utf16_surrogate_first(pg_wchar c)
|
||||||
|
{
|
||||||
|
return (c >= 0xD800 && c <= 0xDBFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
is_utf16_surrogate_second(pg_wchar c)
|
||||||
|
{
|
||||||
|
return (c >= 0xDC00 && c <= 0xDFFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
static pg_wchar
|
||||||
|
surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
|
||||||
|
{
|
||||||
|
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
|
||||||
|
}
|
||||||
|
|
||||||
static char *
|
static char *
|
||||||
litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
|
litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
|
||||||
{
|
{
|
||||||
char *new;
|
char *new;
|
||||||
char *litbuf, *in, *out;
|
char *litbuf, *in, *out;
|
||||||
|
pg_wchar pair_first = 0;
|
||||||
|
|
||||||
if (isxdigit(escape)
|
if (isxdigit(escape)
|
||||||
|| escape == '+'
|
|| escape == '+'
|
||||||
@ -1131,6 +1150,11 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
|
|||||||
{
|
{
|
||||||
if (in[1] == escape)
|
if (in[1] == escape)
|
||||||
{
|
{
|
||||||
|
if (pair_first)
|
||||||
|
{
|
||||||
|
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
|
||||||
|
yyerror("invalid Unicode surrogate pair");
|
||||||
|
}
|
||||||
*out++ = escape;
|
*out++ = escape;
|
||||||
in += 2;
|
in += 2;
|
||||||
}
|
}
|
||||||
@ -1138,9 +1162,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
|
|||||||
{
|
{
|
||||||
pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
|
pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
|
||||||
check_unicode_value(unicode, in, yyscanner);
|
check_unicode_value(unicode, in, yyscanner);
|
||||||
unicode_to_utf8(unicode, (unsigned char *) out);
|
if (pair_first)
|
||||||
|
{
|
||||||
|
if (is_utf16_surrogate_second(unicode))
|
||||||
|
{
|
||||||
|
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
|
||||||
|
pair_first = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
|
||||||
|
yyerror("invalid Unicode surrogate pair");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (is_utf16_surrogate_first(unicode))
|
||||||
|
pair_first = unicode;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
unicode_to_utf8(unicode, (unsigned char *) out);
|
||||||
|
out += pg_mblen(out);
|
||||||
|
}
|
||||||
in += 5;
|
in += 5;
|
||||||
out += pg_mblen(out);
|
|
||||||
}
|
}
|
||||||
else if (in[1] == '+'
|
else if (in[1] == '+'
|
||||||
&& isxdigit(in[2]) && isxdigit(in[3])
|
&& isxdigit(in[2]) && isxdigit(in[3])
|
||||||
@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
|
|||||||
pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
|
pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
|
||||||
+ hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
|
+ hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
|
||||||
check_unicode_value(unicode, in, yyscanner);
|
check_unicode_value(unicode, in, yyscanner);
|
||||||
unicode_to_utf8(unicode, (unsigned char *) out);
|
if (pair_first)
|
||||||
|
{
|
||||||
|
if (is_utf16_surrogate_second(unicode))
|
||||||
|
{
|
||||||
|
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
|
||||||
|
pair_first = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
|
||||||
|
yyerror("invalid Unicode surrogate pair");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (is_utf16_surrogate_first(unicode))
|
||||||
|
pair_first = unicode;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
unicode_to_utf8(unicode, (unsigned char *) out);
|
||||||
|
out += pg_mblen(out);
|
||||||
|
}
|
||||||
in += 8;
|
in += 8;
|
||||||
out += pg_mblen(out);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
|
if (pair_first)
|
||||||
|
{
|
||||||
|
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
|
||||||
|
yyerror("invalid Unicode surrogate pair");
|
||||||
|
}
|
||||||
*out++ = *in++;
|
*out++ = *in++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
*out = '\0';
|
*out = '\0';
|
||||||
|
Loading…
x
Reference in New Issue
Block a user