Surrogate pair support for U& string and identifier syntax

This is mainly to make the functionality consistent with the proposed \u escape syntax.
2009-09-21 22:22:07 +00:00 · 2009-09-21 22:22:07 +00:00 · 02faeb4ac8
commit 02faeb4ac8
parent c6bc0feb00
2 changed files with 81 additions and 6 deletions
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.134 2009/08/27 20:08:02 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
 <chapter id="sql-syntax">
 <title>SQL Syntax</title>
@ -238,6 +238,10 @@ U&amp;"d!0061t!+000061" UESCAPE '!'
    The Unicode escape syntax works only when the server encoding is
    UTF8.  When other server encodings are used, only code points in
    the ASCII range (up to <literal>\007F</literal>) can be specified.
    Both the 4-digit and the 6-digit form can be used to specify
    UTF-16 surrogate pairs to compose characters with code points
    larger than <literal>\FFFF</literal> (although the availability of
    the 6-digit form technically makes this unnecessary).
   </para>
   <para>
@ -497,6 +501,10 @@ U&amp;'d!0061t!+000061' UESCAPE '!'
     UTF8.  When other server encodings are used, only code points in
     the ASCII range (up to <literal>\007F</literal>) can be
     specified.
     Both the 4-digit and the 6-digit form can be used to specify
     UTF-16 surrogate pairs to compose characters with code points
     larger than <literal>\FFFF</literal> (although the availability
     of the 6-digit form technically makes this unnecessary).
    </para>
    <para>
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@ -24,7 +24,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner)
 	}
 }
 static bool
 is_utf16_surrogate_first(pg_wchar c)
 {
 	return (c >= 0xD800 && c <= 0xDBFF);
 }
 static bool
 is_utf16_surrogate_second(pg_wchar c)
 {
 	return (c >= 0xDC00 && c <= 0xDFFF);
 }
 static pg_wchar
 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
 {
 	return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
 }
 static char *
 litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
 {
 	char *new;
 	char *litbuf, *in, *out;
 	pg_wchar pair_first = 0;
 	if (isxdigit(escape)
 		|| escape == '+'
@ -1131,6 +1150,11 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
 		{
 			if (in[1] == escape)
 			{
 				if (pair_first)
 				{
 					ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
 					yyerror("invalid Unicode surrogate pair");
 				}
 				*out++ = escape;
 				in += 2;
 			}
@ -1138,9 +1162,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
 			{
 				pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
 				check_unicode_value(unicode, in, yyscanner);
-				unicode_to_utf8(unicode, (unsigned char *) out);
+				if (pair_first)
 				{
 					if (is_utf16_surrogate_second(unicode))
 					{
 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
 						pair_first = 0;
 					}
 					else
 					{
 						ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
 						yyerror("invalid Unicode surrogate pair");
 					}
 				}
 				if (is_utf16_surrogate_first(unicode))
 					pair_first = unicode;
 				else
 				{
 					unicode_to_utf8(unicode, (unsigned char *) out);
 					out += pg_mblen(out);
 				}
 				in += 5;
 				out += pg_mblen(out);
 			}
 			else if (in[1] == '+'
 					 && isxdigit(in[2]) && isxdigit(in[3])
@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
 				pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
 									+ hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
 				check_unicode_value(unicode, in, yyscanner);
-				unicode_to_utf8(unicode, (unsigned char *) out);
+				if (pair_first)
 				{
 					if (is_utf16_surrogate_second(unicode))
 					{
 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
 						pair_first = 0;
 					}
 					else
 					{
 						ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
 						yyerror("invalid Unicode surrogate pair");
 					}
 				}
 				if (is_utf16_surrogate_first(unicode))
 					pair_first = unicode;
 				else
 				{
 					unicode_to_utf8(unicode, (unsigned char *) out);
 					out += pg_mblen(out);
 				}
 				in += 8;
 				out += pg_mblen(out);
 			}
 			else
 			{
@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
 			}
 		}
 		else
 		{
 			if (pair_first)
 			{
 				ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
 				yyerror("invalid Unicode surrogate pair");
 			}
 			*out++ = *in++;
 		}
 	}
 	*out = '\0';