Unicode escapes in strings and identifiers

2008-10-29 08:04:54 +00:00 · 2008-10-29 08:04:54 +00:00 · 06735e3256
parent 05bba3d176
commit 06735e3256
18 changed files with 638 additions and 59 deletions
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.123 2008/06/26 22:24:42 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.124 2008/10/29 08:04:52 petere Exp $ -->

 <chapter id="sql-syntax">
 <title>SQL Syntax</title>
@ -189,6 +189,57 @@ UPDATE "my_table" SET "a" = 5;
    ampersands.  The length limitation still applies.
   </para>

+   <para>
+    <indexterm><primary>Unicode escape</primary><secondary>in
+    identifiers</secondary></indexterm> A variant of quoted
+    identifiers allows including escaped Unicode characters identified
+    by their code points.  This variant starts
+    with <literal>U&</literal> (upper or lower case U followed by
+    ampersand) immediately before the opening double quote, without
+    any spaces in between, for example <literal>U&"foo"</literal>.
+    (Note that this creates an ambiguity with the
+    operator <literal>&</literal>.  Use spaces around the operator to
+    avoid this problem.)  Inside the quotes, Unicode characters can be
+    specified in escaped form by writing a backslash followed by the
+    four-digit hexadecimal code point number or alternatively a
+    backslash followed by a plus sign followed by a six-digit
+    hexadecimal code point number.  For example, the
+    identifier <literal>"data"</literal> could be written as
+<programlisting>
+U&"d\0061t\+000061"
+</programlisting>
+    The following less trivial example writes the Russian
+    word <quote>slon</quote> (elephant) in Cyrillic letters:
+<programlisting>
+U&"\0441\043B\043E\043D"
+</programlisting>
+   </para>
+
+   <para>
+    If a different escape character than backslash is desired, it can
+    be specified using
+    the <literal>UESCAPE</literal><indexterm><primary>UESCAPE</primary></indexterm>
+    clause after the string, for example:
+<programlisting>
+U&"d!0061t!+000061" UESCAPE '!'
+</programlisting>
+    The escape character can be any single character other than a
+    hexadecimal digit, the plus sign, a single quote, a double quote,
+    or a whitespace character.  Note that the escape character is
+    written in single quotes, not double quotes.
+   </para>
+
+   <para>
+    To include the escape character in the identifier literally, write
+    it twice.
+   </para>
+
+   <para>
+    The Unicode escape syntax works only when the server encoding is
+    UTF8.  When other server encodings are used, only code points in
+    the ASCII range (up to <literal>\007F</literal>) can be specified.
+   </para>
+
   <para>
    Quoting an identifier also makes it case-sensitive, whereas
    unquoted names are always folded to lower case.  For example, the
@ -245,7 +296,7 @@ UPDATE "my_table" SET "a" = 5;
     write two adjacent single quotes, e.g.
     <literal>'Dianne''s horse'</literal>.
     Note that this is <emphasis>not</> the same as a double-quote
-     character (<literal>"</>).
+     character (<literal>"</>). <!-- font-lock sanity: " -->
    </para>

    <para>
@ -269,14 +320,19 @@ SELECT 'foo'      'bar';
     by <acronym>SQL</acronym>; <productname>PostgreSQL</productname> is
     following the standard.)
    </para>
+   </sect3>

-    <para>
-     <indexterm>
+   <sect3 id="sql-syntax-strings-escape">
+    <title>String Constants with C-Style Escapes</title>
+
+     <indexterm zone="sql-syntax-strings-escape">
      <primary>escape string syntax</primary>
     </indexterm>
-     <indexterm>
+     <indexterm zone="sql-syntax-strings-escape">
      <primary>backslash escapes</primary>
     </indexterm>
+
+    <para>
     <productname>PostgreSQL</productname> also accepts <quote>escape</>
     string constants, which are an extension to the SQL standard.
     An escape string constant is specified by writing the letter
@ -287,7 +343,8 @@ SELECT 'foo'      'bar';
     Within an escape string, a backslash character (<literal>\</>) begins a
     C-like <firstterm>backslash escape</> sequence, in which the combination
     of backslash and following character(s) represent a special byte
-     value:
+     value, as shown in <xref linkend="sql-backslash-table">.
+    </para>

     <table id="sql-backslash-table">
      <title>Backslash Escape Sequences</title>
@ -341,14 +398,24 @@ SELECT 'foo'      'bar';
      </tgroup>
     </table>

-     It is your responsibility that the byte sequences you create are
-     valid characters in the server character set encoding. Any other
+    <para>
+     Any other
     character following a backslash is taken literally. Thus, to
     include a backslash character, write two backslashes (<literal>\\</>).
     Also, a single quote can be included in an escape string by writing
     <literal>\'</literal>, in addition to the normal way of <literal>''</>.
    </para>

+    <para>
+     It is your responsibility that the byte sequences you create are
+     valid characters in the server character set encoding.  When the
+     server encoding is UTF-8, then the alternative Unicode escape
+     syntax, explained in <xref linkend="sql-syntax-strings-uescape">,
+     should be used instead.  (The alternative would be doing the
+     UTF-8 encoding by hand and writing out the bytes, which would be
+     very cumbersome.)
+    </para>
+
    <caution>
    <para>
     If the configuration parameter
@ -379,6 +446,65 @@ SELECT 'foo'      'bar';
    </para>
   </sect3>

+   <sect3 id="sql-syntax-strings-uescape">
+    <title>String Constants with Unicode Escapes</title>
+
+    <indexterm  zone="sql-syntax-strings-uescape">
+     <primary>Unicode escape</primary>
+     <secondary>in string constants</secondary>
+    </indexterm>
+
+    <para>
+     <productname>PostgreSQL</productname> also supports another type
+     of escape syntax for strings that allows specifying arbitrary
+     Unicode characters by code point.  A Unicode escape string
+     constant starts with <literal>U&</literal> (upper or lower case
+     letter U followed by ampersand) immediately before the opening
+     quote, without any spaces in between, for
+     example <literal>U&'foo'</literal>.  (Note that this creates an
+     ambiguity with the operator <literal>&</literal>.  Use spaces
+     around the operator to avoid this problem.)  Inside the quotes,
+     Unicode characters can be specified in escaped form by writing a
+     backslash followed by the four-digit hexadecimal code point
+     number or alternatively a backslash followed by a plus sign
+     followed by a six-digit hexadecimal code point number.  For
+     example, the string <literal>'data'</literal> could be written as
+<programlisting>
+U&'d\0061t\+000061'
+</programlisting>
+     The following less trivial example writes the Russian
+     word <quote>slon</quote> (elephant) in Cyrillic letters:
+<programlisting>
+U&'\0441\043B\043E\043D'
+</programlisting>
+    </para>
+
+    <para>
+     If a different escape character than backslash is desired, it can
+     be specified using
+     the <literal>UESCAPE</literal><indexterm><primary>UESCAPE</primary></indexterm>
+     clause after the string, for example:
+<programlisting>
+		   U&'d!0061t!+000061' UESCAPE '!'
+</programlisting>
+     The escape character can be any single character other than a
+     hexadecimal digit, the plus sign, a single quote, a double quote,
+     or a whitespace character.
+    </para>
+
+    <para>
+     The Unicode escape syntax works only when the server encoding is
+     UTF8.  When other server encodings are used, only code points in
+     the ASCII range (up to <literal>\007F</literal>) can be
+     specified.
+    </para>
+
+    <para>
+     To include the escape character in the string literally, write it
+     twice.
+    </para>
+   </sect3>
+
   <sect3 id="sql-syntax-dollar-quoting">
    <title>Dollar-Quoted String Constants</title>

--- a/src/backend/catalog/sql_features.txt
+++ b/src/backend/catalog/sql_features.txt
@ -238,8 +238,8 @@ F381	Extended schema manipulation	02	ALTER TABLE statement: ADD CONSTRAINT claus
 F381	Extended schema manipulation	03	ALTER TABLE statement: DROP CONSTRAINT clause	YES	
 F382	Alter column data type			YES	
 F391	Long identifiers			YES	
-F392	Unicode escapes in identifiers			NO	
-F393	Unicode escapes in literals			NO	
+F392	Unicode escapes in identifiers			YES	
+F393	Unicode escapes in literals			YES	
 F394	Optional normal form specification			NO	
 F401	Extended joined table			YES	
 F401	Extended joined table	01	NATURAL JOIN	YES	
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@ -24,7 +24,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.146 2008/09/01 20:42:45 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.147 2008/10/29 08:04:52 petere Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -76,6 +76,7 @@ static int		literalalloc;	/* current allocated buffer size */
 static void addlit(char *ytext, int yleng);
 static void addlitchar(unsigned char ychar);
 static char *litbufdup(void);
+static char *litbuf_udeescape(unsigned char escape);

 #define lexer_errposition()  scanner_errposition(yylloc)

@ -125,6 +126,8 @@ static unsigned char unescape_single_char(unsigned char c);
 *  <xq> standard quoted strings
 *  <xe> extended quoted strings (support backslash escape sequences)
 *  <xdolq> $foo$ quoted strings
+ *  <xui> quoted identifier with Unicode escapes
+ *  <xus> quoted string with Unicode escapes
 */

 %x xb
@ -134,6 +137,8 @@ static unsigned char unescape_single_char(unsigned char c);
 %x xe
 %x xq
 %x xdolq
+%x xui
+%x xus

 /*
 * In order to make the world safe for Windows and Mac clients as well as
@ -244,6 +249,25 @@ xdstop			{dquote}
 xddouble		{dquote}{dquote}
 xdinside		[^"]+

+/* Unicode escapes */
+uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+/* error rule to avoid backup */
+uescapefail		("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
+
+/* Quoted identifier with Unicode escapes */
+xuistart		[uU]&{dquote}
+xuistop1		{dquote}{whitespace}*{uescapefail}?
+xuistop2		{dquote}{whitespace}*{uescape}
+
+/* Quoted string with Unicode escapes */
+xusstart		[uU]&{quote}
+xusstop1		{quote}{whitespace}*{uescapefail}?
+xusstop2		{quote}{whitespace}*{uescape}
+
+/* error rule to avoid backup */
+xufailed		[uU]&
+
+
 /* C-style comments
 *
 * The "extended comment" syntax closely resembles allowable operator syntax.
@ -444,6 +468,11 @@ other			.
 					BEGIN(xe);
 					startlit();
 				}
+{xusstart}		{
+					SET_YYLLOC();
+					BEGIN(xus);
+					startlit();
+				}
 <xq,xe>{quotestop}	|
 <xq,xe>{quotefail} {
 					yyless(1);
@ -456,10 +485,22 @@ other			.
 					yylval.str = litbufdup();
 					return SCONST;
 				}
-<xq,xe>{xqdouble} {
+<xus>{xusstop1} {
+					/* throw back all but the quote */
+					yyless(1);
+					BEGIN(INITIAL);
+					yylval.str = litbuf_udeescape('\\');
+					return SCONST;
+				}
+<xus>{xusstop2} {
+					BEGIN(INITIAL);
+					yylval.str = litbuf_udeescape(yytext[yyleng-2]);
+					return SCONST;
+				}
+<xq,xe,xus>{xqdouble} {
 					addlitchar('\'');
 				}
-<xq>{xqinside}  {
+<xq,xus>{xqinside}  {
 					addlit(yytext, yyleng);
 				}
 <xe>{xeinside}  {
@ -496,14 +537,14 @@ other			.
 					if (IS_HIGHBIT_SET(c))
 						saw_high_bit = true;
 				}
-<xq,xe>{quotecontinue} {
+<xq,xe,xus>{quotecontinue} {
 					/* ignore */
 				}
 <xe>.			{
 					/* This is only needed for \ just before EOF */
 					addlitchar(yytext[0]);
 				}
-<xq,xe><<EOF>>		{ yyerror("unterminated quoted string"); }
+<xq,xe,xus><<EOF>>		{ yyerror("unterminated quoted string"); }

 {dolqdelim}		{
 					SET_YYLLOC();
@ -553,6 +594,11 @@ other			.
 					BEGIN(xd);
 					startlit();
 				}
+{xuistart}		{
+					SET_YYLLOC();
+					BEGIN(xui);
+					startlit();
+				}
 <xd>{xdstop}	{
 					char		   *ident;

@ -565,13 +611,46 @@ other			.
 					yylval.str = ident;
 					return IDENT;
 				}
-<xd>{xddouble}	{
+<xui>{xuistop1}	{
+					char		   *ident;
+
+					BEGIN(INITIAL);
+					if (literallen == 0)
+						yyerror("zero-length delimited identifier");
+					ident = litbuf_udeescape('\\');
+					if (literallen >= NAMEDATALEN)
+						truncate_identifier(ident, literallen, true);
+					yylval.str = ident;
+					/* throw back all but the quote */
+					yyless(1);
+					return IDENT;
+				}
+<xui>{xuistop2}	{
+					char		   *ident;
+
+					BEGIN(INITIAL);
+					if (literallen == 0)
+						yyerror("zero-length delimited identifier");
+					ident = litbuf_udeescape(yytext[yyleng - 2]);
+					if (literallen >= NAMEDATALEN)
+						truncate_identifier(ident, literallen, true);
+					yylval.str = ident;
+					return IDENT;
+				}
+<xd,xui>{xddouble}	{
 					addlitchar('"');
 				}
-<xd>{xdinside}	{
+<xd,xui>{xdinside}	{
 					addlit(yytext, yyleng);
 				}
-<xd><<EOF>>		{ yyerror("unterminated quoted identifier"); }
+<xd,xui><<EOF>>		{ yyerror("unterminated quoted identifier"); }
+
+{xufailed}	{
+					/* throw back all but the initial u/U */
+					yyless(1);
+					/* and treat it as {other} */
+					return yytext[0];
+				}

 {typecast}		{
 					SET_YYLLOC();
@ -908,6 +987,99 @@ litbufdup(void)
 	return new;
 }

+static int
+hexval(unsigned char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 0xA;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 0xA;
+	elog(ERROR, "invalid hexadecimal digit");
+	return 0; /* not reached */
+}
+
+static void
+check_unicode_value(pg_wchar c, char * loc)
+{
+	if (GetDatabaseEncoding() == PG_UTF8)
+		return;
+
+	if (c > 0x7F)
+	{
+		yylloc += (char *) loc - literalbuf + 3;   /* 3 for U&" */
+		yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
+	}
+}
+
+static char *
+litbuf_udeescape(unsigned char escape)
+{
+	char *new;
+	char *in, *out;
+
+	if (isxdigit(escape)
+		|| escape == '+'
+		|| escape == '\''
+		|| escape == '"'
+		|| scanner_isspace(escape))
+	{
+		yylloc += literallen + yyleng + 1;
+		yyerror("invalid Unicode escape character");
+	}
+
+	/*
+	 * This relies on the subtle assumption that a UTF-8 expansion
+	 * cannot be longer than its escaped representation.
+	 */
+	new = palloc(literallen + 1);
+
+	in = literalbuf;
+	out = new;
+	while (*in)
+	{
+		if (in[0] == escape)
+		{
+			if (in[1] == escape)
+			{
+				*out++ = escape;
+				in += 2;
+			}
+			else if (isxdigit(in[1]) && isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[4]))
+			{
+				pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
+				check_unicode_value(unicode, in);
+				unicode_to_utf8(unicode, (unsigned char *) out);
+				in += 5;
+				out += pg_mblen(out);
+			}
+			else if (in[1] == '+'
+					 && isxdigit(in[2]) && isxdigit(in[3])
+					 && isxdigit(in[4]) && isxdigit(in[5])
+					 && isxdigit(in[6]) && isxdigit(in[7]))
+			{
+				pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
+									+ hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
+				check_unicode_value(unicode, in);
+				unicode_to_utf8(unicode, (unsigned char *) out);
+				in += 8;
+				out += pg_mblen(out);
+			}
+			else
+			{
+				yylloc += in - literalbuf + 3;   /* 3 for U&" */
+				yyerror("invalid Unicode escape value");
+			}
+		}
+		else
+			*out++ = *in++;
+	}
+
+	*out = '\0';
+	pg_verifymbstr(new, out - new, false);
+	return new;
+}

 static unsigned char
 unescape_single_char(unsigned char c)
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.79 2008/10/14 17:12:33 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.80 2008/10/29 08:04:53 petere Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1497,28 +1497,7 @@ unicode_to_sqlchar(pg_wchar c)
 {
 	static unsigned char utf8string[5]; /* need trailing zero */

-	if (c <= 0x7F)
-	{
-		utf8string[0] = c;
-	}
-	else if (c <= 0x7FF)
-	{
-		utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
-		utf8string[1] = 0x80 | (c & 0x3F);
-	}
-	else if (c <= 0xFFFF)
-	{
-		utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
-		utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
-		utf8string[2] = 0x80 | (c & 0x3F);
-	}
-	else
-	{
-		utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
-		utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
-		utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
-		utf8string[3] = 0x80 | (c & 0x3F);
-	}
+	unicode_to_utf8(c, utf8string);

 	return (char *) pg_do_encoding_conversion(utf8string,
 											  pg_mblen((char *) utf8string),
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@ -1,7 +1,7 @@
 /*
 * conversion functions between pg_wchar and multibyte streams.
 * Tatsuo Ishii
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.67 2008/10/27 19:37:22 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.68 2008/10/29 08:04:53 petere Exp $
 *
 */
 /* can be used in either frontend or backend */
@ -419,6 +419,41 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 	return cnt;
 }

+
+/*
+ * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
+ * space allocated.
+ */
+unsigned char *
+unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
+{
+	if (c <= 0x7F)
+	{
+		utf8string[0] = c;
+	}
+	else if (c <= 0x7FF)
+	{
+		utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
+		utf8string[1] = 0x80 | (c & 0x3F);
+	}
+	else if (c <= 0xFFFF)
+	{
+		utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
+		utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
+		utf8string[2] = 0x80 | (c & 0x3F);
+	}
+	else
+	{
+		utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
+		utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
+		utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
+		utf8string[3] = 0x80 | (c & 0x3F);
+	}
+
+	return utf8string;
+}
+
+
 /*
 * Return the byte length of a UTF8 character pointed to by s
 *
--- a/src/bin/psql/psqlscan.l
+++ b/src/bin/psql/psqlscan.l
@ -33,7 +33,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.25 2008/05/09 15:36:31 petere Exp $
+ *	  $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.26 2008/10/29 08:04:53 petere Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -156,6 +156,8 @@ static void emit(const char *txt, int len);
 *  <xq> standard quoted strings
 *  <xe> extended quoted strings (support backslash escape sequences)
 *  <xdolq> $foo$ quoted strings
+ *  <xui> quoted identifier with Unicode escapes
+ *  <xus> quoted string with Unicode escapes
 */

 %x xb
@ -165,6 +167,8 @@ static void emit(const char *txt, int len);
 %x xe
 %x xq
 %x xdolq
+%x xui
+%x xus
 /* Additional exclusive states for psql only: lex backslash commands */
 %x xslashcmd
 %x xslasharg
@ -281,6 +285,25 @@ xdstop			{dquote}
 xddouble		{dquote}{dquote}
 xdinside		[^"]+

+/* Unicode escapes */
+uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+/* error rule to avoid backup */
+uescapefail		("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
+
+/* Quoted identifier with Unicode escapes */
+xuistart		[uU]&{dquote}
+xuistop1		{dquote}{whitespace}*{uescapefail}?
+xuistop2		{dquote}{whitespace}*{uescape}
+
+/* Quoted string with Unicode escapes */
+xusstart		[uU]&{quote}
+xusstop1		{quote}{whitespace}*{uescapefail}?
+xusstop2		{quote}{whitespace}*{uescape}
+
+/* error rule to avoid backup */
+xufailed		[uU]&
+
+
 /* C-style comments
 *
 * The "extended comment" syntax closely resembles allowable operator syntax.
@ -460,16 +483,29 @@ other			.
 					BEGIN(xe);
 					ECHO;
 				}
+{xusstart}		{
+					BEGIN(xus);
+					ECHO;
+				}
 <xq,xe>{quotestop}	|
 <xq,xe>{quotefail} {
 					yyless(1);
 					BEGIN(INITIAL);
 					ECHO;
 				}
-<xq,xe>{xqdouble} {
+<xus>{xusstop1} {
+					yyless(1);
+					BEGIN(INITIAL);
 					ECHO;
 				}
-<xq>{xqinside}  {
+<xus>{xusstop2} {
+					BEGIN(INITIAL);
+					ECHO;
+				}
+<xq,xe,xus>{xqdouble} {
+					ECHO;
+				}
+<xq,xus>{xqinside}  {
 					ECHO;
 				}
 <xe>{xeinside}  {
@ -484,7 +520,7 @@ other			.
 <xe>{xehexesc}  {
 					ECHO;
 				}
-<xq,xe>{quotecontinue} {
+<xq,xe,xus>{quotecontinue} {
 					ECHO;
 				}
 <xe>.			{
@ -535,14 +571,33 @@ other			.
 					BEGIN(xd);
 					ECHO;
 				}
+{xuistart}		{
+					BEGIN(xui);
+					ECHO;
+				}
 <xd>{xdstop}	{
 					BEGIN(INITIAL);
 					ECHO;
 				}
-<xd>{xddouble}	{
+<xui>{xuistop1}	{
+					yyless(1);
+					BEGIN(INITIAL);
 					ECHO;
 				}
-<xd>{xdinside}	{
+<xui>{xuistop2}	{
+					BEGIN(INITIAL);
+					ECHO;
+				}
+<xd,xui>{xddouble}	{
+					ECHO;
+				}
+<xd,xui>{xdinside}	{
+					ECHO;
+				}
+
+{xufailed}	{
+					/* throw back all but the initial u/U */
+					yyless(1);
 					ECHO;
 				}

--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@ -6,7 +6,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.79 2008/06/18 18:42:54 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.80 2008/10/29 08:04:53 petere Exp $
 *
 *	NOTES
 *		This is used both by the backend and by libpq, but should not be
@ -380,6 +380,7 @@ extern const char *GetDatabaseEncodingName(void);
 extern int	pg_valid_client_encoding(const char *name);
 extern int	pg_valid_server_encoding(const char *name);

+extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
 extern int	pg_utf_mblen(const unsigned char *);
 extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
 						  int src_encoding,
--- a/src/interfaces/ecpg/preproc/pgc.l
+++ b/src/interfaces/ecpg/preproc/pgc.l
@ -12,7 +12,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/pgc.l,v 1.166 2008/05/20 23:17:32 meskes Exp $
+ *	  $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/pgc.l,v 1.167 2008/10/29 08:04:53 petere Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -103,6 +103,8 @@ static struct _if_value
 *	<xe> extended quoted strings (support backslash escape sequences)
 *	<xn> national character quoted strings
 *  <xdolq> $foo$ quoted strings
+ *  <xui> quoted identifier with Unicode escapes
+ *  <xus> quoted string with Unicode escapes
 */

 %x xb
@ -117,6 +119,8 @@ static struct _if_value
 %x xdolq
 %x xcond
 %x xskip
+%x xui
+%x xus

 /* Bit string
 */
@ -172,6 +176,18 @@ xdstop			{dquote}
 xddouble		{dquote}{dquote}
 xdinside		[^"]+

+/* Unicode escapes */
+/* (The ecpg scanner is not backup-free, so the fail rules in scan.l are not needed here, but could be added if desired.) */
+uescape			[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+
+/* Quoted identifier with Unicode escapes */
+xuistart		[uU]&{dquote}
+xuistop			{dquote}({whitespace}*{uescape})?
+
+/* Quoted string with Unicode escapes */
+xusstart		[uU]&{quote}
+xusstop			{quote}({whitespace}*{uescape})?
+
 /* special stuff for C strings */
 xdcqq			\\\\
 xdcqdq			\\\"
@ -433,6 +449,13 @@ cppline			{space}*#(.*\\{space})*.*{newline}
 				BEGIN(xe);
 				startlit();
 			}
+<SQL>{xusstart}	{
+				token_start = yytext;
+				state_before = YYSTATE;
+				BEGIN(xus);
+				startlit();
+				addlit(yytext, yyleng);
+			}
 <xq,xqc>{quotestop} |
 <xq,xqc>{quotefail} {
 				yyless(1);
@ -454,22 +477,28 @@ cppline			{space}*#(.*\\{space})*.*{newline}
 				yylval.str = mm_strdup(literalbuf);
 				return NCONST;
 			}
-<xq,xe,xn>{xqdouble}	{ addlitchar('\''); }
+<xus>{xusstop} {
+				addlit(yytext, yyleng);
+				BEGIN(state_before);
+				yylval.str = mm_strdup(literalbuf);
+				return UCONST;
+			}
+<xq,xe,xn,xus>{xqdouble}	{ addlitchar('\''); }
 <xqc>{xqcquote}		{
 				addlitchar('\\');
 				addlitchar('\'');
 			}
-<xq,xqc,xn>{xqinside}	{ addlit(yytext, yyleng); }
+<xq,xqc,xn,xus>{xqinside}	{ addlit(yytext, yyleng); }
 <xe>{xeinside}		{ addlit(yytext, yyleng); }
 <xe>{xeescape}  	{ addlit(yytext, yyleng); }
 <xe>{xeoctesc}		{ addlit(yytext, yyleng); }
 <xe>{xehexesc}		{ addlit(yytext, yyleng); }
-<xq,xqc,xe,xn>{quotecontinue}	{ /* ignore */ }
+<xq,xqc,xe,xn,xus>{quotecontinue}	{ /* ignore */ }
 <xe>.                   {
 			   /* This is only needed for \ just before EOF */
 			   addlitchar(yytext[0]);
 			}
-<xq,xqc,xe,xn><<EOF>>	{ mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted string"); }
+<xq,xqc,xe,xn,xus><<EOF>>	{ mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted string"); }
 <SQL>{dolqfailed}	{
 				/* throw back all but the initial "$" */
 				yyless(1);
@ -515,6 +544,12 @@ cppline			{space}*#(.*\\{space})*.*{newline}
 						BEGIN(xd);
 						startlit();
 					}
+<SQL>{xuistart}		{
+						state_before = YYSTATE;
+						BEGIN(xui);
+						startlit();
+						addlit(yytext, yyleng);
+					}
 <xd>{xdstop}		{
 						BEGIN(state_before);
 						if (literallen == 0)
@ -528,9 +563,18 @@ cppline			{space}*#(.*\\{space})*.*{newline}
 						yylval.str = mm_strdup(literalbuf);
 						return CSTRING;
 					}
-<xd>{xddouble}		{ addlitchar('"'); }
-<xd>{xdinside}		{ addlit(yytext, yyleng); }
-<xd,xdc><<EOF>>		{ mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted identifier"); }
+<xui>{xuistop}		{
+						BEGIN(state_before);
+						if (literallen == 2) /* "U&" */
+							mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier");
+						/* The backend will truncate the idnetifier here. We do not as it does not change the result. */
+						addlit(yytext, yyleng);
+						yylval.str = mm_strdup(literalbuf);
+						return UIDENT;
+					}
+<xd,xui>{xddouble}		{ addlitchar('"'); }
+<xd,xui>{xdinside}		{ addlit(yytext, yyleng); }
+<xd,xdc,xui><<EOF>>		{ mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted identifier"); }
 <C,SQL>{xdstart}	{
 						state_before = YYSTATE;
 						BEGIN(xdc);
--- a/src/interfaces/ecpg/preproc/preproc.y
+++ b/src/interfaces/ecpg/preproc/preproc.y
@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/preproc.y,v 1.379 2008/10/28 14:09:45 petere Exp $ */
+/* $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/preproc.y,v 1.380 2008/10/29 08:04:53 petere Exp $ */

 /* Copyright comment */
 %{
@ -509,7 +509,7 @@ add_typedef(char *name, char * dimension, char * length, enum ECPGttype type_enu

 /* Special token types, not actually keywords - see the "lex" file */
 %token <str>	IDENT SCONST Op CSTRING CVARIABLE CPP_LINE IP BCONST
-%token <str>	XCONST DOLCONST ECONST NCONST
+%token <str>	XCONST DOLCONST ECONST NCONST UCONST UIDENT
 %token <ival>	ICONST PARAM
 %token <dval>	FCONST

@ -4966,6 +4966,10 @@ Sconst:  SCONST
 			$$[strlen($1)+3]='\0';
 			free($1);
 		}
+	| UCONST
+		{
+			$$ = $1; 
+		}
 	| DOLCONST
 		{
 			$$ = $1; 
@ -7013,6 +7017,7 @@ cvariable:	CVARIABLE
 		;
 ident: IDENT				{ $$ = $1; }
 		| CSTRING		{ $$ = make3_str(make_str("\""), $1, make_str("\"")); }
+		| UIDENT		{ $$ = $1; }
 		;

 quoted_ident_stringvar: name
--- a/src/interfaces/ecpg/test/ecpg_schedule
+++ b/src/interfaces/ecpg/test/ecpg_schedule
@ -18,6 +18,7 @@ test: preproc/autoprep
 test: preproc/comment
 test: preproc/define
 test: preproc/init
+test: preproc/strings
 test: preproc/type
 test: preproc/variable
 test: preproc/whenever
--- a/src/interfaces/ecpg/test/ecpg_schedule_tcp
+++ b/src/interfaces/ecpg/test/ecpg_schedule_tcp
@ -18,6 +18,7 @@ test: preproc/autoprep
 test: preproc/comment
 test: preproc/define
 test: preproc/init
+test: preproc/strings
 test: preproc/type
 test: preproc/variable
 test: preproc/whenever
--- a/src/interfaces/ecpg/test/expected/preproc-strings.c
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.c
@ -0,0 +1,62 @@
+/* Processed by ecpg (regression mode) */
+/* These include files are added by the preprocessor */
+#include <ecpglib.h>
+#include <ecpgerrno.h>
+#include <sqlca.h>
+/* End of automatic include section */
+#define ECPGdebug(X,Y) ECPGdebug((X)+100,(Y))
+
+#line 1 "strings.pgc"
+#include <stdlib.h>
+
+
+#line 1 "regression.h"
+
+
+
+
+
+
+#line 3 "strings.pgc"
+
+
+/* exec sql begin declare section */
+      
+
+#line 6 "strings.pgc"
+ char * s1    , * s2    , * s3    , * s4    , * s5    , * s6    ;
+/* exec sql end declare section */
+#line 7 "strings.pgc"
+
+
+int main(void)
+{
+  ECPGdebug(1, stderr);
+
+  { ECPGconnect(__LINE__, 0, "regress1" , NULL, NULL , NULL, 0); }
+#line 13 "strings.pgc"
+
+
+  { ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "select  'abcdef' , N'abcdef' as foo , E'abc\\bdef' as \"foo\" , U&'d\\0061t\\0061' as U&\"foo\" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$     ", ECPGt_EOIT, 
+	ECPGt_char,&(s1),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+	ECPGt_char,&(s2),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+	ECPGt_char,&(s3),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+	ECPGt_char,&(s4),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+	ECPGt_char,&(s5),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+	ECPGt_char,&(s6),(long)0,(long)1,(1)*sizeof(char), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EORT);}
+#line 21 "strings.pgc"
+
+
+  printf("%s %s %s %s %s %s\n", s1, s2, s3, s4, s5, s6);
+
+  { ECPGdisconnect(__LINE__, "CURRENT");}
+#line 25 "strings.pgc"
+
+  exit (0);
+}
--- a/src/interfaces/ecpg/test/expected/preproc-strings.stderr
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.stderr
@ -0,0 +1,36 @@
+[NO_PID]: ECPGdebug: set to 1
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ECPGconnect: opening database regress1 on <DEFAULT> port <DEFAULT>  
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 15: query: select  'abcdef' , N'abcdef' as foo , E'abc\bdef' as "foo" , U&'d\0061t\0061' as U&"foo" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$     ; with 0 parameter(s) on connection regress1
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 15: using PQexec
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 15: correctly got 1 tuples with 6 fields
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abcdef offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abcdef offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abcdef offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: data offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: data offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abc$def offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_finish: connection regress1 closed
+[NO_PID]: sqlca: code: 0, state: 00000
--- a/src/interfaces/ecpg/test/expected/preproc-strings.stdout
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.stdout
@ -0,0 +1 @@
+abcdef abcdef abcdef data data abc$def
--- a/src/interfaces/ecpg/test/preproc/Makefile
+++ b/src/interfaces/ecpg/test/preproc/Makefile
@ -9,6 +9,7 @@ TESTS = array_of_struct array_of_struct.c \
 	comment comment.c \
 	define define.c \
 	init init.c \
+	strings strings.c \
 	type type.c \
 	variable variable.c \
 	whenever whenever.c
--- a/src/interfaces/ecpg/test/preproc/strings.pgc
+++ b/src/interfaces/ecpg/test/preproc/strings.pgc
@ -0,0 +1,27 @@
+#include <stdlib.h>
+
+exec sql include ../regression;
+
+exec sql begin declare section;
+char *s1, *s2, *s3, *s4, *s5, *s6;
+exec sql end declare section;
+
+int main(void)
+{
+  ECPGdebug(1, stderr);
+
+  exec sql connect to REGRESSDB1;
+
+  exec sql select 'abcdef',
+                  N'abcdef' AS foo,
+                  E'abc\bdef' AS "foo",
+                  U&'d\0061t\0061' AS U&"foo",
+                  U&'d!+000061t!+000061' uescape '!',
+                  $foo$abc$def$foo$
+                  into :s1, :s2, :s3, :s4, :s5, :s6;
+
+  printf("%s %s %s %s %s %s\n", s1, s2, s3, s4, s5, s6);
+
+  exec sql disconnect;
+  exit (0);
+}
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@ -21,6 +21,31 @@ SELECT 'first line'
 ERROR:  syntax error at or near "' - third line'"
 LINE 3: ' - third line'
        ^
+-- Unicode escapes
+SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
+ data 
+------
+ data
+(1 row)
+
+SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*';
+ dat\+000061 
+-------------
+ dat\+000061
+(1 row)
+
+SELECT U&'wrong: \061';
+ERROR:  invalid Unicode escape value at or near "\061'"
+LINE 1: SELECT U&'wrong: \061';
+                         ^
+SELECT U&'wrong: \+0061';
+ERROR:  invalid Unicode escape value at or near "\+0061'"
+LINE 1: SELECT U&'wrong: \+0061';
+                         ^
+SELECT U&'wrong: +0061' UESCAPE '+';
+ERROR:  invalid Unicode escape character at or near "+'"
+LINE 1: SELECT U&'wrong: +0061' UESCAPE '+';
+                                         ^
 --
 -- test conversions between various string types
 -- E021-10 implicit casting among the character data types
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@ -16,6 +16,14 @@ SELECT 'first line'
 ' - third line'
 	AS "Illegal comment within continuation";

+-- Unicode escapes
+SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
+SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*';
+
+SELECT U&'wrong: \061';
+SELECT U&'wrong: \+0061';
+SELECT U&'wrong: +0061' UESCAPE '+';
+
 --
 -- test conversions between various string types
 -- E021-10 implicit casting among the character data types