Plug some more holes in encoding conversion.

Various places assume that pg_do_encoding_conversion() and
pg_server_to_any() will ensure encoding validity of their results;
but they failed to do so in the case that the source encoding is SQL_ASCII
while the destination is not.  We cannot perform any actual "conversion"
in that scenario, but we should still validate the string according to the
destination encoding.  Per bug #9210 from Digoal Zhou.

Arguably this is a back-patchable bug fix, but on the other hand adding
more enforcing of encoding checks might break existing applications that
were being sloppy.  On balance there doesn't seem to be much enthusiasm
for a back-patch, so fix in HEAD only.

While at it, remove some apparently-no-longer-needed provisions for
letting pg_do_encoding_conversion() "work" outside a transaction ---
if you consider it "working" to silently fail to do the requested
conversion.

Also, make a few cosmetic improvements in mbutils.c, notably removing
some Asserts that are certainly dead code since the variables they
assert aren't null are never null, even at process start.  (I think
this wasn't true at one time, but it is now.)
This commit is contained in:
Tom Lane 2014-02-23 15:22:50 -05:00
parent 2c65856b7b
commit 49c817eab7
1 changed files with 117 additions and 88 deletions

View File

@ -1,10 +1,36 @@
/*
* This file contains public functions for conversion between
* client encoding and server (database) encoding.
/*-------------------------------------------------------------------------
*
* Tatsuo Ishii
* mbutils.c
* This file contains functions for encoding conversion.
*
* src/backend/utils/mb/mbutils.c
* The string-conversion functions in this file share some API quirks.
* Note the following:
*
* The functions return a palloc'd, null-terminated string if conversion
* is required. However, if no conversion is performed, the given source
* string pointer is returned as-is.
*
* Although the presence of a length argument means that callers can pass
* non-null-terminated strings, care is required because the same string
* will be passed back if no conversion occurs. Such callers *must* check
* whether result == src and handle that case differently.
*
* If the source and destination encodings are the same, the source string
* is returned without any verification; it's assumed to be valid data.
* If that might not be the case, the caller is responsible for validating
* the string using a separate call to pg_verify_mbstr(). Whenever the
* source and destination encodings are different, the functions ensure that
* the result is validly encoded according to the destination encoding.
*
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/utils/mb/mbutils.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
@ -290,7 +316,6 @@ InitializeClientEncoding(void)
int
pg_get_client_encoding(void)
{
Assert(ClientEncoding);
return ClientEncoding->encoding;
}
@ -300,29 +325,13 @@ pg_get_client_encoding(void)
const char *
pg_get_client_encoding_name(void)
{
Assert(ClientEncoding);
return ClientEncoding->name;
}
/*
* Apply encoding conversion on src and return it. The encoding
* conversion function is chosen from the pg_conversion system catalog
* marked as "default". If it is not found in the schema search path,
* it's taken from pg_catalog schema. If it even is not in the schema,
* warn and return src.
* Convert src string to another encoding (general case).
*
* If conversion occurs, a palloc'd null-terminated string is returned.
* In the case of no conversion, src is returned.
*
* CAUTION: although the presence of a length argument means that callers
* can pass non-null-terminated strings, care is required because the same
* string will be passed back if no conversion occurs. Such callers *must*
* check whether result == src and handle that case differently.
*
* Note: we try to avoid raising error, since that could get us into
* infinite recursion when this function is invoked during error message
* sending. It should be OK to raise error for overlength strings though,
* since the recursion will come with a shorter message.
* See the notes about string conversion functions at the top of this file.
*/
unsigned char *
pg_do_encoding_conversion(unsigned char *src, int len,
@ -331,39 +340,32 @@ pg_do_encoding_conversion(unsigned char *src, int len,
unsigned char *result;
Oid proc;
if (!IsTransactionState())
return src;
if (len <= 0)
return src; /* empty string is always valid */
if (src_encoding == dest_encoding)
return src;
return src; /* no conversion required, assume valid */
if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII)
return src;
if (dest_encoding == PG_SQL_ASCII)
return src; /* any string is valid in SQL_ASCII */
if (len <= 0)
if (src_encoding == PG_SQL_ASCII)
{
/* No conversion is possible, but we must validate the result */
(void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
return src;
}
if (!IsTransactionState()) /* shouldn't happen */
elog(ERROR, "cannot perform encoding conversion outside a transaction");
proc = FindDefaultConversionProc(src_encoding, dest_encoding);
if (!OidIsValid(proc))
{
ereport(LOG,
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_FUNCTION),
errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
pg_encoding_to_char(src_encoding),
pg_encoding_to_char(dest_encoding))));
return src;
}
/*
* XXX we should avoid throwing errors in OidFunctionCall. Otherwise we
* are going into infinite loop! So we have to make sure that the
* function exists before calling OidFunctionCall.
*/
if (!SearchSysCacheExists1(PROCOID, ObjectIdGetDatum(proc)))
{
elog(LOG, "cache lookup failed for function %u", proc);
return src;
}
/*
* Allocate space for conversion result, being wary of integer overflow
@ -387,7 +389,7 @@ pg_do_encoding_conversion(unsigned char *src, int len,
}
/*
* Convert string using encoding_name. The source
* Convert string to encoding encoding_name. The source
* encoding is the DB encoding.
*
* BYTEA convert_to(TEXT string, NAME encoding_name) */
@ -412,7 +414,7 @@ pg_convert_to(PG_FUNCTION_ARGS)
}
/*
* Convert string using encoding_name. The destination
* Convert string from encoding encoding_name. The destination
* encoding is the DB encoding.
*
* TEXT convert_from(BYTEA string, NAME encoding_name) */
@ -439,7 +441,7 @@ pg_convert_from(PG_FUNCTION_ARGS)
}
/*
* Convert string using encoding_names.
* Convert string between two arbitrary encodings.
*
* BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
*/
@ -472,8 +474,13 @@ pg_convert(PG_FUNCTION_ARGS)
src_str = VARDATA_ANY(string);
pg_verify_mbstr_len(src_encoding, src_str, len, false);
dest_str = (char *) pg_do_encoding_conversion(
(unsigned char *) src_str, len, src_encoding, dest_encoding);
/* perform conversion */
dest_str = (char *) pg_do_encoding_conversion((unsigned char *) src_str,
len,
src_encoding,
dest_encoding);
/* update len if conversion actually happened */
if (dest_str != src_str)
len = strlen(dest_str);
@ -503,10 +510,11 @@ pg_convert(PG_FUNCTION_ARGS)
Datum
length_in_encoding(PG_FUNCTION_ARGS)
{
bytea *string = PG_GETARG_BYTEA_P(0);
bytea *string = PG_GETARG_BYTEA_PP(0);
char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
int src_encoding = pg_char_to_encoding(src_encoding_name);
int len = VARSIZE(string) - VARHDRSZ;
const char *src_str;
int len;
int retval;
if (src_encoding < 0)
@ -515,11 +523,19 @@ length_in_encoding(PG_FUNCTION_ARGS)
errmsg("invalid encoding name \"%s\"",
src_encoding_name)));
retval = pg_verify_mbstr_len(src_encoding, VARDATA(string), len, false);
PG_RETURN_INT32(retval);
len = VARSIZE_ANY_EXHDR(string);
src_str = VARDATA_ANY(string);
retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
PG_RETURN_INT32(retval);
}
/*
* Get maximum multibyte character length in the specified encoding.
*
* Note encoding is specified numerically, not by name as above.
*/
Datum
pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
{
@ -532,27 +548,31 @@ pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
}
/*
* convert client encoding to server encoding.
* Convert client encoding to server encoding.
*
* See the notes about string conversion functions at the top of this file.
*/
char *
pg_client_to_server(const char *s, int len)
{
Assert(ClientEncoding);
return pg_any_to_server(s, len, ClientEncoding->encoding);
}
/*
* convert any encoding to server encoding.
* Convert any encoding to server encoding.
*
* See the notes about string conversion functions at the top of this file.
*
* Unlike the other string conversion functions, this will apply validation
* even if encoding == DatabaseEncoding->encoding. This is because this is
* used to process data coming in from outside the database, and we never
* want to just assume validity.
*/
char *
pg_any_to_server(const char *s, int len, int encoding)
{
Assert(DatabaseEncoding);
Assert(ClientEncoding);
if (len <= 0)
return (char *) s;
return (char *) s; /* empty string is always valid */
if (encoding == DatabaseEncoding->encoding ||
encoding == PG_SQL_ASCII)
@ -594,46 +614,59 @@ pg_any_to_server(const char *s, int len, int encoding)
return (char *) s;
}
if (ClientEncoding->encoding == encoding)
/* Fast path if we can use cached conversion function */
if (encoding == ClientEncoding->encoding)
return perform_default_encoding_conversion(s, len, true);
else
return (char *) pg_do_encoding_conversion(
(unsigned char *) s, len, encoding, DatabaseEncoding->encoding);
/* General case ... will not work outside transactions */
return (char *) pg_do_encoding_conversion((unsigned char *) s,
len,
encoding,
DatabaseEncoding->encoding);
}
/*
* convert server encoding to client encoding.
* Convert server encoding to client encoding.
*
* See the notes about string conversion functions at the top of this file.
*/
char *
pg_server_to_client(const char *s, int len)
{
Assert(ClientEncoding);
return pg_server_to_any(s, len, ClientEncoding->encoding);
}
/*
* convert server encoding to any encoding.
* Convert server encoding to any encoding.
*
* See the notes about string conversion functions at the top of this file.
*/
char *
pg_server_to_any(const char *s, int len, int encoding)
{
Assert(DatabaseEncoding);
Assert(ClientEncoding);
if (len <= 0)
return (char *) s;
return (char *) s; /* empty string is always valid */
if (encoding == DatabaseEncoding->encoding ||
encoding == PG_SQL_ASCII ||
DatabaseEncoding->encoding == PG_SQL_ASCII)
encoding == PG_SQL_ASCII)
return (char *) s; /* assume data is valid */
if (ClientEncoding->encoding == encoding)
if (DatabaseEncoding->encoding == PG_SQL_ASCII)
{
/* No conversion is possible, but we must validate the result */
(void) pg_verify_mbstr(encoding, s, len, false);
return (char *) s;
}
/* Fast path if we can use cached conversion function */
if (encoding == ClientEncoding->encoding)
return perform_default_encoding_conversion(s, len, false);
else
return (char *) pg_do_encoding_conversion(
(unsigned char *) s, len, DatabaseEncoding->encoding, encoding);
/* General case ... will not work outside transactions */
return (char *) pg_do_encoding_conversion((unsigned char *) s,
len,
DatabaseEncoding->encoding,
encoding);
}
/*
@ -643,7 +676,8 @@ pg_server_to_any(const char *s, int len, int encoding)
* SetClientEncoding(), no conversion is performed.
*/
static char *
perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
perform_default_encoding_conversion(const char *src, int len,
bool is_client_to_server)
{
char *result;
int src_encoding,
@ -931,11 +965,11 @@ raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
* On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
* When that matches the database encoding, we don't need to do anything. In
* CREATE DATABASE, we enforce or trust that the locale's codeset matches the
* database encoding, except for the C locale. (On Windows, we also permit a
* database encoding, except for the C locale. (On Windows, we also permit a
* discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
* gettext to the right codeset.
*
* On Windows, gettext defaults to the Windows ANSI code page. This is a
* On Windows, gettext defaults to the Windows ANSI code page. This is a
* convenient departure for software that passes the strings to Windows ANSI
* APIs, but we don't do that. Compel gettext to use database encoding or,
* failing that, the LC_CTYPE encoding as it would on other platforms.
@ -980,28 +1014,24 @@ pg_bind_textdomain_codeset(const char *domainname)
int
GetDatabaseEncoding(void)
{
Assert(DatabaseEncoding);
return DatabaseEncoding->encoding;
}
const char *
GetDatabaseEncodingName(void)
{
Assert(DatabaseEncoding);
return DatabaseEncoding->name;
}
Datum
getdatabaseencoding(PG_FUNCTION_ARGS)
{
Assert(DatabaseEncoding);
return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
}
Datum
pg_client_encoding(PG_FUNCTION_ARGS)
{
Assert(ClientEncoding);
return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
}
@ -1014,7 +1044,6 @@ pg_client_encoding(PG_FUNCTION_ARGS)
int
GetMessageEncoding(void)
{
Assert(MessageEncoding);
return MessageEncoding->encoding;
}