Rationalize code placement between wchar.c, encnames.c, and mbutils.c.
Move all the backend-only code that'd crept into wchar.c and encnames.c into mbutils.c. To remove the last few #ifdef dependencies from wchar.c and encnames.c, also make the following changes: * Adjust get_encoding_name_for_icu to return NULL, not throw an error, for unsupported encodings. Its sole caller can perfectly well throw an error instead. (While at it, I also made this function and its sibling is_encoding_supported_by_icu proof against out-of-range encoding IDs.) * Remove the overlength-name error condition from pg_char_to_encoding. It's completely silly not to treat that just like any other the-name-is-not-in-the-table case. Also, get rid of pg_mic_mblen --- there's no obvious reason why conv.c shouldn't call pg_mule_mblen instead. Other than that, this is just code movement and comment-polishing with no functional changes. Notably, I reordered declarations in pg_wchar.h to show which functions are frontend-accessible and which are not. Discussion: https://postgr.es/m/CA+TgmoYO8oq-iy8E02rD8eX25T-9SmyxKWqqks5OMHxKvGXpXQ@mail.gmail.com
This commit is contained in:
parent
3d4cb5d6c1
commit
5afaa2e426
@ -1555,9 +1555,14 @@ init_icu_converter(void)
|
|||||||
UConverter *conv;
|
UConverter *conv;
|
||||||
|
|
||||||
if (icu_converter)
|
if (icu_converter)
|
||||||
return;
|
return; /* already done */
|
||||||
|
|
||||||
icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
|
icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
|
||||||
|
if (!icu_encoding_name)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||||
|
errmsg("encoding \"%s\" not supported by ICU",
|
||||||
|
pg_encoding_to_char(GetDatabaseEncoding()))));
|
||||||
|
|
||||||
status = U_ZERO_ERROR;
|
status = U_ZERO_ERROR;
|
||||||
conv = ucnv_open(icu_encoding_name, &status);
|
conv = ucnv_open(icu_encoding_name, &status);
|
||||||
|
@ -115,7 +115,7 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int l = pg_mic_mblen(mic);
|
int l = pg_mule_mblen(mic);
|
||||||
|
|
||||||
if (len < l)
|
if (len < l)
|
||||||
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
|
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
|
||||||
@ -217,7 +217,7 @@ mic2latin_with_table(const unsigned char *mic,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
int l = pg_mic_mblen(mic);
|
int l = pg_mule_mblen(mic);
|
||||||
|
|
||||||
if (len < l)
|
if (len < l)
|
||||||
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
|
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
|
||||||
|
@ -1066,6 +1066,23 @@ pg_client_encoding(PG_FUNCTION_ARGS)
|
|||||||
return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
|
return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Datum
|
||||||
|
PG_char_to_encoding(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
Name s = PG_GETARG_NAME(0);
|
||||||
|
|
||||||
|
PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
|
||||||
|
}
|
||||||
|
|
||||||
|
Datum
|
||||||
|
PG_encoding_to_char(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
int32 encoding = PG_GETARG_INT32(0);
|
||||||
|
const char *encoding_name = pg_encoding_to_char(encoding);
|
||||||
|
|
||||||
|
return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* gettext() returns messages in this encoding. This often matches the
|
* gettext() returns messages in this encoding. This often matches the
|
||||||
* database encoding, but it differs for SQL_ASCII databases, for processes
|
* database encoding, but it differs for SQL_ASCII databases, for processes
|
||||||
@ -1078,6 +1095,438 @@ GetMessageEncoding(void)
|
|||||||
return MessageEncoding->encoding;
|
return MessageEncoding->encoding;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Generic character incrementer function.
|
||||||
|
*
|
||||||
|
* Not knowing anything about the properties of the encoding in use, we just
|
||||||
|
* keep incrementing the last byte until we get a validly-encoded result,
|
||||||
|
* or we run out of values to try. We don't bother to try incrementing
|
||||||
|
* higher-order bytes, so there's no growth in runtime for wider characters.
|
||||||
|
* (If we did try to do that, we'd need to consider the likelihood that 255
|
||||||
|
* is not a valid final byte in the encoding.)
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
pg_generic_charinc(unsigned char *charptr, int len)
|
||||||
|
{
|
||||||
|
unsigned char *lastbyte = charptr + len - 1;
|
||||||
|
mbverifier mbverify;
|
||||||
|
|
||||||
|
/* We can just invoke the character verifier directly. */
|
||||||
|
mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
|
||||||
|
|
||||||
|
while (*lastbyte < (unsigned char) 255)
|
||||||
|
{
|
||||||
|
(*lastbyte)++;
|
||||||
|
if ((*mbverify) (charptr, len) == len)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* UTF-8 character incrementer function.
|
||||||
|
*
|
||||||
|
* For a one-byte character less than 0x7F, we just increment the byte.
|
||||||
|
*
|
||||||
|
* For a multibyte character, every byte but the first must fall between 0x80
|
||||||
|
* and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
|
||||||
|
* the last byte that's not already at its maximum value. If we can't find a
|
||||||
|
* byte that's less than the maximum allowable value, we simply fail. We also
|
||||||
|
* need some special-case logic to skip regions used for surrogate pair
|
||||||
|
* handling, as those should not occur in valid UTF-8.
|
||||||
|
*
|
||||||
|
* Note that we don't reset lower-order bytes back to their minimums, since
|
||||||
|
* we can't afford to make an exhaustive search (see make_greater_string).
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
pg_utf8_increment(unsigned char *charptr, int length)
|
||||||
|
{
|
||||||
|
unsigned char a;
|
||||||
|
unsigned char limit;
|
||||||
|
|
||||||
|
switch (length)
|
||||||
|
{
|
||||||
|
default:
|
||||||
|
/* reject lengths 5 and 6 for now */
|
||||||
|
return false;
|
||||||
|
case 4:
|
||||||
|
a = charptr[3];
|
||||||
|
if (a < 0xBF)
|
||||||
|
{
|
||||||
|
charptr[3]++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* FALL THRU */
|
||||||
|
case 3:
|
||||||
|
a = charptr[2];
|
||||||
|
if (a < 0xBF)
|
||||||
|
{
|
||||||
|
charptr[2]++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* FALL THRU */
|
||||||
|
case 2:
|
||||||
|
a = charptr[1];
|
||||||
|
switch (*charptr)
|
||||||
|
{
|
||||||
|
case 0xED:
|
||||||
|
limit = 0x9F;
|
||||||
|
break;
|
||||||
|
case 0xF4:
|
||||||
|
limit = 0x8F;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
limit = 0xBF;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (a < limit)
|
||||||
|
{
|
||||||
|
charptr[1]++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* FALL THRU */
|
||||||
|
case 1:
|
||||||
|
a = *charptr;
|
||||||
|
if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
|
||||||
|
return false;
|
||||||
|
charptr[0]++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* EUC-JP character incrementer function.
|
||||||
|
*
|
||||||
|
* If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
|
||||||
|
* representing JIS X 0201 characters with the second byte ranging between
|
||||||
|
* 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
|
||||||
|
* and otherwise rewrite the whole sequence to 0xa1 0xa1.
|
||||||
|
*
|
||||||
|
* If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
|
||||||
|
* in which the last two bytes range between 0xa1 and 0xfe. The last byte
|
||||||
|
* is incremented if possible, otherwise the second-to-last byte.
|
||||||
|
*
|
||||||
|
* If the sequence starts with a value other than the above and its MSB
|
||||||
|
* is set, it must be a two-byte sequence representing JIS X 0208 characters
|
||||||
|
* with both bytes ranging between 0xa1 and 0xfe. The last byte is
|
||||||
|
* incremented if possible, otherwise the second-to-last byte.
|
||||||
|
*
|
||||||
|
* Otherwise, the sequence is a single-byte ASCII character. It is
|
||||||
|
* incremented up to 0x7f.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
pg_eucjp_increment(unsigned char *charptr, int length)
|
||||||
|
{
|
||||||
|
unsigned char c1,
|
||||||
|
c2;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
c1 = *charptr;
|
||||||
|
|
||||||
|
switch (c1)
|
||||||
|
{
|
||||||
|
case SS2: /* JIS X 0201 */
|
||||||
|
if (length != 2)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
c2 = charptr[1];
|
||||||
|
|
||||||
|
if (c2 >= 0xdf)
|
||||||
|
charptr[0] = charptr[1] = 0xa1;
|
||||||
|
else if (c2 < 0xa1)
|
||||||
|
charptr[1] = 0xa1;
|
||||||
|
else
|
||||||
|
charptr[1]++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SS3: /* JIS X 0212 */
|
||||||
|
if (length != 3)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (i = 2; i > 0; i--)
|
||||||
|
{
|
||||||
|
c2 = charptr[i];
|
||||||
|
if (c2 < 0xa1)
|
||||||
|
{
|
||||||
|
charptr[i] = 0xa1;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else if (c2 < 0xfe)
|
||||||
|
{
|
||||||
|
charptr[i]++;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Out of 3-byte code region */
|
||||||
|
return false;
|
||||||
|
|
||||||
|
default:
|
||||||
|
if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
|
||||||
|
{
|
||||||
|
if (length != 2)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (i = 1; i >= 0; i--)
|
||||||
|
{
|
||||||
|
c2 = charptr[i];
|
||||||
|
if (c2 < 0xa1)
|
||||||
|
{
|
||||||
|
charptr[i] = 0xa1;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else if (c2 < 0xfe)
|
||||||
|
{
|
||||||
|
charptr[i]++;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Out of 2 byte code region */
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{ /* ASCII, single byte */
|
||||||
|
if (c1 > 0x7e)
|
||||||
|
return false;
|
||||||
|
(*charptr)++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* get the character incrementer for the encoding for the current database
|
||||||
|
*/
|
||||||
|
mbcharacter_incrementer
|
||||||
|
pg_database_encoding_character_incrementer(void)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Eventually it might be best to add a field to pg_wchar_table[], but for
|
||||||
|
* now we just use a switch.
|
||||||
|
*/
|
||||||
|
switch (GetDatabaseEncoding())
|
||||||
|
{
|
||||||
|
case PG_UTF8:
|
||||||
|
return pg_utf8_increment;
|
||||||
|
|
||||||
|
case PG_EUC_JP:
|
||||||
|
return pg_eucjp_increment;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return pg_generic_charinc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* fetch maximum length of the encoding for the current database
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
pg_database_encoding_max_length(void)
|
||||||
|
{
|
||||||
|
return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Verify mbstr to make sure that it is validly encoded in the current
|
||||||
|
* database encoding. Otherwise same as pg_verify_mbstr().
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
pg_verifymbstr(const char *mbstr, int len, bool noError)
|
||||||
|
{
|
||||||
|
return
|
||||||
|
pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Verify mbstr to make sure that it is validly encoded in the specified
|
||||||
|
* encoding.
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
|
||||||
|
{
|
||||||
|
return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Verify mbstr to make sure that it is validly encoded in the specified
|
||||||
|
* encoding.
|
||||||
|
*
|
||||||
|
* mbstr is not necessarily zero terminated; length of mbstr is
|
||||||
|
* specified by len.
|
||||||
|
*
|
||||||
|
* If OK, return length of string in the encoding.
|
||||||
|
* If a problem is found, return -1 when noError is
|
||||||
|
* true; when noError is false, ereport() a descriptive message.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
|
||||||
|
{
|
||||||
|
mbverifier mbverify;
|
||||||
|
int mb_len;
|
||||||
|
|
||||||
|
Assert(PG_VALID_ENCODING(encoding));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In single-byte encodings, we need only reject nulls (\0).
|
||||||
|
*/
|
||||||
|
if (pg_encoding_max_length(encoding) <= 1)
|
||||||
|
{
|
||||||
|
const char *nullpos = memchr(mbstr, 0, len);
|
||||||
|
|
||||||
|
if (nullpos == NULL)
|
||||||
|
return len;
|
||||||
|
if (noError)
|
||||||
|
return -1;
|
||||||
|
report_invalid_encoding(encoding, nullpos, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* fetch function pointer just once */
|
||||||
|
mbverify = pg_wchar_table[encoding].mbverify;
|
||||||
|
|
||||||
|
mb_len = 0;
|
||||||
|
|
||||||
|
while (len > 0)
|
||||||
|
{
|
||||||
|
int l;
|
||||||
|
|
||||||
|
/* fast path for ASCII-subset characters */
|
||||||
|
if (!IS_HIGHBIT_SET(*mbstr))
|
||||||
|
{
|
||||||
|
if (*mbstr != '\0')
|
||||||
|
{
|
||||||
|
mb_len++;
|
||||||
|
mbstr++;
|
||||||
|
len--;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (noError)
|
||||||
|
return -1;
|
||||||
|
report_invalid_encoding(encoding, mbstr, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
l = (*mbverify) ((const unsigned char *) mbstr, len);
|
||||||
|
|
||||||
|
if (l < 0)
|
||||||
|
{
|
||||||
|
if (noError)
|
||||||
|
return -1;
|
||||||
|
report_invalid_encoding(encoding, mbstr, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
mbstr += l;
|
||||||
|
len -= l;
|
||||||
|
mb_len++;
|
||||||
|
}
|
||||||
|
return mb_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* check_encoding_conversion_args: check arguments of a conversion function
|
||||||
|
*
|
||||||
|
* "expected" arguments can be either an encoding ID or -1 to indicate that
|
||||||
|
* the caller will check whether it accepts the ID.
|
||||||
|
*
|
||||||
|
* Note: the errors here are not really user-facing, so elog instead of
|
||||||
|
* ereport seems sufficient. Also, we trust that the "expected" encoding
|
||||||
|
* arguments are valid encoding IDs, but we don't trust the actuals.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
check_encoding_conversion_args(int src_encoding,
|
||||||
|
int dest_encoding,
|
||||||
|
int len,
|
||||||
|
int expected_src_encoding,
|
||||||
|
int expected_dest_encoding)
|
||||||
|
{
|
||||||
|
if (!PG_VALID_ENCODING(src_encoding))
|
||||||
|
elog(ERROR, "invalid source encoding ID: %d", src_encoding);
|
||||||
|
if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
|
||||||
|
elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
|
||||||
|
pg_enc2name_tbl[expected_src_encoding].name,
|
||||||
|
pg_enc2name_tbl[src_encoding].name);
|
||||||
|
if (!PG_VALID_ENCODING(dest_encoding))
|
||||||
|
elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
|
||||||
|
if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
|
||||||
|
elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
|
||||||
|
pg_enc2name_tbl[expected_dest_encoding].name,
|
||||||
|
pg_enc2name_tbl[dest_encoding].name);
|
||||||
|
if (len < 0)
|
||||||
|
elog(ERROR, "encoding conversion length must not be negative");
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* report_invalid_encoding: complain about invalid multibyte character
|
||||||
|
*
|
||||||
|
* note: len is remaining length of string, not length of character;
|
||||||
|
* len must be greater than zero, as we always examine the first byte.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
report_invalid_encoding(int encoding, const char *mbstr, int len)
|
||||||
|
{
|
||||||
|
int l = pg_encoding_mblen(encoding, mbstr);
|
||||||
|
char buf[8 * 5 + 1];
|
||||||
|
char *p = buf;
|
||||||
|
int j,
|
||||||
|
jlimit;
|
||||||
|
|
||||||
|
jlimit = Min(l, len);
|
||||||
|
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
||||||
|
|
||||||
|
for (j = 0; j < jlimit; j++)
|
||||||
|
{
|
||||||
|
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
||||||
|
if (j < jlimit - 1)
|
||||||
|
p += sprintf(p, " ");
|
||||||
|
}
|
||||||
|
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||||
|
errmsg("invalid byte sequence for encoding \"%s\": %s",
|
||||||
|
pg_enc2name_tbl[encoding].name,
|
||||||
|
buf)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* report_untranslatable_char: complain about untranslatable character
|
||||||
|
*
|
||||||
|
* note: len is remaining length of string, not length of character;
|
||||||
|
* len must be greater than zero, as we always examine the first byte.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
report_untranslatable_char(int src_encoding, int dest_encoding,
|
||||||
|
const char *mbstr, int len)
|
||||||
|
{
|
||||||
|
int l = pg_encoding_mblen(src_encoding, mbstr);
|
||||||
|
char buf[8 * 5 + 1];
|
||||||
|
char *p = buf;
|
||||||
|
int j,
|
||||||
|
jlimit;
|
||||||
|
|
||||||
|
jlimit = Min(l, len);
|
||||||
|
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
||||||
|
|
||||||
|
for (j = 0; j < jlimit; j++)
|
||||||
|
{
|
||||||
|
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
||||||
|
if (j < jlimit - 1)
|
||||||
|
p += sprintf(p, " ");
|
||||||
|
}
|
||||||
|
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
|
||||||
|
errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
|
||||||
|
buf,
|
||||||
|
pg_enc2name_tbl[src_encoding].name,
|
||||||
|
pg_enc2name_tbl[dest_encoding].name)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
/*
|
/*
|
||||||
* Convert from MessageEncoding to a palloc'ed, null-terminated utf16
|
* Convert from MessageEncoding to a palloc'ed, null-terminated utf16
|
||||||
@ -1149,4 +1598,4 @@ pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
|
|||||||
return utf16;
|
return utf16;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif /* WIN32 */
|
||||||
|
@ -10,12 +10,7 @@
|
|||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
#ifdef FRONTEND
|
#include "c.h"
|
||||||
#include "postgres_fe.h"
|
|
||||||
#else
|
|
||||||
#include "postgres.h"
|
|
||||||
#include "utils/builtins.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
@ -310,6 +305,7 @@ static const pg_encname pg_encname_tbl[] =
|
|||||||
#else
|
#else
|
||||||
#define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
|
#define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const pg_enc2name pg_enc2name_tbl[] =
|
const pg_enc2name pg_enc2name_tbl[] =
|
||||||
{
|
{
|
||||||
DEF_ENC2NAME(SQL_ASCII, 0),
|
DEF_ENC2NAME(SQL_ASCII, 0),
|
||||||
@ -409,10 +405,8 @@ const pg_enc2gettext pg_enc2gettext_tbl[] =
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#ifndef FRONTEND
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Table of encoding names for ICU
|
* Table of encoding names for ICU (currently covers backend encodings only)
|
||||||
*
|
*
|
||||||
* Reference: <https://ssl.icu-project.org/icu-bin/convexp>
|
* Reference: <https://ssl.icu-project.org/icu-bin/convexp>
|
||||||
*
|
*
|
||||||
@ -457,33 +451,32 @@ static const char *const pg_enc2icu_tbl[] =
|
|||||||
"KOI8-U", /* PG_KOI8U */
|
"KOI8-U", /* PG_KOI8U */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Is this encoding supported by ICU?
|
||||||
|
*/
|
||||||
bool
|
bool
|
||||||
is_encoding_supported_by_icu(int encoding)
|
is_encoding_supported_by_icu(int encoding)
|
||||||
{
|
{
|
||||||
|
if (!PG_VALID_BE_ENCODING(encoding))
|
||||||
|
return false;
|
||||||
return (pg_enc2icu_tbl[encoding] != NULL);
|
return (pg_enc2icu_tbl[encoding] != NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns ICU's name for encoding, or NULL if not supported
|
||||||
|
*/
|
||||||
const char *
|
const char *
|
||||||
get_encoding_name_for_icu(int encoding)
|
get_encoding_name_for_icu(int encoding)
|
||||||
{
|
{
|
||||||
const char *icu_encoding_name;
|
|
||||||
|
|
||||||
StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
|
StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
|
||||||
"pg_enc2icu_tbl incomplete");
|
"pg_enc2icu_tbl incomplete");
|
||||||
|
|
||||||
icu_encoding_name = pg_enc2icu_tbl[encoding];
|
if (!PG_VALID_BE_ENCODING(encoding))
|
||||||
|
return NULL;
|
||||||
if (!icu_encoding_name)
|
return pg_enc2icu_tbl[encoding];
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
||||||
errmsg("encoding \"%s\" not supported by ICU",
|
|
||||||
pg_encoding_to_char(encoding))));
|
|
||||||
|
|
||||||
return icu_encoding_name;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* not FRONTEND */
|
|
||||||
|
|
||||||
|
|
||||||
/* ----------
|
/* ----------
|
||||||
* Encoding checks, for error returns -1 else encoding id
|
* Encoding checks, for error returns -1 else encoding id
|
||||||
@ -523,9 +516,10 @@ pg_valid_server_encoding_id(int encoding)
|
|||||||
return PG_VALID_BE_ENCODING(encoding);
|
return PG_VALID_BE_ENCODING(encoding);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------
|
/*
|
||||||
* Remove irrelevant chars from encoding name
|
* Remove irrelevant chars from encoding name, store at *newkey
|
||||||
* ----------
|
*
|
||||||
|
* (Caller's responsibility to provide a large enough buffer)
|
||||||
*/
|
*/
|
||||||
static char *
|
static char *
|
||||||
clean_encoding_name(const char *key, char *newkey)
|
clean_encoding_name(const char *key, char *newkey)
|
||||||
@ -547,11 +541,10 @@ clean_encoding_name(const char *key, char *newkey)
|
|||||||
return newkey;
|
return newkey;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------
|
/*
|
||||||
* Search encoding by encoding name
|
* Search encoding by encoding name
|
||||||
*
|
*
|
||||||
* Returns encoding ID, or -1 for error
|
* Returns encoding ID, or -1 if not recognized
|
||||||
* ----------
|
|
||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
pg_char_to_encoding(const char *name)
|
pg_char_to_encoding(const char *name)
|
||||||
@ -568,16 +561,8 @@ pg_char_to_encoding(const char *name)
|
|||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (strlen(name) >= NAMEDATALEN)
|
if (strlen(name) >= NAMEDATALEN)
|
||||||
{
|
return -1; /* it's certainly not in the table */
|
||||||
#ifdef FRONTEND
|
|
||||||
fprintf(stderr, "encoding name too long\n");
|
|
||||||
return -1;
|
|
||||||
#else
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_NAME_TOO_LONG),
|
|
||||||
errmsg("encoding name too long")));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
key = clean_encoding_name(name, buff);
|
key = clean_encoding_name(name, buff);
|
||||||
|
|
||||||
while (last >= base)
|
while (last >= base)
|
||||||
@ -599,16 +584,6 @@ pg_char_to_encoding(const char *name)
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef FRONTEND
|
|
||||||
Datum
|
|
||||||
PG_char_to_encoding(PG_FUNCTION_ARGS)
|
|
||||||
{
|
|
||||||
Name s = PG_GETARG_NAME(0);
|
|
||||||
|
|
||||||
PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const char *
|
const char *
|
||||||
pg_encoding_to_char(int encoding)
|
pg_encoding_to_char(int encoding)
|
||||||
{
|
{
|
||||||
@ -621,15 +596,3 @@ pg_encoding_to_char(int encoding)
|
|||||||
}
|
}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef FRONTEND
|
|
||||||
Datum
|
|
||||||
PG_encoding_to_char(PG_FUNCTION_ARGS)
|
|
||||||
{
|
|
||||||
int32 encoding = PG_GETARG_INT32(0);
|
|
||||||
const char *encoding_name = pg_encoding_to_char(encoding);
|
|
||||||
|
|
||||||
return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
@ -10,11 +10,7 @@
|
|||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
#ifdef FRONTEND
|
#include "c.h"
|
||||||
#include "postgres_fe.h"
|
|
||||||
#else
|
|
||||||
#include "postgres.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "mb/pg_wchar.h"
|
#include "mb/pg_wchar.h"
|
||||||
|
|
||||||
@ -838,6 +834,7 @@ pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
|
|||||||
return cnt;
|
return cnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* exported for direct use by conv.c */
|
||||||
int
|
int
|
||||||
pg_mule_mblen(const unsigned char *s)
|
pg_mule_mblen(const unsigned char *s)
|
||||||
{
|
{
|
||||||
@ -1498,214 +1495,6 @@ pg_utf8_islegal(const unsigned char *source, int length)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef FRONTEND
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Generic character incrementer function.
|
|
||||||
*
|
|
||||||
* Not knowing anything about the properties of the encoding in use, we just
|
|
||||||
* keep incrementing the last byte until we get a validly-encoded result,
|
|
||||||
* or we run out of values to try. We don't bother to try incrementing
|
|
||||||
* higher-order bytes, so there's no growth in runtime for wider characters.
|
|
||||||
* (If we did try to do that, we'd need to consider the likelihood that 255
|
|
||||||
* is not a valid final byte in the encoding.)
|
|
||||||
*/
|
|
||||||
static bool
|
|
||||||
pg_generic_charinc(unsigned char *charptr, int len)
|
|
||||||
{
|
|
||||||
unsigned char *lastbyte = charptr + len - 1;
|
|
||||||
mbverifier mbverify;
|
|
||||||
|
|
||||||
/* We can just invoke the character verifier directly. */
|
|
||||||
mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
|
|
||||||
|
|
||||||
while (*lastbyte < (unsigned char) 255)
|
|
||||||
{
|
|
||||||
(*lastbyte)++;
|
|
||||||
if ((*mbverify) (charptr, len) == len)
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* UTF-8 character incrementer function.
|
|
||||||
*
|
|
||||||
* For a one-byte character less than 0x7F, we just increment the byte.
|
|
||||||
*
|
|
||||||
* For a multibyte character, every byte but the first must fall between 0x80
|
|
||||||
* and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
|
|
||||||
* the last byte that's not already at its maximum value. If we can't find a
|
|
||||||
* byte that's less than the maximum allowable value, we simply fail. We also
|
|
||||||
* need some special-case logic to skip regions used for surrogate pair
|
|
||||||
* handling, as those should not occur in valid UTF-8.
|
|
||||||
*
|
|
||||||
* Note that we don't reset lower-order bytes back to their minimums, since
|
|
||||||
* we can't afford to make an exhaustive search (see make_greater_string).
|
|
||||||
*/
|
|
||||||
static bool
|
|
||||||
pg_utf8_increment(unsigned char *charptr, int length)
|
|
||||||
{
|
|
||||||
unsigned char a;
|
|
||||||
unsigned char limit;
|
|
||||||
|
|
||||||
switch (length)
|
|
||||||
{
|
|
||||||
default:
|
|
||||||
/* reject lengths 5 and 6 for now */
|
|
||||||
return false;
|
|
||||||
case 4:
|
|
||||||
a = charptr[3];
|
|
||||||
if (a < 0xBF)
|
|
||||||
{
|
|
||||||
charptr[3]++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
/* FALL THRU */
|
|
||||||
case 3:
|
|
||||||
a = charptr[2];
|
|
||||||
if (a < 0xBF)
|
|
||||||
{
|
|
||||||
charptr[2]++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
/* FALL THRU */
|
|
||||||
case 2:
|
|
||||||
a = charptr[1];
|
|
||||||
switch (*charptr)
|
|
||||||
{
|
|
||||||
case 0xED:
|
|
||||||
limit = 0x9F;
|
|
||||||
break;
|
|
||||||
case 0xF4:
|
|
||||||
limit = 0x8F;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
limit = 0xBF;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (a < limit)
|
|
||||||
{
|
|
||||||
charptr[1]++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
/* FALL THRU */
|
|
||||||
case 1:
|
|
||||||
a = *charptr;
|
|
||||||
if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
|
|
||||||
return false;
|
|
||||||
charptr[0]++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* EUC-JP character incrementer function.
|
|
||||||
*
|
|
||||||
* If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
|
|
||||||
* representing JIS X 0201 characters with the second byte ranging between
|
|
||||||
* 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
|
|
||||||
* and otherwise rewrite the whole sequence to 0xa1 0xa1.
|
|
||||||
*
|
|
||||||
* If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
|
|
||||||
* in which the last two bytes range between 0xa1 and 0xfe. The last byte
|
|
||||||
* is incremented if possible, otherwise the second-to-last byte.
|
|
||||||
*
|
|
||||||
* If the sequence starts with a value other than the above and its MSB
|
|
||||||
* is set, it must be a two-byte sequence representing JIS X 0208 characters
|
|
||||||
* with both bytes ranging between 0xa1 and 0xfe. The last byte is
|
|
||||||
* incremented if possible, otherwise the second-to-last byte.
|
|
||||||
*
|
|
||||||
* Otherwise, the sequence is a single-byte ASCII character. It is
|
|
||||||
* incremented up to 0x7f.
|
|
||||||
*/
|
|
||||||
static bool
|
|
||||||
pg_eucjp_increment(unsigned char *charptr, int length)
|
|
||||||
{
|
|
||||||
unsigned char c1,
|
|
||||||
c2;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
c1 = *charptr;
|
|
||||||
|
|
||||||
switch (c1)
|
|
||||||
{
|
|
||||||
case SS2: /* JIS X 0201 */
|
|
||||||
if (length != 2)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
c2 = charptr[1];
|
|
||||||
|
|
||||||
if (c2 >= 0xdf)
|
|
||||||
charptr[0] = charptr[1] = 0xa1;
|
|
||||||
else if (c2 < 0xa1)
|
|
||||||
charptr[1] = 0xa1;
|
|
||||||
else
|
|
||||||
charptr[1]++;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case SS3: /* JIS X 0212 */
|
|
||||||
if (length != 3)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
for (i = 2; i > 0; i--)
|
|
||||||
{
|
|
||||||
c2 = charptr[i];
|
|
||||||
if (c2 < 0xa1)
|
|
||||||
{
|
|
||||||
charptr[i] = 0xa1;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
else if (c2 < 0xfe)
|
|
||||||
{
|
|
||||||
charptr[i]++;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Out of 3-byte code region */
|
|
||||||
return false;
|
|
||||||
|
|
||||||
default:
|
|
||||||
if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
|
|
||||||
{
|
|
||||||
if (length != 2)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
for (i = 1; i >= 0; i--)
|
|
||||||
{
|
|
||||||
c2 = charptr[i];
|
|
||||||
if (c2 < 0xa1)
|
|
||||||
{
|
|
||||||
charptr[i] = 0xa1;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
else if (c2 < 0xfe)
|
|
||||||
{
|
|
||||||
charptr[i]++;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Out of 2 byte code region */
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{ /* ASCII, single byte */
|
|
||||||
if (c1 > 0x7e)
|
|
||||||
return false;
|
|
||||||
(*charptr)++;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
#endif /* !FRONTEND */
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
*-------------------------------------------------------------------
|
*-------------------------------------------------------------------
|
||||||
@ -1758,13 +1547,6 @@ const pg_wchar_tbl pg_wchar_table[] = {
|
|||||||
{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */
|
{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */
|
||||||
};
|
};
|
||||||
|
|
||||||
/* returns the byte length of a word for mule internal code */
|
|
||||||
int
|
|
||||||
pg_mic_mblen(const unsigned char *mbstr)
|
|
||||||
{
|
|
||||||
return pg_mule_mblen(mbstr);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Returns the byte length of a multibyte character.
|
* Returns the byte length of a multibyte character.
|
||||||
*/
|
*/
|
||||||
@ -1810,232 +1592,3 @@ pg_encoding_max_length(int encoding)
|
|||||||
|
|
||||||
return pg_wchar_table[encoding].maxmblen;
|
return pg_wchar_table[encoding].maxmblen;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef FRONTEND
|
|
||||||
|
|
||||||
/*
|
|
||||||
* fetch maximum length of the encoding for the current database
|
|
||||||
*/
|
|
||||||
int
|
|
||||||
pg_database_encoding_max_length(void)
|
|
||||||
{
|
|
||||||
return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* get the character incrementer for the encoding for the current database
|
|
||||||
*/
|
|
||||||
mbcharacter_incrementer
|
|
||||||
pg_database_encoding_character_incrementer(void)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Eventually it might be best to add a field to pg_wchar_table[], but for
|
|
||||||
* now we just use a switch.
|
|
||||||
*/
|
|
||||||
switch (GetDatabaseEncoding())
|
|
||||||
{
|
|
||||||
case PG_UTF8:
|
|
||||||
return pg_utf8_increment;
|
|
||||||
|
|
||||||
case PG_EUC_JP:
|
|
||||||
return pg_eucjp_increment;
|
|
||||||
|
|
||||||
default:
|
|
||||||
return pg_generic_charinc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Verify mbstr to make sure that it is validly encoded in the current
|
|
||||||
* database encoding. Otherwise same as pg_verify_mbstr().
|
|
||||||
*/
|
|
||||||
bool
|
|
||||||
pg_verifymbstr(const char *mbstr, int len, bool noError)
|
|
||||||
{
|
|
||||||
return
|
|
||||||
pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Verify mbstr to make sure that it is validly encoded in the specified
|
|
||||||
* encoding.
|
|
||||||
*/
|
|
||||||
bool
|
|
||||||
pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
|
|
||||||
{
|
|
||||||
return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Verify mbstr to make sure that it is validly encoded in the specified
|
|
||||||
* encoding.
|
|
||||||
*
|
|
||||||
* mbstr is not necessarily zero terminated; length of mbstr is
|
|
||||||
* specified by len.
|
|
||||||
*
|
|
||||||
* If OK, return length of string in the encoding.
|
|
||||||
* If a problem is found, return -1 when noError is
|
|
||||||
* true; when noError is false, ereport() a descriptive message.
|
|
||||||
*/
|
|
||||||
int
|
|
||||||
pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
|
|
||||||
{
|
|
||||||
mbverifier mbverify;
|
|
||||||
int mb_len;
|
|
||||||
|
|
||||||
Assert(PG_VALID_ENCODING(encoding));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* In single-byte encodings, we need only reject nulls (\0).
|
|
||||||
*/
|
|
||||||
if (pg_encoding_max_length(encoding) <= 1)
|
|
||||||
{
|
|
||||||
const char *nullpos = memchr(mbstr, 0, len);
|
|
||||||
|
|
||||||
if (nullpos == NULL)
|
|
||||||
return len;
|
|
||||||
if (noError)
|
|
||||||
return -1;
|
|
||||||
report_invalid_encoding(encoding, nullpos, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* fetch function pointer just once */
|
|
||||||
mbverify = pg_wchar_table[encoding].mbverify;
|
|
||||||
|
|
||||||
mb_len = 0;
|
|
||||||
|
|
||||||
while (len > 0)
|
|
||||||
{
|
|
||||||
int l;
|
|
||||||
|
|
||||||
/* fast path for ASCII-subset characters */
|
|
||||||
if (!IS_HIGHBIT_SET(*mbstr))
|
|
||||||
{
|
|
||||||
if (*mbstr != '\0')
|
|
||||||
{
|
|
||||||
mb_len++;
|
|
||||||
mbstr++;
|
|
||||||
len--;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (noError)
|
|
||||||
return -1;
|
|
||||||
report_invalid_encoding(encoding, mbstr, len);
|
|
||||||
}
|
|
||||||
|
|
||||||
l = (*mbverify) ((const unsigned char *) mbstr, len);
|
|
||||||
|
|
||||||
if (l < 0)
|
|
||||||
{
|
|
||||||
if (noError)
|
|
||||||
return -1;
|
|
||||||
report_invalid_encoding(encoding, mbstr, len);
|
|
||||||
}
|
|
||||||
|
|
||||||
mbstr += l;
|
|
||||||
len -= l;
|
|
||||||
mb_len++;
|
|
||||||
}
|
|
||||||
return mb_len;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* check_encoding_conversion_args: check arguments of a conversion function
|
|
||||||
*
|
|
||||||
* "expected" arguments can be either an encoding ID or -1 to indicate that
|
|
||||||
* the caller will check whether it accepts the ID.
|
|
||||||
*
|
|
||||||
* Note: the errors here are not really user-facing, so elog instead of
|
|
||||||
* ereport seems sufficient. Also, we trust that the "expected" encoding
|
|
||||||
* arguments are valid encoding IDs, but we don't trust the actuals.
|
|
||||||
*/
|
|
||||||
void
|
|
||||||
check_encoding_conversion_args(int src_encoding,
|
|
||||||
int dest_encoding,
|
|
||||||
int len,
|
|
||||||
int expected_src_encoding,
|
|
||||||
int expected_dest_encoding)
|
|
||||||
{
|
|
||||||
if (!PG_VALID_ENCODING(src_encoding))
|
|
||||||
elog(ERROR, "invalid source encoding ID: %d", src_encoding);
|
|
||||||
if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
|
|
||||||
elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
|
|
||||||
pg_enc2name_tbl[expected_src_encoding].name,
|
|
||||||
pg_enc2name_tbl[src_encoding].name);
|
|
||||||
if (!PG_VALID_ENCODING(dest_encoding))
|
|
||||||
elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
|
|
||||||
if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
|
|
||||||
elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
|
|
||||||
pg_enc2name_tbl[expected_dest_encoding].name,
|
|
||||||
pg_enc2name_tbl[dest_encoding].name);
|
|
||||||
if (len < 0)
|
|
||||||
elog(ERROR, "encoding conversion length must not be negative");
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* report_invalid_encoding: complain about invalid multibyte character
|
|
||||||
*
|
|
||||||
* note: len is remaining length of string, not length of character;
|
|
||||||
* len must be greater than zero, as we always examine the first byte.
|
|
||||||
*/
|
|
||||||
void
|
|
||||||
report_invalid_encoding(int encoding, const char *mbstr, int len)
|
|
||||||
{
|
|
||||||
int l = pg_encoding_mblen(encoding, mbstr);
|
|
||||||
char buf[8 * 5 + 1];
|
|
||||||
char *p = buf;
|
|
||||||
int j,
|
|
||||||
jlimit;
|
|
||||||
|
|
||||||
jlimit = Min(l, len);
|
|
||||||
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
|
||||||
|
|
||||||
for (j = 0; j < jlimit; j++)
|
|
||||||
{
|
|
||||||
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
|
||||||
if (j < jlimit - 1)
|
|
||||||
p += sprintf(p, " ");
|
|
||||||
}
|
|
||||||
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
|
||||||
errmsg("invalid byte sequence for encoding \"%s\": %s",
|
|
||||||
pg_enc2name_tbl[encoding].name,
|
|
||||||
buf)));
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* report_untranslatable_char: complain about untranslatable character
|
|
||||||
*
|
|
||||||
* note: len is remaining length of string, not length of character;
|
|
||||||
* len must be greater than zero, as we always examine the first byte.
|
|
||||||
*/
|
|
||||||
void
|
|
||||||
report_untranslatable_char(int src_encoding, int dest_encoding,
|
|
||||||
const char *mbstr, int len)
|
|
||||||
{
|
|
||||||
int l = pg_encoding_mblen(src_encoding, mbstr);
|
|
||||||
char buf[8 * 5 + 1];
|
|
||||||
char *p = buf;
|
|
||||||
int j,
|
|
||||||
jlimit;
|
|
||||||
|
|
||||||
jlimit = Min(l, len);
|
|
||||||
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
|
||||||
|
|
||||||
for (j = 0; j < jlimit; j++)
|
|
||||||
{
|
|
||||||
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
|
||||||
if (j < jlimit - 1)
|
|
||||||
p += sprintf(p, " ");
|
|
||||||
}
|
|
||||||
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
|
|
||||||
errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
|
|
||||||
buf,
|
|
||||||
pg_enc2name_tbl[src_encoding].name,
|
|
||||||
pg_enc2name_tbl[dest_encoding].name)));
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* !FRONTEND */
|
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
* src/include/mb/pg_wchar.h
|
* src/include/mb/pg_wchar.h
|
||||||
*
|
*
|
||||||
* NOTES
|
* NOTES
|
||||||
* This is used both by the backend and by libpq, but should not be
|
* This is used both by the backend and by frontends, but should not be
|
||||||
* included by libpq client programs. In particular, a libpq client
|
* included by libpq client programs. In particular, a libpq client
|
||||||
* should not assume that the encoding IDs used by the version of libpq
|
* should not assume that the encoding IDs used by the version of libpq
|
||||||
* it's linked to match up with the IDs declared here.
|
* it's linked to match up with the IDs declared here.
|
||||||
@ -345,12 +345,6 @@ typedef struct pg_enc2gettext
|
|||||||
|
|
||||||
extern const pg_enc2gettext pg_enc2gettext_tbl[];
|
extern const pg_enc2gettext pg_enc2gettext_tbl[];
|
||||||
|
|
||||||
/*
|
|
||||||
* Encoding names for ICU
|
|
||||||
*/
|
|
||||||
extern bool is_encoding_supported_by_icu(int encoding);
|
|
||||||
extern const char *get_encoding_name_for_icu(int encoding);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* pg_wchar stuff
|
* pg_wchar stuff
|
||||||
*/
|
*/
|
||||||
@ -539,8 +533,27 @@ extern const char *pg_encoding_to_char(int encoding);
|
|||||||
extern int pg_valid_server_encoding_id(int encoding);
|
extern int pg_valid_server_encoding_id(int encoding);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Remaining functions are not considered part of libpq's API, though many
|
* These functions are available to frontend code that links with libpgcommon
|
||||||
* of them do exist inside libpq.
|
* (in addition to the ones just above). The constant tables declared
|
||||||
|
* earlier in this file are also available from libpgcommon.
|
||||||
|
*/
|
||||||
|
extern int pg_encoding_mblen(int encoding, const char *mbstr);
|
||||||
|
extern int pg_encoding_dsplen(int encoding, const char *mbstr);
|
||||||
|
extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);
|
||||||
|
extern int pg_encoding_max_length(int encoding);
|
||||||
|
extern int pg_valid_client_encoding(const char *name);
|
||||||
|
extern int pg_valid_server_encoding(const char *name);
|
||||||
|
extern bool is_encoding_supported_by_icu(int encoding);
|
||||||
|
extern const char *get_encoding_name_for_icu(int encoding);
|
||||||
|
|
||||||
|
extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
|
||||||
|
extern pg_wchar utf8_to_unicode(const unsigned char *c);
|
||||||
|
extern bool pg_utf8_islegal(const unsigned char *source, int length);
|
||||||
|
extern int pg_utf_mblen(const unsigned char *s);
|
||||||
|
extern int pg_mule_mblen(const unsigned char *s);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The remaining functions are backend-only.
|
||||||
*/
|
*/
|
||||||
extern int pg_mb2wchar(const char *from, pg_wchar *to);
|
extern int pg_mb2wchar(const char *from, pg_wchar *to);
|
||||||
extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
|
extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
|
||||||
@ -556,18 +569,12 @@ extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t
|
|||||||
extern size_t pg_wchar_strlen(const pg_wchar *wstr);
|
extern size_t pg_wchar_strlen(const pg_wchar *wstr);
|
||||||
extern int pg_mblen(const char *mbstr);
|
extern int pg_mblen(const char *mbstr);
|
||||||
extern int pg_dsplen(const char *mbstr);
|
extern int pg_dsplen(const char *mbstr);
|
||||||
extern int pg_encoding_mblen(int encoding, const char *mbstr);
|
|
||||||
extern int pg_encoding_dsplen(int encoding, const char *mbstr);
|
|
||||||
extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);
|
|
||||||
extern int pg_mule_mblen(const unsigned char *mbstr);
|
|
||||||
extern int pg_mic_mblen(const unsigned char *mbstr);
|
|
||||||
extern int pg_mbstrlen(const char *mbstr);
|
extern int pg_mbstrlen(const char *mbstr);
|
||||||
extern int pg_mbstrlen_with_len(const char *mbstr, int len);
|
extern int pg_mbstrlen_with_len(const char *mbstr, int len);
|
||||||
extern int pg_mbcliplen(const char *mbstr, int len, int limit);
|
extern int pg_mbcliplen(const char *mbstr, int len, int limit);
|
||||||
extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,
|
extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,
|
||||||
int len, int limit);
|
int len, int limit);
|
||||||
extern int pg_mbcharcliplen(const char *mbstr, int len, int limit);
|
extern int pg_mbcharcliplen(const char *mbstr, int len, int limit);
|
||||||
extern int pg_encoding_max_length(int encoding);
|
|
||||||
extern int pg_database_encoding_max_length(void);
|
extern int pg_database_encoding_max_length(void);
|
||||||
extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void);
|
extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void);
|
||||||
|
|
||||||
@ -587,12 +594,6 @@ extern int GetMessageEncoding(void);
|
|||||||
extern int pg_bind_textdomain_codeset(const char *domainname);
|
extern int pg_bind_textdomain_codeset(const char *domainname);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern int pg_valid_client_encoding(const char *name);
|
|
||||||
extern int pg_valid_server_encoding(const char *name);
|
|
||||||
|
|
||||||
extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
|
|
||||||
extern pg_wchar utf8_to_unicode(const unsigned char *c);
|
|
||||||
extern int pg_utf_mblen(const unsigned char *);
|
|
||||||
extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
|
extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
|
||||||
int src_encoding,
|
int src_encoding,
|
||||||
int dest_encoding);
|
int dest_encoding);
|
||||||
@ -647,8 +648,6 @@ extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
|
|||||||
int len, int lc, int encoding,
|
int len, int lc, int encoding,
|
||||||
const unsigned char *tab);
|
const unsigned char *tab);
|
||||||
|
|
||||||
extern bool pg_utf8_islegal(const unsigned char *source, int length);
|
|
||||||
|
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
|
extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user