Rationalize code placement between wchar.c, encnames.c, and mbutils.c.
Move all the backend-only code that'd crept into wchar.c and encnames.c into mbutils.c. To remove the last few #ifdef dependencies from wchar.c and encnames.c, also make the following changes: * Adjust get_encoding_name_for_icu to return NULL, not throw an error, for unsupported encodings. Its sole caller can perfectly well throw an error instead. (While at it, I also made this function and its sibling is_encoding_supported_by_icu proof against out-of-range encoding IDs.) * Remove the overlength-name error condition from pg_char_to_encoding. It's completely silly not to treat that just like any other the-name-is-not-in-the-table case. Also, get rid of pg_mic_mblen --- there's no obvious reason why conv.c shouldn't call pg_mule_mblen instead. Other than that, this is just code movement and comment-polishing with no functional changes. Notably, I reordered declarations in pg_wchar.h to show which functions are frontend-accessible and which are not. Discussion: https://postgr.es/m/CA+TgmoYO8oq-iy8E02rD8eX25T-9SmyxKWqqks5OMHxKvGXpXQ@mail.gmail.com
This commit is contained in:
parent
3d4cb5d6c1
commit
5afaa2e426
@ -1555,9 +1555,14 @@ init_icu_converter(void)
|
||||
UConverter *conv;
|
||||
|
||||
if (icu_converter)
|
||||
return;
|
||||
return; /* already done */
|
||||
|
||||
icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
|
||||
if (!icu_encoding_name)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("encoding \"%s\" not supported by ICU",
|
||||
pg_encoding_to_char(GetDatabaseEncoding()))));
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
conv = ucnv_open(icu_encoding_name, &status);
|
||||
|
@ -115,7 +115,7 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
|
||||
}
|
||||
else
|
||||
{
|
||||
int l = pg_mic_mblen(mic);
|
||||
int l = pg_mule_mblen(mic);
|
||||
|
||||
if (len < l)
|
||||
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
|
||||
@ -217,7 +217,7 @@ mic2latin_with_table(const unsigned char *mic,
|
||||
}
|
||||
else
|
||||
{
|
||||
int l = pg_mic_mblen(mic);
|
||||
int l = pg_mule_mblen(mic);
|
||||
|
||||
if (len < l)
|
||||
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
|
||||
|
@ -1066,6 +1066,23 @@ pg_client_encoding(PG_FUNCTION_ARGS)
|
||||
return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
|
||||
}
|
||||
|
||||
Datum
|
||||
PG_char_to_encoding(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Name s = PG_GETARG_NAME(0);
|
||||
|
||||
PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
|
||||
}
|
||||
|
||||
Datum
|
||||
PG_encoding_to_char(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int32 encoding = PG_GETARG_INT32(0);
|
||||
const char *encoding_name = pg_encoding_to_char(encoding);
|
||||
|
||||
return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
|
||||
}
|
||||
|
||||
/*
|
||||
* gettext() returns messages in this encoding. This often matches the
|
||||
* database encoding, but it differs for SQL_ASCII databases, for processes
|
||||
@ -1078,6 +1095,438 @@ GetMessageEncoding(void)
|
||||
return MessageEncoding->encoding;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Generic character incrementer function.
|
||||
*
|
||||
* Not knowing anything about the properties of the encoding in use, we just
|
||||
* keep incrementing the last byte until we get a validly-encoded result,
|
||||
* or we run out of values to try. We don't bother to try incrementing
|
||||
* higher-order bytes, so there's no growth in runtime for wider characters.
|
||||
* (If we did try to do that, we'd need to consider the likelihood that 255
|
||||
* is not a valid final byte in the encoding.)
|
||||
*/
|
||||
static bool
|
||||
pg_generic_charinc(unsigned char *charptr, int len)
|
||||
{
|
||||
unsigned char *lastbyte = charptr + len - 1;
|
||||
mbverifier mbverify;
|
||||
|
||||
/* We can just invoke the character verifier directly. */
|
||||
mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
|
||||
|
||||
while (*lastbyte < (unsigned char) 255)
|
||||
{
|
||||
(*lastbyte)++;
|
||||
if ((*mbverify) (charptr, len) == len)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* UTF-8 character incrementer function.
|
||||
*
|
||||
* For a one-byte character less than 0x7F, we just increment the byte.
|
||||
*
|
||||
* For a multibyte character, every byte but the first must fall between 0x80
|
||||
* and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
|
||||
* the last byte that's not already at its maximum value. If we can't find a
|
||||
* byte that's less than the maximum allowable value, we simply fail. We also
|
||||
* need some special-case logic to skip regions used for surrogate pair
|
||||
* handling, as those should not occur in valid UTF-8.
|
||||
*
|
||||
* Note that we don't reset lower-order bytes back to their minimums, since
|
||||
* we can't afford to make an exhaustive search (see make_greater_string).
|
||||
*/
|
||||
static bool
|
||||
pg_utf8_increment(unsigned char *charptr, int length)
|
||||
{
|
||||
unsigned char a;
|
||||
unsigned char limit;
|
||||
|
||||
switch (length)
|
||||
{
|
||||
default:
|
||||
/* reject lengths 5 and 6 for now */
|
||||
return false;
|
||||
case 4:
|
||||
a = charptr[3];
|
||||
if (a < 0xBF)
|
||||
{
|
||||
charptr[3]++;
|
||||
break;
|
||||
}
|
||||
/* FALL THRU */
|
||||
case 3:
|
||||
a = charptr[2];
|
||||
if (a < 0xBF)
|
||||
{
|
||||
charptr[2]++;
|
||||
break;
|
||||
}
|
||||
/* FALL THRU */
|
||||
case 2:
|
||||
a = charptr[1];
|
||||
switch (*charptr)
|
||||
{
|
||||
case 0xED:
|
||||
limit = 0x9F;
|
||||
break;
|
||||
case 0xF4:
|
||||
limit = 0x8F;
|
||||
break;
|
||||
default:
|
||||
limit = 0xBF;
|
||||
break;
|
||||
}
|
||||
if (a < limit)
|
||||
{
|
||||
charptr[1]++;
|
||||
break;
|
||||
}
|
||||
/* FALL THRU */
|
||||
case 1:
|
||||
a = *charptr;
|
||||
if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
|
||||
return false;
|
||||
charptr[0]++;
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* EUC-JP character incrementer function.
|
||||
*
|
||||
* If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
|
||||
* representing JIS X 0201 characters with the second byte ranging between
|
||||
* 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
|
||||
* and otherwise rewrite the whole sequence to 0xa1 0xa1.
|
||||
*
|
||||
* If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
|
||||
* in which the last two bytes range between 0xa1 and 0xfe. The last byte
|
||||
* is incremented if possible, otherwise the second-to-last byte.
|
||||
*
|
||||
* If the sequence starts with a value other than the above and its MSB
|
||||
* is set, it must be a two-byte sequence representing JIS X 0208 characters
|
||||
* with both bytes ranging between 0xa1 and 0xfe. The last byte is
|
||||
* incremented if possible, otherwise the second-to-last byte.
|
||||
*
|
||||
* Otherwise, the sequence is a single-byte ASCII character. It is
|
||||
* incremented up to 0x7f.
|
||||
*/
|
||||
static bool
|
||||
pg_eucjp_increment(unsigned char *charptr, int length)
|
||||
{
|
||||
unsigned char c1,
|
||||
c2;
|
||||
int i;
|
||||
|
||||
c1 = *charptr;
|
||||
|
||||
switch (c1)
|
||||
{
|
||||
case SS2: /* JIS X 0201 */
|
||||
if (length != 2)
|
||||
return false;
|
||||
|
||||
c2 = charptr[1];
|
||||
|
||||
if (c2 >= 0xdf)
|
||||
charptr[0] = charptr[1] = 0xa1;
|
||||
else if (c2 < 0xa1)
|
||||
charptr[1] = 0xa1;
|
||||
else
|
||||
charptr[1]++;
|
||||
break;
|
||||
|
||||
case SS3: /* JIS X 0212 */
|
||||
if (length != 3)
|
||||
return false;
|
||||
|
||||
for (i = 2; i > 0; i--)
|
||||
{
|
||||
c2 = charptr[i];
|
||||
if (c2 < 0xa1)
|
||||
{
|
||||
charptr[i] = 0xa1;
|
||||
return true;
|
||||
}
|
||||
else if (c2 < 0xfe)
|
||||
{
|
||||
charptr[i]++;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Out of 3-byte code region */
|
||||
return false;
|
||||
|
||||
default:
|
||||
if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
|
||||
{
|
||||
if (length != 2)
|
||||
return false;
|
||||
|
||||
for (i = 1; i >= 0; i--)
|
||||
{
|
||||
c2 = charptr[i];
|
||||
if (c2 < 0xa1)
|
||||
{
|
||||
charptr[i] = 0xa1;
|
||||
return true;
|
||||
}
|
||||
else if (c2 < 0xfe)
|
||||
{
|
||||
charptr[i]++;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Out of 2 byte code region */
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{ /* ASCII, single byte */
|
||||
if (c1 > 0x7e)
|
||||
return false;
|
||||
(*charptr)++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* get the character incrementer for the encoding for the current database
|
||||
*/
|
||||
mbcharacter_incrementer
|
||||
pg_database_encoding_character_incrementer(void)
|
||||
{
|
||||
/*
|
||||
* Eventually it might be best to add a field to pg_wchar_table[], but for
|
||||
* now we just use a switch.
|
||||
*/
|
||||
switch (GetDatabaseEncoding())
|
||||
{
|
||||
case PG_UTF8:
|
||||
return pg_utf8_increment;
|
||||
|
||||
case PG_EUC_JP:
|
||||
return pg_eucjp_increment;
|
||||
|
||||
default:
|
||||
return pg_generic_charinc;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* fetch maximum length of the encoding for the current database
|
||||
*/
|
||||
int
|
||||
pg_database_encoding_max_length(void)
|
||||
{
|
||||
return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify mbstr to make sure that it is validly encoded in the current
|
||||
* database encoding. Otherwise same as pg_verify_mbstr().
|
||||
*/
|
||||
bool
|
||||
pg_verifymbstr(const char *mbstr, int len, bool noError)
|
||||
{
|
||||
return
|
||||
pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify mbstr to make sure that it is validly encoded in the specified
|
||||
* encoding.
|
||||
*/
|
||||
bool
|
||||
pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
|
||||
{
|
||||
return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify mbstr to make sure that it is validly encoded in the specified
|
||||
* encoding.
|
||||
*
|
||||
* mbstr is not necessarily zero terminated; length of mbstr is
|
||||
* specified by len.
|
||||
*
|
||||
* If OK, return length of string in the encoding.
|
||||
* If a problem is found, return -1 when noError is
|
||||
* true; when noError is false, ereport() a descriptive message.
|
||||
*/
|
||||
int
|
||||
pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
|
||||
{
|
||||
mbverifier mbverify;
|
||||
int mb_len;
|
||||
|
||||
Assert(PG_VALID_ENCODING(encoding));
|
||||
|
||||
/*
|
||||
* In single-byte encodings, we need only reject nulls (\0).
|
||||
*/
|
||||
if (pg_encoding_max_length(encoding) <= 1)
|
||||
{
|
||||
const char *nullpos = memchr(mbstr, 0, len);
|
||||
|
||||
if (nullpos == NULL)
|
||||
return len;
|
||||
if (noError)
|
||||
return -1;
|
||||
report_invalid_encoding(encoding, nullpos, 1);
|
||||
}
|
||||
|
||||
/* fetch function pointer just once */
|
||||
mbverify = pg_wchar_table[encoding].mbverify;
|
||||
|
||||
mb_len = 0;
|
||||
|
||||
while (len > 0)
|
||||
{
|
||||
int l;
|
||||
|
||||
/* fast path for ASCII-subset characters */
|
||||
if (!IS_HIGHBIT_SET(*mbstr))
|
||||
{
|
||||
if (*mbstr != '\0')
|
||||
{
|
||||
mb_len++;
|
||||
mbstr++;
|
||||
len--;
|
||||
continue;
|
||||
}
|
||||
if (noError)
|
||||
return -1;
|
||||
report_invalid_encoding(encoding, mbstr, len);
|
||||
}
|
||||
|
||||
l = (*mbverify) ((const unsigned char *) mbstr, len);
|
||||
|
||||
if (l < 0)
|
||||
{
|
||||
if (noError)
|
||||
return -1;
|
||||
report_invalid_encoding(encoding, mbstr, len);
|
||||
}
|
||||
|
||||
mbstr += l;
|
||||
len -= l;
|
||||
mb_len++;
|
||||
}
|
||||
return mb_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* check_encoding_conversion_args: check arguments of a conversion function
|
||||
*
|
||||
* "expected" arguments can be either an encoding ID or -1 to indicate that
|
||||
* the caller will check whether it accepts the ID.
|
||||
*
|
||||
* Note: the errors here are not really user-facing, so elog instead of
|
||||
* ereport seems sufficient. Also, we trust that the "expected" encoding
|
||||
* arguments are valid encoding IDs, but we don't trust the actuals.
|
||||
*/
|
||||
void
|
||||
check_encoding_conversion_args(int src_encoding,
|
||||
int dest_encoding,
|
||||
int len,
|
||||
int expected_src_encoding,
|
||||
int expected_dest_encoding)
|
||||
{
|
||||
if (!PG_VALID_ENCODING(src_encoding))
|
||||
elog(ERROR, "invalid source encoding ID: %d", src_encoding);
|
||||
if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
|
||||
elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
|
||||
pg_enc2name_tbl[expected_src_encoding].name,
|
||||
pg_enc2name_tbl[src_encoding].name);
|
||||
if (!PG_VALID_ENCODING(dest_encoding))
|
||||
elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
|
||||
if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
|
||||
elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
|
||||
pg_enc2name_tbl[expected_dest_encoding].name,
|
||||
pg_enc2name_tbl[dest_encoding].name);
|
||||
if (len < 0)
|
||||
elog(ERROR, "encoding conversion length must not be negative");
|
||||
}
|
||||
|
||||
/*
|
||||
* report_invalid_encoding: complain about invalid multibyte character
|
||||
*
|
||||
* note: len is remaining length of string, not length of character;
|
||||
* len must be greater than zero, as we always examine the first byte.
|
||||
*/
|
||||
void
|
||||
report_invalid_encoding(int encoding, const char *mbstr, int len)
|
||||
{
|
||||
int l = pg_encoding_mblen(encoding, mbstr);
|
||||
char buf[8 * 5 + 1];
|
||||
char *p = buf;
|
||||
int j,
|
||||
jlimit;
|
||||
|
||||
jlimit = Min(l, len);
|
||||
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
||||
|
||||
for (j = 0; j < jlimit; j++)
|
||||
{
|
||||
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
||||
if (j < jlimit - 1)
|
||||
p += sprintf(p, " ");
|
||||
}
|
||||
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("invalid byte sequence for encoding \"%s\": %s",
|
||||
pg_enc2name_tbl[encoding].name,
|
||||
buf)));
|
||||
}
|
||||
|
||||
/*
|
||||
* report_untranslatable_char: complain about untranslatable character
|
||||
*
|
||||
* note: len is remaining length of string, not length of character;
|
||||
* len must be greater than zero, as we always examine the first byte.
|
||||
*/
|
||||
void
|
||||
report_untranslatable_char(int src_encoding, int dest_encoding,
|
||||
const char *mbstr, int len)
|
||||
{
|
||||
int l = pg_encoding_mblen(src_encoding, mbstr);
|
||||
char buf[8 * 5 + 1];
|
||||
char *p = buf;
|
||||
int j,
|
||||
jlimit;
|
||||
|
||||
jlimit = Min(l, len);
|
||||
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
||||
|
||||
for (j = 0; j < jlimit; j++)
|
||||
{
|
||||
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
||||
if (j < jlimit - 1)
|
||||
p += sprintf(p, " ");
|
||||
}
|
||||
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
|
||||
errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
|
||||
buf,
|
||||
pg_enc2name_tbl[src_encoding].name,
|
||||
pg_enc2name_tbl[dest_encoding].name)));
|
||||
}
|
||||
|
||||
|
||||
#ifdef WIN32
|
||||
/*
|
||||
* Convert from MessageEncoding to a palloc'ed, null-terminated utf16
|
||||
@ -1149,4 +1598,4 @@ pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
|
||||
return utf16;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif /* WIN32 */
|
||||
|
@ -10,12 +10,7 @@
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#ifdef FRONTEND
|
||||
#include "postgres_fe.h"
|
||||
#else
|
||||
#include "postgres.h"
|
||||
#include "utils/builtins.h"
|
||||
#endif
|
||||
#include "c.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <unistd.h>
|
||||
@ -310,6 +305,7 @@ static const pg_encname pg_encname_tbl[] =
|
||||
#else
|
||||
#define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
|
||||
#endif
|
||||
|
||||
const pg_enc2name pg_enc2name_tbl[] =
|
||||
{
|
||||
DEF_ENC2NAME(SQL_ASCII, 0),
|
||||
@ -409,10 +405,8 @@ const pg_enc2gettext pg_enc2gettext_tbl[] =
|
||||
};
|
||||
|
||||
|
||||
#ifndef FRONTEND
|
||||
|
||||
/*
|
||||
* Table of encoding names for ICU
|
||||
* Table of encoding names for ICU (currently covers backend encodings only)
|
||||
*
|
||||
* Reference: <https://ssl.icu-project.org/icu-bin/convexp>
|
||||
*
|
||||
@ -457,33 +451,32 @@ static const char *const pg_enc2icu_tbl[] =
|
||||
"KOI8-U", /* PG_KOI8U */
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Is this encoding supported by ICU?
|
||||
*/
|
||||
bool
|
||||
is_encoding_supported_by_icu(int encoding)
|
||||
{
|
||||
if (!PG_VALID_BE_ENCODING(encoding))
|
||||
return false;
|
||||
return (pg_enc2icu_tbl[encoding] != NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns ICU's name for encoding, or NULL if not supported
|
||||
*/
|
||||
const char *
|
||||
get_encoding_name_for_icu(int encoding)
|
||||
{
|
||||
const char *icu_encoding_name;
|
||||
|
||||
StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
|
||||
"pg_enc2icu_tbl incomplete");
|
||||
|
||||
icu_encoding_name = pg_enc2icu_tbl[encoding];
|
||||
|
||||
if (!icu_encoding_name)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("encoding \"%s\" not supported by ICU",
|
||||
pg_encoding_to_char(encoding))));
|
||||
|
||||
return icu_encoding_name;
|
||||
if (!PG_VALID_BE_ENCODING(encoding))
|
||||
return NULL;
|
||||
return pg_enc2icu_tbl[encoding];
|
||||
}
|
||||
|
||||
#endif /* not FRONTEND */
|
||||
|
||||
|
||||
/* ----------
|
||||
* Encoding checks, for error returns -1 else encoding id
|
||||
@ -523,9 +516,10 @@ pg_valid_server_encoding_id(int encoding)
|
||||
return PG_VALID_BE_ENCODING(encoding);
|
||||
}
|
||||
|
||||
/* ----------
|
||||
* Remove irrelevant chars from encoding name
|
||||
* ----------
|
||||
/*
|
||||
* Remove irrelevant chars from encoding name, store at *newkey
|
||||
*
|
||||
* (Caller's responsibility to provide a large enough buffer)
|
||||
*/
|
||||
static char *
|
||||
clean_encoding_name(const char *key, char *newkey)
|
||||
@ -547,11 +541,10 @@ clean_encoding_name(const char *key, char *newkey)
|
||||
return newkey;
|
||||
}
|
||||
|
||||
/* ----------
|
||||
/*
|
||||
* Search encoding by encoding name
|
||||
*
|
||||
* Returns encoding ID, or -1 for error
|
||||
* ----------
|
||||
* Returns encoding ID, or -1 if not recognized
|
||||
*/
|
||||
int
|
||||
pg_char_to_encoding(const char *name)
|
||||
@ -568,16 +561,8 @@ pg_char_to_encoding(const char *name)
|
||||
return -1;
|
||||
|
||||
if (strlen(name) >= NAMEDATALEN)
|
||||
{
|
||||
#ifdef FRONTEND
|
||||
fprintf(stderr, "encoding name too long\n");
|
||||
return -1;
|
||||
#else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_NAME_TOO_LONG),
|
||||
errmsg("encoding name too long")));
|
||||
#endif
|
||||
}
|
||||
return -1; /* it's certainly not in the table */
|
||||
|
||||
key = clean_encoding_name(name, buff);
|
||||
|
||||
while (last >= base)
|
||||
@ -599,16 +584,6 @@ pg_char_to_encoding(const char *name)
|
||||
return -1;
|
||||
}
|
||||
|
||||
#ifndef FRONTEND
|
||||
Datum
|
||||
PG_char_to_encoding(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Name s = PG_GETARG_NAME(0);
|
||||
|
||||
PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
|
||||
}
|
||||
#endif
|
||||
|
||||
const char *
|
||||
pg_encoding_to_char(int encoding)
|
||||
{
|
||||
@ -621,15 +596,3 @@ pg_encoding_to_char(int encoding)
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
#ifndef FRONTEND
|
||||
Datum
|
||||
PG_encoding_to_char(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int32 encoding = PG_GETARG_INT32(0);
|
||||
const char *encoding_name = pg_encoding_to_char(encoding);
|
||||
|
||||
return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -10,11 +10,7 @@
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#ifdef FRONTEND
|
||||
#include "postgres_fe.h"
|
||||
#else
|
||||
#include "postgres.h"
|
||||
#endif
|
||||
#include "c.h"
|
||||
|
||||
#include "mb/pg_wchar.h"
|
||||
|
||||
@ -838,6 +834,7 @@ pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/* exported for direct use by conv.c */
|
||||
int
|
||||
pg_mule_mblen(const unsigned char *s)
|
||||
{
|
||||
@ -1498,214 +1495,6 @@ pg_utf8_islegal(const unsigned char *source, int length)
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifndef FRONTEND
|
||||
|
||||
/*
|
||||
* Generic character incrementer function.
|
||||
*
|
||||
* Not knowing anything about the properties of the encoding in use, we just
|
||||
* keep incrementing the last byte until we get a validly-encoded result,
|
||||
* or we run out of values to try. We don't bother to try incrementing
|
||||
* higher-order bytes, so there's no growth in runtime for wider characters.
|
||||
* (If we did try to do that, we'd need to consider the likelihood that 255
|
||||
* is not a valid final byte in the encoding.)
|
||||
*/
|
||||
static bool
|
||||
pg_generic_charinc(unsigned char *charptr, int len)
|
||||
{
|
||||
unsigned char *lastbyte = charptr + len - 1;
|
||||
mbverifier mbverify;
|
||||
|
||||
/* We can just invoke the character verifier directly. */
|
||||
mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
|
||||
|
||||
while (*lastbyte < (unsigned char) 255)
|
||||
{
|
||||
(*lastbyte)++;
|
||||
if ((*mbverify) (charptr, len) == len)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* UTF-8 character incrementer function.
|
||||
*
|
||||
* For a one-byte character less than 0x7F, we just increment the byte.
|
||||
*
|
||||
* For a multibyte character, every byte but the first must fall between 0x80
|
||||
* and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
|
||||
* the last byte that's not already at its maximum value. If we can't find a
|
||||
* byte that's less than the maximum allowable value, we simply fail. We also
|
||||
* need some special-case logic to skip regions used for surrogate pair
|
||||
* handling, as those should not occur in valid UTF-8.
|
||||
*
|
||||
* Note that we don't reset lower-order bytes back to their minimums, since
|
||||
* we can't afford to make an exhaustive search (see make_greater_string).
|
||||
*/
|
||||
static bool
|
||||
pg_utf8_increment(unsigned char *charptr, int length)
|
||||
{
|
||||
unsigned char a;
|
||||
unsigned char limit;
|
||||
|
||||
switch (length)
|
||||
{
|
||||
default:
|
||||
/* reject lengths 5 and 6 for now */
|
||||
return false;
|
||||
case 4:
|
||||
a = charptr[3];
|
||||
if (a < 0xBF)
|
||||
{
|
||||
charptr[3]++;
|
||||
break;
|
||||
}
|
||||
/* FALL THRU */
|
||||
case 3:
|
||||
a = charptr[2];
|
||||
if (a < 0xBF)
|
||||
{
|
||||
charptr[2]++;
|
||||
break;
|
||||
}
|
||||
/* FALL THRU */
|
||||
case 2:
|
||||
a = charptr[1];
|
||||
switch (*charptr)
|
||||
{
|
||||
case 0xED:
|
||||
limit = 0x9F;
|
||||
break;
|
||||
case 0xF4:
|
||||
limit = 0x8F;
|
||||
break;
|
||||
default:
|
||||
limit = 0xBF;
|
||||
break;
|
||||
}
|
||||
if (a < limit)
|
||||
{
|
||||
charptr[1]++;
|
||||
break;
|
||||
}
|
||||
/* FALL THRU */
|
||||
case 1:
|
||||
a = *charptr;
|
||||
if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
|
||||
return false;
|
||||
charptr[0]++;
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* EUC-JP character incrementer function.
|
||||
*
|
||||
* If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
|
||||
* representing JIS X 0201 characters with the second byte ranging between
|
||||
* 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
|
||||
* and otherwise rewrite the whole sequence to 0xa1 0xa1.
|
||||
*
|
||||
* If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
|
||||
* in which the last two bytes range between 0xa1 and 0xfe. The last byte
|
||||
* is incremented if possible, otherwise the second-to-last byte.
|
||||
*
|
||||
* If the sequence starts with a value other than the above and its MSB
|
||||
* is set, it must be a two-byte sequence representing JIS X 0208 characters
|
||||
* with both bytes ranging between 0xa1 and 0xfe. The last byte is
|
||||
* incremented if possible, otherwise the second-to-last byte.
|
||||
*
|
||||
* Otherwise, the sequence is a single-byte ASCII character. It is
|
||||
* incremented up to 0x7f.
|
||||
*/
|
||||
static bool
|
||||
pg_eucjp_increment(unsigned char *charptr, int length)
|
||||
{
|
||||
unsigned char c1,
|
||||
c2;
|
||||
int i;
|
||||
|
||||
c1 = *charptr;
|
||||
|
||||
switch (c1)
|
||||
{
|
||||
case SS2: /* JIS X 0201 */
|
||||
if (length != 2)
|
||||
return false;
|
||||
|
||||
c2 = charptr[1];
|
||||
|
||||
if (c2 >= 0xdf)
|
||||
charptr[0] = charptr[1] = 0xa1;
|
||||
else if (c2 < 0xa1)
|
||||
charptr[1] = 0xa1;
|
||||
else
|
||||
charptr[1]++;
|
||||
break;
|
||||
|
||||
case SS3: /* JIS X 0212 */
|
||||
if (length != 3)
|
||||
return false;
|
||||
|
||||
for (i = 2; i > 0; i--)
|
||||
{
|
||||
c2 = charptr[i];
|
||||
if (c2 < 0xa1)
|
||||
{
|
||||
charptr[i] = 0xa1;
|
||||
return true;
|
||||
}
|
||||
else if (c2 < 0xfe)
|
||||
{
|
||||
charptr[i]++;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Out of 3-byte code region */
|
||||
return false;
|
||||
|
||||
default:
|
||||
if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
|
||||
{
|
||||
if (length != 2)
|
||||
return false;
|
||||
|
||||
for (i = 1; i >= 0; i--)
|
||||
{
|
||||
c2 = charptr[i];
|
||||
if (c2 < 0xa1)
|
||||
{
|
||||
charptr[i] = 0xa1;
|
||||
return true;
|
||||
}
|
||||
else if (c2 < 0xfe)
|
||||
{
|
||||
charptr[i]++;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Out of 2 byte code region */
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{ /* ASCII, single byte */
|
||||
if (c1 > 0x7e)
|
||||
return false;
|
||||
(*charptr)++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif /* !FRONTEND */
|
||||
|
||||
|
||||
/*
|
||||
*-------------------------------------------------------------------
|
||||
@ -1758,13 +1547,6 @@ const pg_wchar_tbl pg_wchar_table[] = {
|
||||
{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */
|
||||
};
|
||||
|
||||
/* returns the byte length of a word for mule internal code */
|
||||
int
|
||||
pg_mic_mblen(const unsigned char *mbstr)
|
||||
{
|
||||
return pg_mule_mblen(mbstr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the byte length of a multibyte character.
|
||||
*/
|
||||
@ -1810,232 +1592,3 @@ pg_encoding_max_length(int encoding)
|
||||
|
||||
return pg_wchar_table[encoding].maxmblen;
|
||||
}
|
||||
|
||||
#ifndef FRONTEND
|
||||
|
||||
/*
|
||||
* fetch maximum length of the encoding for the current database
|
||||
*/
|
||||
int
|
||||
pg_database_encoding_max_length(void)
|
||||
{
|
||||
return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
|
||||
}
|
||||
|
||||
/*
|
||||
* get the character incrementer for the encoding for the current database
|
||||
*/
|
||||
mbcharacter_incrementer
|
||||
pg_database_encoding_character_incrementer(void)
|
||||
{
|
||||
/*
|
||||
* Eventually it might be best to add a field to pg_wchar_table[], but for
|
||||
* now we just use a switch.
|
||||
*/
|
||||
switch (GetDatabaseEncoding())
|
||||
{
|
||||
case PG_UTF8:
|
||||
return pg_utf8_increment;
|
||||
|
||||
case PG_EUC_JP:
|
||||
return pg_eucjp_increment;
|
||||
|
||||
default:
|
||||
return pg_generic_charinc;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify mbstr to make sure that it is validly encoded in the current
|
||||
* database encoding. Otherwise same as pg_verify_mbstr().
|
||||
*/
|
||||
bool
|
||||
pg_verifymbstr(const char *mbstr, int len, bool noError)
|
||||
{
|
||||
return
|
||||
pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify mbstr to make sure that it is validly encoded in the specified
|
||||
* encoding.
|
||||
*/
|
||||
bool
|
||||
pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
|
||||
{
|
||||
return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify mbstr to make sure that it is validly encoded in the specified
|
||||
* encoding.
|
||||
*
|
||||
* mbstr is not necessarily zero terminated; length of mbstr is
|
||||
* specified by len.
|
||||
*
|
||||
* If OK, return length of string in the encoding.
|
||||
* If a problem is found, return -1 when noError is
|
||||
* true; when noError is false, ereport() a descriptive message.
|
||||
*/
|
||||
int
|
||||
pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
|
||||
{
|
||||
mbverifier mbverify;
|
||||
int mb_len;
|
||||
|
||||
Assert(PG_VALID_ENCODING(encoding));
|
||||
|
||||
/*
|
||||
* In single-byte encodings, we need only reject nulls (\0).
|
||||
*/
|
||||
if (pg_encoding_max_length(encoding) <= 1)
|
||||
{
|
||||
const char *nullpos = memchr(mbstr, 0, len);
|
||||
|
||||
if (nullpos == NULL)
|
||||
return len;
|
||||
if (noError)
|
||||
return -1;
|
||||
report_invalid_encoding(encoding, nullpos, 1);
|
||||
}
|
||||
|
||||
/* fetch function pointer just once */
|
||||
mbverify = pg_wchar_table[encoding].mbverify;
|
||||
|
||||
mb_len = 0;
|
||||
|
||||
while (len > 0)
|
||||
{
|
||||
int l;
|
||||
|
||||
/* fast path for ASCII-subset characters */
|
||||
if (!IS_HIGHBIT_SET(*mbstr))
|
||||
{
|
||||
if (*mbstr != '\0')
|
||||
{
|
||||
mb_len++;
|
||||
mbstr++;
|
||||
len--;
|
||||
continue;
|
||||
}
|
||||
if (noError)
|
||||
return -1;
|
||||
report_invalid_encoding(encoding, mbstr, len);
|
||||
}
|
||||
|
||||
l = (*mbverify) ((const unsigned char *) mbstr, len);
|
||||
|
||||
if (l < 0)
|
||||
{
|
||||
if (noError)
|
||||
return -1;
|
||||
report_invalid_encoding(encoding, mbstr, len);
|
||||
}
|
||||
|
||||
mbstr += l;
|
||||
len -= l;
|
||||
mb_len++;
|
||||
}
|
||||
return mb_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* check_encoding_conversion_args: check arguments of a conversion function
|
||||
*
|
||||
* "expected" arguments can be either an encoding ID or -1 to indicate that
|
||||
* the caller will check whether it accepts the ID.
|
||||
*
|
||||
* Note: the errors here are not really user-facing, so elog instead of
|
||||
* ereport seems sufficient. Also, we trust that the "expected" encoding
|
||||
* arguments are valid encoding IDs, but we don't trust the actuals.
|
||||
*/
|
||||
void
|
||||
check_encoding_conversion_args(int src_encoding,
|
||||
int dest_encoding,
|
||||
int len,
|
||||
int expected_src_encoding,
|
||||
int expected_dest_encoding)
|
||||
{
|
||||
if (!PG_VALID_ENCODING(src_encoding))
|
||||
elog(ERROR, "invalid source encoding ID: %d", src_encoding);
|
||||
if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
|
||||
elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
|
||||
pg_enc2name_tbl[expected_src_encoding].name,
|
||||
pg_enc2name_tbl[src_encoding].name);
|
||||
if (!PG_VALID_ENCODING(dest_encoding))
|
||||
elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
|
||||
if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
|
||||
elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
|
||||
pg_enc2name_tbl[expected_dest_encoding].name,
|
||||
pg_enc2name_tbl[dest_encoding].name);
|
||||
if (len < 0)
|
||||
elog(ERROR, "encoding conversion length must not be negative");
|
||||
}
|
||||
|
||||
/*
|
||||
* report_invalid_encoding: complain about invalid multibyte character
|
||||
*
|
||||
* note: len is remaining length of string, not length of character;
|
||||
* len must be greater than zero, as we always examine the first byte.
|
||||
*/
|
||||
void
|
||||
report_invalid_encoding(int encoding, const char *mbstr, int len)
|
||||
{
|
||||
int l = pg_encoding_mblen(encoding, mbstr);
|
||||
char buf[8 * 5 + 1];
|
||||
char *p = buf;
|
||||
int j,
|
||||
jlimit;
|
||||
|
||||
jlimit = Min(l, len);
|
||||
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
||||
|
||||
for (j = 0; j < jlimit; j++)
|
||||
{
|
||||
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
||||
if (j < jlimit - 1)
|
||||
p += sprintf(p, " ");
|
||||
}
|
||||
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("invalid byte sequence for encoding \"%s\": %s",
|
||||
pg_enc2name_tbl[encoding].name,
|
||||
buf)));
|
||||
}
|
||||
|
||||
/*
|
||||
* report_untranslatable_char: complain about untranslatable character
|
||||
*
|
||||
* note: len is remaining length of string, not length of character;
|
||||
* len must be greater than zero, as we always examine the first byte.
|
||||
*/
|
||||
void
|
||||
report_untranslatable_char(int src_encoding, int dest_encoding,
|
||||
const char *mbstr, int len)
|
||||
{
|
||||
int l = pg_encoding_mblen(src_encoding, mbstr);
|
||||
char buf[8 * 5 + 1];
|
||||
char *p = buf;
|
||||
int j,
|
||||
jlimit;
|
||||
|
||||
jlimit = Min(l, len);
|
||||
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
||||
|
||||
for (j = 0; j < jlimit; j++)
|
||||
{
|
||||
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
||||
if (j < jlimit - 1)
|
||||
p += sprintf(p, " ");
|
||||
}
|
||||
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
|
||||
errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
|
||||
buf,
|
||||
pg_enc2name_tbl[src_encoding].name,
|
||||
pg_enc2name_tbl[dest_encoding].name)));
|
||||
}
|
||||
|
||||
#endif /* !FRONTEND */
|
||||
|
@ -9,7 +9,7 @@
|
||||
* src/include/mb/pg_wchar.h
|
||||
*
|
||||
* NOTES
|
||||
* This is used both by the backend and by libpq, but should not be
|
||||
* This is used both by the backend and by frontends, but should not be
|
||||
* included by libpq client programs. In particular, a libpq client
|
||||
* should not assume that the encoding IDs used by the version of libpq
|
||||
* it's linked to match up with the IDs declared here.
|
||||
@ -345,12 +345,6 @@ typedef struct pg_enc2gettext
|
||||
|
||||
extern const pg_enc2gettext pg_enc2gettext_tbl[];
|
||||
|
||||
/*
|
||||
* Encoding names for ICU
|
||||
*/
|
||||
extern bool is_encoding_supported_by_icu(int encoding);
|
||||
extern const char *get_encoding_name_for_icu(int encoding);
|
||||
|
||||
/*
|
||||
* pg_wchar stuff
|
||||
*/
|
||||
@ -539,8 +533,27 @@ extern const char *pg_encoding_to_char(int encoding);
|
||||
extern int pg_valid_server_encoding_id(int encoding);
|
||||
|
||||
/*
|
||||
* Remaining functions are not considered part of libpq's API, though many
|
||||
* of them do exist inside libpq.
|
||||
* These functions are available to frontend code that links with libpgcommon
|
||||
* (in addition to the ones just above). The constant tables declared
|
||||
* earlier in this file are also available from libpgcommon.
|
||||
*/
|
||||
extern int pg_encoding_mblen(int encoding, const char *mbstr);
|
||||
extern int pg_encoding_dsplen(int encoding, const char *mbstr);
|
||||
extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);
|
||||
extern int pg_encoding_max_length(int encoding);
|
||||
extern int pg_valid_client_encoding(const char *name);
|
||||
extern int pg_valid_server_encoding(const char *name);
|
||||
extern bool is_encoding_supported_by_icu(int encoding);
|
||||
extern const char *get_encoding_name_for_icu(int encoding);
|
||||
|
||||
extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
|
||||
extern pg_wchar utf8_to_unicode(const unsigned char *c);
|
||||
extern bool pg_utf8_islegal(const unsigned char *source, int length);
|
||||
extern int pg_utf_mblen(const unsigned char *s);
|
||||
extern int pg_mule_mblen(const unsigned char *s);
|
||||
|
||||
/*
|
||||
* The remaining functions are backend-only.
|
||||
*/
|
||||
extern int pg_mb2wchar(const char *from, pg_wchar *to);
|
||||
extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
|
||||
@ -556,18 +569,12 @@ extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t
|
||||
extern size_t pg_wchar_strlen(const pg_wchar *wstr);
|
||||
extern int pg_mblen(const char *mbstr);
|
||||
extern int pg_dsplen(const char *mbstr);
|
||||
extern int pg_encoding_mblen(int encoding, const char *mbstr);
|
||||
extern int pg_encoding_dsplen(int encoding, const char *mbstr);
|
||||
extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);
|
||||
extern int pg_mule_mblen(const unsigned char *mbstr);
|
||||
extern int pg_mic_mblen(const unsigned char *mbstr);
|
||||
extern int pg_mbstrlen(const char *mbstr);
|
||||
extern int pg_mbstrlen_with_len(const char *mbstr, int len);
|
||||
extern int pg_mbcliplen(const char *mbstr, int len, int limit);
|
||||
extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,
|
||||
int len, int limit);
|
||||
extern int pg_mbcharcliplen(const char *mbstr, int len, int limit);
|
||||
extern int pg_encoding_max_length(int encoding);
|
||||
extern int pg_database_encoding_max_length(void);
|
||||
extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void);
|
||||
|
||||
@ -587,12 +594,6 @@ extern int GetMessageEncoding(void);
|
||||
extern int pg_bind_textdomain_codeset(const char *domainname);
|
||||
#endif
|
||||
|
||||
extern int pg_valid_client_encoding(const char *name);
|
||||
extern int pg_valid_server_encoding(const char *name);
|
||||
|
||||
extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
|
||||
extern pg_wchar utf8_to_unicode(const unsigned char *c);
|
||||
extern int pg_utf_mblen(const unsigned char *);
|
||||
extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
|
||||
int src_encoding,
|
||||
int dest_encoding);
|
||||
@ -647,8 +648,6 @@ extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
|
||||
int len, int lc, int encoding,
|
||||
const unsigned char *tab);
|
||||
|
||||
extern bool pg_utf8_islegal(const unsigned char *source, int length);
|
||||
|
||||
#ifdef WIN32
|
||||
extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user