Support C.UTF-8 locale in the new builtin collation provider.

The builtin C.UTF-8 locale has similar semantics to the libc locale of
the same name. That is, code point sort order (fast, memcmp-based)
combined with Unicode semantics for character operations such as
pattern matching, regular expressions, and
LOWER()/INITCAP()/UPPER(). The character semantics are based on
Unicode simple case mappings.

The builtin provider's C.UTF-8 offers several important advantages
over libc:

 * faster sorting -- benefits from additional optimizations such as
   abbreviated keys and varstrfastcmp_c
 * faster case conversion, e.g. LOWER(), at least compared with some
   libc implementations
 * available on all platforms with identical semantics, and the
   semantics are stable, testable, and documentable within a given
   Postgres major version

Being based on memcmp, the builtin C.UTF-8 locale does not offer
natural language sort order. But it is an improvement for most use
cases that might otherwise use libc's "C.UTF-8" locale, as well as
many use cases that use libc's "C" locale.

Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com
Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
This commit is contained in:
Jeff Davis 2024-03-19 15:24:41 -07:00
parent fd0398fcb0
commit f69319f2f1
17 changed files with 494 additions and 26 deletions

View File

@ -377,13 +377,21 @@ initdb --locale-provider=icu --icu-locale=en
<listitem>
<para>
The <literal>builtin</literal> provider uses built-in operations. Only
the <literal>C</literal> locale is supported for this provider.
the <literal>C</literal> and <literal>C.UTF-8</literal> locales are
supported for this provider.
</para>
<para>
The <literal>C</literal> locale behavior is identical to the
<literal>C</literal> locale in the libc provider. When using this
locale, the behavior may depend on the database encoding.
</para>
<para>
The <literal>C.UTF-8</literal> locale is available only for when the
database encoding is <literal>UTF-8</literal>, and the behavior is
based on Unicode. The collation uses the code point values only. The
regular expression character classes are based on the "POSIX
Compatible" semantics, and the case mapping is the "simple" variant.
</para>
</listitem>
</varlistentry>
@ -878,6 +886,23 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR";
</listitem>
</varlistentry>
<varlistentry>
<term><literal>pg_c_utf8</literal></term>
<listitem>
<para>
This collation sorts by Unicode code point values rather than natural
language order. For the functions <function>lower</function>,
<function>initcap</function>, and <function>upper</function>, it uses
Unicode simple case mapping. For pattern matching (including regular
expressions), it uses the POSIX Compatible variant of Unicode <ulink
url="https://www.unicode.org/reports/tr18/#Compatibility_Properties">Compatibility
Properties</ulink>. Behavior is efficient and stable within a
<productname>Postgres</productname> major version. This collation is
only available for encoding <literal>UTF8</literal>.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>C</literal> (equivalent to <literal>POSIX</literal>)</term>
<listitem>

View File

@ -99,7 +99,7 @@ CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> FROM <replace
<para>
If <replaceable>provider</replaceable> is <literal>builtin</literal>,
then <replaceable>locale</replaceable> must be specified and set to
<literal>C</literal>.
either <literal>C</literal> or <literal>C.UTF-8</literal>.
</para>
</listitem>
</varlistentry>

View File

@ -166,8 +166,9 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
</para>
<para>
If <xref linkend="create-database-locale-provider"/> is
<literal>builtin</literal>, then <replaceable>locale</replaceable>
must be specified and set to <literal>C</literal>.
<literal>builtin</literal>, then <replaceable>locale</replaceable> or
<replaceable>builtin_locale</replaceable> must be specified and set to
either <literal>C</literal> or <literal>C.UTF-8</literal>.
</para>
<tip>
<para>
@ -228,9 +229,11 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
linkend="create-database-locale-provider">locale provider</link> must
be <literal>builtin</literal>. The default is the setting of <xref
linkend="create-database-locale"/> if specified; otherwise the same
setting as the template database. Currently, the only available
locale for the <literal>builtin</literal> provider is
<literal>C</literal>.
setting as the template database.
</para>
<para>
The locales available for the <literal>builtin</literal> provider are
<literal>C</literal> and <literal>C.UTF-8</literal>.
</para>
</listitem>
</varlistentry>

View File

@ -288,8 +288,9 @@ PostgreSQL documentation
</para>
<para>
If <option>--locale-provider</option> is <literal>builtin</literal>,
<option>--locale</option> must be specified and set to
<literal>C</literal>.
<option>--locale</option> or <option>--builtin-locale</option> must be
specified and set to <literal>C</literal> or
<literal>C.UTF-8</literal>.
</para>
</listitem>
</varlistentry>

View File

@ -16,6 +16,8 @@
*/
#include "catalog/pg_collation.h"
#include "common/unicode_case.h"
#include "common/unicode_category.h"
#include "utils/pg_locale.h"
/*
@ -64,6 +66,7 @@
typedef enum
{
PG_REGEX_LOCALE_C, /* C locale (encoding independent) */
PG_REGEX_BUILTIN, /* built-in Unicode semantics */
PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */
PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */
PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */
@ -266,7 +269,12 @@ pg_set_regex_collation(Oid collation)
if (GetDatabaseEncoding() == PG_UTF8)
{
if (pg_regex_locale)
pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
{
if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
pg_regex_strategy = PG_REGEX_BUILTIN;
else
pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
}
else
pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
}
@ -290,6 +298,8 @@ pg_wc_isdigit(pg_wchar c)
case PG_REGEX_LOCALE_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISDIGIT));
case PG_REGEX_BUILTIN:
return pg_u_isdigit(c, true);
case PG_REGEX_LOCALE_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswdigit((wint_t) c);
@ -322,6 +332,8 @@ pg_wc_isalpha(pg_wchar c)
case PG_REGEX_LOCALE_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISALPHA));
case PG_REGEX_BUILTIN:
return pg_u_isalpha(c);
case PG_REGEX_LOCALE_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswalpha((wint_t) c);
@ -354,6 +366,8 @@ pg_wc_isalnum(pg_wchar c)
case PG_REGEX_LOCALE_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISALNUM));
case PG_REGEX_BUILTIN:
return pg_u_isalnum(c, true);
case PG_REGEX_LOCALE_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswalnum((wint_t) c);
@ -395,6 +409,8 @@ pg_wc_isupper(pg_wchar c)
case PG_REGEX_LOCALE_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISUPPER));
case PG_REGEX_BUILTIN:
return pg_u_isupper(c);
case PG_REGEX_LOCALE_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswupper((wint_t) c);
@ -427,6 +443,8 @@ pg_wc_islower(pg_wchar c)
case PG_REGEX_LOCALE_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISLOWER));
case PG_REGEX_BUILTIN:
return pg_u_islower(c);
case PG_REGEX_LOCALE_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswlower((wint_t) c);
@ -459,6 +477,8 @@ pg_wc_isgraph(pg_wchar c)
case PG_REGEX_LOCALE_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISGRAPH));
case PG_REGEX_BUILTIN:
return pg_u_isgraph(c);
case PG_REGEX_LOCALE_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswgraph((wint_t) c);
@ -491,6 +511,8 @@ pg_wc_isprint(pg_wchar c)
case PG_REGEX_LOCALE_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISPRINT));
case PG_REGEX_BUILTIN:
return pg_u_isprint(c);
case PG_REGEX_LOCALE_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswprint((wint_t) c);
@ -523,6 +545,8 @@ pg_wc_ispunct(pg_wchar c)
case PG_REGEX_LOCALE_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISPUNCT));
case PG_REGEX_BUILTIN:
return pg_u_ispunct(c, true);
case PG_REGEX_LOCALE_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswpunct((wint_t) c);
@ -555,6 +579,8 @@ pg_wc_isspace(pg_wchar c)
case PG_REGEX_LOCALE_C:
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISSPACE));
case PG_REGEX_BUILTIN:
return pg_u_isspace(c);
case PG_REGEX_LOCALE_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswspace((wint_t) c);
@ -588,6 +614,8 @@ pg_wc_toupper(pg_wchar c)
if (c <= (pg_wchar) 127)
return pg_ascii_toupper((unsigned char) c);
return c;
case PG_REGEX_BUILTIN:
return unicode_uppercase_simple(c);
case PG_REGEX_LOCALE_WIDE:
/* force C behavior for ASCII characters, per comments above */
if (c <= (pg_wchar) 127)
@ -628,6 +656,8 @@ pg_wc_tolower(pg_wchar c)
if (c <= (pg_wchar) 127)
return pg_ascii_tolower((unsigned char) c);
return c;
case PG_REGEX_BUILTIN:
return unicode_lowercase_simple(c);
case PG_REGEX_LOCALE_WIDE:
/* force C behavior for ASCII characters, per comments above */
if (c <= (pg_wchar) 127)
@ -792,6 +822,9 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
#endif
break;
case PG_REGEX_BUILTIN:
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
break;
case PG_REGEX_LOCALE_WIDE:
case PG_REGEX_LOCALE_WIDE_L:
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
@ -809,6 +842,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
break;
default:
Assert(false);
max_chr = 0; /* can't get here, but keep compiler quiet */
break;
}

View File

@ -77,6 +77,8 @@
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
#include "common/unicode_case.h"
#include "common/unicode_category.h"
#include "mb/pg_wchar.h"
#include "nodes/miscnodes.h"
#include "parser/scansup.h"
@ -1679,6 +1681,34 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
}
else
#endif
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
{
const char *src = buff;
size_t srclen = nbytes;
size_t dstsize;
char *dst;
size_t needed;
Assert(GetDatabaseEncoding() == PG_UTF8);
/* first try buffer of equal size plus terminating NUL */
dstsize = srclen + 1;
dst = palloc(dstsize);
needed = unicode_strlower(dst, dstsize, src, srclen);
if (needed + 1 > dstsize)
{
/* grow buffer if needed and retry */
dstsize = needed + 1;
dst = repalloc(dst, dstsize);
needed = unicode_strlower(dst, dstsize, src, srclen);
Assert(needed + 1 == dstsize);
}
Assert(dst[needed] == '\0');
result = dst;
}
else
{
Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC);
@ -1799,6 +1829,34 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
}
else
#endif
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
{
const char *src = buff;
size_t srclen = nbytes;
size_t dstsize;
char *dst;
size_t needed;
Assert(GetDatabaseEncoding() == PG_UTF8);
/* first try buffer of equal size plus terminating NUL */
dstsize = srclen + 1;
dst = palloc(dstsize);
needed = unicode_strupper(dst, dstsize, src, srclen);
if (needed + 1 > dstsize)
{
/* grow buffer if needed and retry */
dstsize = needed + 1;
dst = repalloc(dst, dstsize);
needed = unicode_strupper(dst, dstsize, src, srclen);
Assert(needed + 1 == dstsize);
}
Assert(dst[needed] == '\0');
result = dst;
}
else
{
Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC);
@ -1920,6 +1978,60 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
}
else
#endif
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
{
const unsigned char *src = (unsigned char *) buff;
size_t srclen = nbytes;
unsigned char *dst;
size_t dstsize;
int srcoff = 0;
int dstoff = 0;
Assert(GetDatabaseEncoding() == PG_UTF8);
/* overflow paranoia */
if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
/* result is at most srclen codepoints plus terminating NUL */
dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
dst = (unsigned char *) palloc(dstsize);
while (srcoff < nbytes)
{
pg_wchar u1 = utf8_to_unicode(src + srcoff);
pg_wchar u2;
int u1len = unicode_utf8len(u1);
int u2len;
if (wasalnum)
u2 = unicode_lowercase_simple(u1);
else
u2 = unicode_uppercase_simple(u1);
u2len = unicode_utf8len(u2);
Assert(dstoff + u2len + 1 <= dstsize);
wasalnum = pg_u_isalnum(u2, true);
unicode_to_utf8(u2, dst + dstoff);
srcoff += u1len;
dstoff += u2len;
}
Assert(dstoff + 1 <= dstsize);
*(dst + dstoff) = '\0';
dstoff++;
/* allocate result buffer of the right size and free workspace */
result = palloc(dstoff);
memcpy(result, dst, dstoff);
pfree(dst);
}
else
{
Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC);

View File

@ -1270,8 +1270,14 @@ lookup_collation_cache(Oid collation, bool set_flags)
if (collform->collprovider == COLLPROVIDER_BUILTIN)
{
Datum datum;
const char *colllocale;
datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
colllocale = TextDatumGetCString(datum);
cache_entry->collate_is_c = true;
cache_entry->ctype_is_c = true;
cache_entry->ctype_is_c = (strcmp(colllocale, "C") == 0);
}
else if (collform->collprovider == COLLPROVIDER_LIBC)
{
@ -1670,7 +1676,6 @@ pg_newlocale_from_collation(Oid collid)
collversionstr = TextDatumGetCString(datum);
Assert(collform->collprovider != COLLPROVIDER_BUILTIN);
if (collform->collprovider == COLLPROVIDER_LIBC)
datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
else
@ -1725,7 +1730,13 @@ get_collation_actual_version(char collprovider, const char *collcollate)
{
char *collversion = NULL;
/* the builtin collation provider is not versioned */
/*
* The only two supported locales (C and C.UTF-8) are both based on memcmp
* and are not expected to change.
*
* Note that the character semantics may change for some locales, but the
* collation version only tracks changes to sort order.
*/
if (collprovider == COLLPROVIDER_BUILTIN)
return NULL;
@ -2505,13 +2516,17 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
int
builtin_locale_encoding(const char *locale)
{
if (strcmp(locale, "C") != 0)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("invalid locale name \"%s\" for builtin provider",
locale)));
if (strcmp(locale, "C") == 0)
return -1;
if (strcmp(locale, "C.UTF-8") == 0)
return PG_UTF8;
return -1;
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("invalid locale name \"%s\" for builtin provider",
locale)));
return 0; /* keep compiler quiet */
}
@ -2525,13 +2540,28 @@ builtin_locale_encoding(const char *locale)
const char *
builtin_validate_locale(int encoding, const char *locale)
{
if (strcmp(locale, "C") != 0)
const char *canonical_name = NULL;
int required_encoding;
if (strcmp(locale, "C") == 0)
canonical_name = "C";
else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0)
canonical_name = "C.UTF-8";
if (!canonical_name)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("invalid locale name \"%s\" for builtin provider",
locale)));
return "C";
required_encoding = builtin_locale_encoding(canonical_name);
if (required_encoding >= 0 && encoding != required_encoding)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("encoding \"%s\" does not match locale \"%s\"",
pg_encoding_to_char(encoding), locale)));
return canonical_name;
}

View File

@ -2403,9 +2403,16 @@ setlocales(void)
if (locale_provider == COLLPROVIDER_BUILTIN)
{
if (strcmp(datlocale, "C") != 0)
if (strcmp(datlocale, "C") == 0)
canonname = "C";
else if (strcmp(datlocale, "C.UTF-8") == 0 ||
strcmp(datlocale, "C.UTF8") == 0)
canonname = "C.UTF-8";
else
pg_fatal("invalid locale name \"%s\" for builtin provider",
datlocale);
datlocale = canonname;
}
else if (locale_provider == COLLPROVIDER_ICU)
{
@ -2695,6 +2702,13 @@ setup_locale_encoding(void)
!check_locale_encoding(lc_collate, encodingid))
exit(1); /* check_locale_encoding printed the error */
if (locale_provider == COLLPROVIDER_BUILTIN)
{
if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8)
pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
datlocale, "UTF-8");
}
if (locale_provider == COLLPROVIDER_ICU &&
!check_icu_locale_encoding(encodingid))
exit(1);

View File

@ -196,6 +196,23 @@ command_ok(
],
'locale provider builtin with --locale');
command_ok(
[
'initdb', '--no-sync',
'--locale-provider=builtin', '-E UTF-8',
'--builtin-locale=C.UTF-8', "$tempdir/data8"
],
'locale provider builtin with -E UTF-8 --builtin-locale=C.UTF-8');
command_fails(
[
'initdb', '--no-sync',
'--locale-provider=builtin', '-E SQL_ASCII',
'--builtin-locale=C.UTF-8', "$tempdir/data9"
],
'locale provider builtin with --builtin-locale=C.UTF-8 fails for SQL_ASCII'
);
command_ok(
[
'initdb', '--no-sync',

View File

@ -140,7 +140,7 @@ if ($oldnode->pg_version >= '17devel')
{
$original_enc_name = "UTF-8";
$original_provider = "b";
$original_datlocale = "C";
$original_datlocale = "C.UTF-8";
}
elsif ($oldnode->pg_version >= 15 && $ENV{with_icu} eq 'yes')
{

View File

@ -139,6 +139,24 @@ $node->command_ok(
],
'create database with provider "builtin" and LC_CTYPE=C');
$node->command_ok(
[
'createdb', '-T',
'template0', '--locale-provider=builtin',
'-E UTF-8', '--builtin-locale=C.UTF8',
'tbuiltin5'
],
'create database with --builtin-locale C.UTF-8 and -E UTF-8');
$node->command_fails(
[
'createdb', '-T',
'template0', '--locale-provider=builtin',
'-E LATIN1', '--builtin-locale=C.UTF-8',
'tbuiltin6'
],
'create database with --builtin-locale C.UTF-8 and -E LATIN1');
$node->command_fails(
[
'createdb', '-T',

View File

@ -57,6 +57,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 202403191
#define CATALOG_VERSION_NO 202403192
#endif

View File

@ -30,5 +30,8 @@
descr => 'sorts using the Unicode Collation Algorithm with default settings',
collname => 'unicode', collprovider => 'i', collencoding => '-1',
colllocale => 'und' },
{ oid => '811', descr => 'sorts by Unicode code point; Unicode and POSIX character semantics',
collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6',
colllocale => 'C.UTF-8' },
]

View File

@ -0,0 +1,136 @@
/*
* This test is for collations and character operations when using the
* builtin provider with the C.UTF-8 locale.
*/
/* skip test if not UTF8 server encoding */
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
\if :skip_test
\quit
\endif
SET client_encoding TO UTF8;
--
-- Test PG_C_UTF8
--
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C_UTF8'); -- fails
ERROR: invalid locale name "C_UTF8" for builtin provider
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C.UTF8');
DROP COLLATION regress_pg_c_utf8;
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C.UTF-8');
CREATE TABLE test_pg_c_utf8 (
t TEXT COLLATE PG_C_UTF8
);
INSERT INTO test_pg_c_utf8 VALUES
('abc DEF 123abc'),
('ábc sßs ßss DÉF'),
('DŽxxDŽ džxxDž Džxxdž'),
('ȺȺȺ'),
('ⱥⱥⱥ'),
('ⱥȺ');
SELECT
t, lower(t), initcap(t), upper(t),
length(convert_to(t, 'UTF8')) AS t_bytes,
length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
FROM test_pg_c_utf8;
t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes
-----------------+-----------------+-----------------+-----------------+---------+---------------+-----------------+---------------
abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14
ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF | 19 | 19 | 19 | 19
DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | DŽxxdž DŽxxdž DŽxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20
ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6
ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6
ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4
(6 rows)
DROP TABLE test_pg_c_utf8;
-- negative test: Final_Sigma not used for builtin locale C.UTF-8
SELECT lower('ΑΣ' COLLATE PG_C_UTF8);
lower
-------
ασ
(1 row)
SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8);
lower
-------
αͺσͺ
(1 row)
SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8);
lower
-------
α΄σ΄
(1 row)
-- properties
SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8;
?column?
----------
t
(1 row)
SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8;
?column?
----------
t
(1 row)
SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8;
?column?
----------
t
(1 row)
SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix
?column?
----------
t
(1 row)
SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8;
?column?
----------
t
(1 row)
SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix
?column?
----------
t
(1 row)
-- case mapping
SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8;
?column?
----------
t
(1 row)
SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8;
?column?
----------
t
(1 row)
SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
?column?
----------
t
(1 row)
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
?column?
----------
t
(1 row)
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
?column?
----------
t
(1 row)

View File

@ -0,0 +1,8 @@
/*
* This test is for collations and character operations when using the
* builtin provider with the C.UTF-8 locale.
*/
/* skip test if not UTF8 server encoding */
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
\if :skip_test
\quit

View File

@ -78,9 +78,9 @@ test: brin_bloom brin_multi
# psql depends on create_am
# amutils depends on geometry, create_index_spgist, hash_index, brin
# ----------
test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort create_role without_overlaps
test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.utf8 collate.icu.utf8 incremental_sort create_role without_overlaps
# collate.*.utf8 tests cannot be run in parallel with each other
# collate.linux.utf8 and collate.icu.utf8 tests cannot be run in parallel with each other
test: rules psql psql_crosstab amutils stats_ext collate.linux.utf8 collate.windows.win1252
# ----------

View File

@ -0,0 +1,67 @@
/*
* This test is for collations and character operations when using the
* builtin provider with the C.UTF-8 locale.
*/
/* skip test if not UTF8 server encoding */
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
\if :skip_test
\quit
\endif
SET client_encoding TO UTF8;
--
-- Test PG_C_UTF8
--
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C_UTF8'); -- fails
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C.UTF8');
DROP COLLATION regress_pg_c_utf8;
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C.UTF-8');
CREATE TABLE test_pg_c_utf8 (
t TEXT COLLATE PG_C_UTF8
);
INSERT INTO test_pg_c_utf8 VALUES
('abc DEF 123abc'),
('ábc sßs ßss DÉF'),
('DŽxxDŽ džxxDž Džxxdž'),
('ȺȺȺ'),
('ⱥⱥⱥ'),
('ⱥȺ');
SELECT
t, lower(t), initcap(t), upper(t),
length(convert_to(t, 'UTF8')) AS t_bytes,
length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
FROM test_pg_c_utf8;
DROP TABLE test_pg_c_utf8;
-- negative test: Final_Sigma not used for builtin locale C.UTF-8
SELECT lower('ΑΣ' COLLATE PG_C_UTF8);
SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8);
SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8);
-- properties
SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8;
SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8;
SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8;
SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix
SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8;
SELECT '' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix
-- case mapping
SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8;
SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8;
SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed