Support C.UTF-8 locale in the new builtin collation provider.

The builtin C.UTF-8 locale has similar semantics to the libc locale of the same name. That is, code point sort order (fast, memcmp-based) combined with Unicode semantics for character operations such as pattern matching, regular expressions, and LOWER()/INITCAP()/UPPER(). The character semantics are based on Unicode simple case mappings. The builtin provider's C.UTF-8 offers several important advantages over libc: * faster sorting -- benefits from additional optimizations such as abbreviated keys and varstrfastcmp_c * faster case conversion, e.g. LOWER(), at least compared with some libc implementations * available on all platforms with identical semantics, and the semantics are stable, testable, and documentable within a given Postgres major version Being based on memcmp, the builtin C.UTF-8 locale does not offer natural language sort order. But it is an improvement for most use cases that might otherwise use libc's "C.UTF-8" locale, as well as many use cases that use libc's "C" locale. Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
2024-03-19 15:24:41 -07:00 · 2024-03-19 15:24:41 -07:00 · f69319f2f1
parent fd0398fcb0
commit f69319f2f1
17 changed files with 494 additions and 26 deletions
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@ -377,13 +377,21 @@ initdb --locale-provider=icu --icu-locale=en
     <listitem>
      <para>
       The <literal>builtin</literal> provider uses built-in operations. Only
-       the <literal>C</literal> locale is supported for this provider.
+       the <literal>C</literal> and <literal>C.UTF-8</literal> locales are
       supported for this provider.
      </para>
      <para>
       The <literal>C</literal> locale behavior is identical to the
       <literal>C</literal> locale in the libc provider. When using this
       locale, the behavior may depend on the database encoding.
      </para>
      <para>
       The <literal>C.UTF-8</literal> locale is available only for when the
       database encoding is <literal>UTF-8</literal>, and the behavior is
       based on Unicode. The collation uses the code point values only. The
       regular expression character classes are based on the "POSIX
       Compatible" semantics, and the case mapping is the "simple" variant.
      </para>
     </listitem>
    </varlistentry>
@ -878,6 +886,23 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR";
      </listitem>
     </varlistentry>
     <varlistentry>
      <term><literal>pg_c_utf8</literal></term>
      <listitem>
       <para>
        This collation sorts by Unicode code point values rather than natural
        language order.  For the functions <function>lower</function>,
        <function>initcap</function>, and <function>upper</function>, it uses
        Unicode simple case mapping.  For pattern matching (including regular
        expressions), it uses the POSIX Compatible variant of Unicode <ulink
        url="https://www.unicode.org/reports/tr18/#Compatibility_Properties">Compatibility
        Properties</ulink>.  Behavior is efficient and stable within a
        <productname>Postgres</productname> major version.  This collation is
        only available for encoding <literal>UTF8</literal>.
       </para>
      </listitem>
     </varlistentry>
     <varlistentry>
      <term><literal>C</literal> (equivalent to <literal>POSIX</literal>)</term>
      <listitem>
--- a/doc/src/sgml/ref/create_collation.sgml
+++ b/doc/src/sgml/ref/create_collation.sgml
@ -99,7 +99,7 @@ CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> FROM <replace
      <para>
       If <replaceable>provider</replaceable> is <literal>builtin</literal>,
       then <replaceable>locale</replaceable> must be specified and set to
-       <literal>C</literal>.
+       either <literal>C</literal> or <literal>C.UTF-8</literal>.
      </para>
     </listitem>
    </varlistentry>
--- a/doc/src/sgml/ref/create_database.sgml
+++ b/doc/src/sgml/ref/create_database.sgml
@ -166,8 +166,9 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
       </para>
       <para>
        If <xref linkend="create-database-locale-provider"/> is
-        <literal>builtin</literal>, then <replaceable>locale</replaceable>
+        <literal>builtin</literal>, then <replaceable>locale</replaceable> or
-        must be specified and set to <literal>C</literal>.
+        <replaceable>builtin_locale</replaceable> must be specified and set to
        either <literal>C</literal> or <literal>C.UTF-8</literal>.
       </para>
       <tip>
        <para>
@ -228,9 +229,11 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
        linkend="create-database-locale-provider">locale provider</link> must
        be <literal>builtin</literal>.  The default is the setting of <xref
        linkend="create-database-locale"/> if specified; otherwise the same
-        setting as the template database.  Currently, the only available
+        setting as the template database.
-        locale for the <literal>builtin</literal> provider is
+       </para>
-        <literal>C</literal>.
+       <para>
        The locales available for the <literal>builtin</literal> provider are
        <literal>C</literal> and <literal>C.UTF-8</literal>.
       </para>
      </listitem>
     </varlistentry>
--- a/doc/src/sgml/ref/initdb.sgml
+++ b/doc/src/sgml/ref/initdb.sgml
@ -288,8 +288,9 @@ PostgreSQL documentation
       </para>
       <para>
        If <option>--locale-provider</option> is <literal>builtin</literal>,
-        <option>--locale</option> must be specified and set to
+        <option>--locale</option> or <option>--builtin-locale</option> must be
-        <literal>C</literal>.
+        specified and set to <literal>C</literal> or
        <literal>C.UTF-8</literal>.
       </para>
      </listitem>
     </varlistentry>
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@ -16,6 +16,8 @@
 */
 #include "catalog/pg_collation.h"
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
 #include "utils/pg_locale.h"
 /*
@ -64,6 +66,7 @@
 typedef enum
 {
 	PG_REGEX_LOCALE_C,			/* C locale (encoding independent) */
 	PG_REGEX_BUILTIN,			/* built-in Unicode semantics */
 	PG_REGEX_LOCALE_WIDE,		/* Use <wctype.h> functions */
 	PG_REGEX_LOCALE_1BYTE,		/* Use <ctype.h> functions */
 	PG_REGEX_LOCALE_WIDE_L,		/* Use locale_t <wctype.h> functions */
@ -266,7 +269,12 @@ pg_set_regex_collation(Oid collation)
 		if (GetDatabaseEncoding() == PG_UTF8)
 		{
 			if (pg_regex_locale)
 			{
 				if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
 					pg_regex_strategy = PG_REGEX_BUILTIN;
 				else
 					pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
 			}
 			else
 				pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
 		}
@ -290,6 +298,8 @@ pg_wc_isdigit(pg_wchar c)
 		case PG_REGEX_LOCALE_C:
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISDIGIT));
 		case PG_REGEX_BUILTIN:
 			return pg_u_isdigit(c, true);
 		case PG_REGEX_LOCALE_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswdigit((wint_t) c);
@ -322,6 +332,8 @@ pg_wc_isalpha(pg_wchar c)
 		case PG_REGEX_LOCALE_C:
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISALPHA));
 		case PG_REGEX_BUILTIN:
 			return pg_u_isalpha(c);
 		case PG_REGEX_LOCALE_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswalpha((wint_t) c);
@ -354,6 +366,8 @@ pg_wc_isalnum(pg_wchar c)
 		case PG_REGEX_LOCALE_C:
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISALNUM));
 		case PG_REGEX_BUILTIN:
 			return pg_u_isalnum(c, true);
 		case PG_REGEX_LOCALE_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswalnum((wint_t) c);
@ -395,6 +409,8 @@ pg_wc_isupper(pg_wchar c)
 		case PG_REGEX_LOCALE_C:
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISUPPER));
 		case PG_REGEX_BUILTIN:
 			return pg_u_isupper(c);
 		case PG_REGEX_LOCALE_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswupper((wint_t) c);
@ -427,6 +443,8 @@ pg_wc_islower(pg_wchar c)
 		case PG_REGEX_LOCALE_C:
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISLOWER));
 		case PG_REGEX_BUILTIN:
 			return pg_u_islower(c);
 		case PG_REGEX_LOCALE_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswlower((wint_t) c);
@ -459,6 +477,8 @@ pg_wc_isgraph(pg_wchar c)
 		case PG_REGEX_LOCALE_C:
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISGRAPH));
 		case PG_REGEX_BUILTIN:
 			return pg_u_isgraph(c);
 		case PG_REGEX_LOCALE_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswgraph((wint_t) c);
@ -491,6 +511,8 @@ pg_wc_isprint(pg_wchar c)
 		case PG_REGEX_LOCALE_C:
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISPRINT));
 		case PG_REGEX_BUILTIN:
 			return pg_u_isprint(c);
 		case PG_REGEX_LOCALE_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswprint((wint_t) c);
@ -523,6 +545,8 @@ pg_wc_ispunct(pg_wchar c)
 		case PG_REGEX_LOCALE_C:
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISPUNCT));
 		case PG_REGEX_BUILTIN:
 			return pg_u_ispunct(c, true);
 		case PG_REGEX_LOCALE_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswpunct((wint_t) c);
@ -555,6 +579,8 @@ pg_wc_isspace(pg_wchar c)
 		case PG_REGEX_LOCALE_C:
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISSPACE));
 		case PG_REGEX_BUILTIN:
 			return pg_u_isspace(c);
 		case PG_REGEX_LOCALE_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswspace((wint_t) c);
@ -588,6 +614,8 @@ pg_wc_toupper(pg_wchar c)
 			if (c <= (pg_wchar) 127)
 				return pg_ascii_toupper((unsigned char) c);
 			return c;
 		case PG_REGEX_BUILTIN:
 			return unicode_uppercase_simple(c);
 		case PG_REGEX_LOCALE_WIDE:
 			/* force C behavior for ASCII characters, per comments above */
 			if (c <= (pg_wchar) 127)
@ -628,6 +656,8 @@ pg_wc_tolower(pg_wchar c)
 			if (c <= (pg_wchar) 127)
 				return pg_ascii_tolower((unsigned char) c);
 			return c;
 		case PG_REGEX_BUILTIN:
 			return unicode_lowercase_simple(c);
 		case PG_REGEX_LOCALE_WIDE:
 			/* force C behavior for ASCII characters, per comments above */
 			if (c <= (pg_wchar) 127)
@ -792,6 +822,9 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
 			break;
 		case PG_REGEX_BUILTIN:
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 			break;
 		case PG_REGEX_LOCALE_WIDE:
 		case PG_REGEX_LOCALE_WIDE_L:
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
@ -809,6 +842,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 			break;
 		default:
 			Assert(false);
 			max_chr = 0;		/* can't get here, but keep compiler quiet */
 			break;
 	}
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@ -77,6 +77,8 @@
 #include "catalog/pg_collation.h"
 #include "catalog/pg_type.h"
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
 #include "mb/pg_wchar.h"
 #include "nodes/miscnodes.h"
 #include "parser/scansup.h"
@ -1679,6 +1681,34 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
 		}
 		else
 #endif
 		if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
 		{
 			const char *src = buff;
 			size_t		srclen = nbytes;
 			size_t		dstsize;
 			char	   *dst;
 			size_t		needed;
 			Assert(GetDatabaseEncoding() == PG_UTF8);
 			/* first try buffer of equal size plus terminating NUL */
 			dstsize = srclen + 1;
 			dst = palloc(dstsize);
 			needed = unicode_strlower(dst, dstsize, src, srclen);
 			if (needed + 1 > dstsize)
 			{
 				/* grow buffer if needed and retry */
 				dstsize = needed + 1;
 				dst = repalloc(dst, dstsize);
 				needed = unicode_strlower(dst, dstsize, src, srclen);
 				Assert(needed + 1 == dstsize);
 			}
 			Assert(dst[needed] == '\0');
 			result = dst;
 		}
 		else
 		{
 			Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC);
@ -1799,6 +1829,34 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 		}
 		else
 #endif
 		if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
 		{
 			const char *src = buff;
 			size_t		srclen = nbytes;
 			size_t		dstsize;
 			char	   *dst;
 			size_t		needed;
 			Assert(GetDatabaseEncoding() == PG_UTF8);
 			/* first try buffer of equal size plus terminating NUL */
 			dstsize = srclen + 1;
 			dst = palloc(dstsize);
 			needed = unicode_strupper(dst, dstsize, src, srclen);
 			if (needed + 1 > dstsize)
 			{
 				/* grow buffer if needed and retry */
 				dstsize = needed + 1;
 				dst = repalloc(dst, dstsize);
 				needed = unicode_strupper(dst, dstsize, src, srclen);
 				Assert(needed + 1 == dstsize);
 			}
 			Assert(dst[needed] == '\0');
 			result = dst;
 		}
 		else
 		{
 			Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC);
@ -1920,6 +1978,60 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 		}
 		else
 #endif
 		if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
 		{
 			const unsigned char *src = (unsigned char *) buff;
 			size_t		srclen = nbytes;
 			unsigned char *dst;
 			size_t		dstsize;
 			int			srcoff = 0;
 			int			dstoff = 0;
 			Assert(GetDatabaseEncoding() == PG_UTF8);
 			/* overflow paranoia */
 			if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
 				ereport(ERROR,
 						(errcode(ERRCODE_OUT_OF_MEMORY),
 						 errmsg("out of memory")));
 			/* result is at most srclen codepoints plus terminating NUL */
 			dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
 			dst = (unsigned char *) palloc(dstsize);
 			while (srcoff < nbytes)
 			{
 				pg_wchar	u1 = utf8_to_unicode(src + srcoff);
 				pg_wchar	u2;
 				int			u1len = unicode_utf8len(u1);
 				int			u2len;
 				if (wasalnum)
 					u2 = unicode_lowercase_simple(u1);
 				else
 					u2 = unicode_uppercase_simple(u1);
 				u2len = unicode_utf8len(u2);
 				Assert(dstoff + u2len + 1 <= dstsize);
 				wasalnum = pg_u_isalnum(u2, true);
 				unicode_to_utf8(u2, dst + dstoff);
 				srcoff += u1len;
 				dstoff += u2len;
 			}
 			Assert(dstoff + 1 <= dstsize);
 			*(dst + dstoff) = '\0';
 			dstoff++;
 			/* allocate result buffer of the right size and free workspace */
 			result = palloc(dstoff);
 			memcpy(result, dst, dstoff);
 			pfree(dst);
 		}
 		else
 		{
 			Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC);
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@ -1270,8 +1270,14 @@ lookup_collation_cache(Oid collation, bool set_flags)
 		if (collform->collprovider == COLLPROVIDER_BUILTIN)
 		{
 			Datum		datum;
 			const char *colllocale;
 			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
 			colllocale = TextDatumGetCString(datum);
 			cache_entry->collate_is_c = true;
-			cache_entry->ctype_is_c = true;
+			cache_entry->ctype_is_c = (strcmp(colllocale, "C") == 0);
 		}
 		else if (collform->collprovider == COLLPROVIDER_LIBC)
 		{
@ -1670,7 +1676,6 @@ pg_newlocale_from_collation(Oid collid)
 			collversionstr = TextDatumGetCString(datum);
 			Assert(collform->collprovider != COLLPROVIDER_BUILTIN);
 			if (collform->collprovider == COLLPROVIDER_LIBC)
 				datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
 			else
@ -1725,7 +1730,13 @@ get_collation_actual_version(char collprovider, const char *collcollate)
 {
 	char	   *collversion = NULL;
-	/* the builtin collation provider is not versioned */
+	/*
 	 * The only two supported locales (C and C.UTF-8) are both based on memcmp
 	 * and are not expected to change.
 	 *
 	 * Note that the character semantics may change for some locales, but the
 	 * collation version only tracks changes to sort order.
 	 */
 	if (collprovider == COLLPROVIDER_BUILTIN)
 		return NULL;
@ -2505,13 +2516,17 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
 int
 builtin_locale_encoding(const char *locale)
 {
-	if (strcmp(locale, "C") != 0)
+	if (strcmp(locale, "C") == 0)
 		return -1;
 	if (strcmp(locale, "C.UTF-8") == 0)
 		return PG_UTF8;
 	ereport(ERROR,
 			(errcode(ERRCODE_WRONG_OBJECT_TYPE),
 			 errmsg("invalid locale name \"%s\" for builtin provider",
 					locale)));
-	return -1;
+	return 0;					/* keep compiler quiet */
 }
@ -2525,13 +2540,28 @@ builtin_locale_encoding(const char *locale)
 const char *
 builtin_validate_locale(int encoding, const char *locale)
 {
-	if (strcmp(locale, "C") != 0)
+	const char *canonical_name = NULL;
 	int			required_encoding;
 	if (strcmp(locale, "C") == 0)
 		canonical_name = "C";
 	else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0)
 		canonical_name = "C.UTF-8";
 	if (!canonical_name)
 		ereport(ERROR,
 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
 				 errmsg("invalid locale name \"%s\" for builtin provider",
 						locale)));
-	return "C";
+	required_encoding = builtin_locale_encoding(canonical_name);
 	if (required_encoding >= 0 && encoding != required_encoding)
 		ereport(ERROR,
 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
 				 errmsg("encoding \"%s\" does not match locale \"%s\"",
 						pg_encoding_to_char(encoding), locale)));
 	return canonical_name;
 }
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@ -2403,9 +2403,16 @@ setlocales(void)
 	if (locale_provider == COLLPROVIDER_BUILTIN)
 	{
-		if (strcmp(datlocale, "C") != 0)
+		if (strcmp(datlocale, "C") == 0)
 			canonname = "C";
 		else if (strcmp(datlocale, "C.UTF-8") == 0 ||
 				 strcmp(datlocale, "C.UTF8") == 0)
 			canonname = "C.UTF-8";
 		else
 			pg_fatal("invalid locale name \"%s\" for builtin provider",
 					 datlocale);
 		datlocale = canonname;
 	}
 	else if (locale_provider == COLLPROVIDER_ICU)
 	{
@ -2695,6 +2702,13 @@ setup_locale_encoding(void)
 		!check_locale_encoding(lc_collate, encodingid))
 		exit(1);				/* check_locale_encoding printed the error */
 	if (locale_provider == COLLPROVIDER_BUILTIN)
 	{
 		if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8)
 			pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
 					 datlocale, "UTF-8");
 	}
 	if (locale_provider == COLLPROVIDER_ICU &&
 		!check_icu_locale_encoding(encodingid))
 		exit(1);
--- a/src/bin/initdb/t/001_initdb.pl
+++ b/src/bin/initdb/t/001_initdb.pl
@ -196,6 +196,23 @@ command_ok(
 	],
 	'locale provider builtin with --locale');
 command_ok(
 	[
 		'initdb', '--no-sync',
 		'--locale-provider=builtin', '-E UTF-8',
 		'--builtin-locale=C.UTF-8', "$tempdir/data8"
 	],
 	'locale provider builtin with -E UTF-8 --builtin-locale=C.UTF-8');
 command_fails(
 	[
 		'initdb', '--no-sync',
 		'--locale-provider=builtin', '-E SQL_ASCII',
 		'--builtin-locale=C.UTF-8', "$tempdir/data9"
 	],
 	'locale provider builtin with --builtin-locale=C.UTF-8 fails for SQL_ASCII'
 );
 command_ok(
 	[
 		'initdb', '--no-sync',
--- a/src/bin/pg_upgrade/t/002_pg_upgrade.pl
+++ b/src/bin/pg_upgrade/t/002_pg_upgrade.pl
@ -140,7 +140,7 @@ if ($oldnode->pg_version >= '17devel')
 {
 	$original_enc_name = "UTF-8";
 	$original_provider = "b";
-	$original_datlocale = "C";
+	$original_datlocale = "C.UTF-8";
 }
 elsif ($oldnode->pg_version >= 15 && $ENV{with_icu} eq 'yes')
 {
--- a/src/bin/scripts/t/020_createdb.pl
+++ b/src/bin/scripts/t/020_createdb.pl
@ -139,6 +139,24 @@ $node->command_ok(
 	],
 	'create database with provider "builtin" and LC_CTYPE=C');
 $node->command_ok(
 	[
 		'createdb', '-T',
 		'template0', '--locale-provider=builtin',
 		'-E UTF-8', '--builtin-locale=C.UTF8',
 		'tbuiltin5'
 	],
 	'create database with --builtin-locale C.UTF-8 and -E UTF-8');
 $node->command_fails(
 	[
 		'createdb', '-T',
 		'template0', '--locale-provider=builtin',
 		'-E LATIN1', '--builtin-locale=C.UTF-8',
 		'tbuiltin6'
 	],
 	'create database with --builtin-locale C.UTF-8 and -E LATIN1');
 $node->command_fails(
 	[
 		'createdb', '-T',
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@ -57,6 +57,6 @@
 */
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202403191
+#define CATALOG_VERSION_NO	202403192
 #endif
--- a/src/include/catalog/pg_collation.dat
+++ b/src/include/catalog/pg_collation.dat
@ -30,5 +30,8 @@
  descr => 'sorts using the Unicode Collation Algorithm with default settings',
  collname => 'unicode', collprovider => 'i', collencoding => '-1',
  colllocale => 'und' },
 { oid => '811', descr => 'sorts by Unicode code point; Unicode and POSIX character semantics',
  collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6',
  colllocale => 'C.UTF-8' },
 ]
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@ -0,0 +1,136 @@
 /*
 * This test is for collations and character operations when using the
 * builtin provider with the C.UTF-8 locale.
 */
 /* skip test if not UTF8 server encoding */
 SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
 \if :skip_test
 \quit
 \endif
 SET client_encoding TO UTF8;
 --
 -- Test PG_C_UTF8
 --
 CREATE COLLATION regress_pg_c_utf8 (
  provider = builtin, locale = 'C_UTF8'); -- fails
 ERROR:  invalid locale name "C_UTF8" for builtin provider
 CREATE COLLATION regress_pg_c_utf8 (
  provider = builtin, locale = 'C.UTF8');
 DROP COLLATION regress_pg_c_utf8;
 CREATE COLLATION regress_pg_c_utf8 (
  provider = builtin, locale = 'C.UTF-8');
 CREATE TABLE test_pg_c_utf8 (
  t TEXT COLLATE PG_C_UTF8
 );
 INSERT INTO test_pg_c_utf8 VALUES
  ('abc DEF 123abc'),
  ('ábc sßs ßss DÉF'),
  ('ǄxxǄ ǆxxǅ ǅxxǆ'),
  ('ȺȺȺ'),
  ('ⱥⱥⱥ'),
  ('ⱥȺ');
 SELECT
    t, lower(t), initcap(t), upper(t),
    length(convert_to(t, 'UTF8')) AS t_bytes,
    length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
    length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
    length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
  FROM test_pg_c_utf8;
        t        |      lower      |     initcap     |      upper      | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes 
 -----------------+-----------------+-----------------+-----------------+---------+---------------+-----------------+---------------
 abc DEF 123abc  | abc def 123abc  | Abc Def 123abc  | ABC DEF 123ABC  |      14 |            14 |              14 |            14
 ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF |      19 |            19 |              19 |            19
 ǄxxǄ ǆxxǅ ǅxxǆ  | ǆxxǆ ǆxxǆ ǆxxǆ  | Ǆxxǆ Ǆxxǆ Ǆxxǆ  | ǄXXǄ ǄXXǄ ǄXXǄ  |      20 |            20 |              20 |            20
 ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       6 |             9 |               8 |             6
 ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       9 |             9 |               8 |             6
 ⱥȺ              | ⱥⱥ              | Ⱥⱥ              | ȺȺ              |       5 |             6 |               5 |             4
 (6 rows)
 DROP TABLE test_pg_c_utf8;
 -- negative test: Final_Sigma not used for builtin locale C.UTF-8
 SELECT lower('ΑΣ' COLLATE PG_C_UTF8);
 lower 
 -------
 ασ
 (1 row)
 SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8);
 lower 
 -------
 αͺσͺ
 (1 row)
 SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8);
 lower 
 -------
 α΄σ΄
 (1 row)
 -- properties
 SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8;
 ?column? 
 ----------
 t
 (1 row)
 SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8;
 ?column? 
 ----------
 t
 (1 row)
 SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8;
 ?column? 
 ----------
 t
 (1 row)
 SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix
 ?column? 
 ----------
 t
 (1 row)
 SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8;
 ?column? 
 ----------
 t
 (1 row)
 SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix
 ?column? 
 ----------
 t
 (1 row)
 -- case mapping
 SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8;
 ?column? 
 ----------
 t
 (1 row)
 SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8;
 ?column? 
 ----------
 t
 (1 row)
 SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
 ?column? 
 ----------
 t
 (1 row)
 SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
 ?column? 
 ----------
 t
 (1 row)
 SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
 ?column? 
 ----------
 t
 (1 row)
--- a/src/test/regress/expected/collate.utf8_1.out
+++ b/src/test/regress/expected/collate.utf8_1.out
@ -0,0 +1,8 @@
 /*
 * This test is for collations and character operations when using the
 * builtin provider with the C.UTF-8 locale.
 */
 /* skip test if not UTF8 server encoding */
 SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
 \if :skip_test
 \quit
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@ -78,9 +78,9 @@ test: brin_bloom brin_multi
 # psql depends on create_am
 # amutils depends on geometry, create_index_spgist, hash_index, brin
 # ----------
-test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort create_role without_overlaps
+test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.utf8 collate.icu.utf8 incremental_sort create_role without_overlaps
-# collate.*.utf8 tests cannot be run in parallel with each other
+# collate.linux.utf8 and collate.icu.utf8 tests cannot be run in parallel with each other
 test: rules psql psql_crosstab amutils stats_ext collate.linux.utf8 collate.windows.win1252
 # ----------
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@ -0,0 +1,67 @@
 /*
 * This test is for collations and character operations when using the
 * builtin provider with the C.UTF-8 locale.
 */
 /* skip test if not UTF8 server encoding */
 SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
 \if :skip_test
 \quit
 \endif
 SET client_encoding TO UTF8;
 --
 -- Test PG_C_UTF8
 --
 CREATE COLLATION regress_pg_c_utf8 (
  provider = builtin, locale = 'C_UTF8'); -- fails
 CREATE COLLATION regress_pg_c_utf8 (
  provider = builtin, locale = 'C.UTF8');
 DROP COLLATION regress_pg_c_utf8;
 CREATE COLLATION regress_pg_c_utf8 (
  provider = builtin, locale = 'C.UTF-8');
 CREATE TABLE test_pg_c_utf8 (
  t TEXT COLLATE PG_C_UTF8
 );
 INSERT INTO test_pg_c_utf8 VALUES
  ('abc DEF 123abc'),
  ('ábc sßs ßss DÉF'),
  ('ǄxxǄ ǆxxǅ ǅxxǆ'),
  ('ȺȺȺ'),
  ('ⱥⱥⱥ'),
  ('ⱥȺ');
 SELECT
    t, lower(t), initcap(t), upper(t),
    length(convert_to(t, 'UTF8')) AS t_bytes,
    length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
    length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
    length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
  FROM test_pg_c_utf8;
 DROP TABLE test_pg_c_utf8;
 -- negative test: Final_Sigma not used for builtin locale C.UTF-8
 SELECT lower('ΑΣ' COLLATE PG_C_UTF8);
 SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8);
 SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8);
 -- properties
 SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8;
 SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8;
 SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8;
 SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix
 SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8;
 SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix
 -- case mapping
 SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8;
 SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8;
 SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
 SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
 SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed