diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 12fabb7372..6dd95b8966 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -893,7 +893,7 @@ CREATE COLLATION german (provider = libc, locale = 'de_DE'); The first example selects the ICU locale using a language tag per BCP 47. The second example uses the traditional ICU-specific locale syntax. The first style is preferred going - forward, but it is not supported by older ICU versions. + forward, and is used internally to store locales. Note that you can name the collation objects in the SQL environment diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 45de78352c..c91fe66d9b 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -165,6 +165,11 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e else colliculocale = NULL; + /* + * When the ICU locale comes from an existing collation, do not + * canonicalize to a language tag. + */ + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull); if (!isnull) collicurules = TextDatumGetCString(datum); @@ -259,6 +264,25 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("parameter \"locale\" must be specified"))); + /* + * During binary upgrade, preserve the locale string. Otherwise, + * canonicalize to a language tag. + */ + if (!IsBinaryUpgrade) + { + char *langtag = icu_language_tag(colliculocale, + icu_validation_level); + + if (langtag && strcmp(colliculocale, langtag) != 0) + { + ereport(NOTICE, + (errmsg("using standard form \"%s\" for locale \"%s\"", + langtag, colliculocale))); + + colliculocale = langtag; + } + } + icu_validate_locale(colliculocale); } @@ -569,26 +593,6 @@ cmpaliases(const void *a, const void *b) #ifdef USE_ICU -/* - * Get the ICU language tag for a locale name. - * The result is a palloc'd string. - */ -static char * -get_icu_language_tag(const char *localename) -{ - char buf[ULOC_FULLNAME_CAPACITY]; - UErrorCode status; - - status = U_ZERO_ERROR; - uloc_toLanguageTag(localename, buf, sizeof(buf), true, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not convert locale name \"%s\" to language tag: %s", - localename, u_errorName(status)))); - - return pstrdup(buf); -} - /* * Get a comment (specifically, the display name) for an ICU locale. * The result is a palloc'd string, or NULL if we can't get a comment @@ -950,7 +954,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS) else name = uloc_getAvailable(i); - langtag = get_icu_language_tag(name); + langtag = icu_language_tag(name, ERROR); /* * Be paranoid about not allowing any non-ASCII strings into diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 24bcc5adfe..2e242eeff2 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1058,6 +1058,26 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ICU locale must be specified"))); + /* + * During binary upgrade, or when the locale came from the template + * database, preserve locale string. Otherwise, canonicalize to a + * language tag. + */ + if (!IsBinaryUpgrade && dbiculocale != src_iculocale) + { + char *langtag = icu_language_tag(dbiculocale, + icu_validation_level); + + if (langtag && strcmp(dbiculocale, langtag) != 0) + { + ereport(NOTICE, + (errmsg("using standard form \"%s\" for locale \"%s\"", + langtag, dbiculocale))); + + dbiculocale = langtag; + } + } + icu_validate_locale(dbiculocale); } else diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 9497c20d12..06e73aa012 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -2826,6 +2826,91 @@ icu_set_collation_attributes(UCollator *collator, const char *loc, #endif +/* + * Return the BCP47 language tag representation of the requested locale. + * + * This function should be called before passing the string to ucol_open(), + * because conversion to a language tag also performs "level 2 + * canonicalization". In addition to producing a consistent format, level 2 + * canonicalization is able to more accurately interpret different input + * locale string formats, such as POSIX and .NET IDs. + */ +char * +icu_language_tag(const char *loc_str, int elevel) +{ +#ifdef USE_ICU + UErrorCode status; + char lang[ULOC_LANG_CAPACITY]; + char *langtag; + size_t buflen = 32; /* arbitrary starting buffer size */ + const bool strict = true; + + status = U_ZERO_ERROR; + uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status); + if (U_FAILURE(status)) + { + if (elevel > 0) + ereport(elevel, + (errmsg("could not get language from locale \"%s\": %s", + loc_str, u_errorName(status)))); + return NULL; + } + + /* C/POSIX locales aren't handled by uloc_getLanguageTag() */ + if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0) + return pstrdup("en-US-u-va-posix"); + + /* + * A BCP47 language tag doesn't have a clearly-defined upper limit + * (cf. RFC5646 section 4.4). Additionally, in older ICU versions, + * uloc_toLanguageTag() doesn't always return the ultimate length on the + * first call, necessitating a loop. + */ + langtag = palloc(buflen); + while (true) + { + int32_t len; + + status = U_ZERO_ERROR; + len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status); + + /* + * If the result fits in the buffer exactly (len == buflen), + * uloc_toLanguageTag() will return success without nul-terminating + * the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >= + * buflen and try again. + */ + if ((status == U_BUFFER_OVERFLOW_ERROR || + (U_SUCCESS(status) && len >= buflen)) && + buflen < MaxAllocSize) + { + buflen = Min(buflen * 2, MaxAllocSize); + langtag = repalloc(langtag, buflen); + continue; + } + + break; + } + + if (U_FAILURE(status)) + { + pfree(langtag); + + if (elevel > 0) + ereport(elevel, + (errmsg("could not convert locale name \"%s\" to language tag: %s", + loc_str, u_errorName(status)))); + return NULL; + } + + return langtag; +#else /* not USE_ICU */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"))); +#endif /* not USE_ICU */ +} + /* * Perform best-effort check that the locale is a valid one. */ diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 208ddc9b30..4814c1c405 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2229,6 +2229,78 @@ check_icu_locale_encoding(int user_enc) return true; } +/* + * Convert to canonical BCP47 language tag. Must be consistent with + * icu_language_tag(). + */ +static char * +icu_language_tag(const char *loc_str) +{ +#ifdef USE_ICU + UErrorCode status; + char lang[ULOC_LANG_CAPACITY]; + char *langtag; + size_t buflen = 32; /* arbitrary starting buffer size */ + const bool strict = true; + + status = U_ZERO_ERROR; + uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status); + if (U_FAILURE(status)) + { + pg_fatal("could not get language from locale \"%s\": %s", + loc_str, u_errorName(status)); + return NULL; + } + + /* C/POSIX locales aren't handled by uloc_getLanguageTag() */ + if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0) + return pstrdup("en-US-u-va-posix"); + + /* + * A BCP47 language tag doesn't have a clearly-defined upper limit + * (cf. RFC5646 section 4.4). Additionally, in older ICU versions, + * uloc_toLanguageTag() doesn't always return the ultimate length on the + * first call, necessitating a loop. + */ + langtag = pg_malloc(buflen); + while (true) + { + int32_t len; + + status = U_ZERO_ERROR; + len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status); + + /* + * If the result fits in the buffer exactly (len == buflen), + * uloc_toLanguageTag() will return success without nul-terminating + * the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >= + * buflen and try again. + */ + if (status == U_BUFFER_OVERFLOW_ERROR || + (U_SUCCESS(status) && len >= buflen)) + { + buflen = buflen * 2; + langtag = pg_realloc(langtag, buflen); + continue; + } + + break; + } + + if (U_FAILURE(status)) + { + pg_free(langtag); + + pg_fatal("could not convert locale name \"%s\" to language tag: %s", + loc_str, u_errorName(status)); + } + + return langtag; +#else + pg_fatal("ICU is not supported in this build"); +#endif +} + /* * Perform best-effort check that the locale is a valid one. Should be * consistent with pg_locale.c, except that it doesn't need to open the @@ -2376,6 +2448,8 @@ setlocales(void) if (locale_provider == COLLPROVIDER_ICU) { + char *langtag; + /* acquire default locale from the environment, if not specified */ if (icu_locale == NULL) { @@ -2383,6 +2457,13 @@ setlocales(void) printf(_("Using default ICU locale \"%s\".\n"), icu_locale); } + /* canonicalize to a language tag */ + langtag = icu_language_tag(icu_locale); + printf(_("Using language tag \"%s\" for ICU locale \"%s\".\n"), + langtag, icu_locale); + pg_free(icu_locale); + icu_locale = langtag; + icu_validate_locale(icu_locale); /* diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index db7995fe28..17a444d80c 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -144,7 +144,7 @@ if ($ENV{with_icu} eq 'yes') '--locale-provider=icu', '--icu-locale=@colNumeric=lower', "$tempdir/dataX" ], - qr/could not open collator for locale "\@colNumeric=lower": U_ILLEGAL_ARGUMENT_ERROR/, + qr/could not open collator for locale "und-u-kn-lower": U_ILLEGAL_ARGUMENT_ERROR/, 'fails for invalid collation argument'); } else diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index 42215f82f7..df26ba42d6 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -1860,9 +1860,9 @@ my %tests = ( 'CREATE COLLATION icu_collation' => { create_order => 76, - create_sql => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'C');", + create_sql => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'en-US-u-va-posix');", regexp => - qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'C'(, version = '[^']*')?\);/m, + qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'en-US-u-va-posix'(, version = '[^']*')?\);/m, icu => 1, like => { %full_runs, section_pre_data => 1, }, }, diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index c275427976..8c095abc52 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -120,6 +120,7 @@ extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale); extern void icu_validate_locale(const char *loc_str); +extern char *icu_language_tag(const char *loc_str, int elevel); #ifdef USE_ICU extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes); diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 5e480d45cd..b5a221b030 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1019,6 +1019,7 @@ reset enable_seqscan; CREATE ROLE regress_test_role; CREATE SCHEMA test_schema; -- We need to do this this way to cope with varying names for encodings: +SET client_min_messages TO WARNING; do $$ BEGIN EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' || @@ -1033,12 +1034,17 @@ BEGIN quote_literal(current_setting('lc_collate')) || ');'; END $$; +RESET client_min_messages; CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale" ERROR: parameter "locale" must be specified CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails ERROR: ICU locale "nonsense-nowhere" has unknown language "nonsense" HINT: To disable ICU locale validation, set parameter icu_validation_level to DISABLED. +CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails +ERROR: could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR SET icu_validation_level = WARNING; +CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx; +WARNING: could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx; WARNING: ICU locale "nonsense-nowhere" has unknown language "nonsense" HINT: To disable ICU locale validation, set parameter icu_validation_level to DISABLED. @@ -1169,14 +1175,18 @@ SELECT * FROM collate_test2 ORDER BY b COLLATE UNICODE; -- test ICU collation customization -- test the attributes handled by icu_set_collation_attributes() +SET client_min_messages=WARNING; CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes'); +RESET client_min_messages; SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents; ?column? | ?column? ----------+---------- t | t (1 row) +SET client_min_messages=WARNING; CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes'); +RESET client_min_messages; SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards; ?column? | ?column? ----------+---------- @@ -1184,7 +1194,9 @@ SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll (1 row) CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower'); +NOTICE: using standard form "und-u-kf-lower" for locale "@colCaseFirst=lower" CREATE COLLATION testcoll_upper_first (provider = icu, locale = '@colCaseFirst=upper'); +NOTICE: using standard form "und-u-kf-upper" for locale "@colCaseFirst=upper" SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcoll_upper_first; ?column? | ?column? ----------+---------- @@ -1192,13 +1204,16 @@ SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcol (1 row) CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted'); +NOTICE: using standard form "und-u-ka-shifted" for locale "@colAlternate=shifted" SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted; ?column? | ?column? ----------+---------- t | t (1 row) +SET client_min_messages=WARNING; CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes'); +RESET client_min_messages; SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric; ?column? | ?column? ----------+---------- @@ -1206,10 +1221,12 @@ SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_n (1 row) CREATE COLLATION testcoll_error1 (provider = icu, locale = '@colNumeric=lower'); -ERROR: could not open collator for locale "@colNumeric=lower": U_ILLEGAL_ARGUMENT_ERROR +NOTICE: using standard form "und-u-kn-lower" for locale "@colNumeric=lower" +ERROR: could not open collator for locale "und-u-kn-lower": U_ILLEGAL_ARGUMENT_ERROR -- test that attributes not handled by icu_set_collation_attributes() -- (handled by ucol_open() directly) also work CREATE COLLATION testcoll_de_phonebook (provider = icu, locale = 'de@collation=phonebook'); +NOTICE: using standard form "de-u-co-phonebk" for locale "de@collation=phonebook" SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE testcoll_de_phonebook; ?column? | ?column? ----------+---------- @@ -1218,6 +1235,7 @@ SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE tes -- rules CREATE COLLATION testcoll_rules1 (provider = icu, locale = '', rules = '&a < g'); +NOTICE: using standard form "und" for locale "" CREATE TABLE test7 (a text); -- example from https://unicode-org.github.io/icu/userguide/collation/customization/#syntax INSERT INTO test7 VALUES ('Abernathy'), ('apple'), ('bird'), ('Boston'), ('Graham'), ('green'); @@ -1245,10 +1263,13 @@ SELECT * FROM test7 ORDER BY a COLLATE testcoll_rules1; DROP TABLE test7; CREATE COLLATION testcoll_rulesx (provider = icu, locale = '', rules = '!!wrong!!'); -ERROR: could not open collator for locale "" with rules "!!wrong!!": U_INVALID_FORMAT_ERROR +NOTICE: using standard form "und" for locale "" +ERROR: could not open collator for locale "und" with rules "!!wrong!!": U_INVALID_FORMAT_ERROR -- nondeterministic collations CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true); +NOTICE: using standard form "und" for locale "" CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = false); +NOTICE: using standard form "und" for locale "" CREATE TABLE test6 (a int, b text); -- same string in different normal forms INSERT INTO test6 VALUES (1, U&'\00E4bc'); @@ -1298,7 +1319,9 @@ SELECT * FROM test6a WHERE b = ARRAY['äbc'] COLLATE ctest_nondet; (2 rows) CREATE COLLATION case_sensitive (provider = icu, locale = ''); +NOTICE: using standard form "und" for locale "" CREATE COLLATION case_insensitive (provider = icu, locale = '@colStrength=secondary', deterministic = false); +NOTICE: using standard form "und-u-ks-level2" for locale "@colStrength=secondary" SELECT 'abc' <= 'ABC' COLLATE case_sensitive, 'abc' >= 'ABC' COLLATE case_sensitive; ?column? | ?column? ----------+---------- @@ -1780,7 +1803,9 @@ SELECT * FROM outer_text WHERE (f1, f2) NOT IN (SELECT * FROM inner_text); (2 rows) -- accents +SET client_min_messages=WARNING; CREATE COLLATION ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes', deterministic = false); +RESET client_min_messages; CREATE TABLE test4 (a int, b text); INSERT INTO test4 VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté'); SELECT * FROM test4 WHERE b = 'cote'; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 95d96f2eb8..85e26951b6 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -357,6 +357,8 @@ CREATE ROLE regress_test_role; CREATE SCHEMA test_schema; -- We need to do this this way to cope with varying names for encodings: +SET client_min_messages TO WARNING; + do $$ BEGIN EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' || @@ -370,9 +372,14 @@ BEGIN quote_literal(current_setting('lc_collate')) || ');'; END $$; + +RESET client_min_messages; + CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale" CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails +CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails SET icu_validation_level = WARNING; +CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx; CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx; RESET icu_validation_level; @@ -457,10 +464,14 @@ SELECT * FROM collate_test2 ORDER BY b COLLATE UNICODE; -- test the attributes handled by icu_set_collation_attributes() +SET client_min_messages=WARNING; CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes'); +RESET client_min_messages; SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents; +SET client_min_messages=WARNING; CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes'); +RESET client_min_messages; SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards; CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower'); @@ -470,7 +481,9 @@ SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcol CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted'); SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted; +SET client_min_messages=WARNING; CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes'); +RESET client_min_messages; SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric; CREATE COLLATION testcoll_error1 (provider = icu, locale = '@colNumeric=lower'); @@ -659,7 +672,9 @@ INSERT INTO inner_text VALUES ('a', NULL); SELECT * FROM outer_text WHERE (f1, f2) NOT IN (SELECT * FROM inner_text); -- accents +SET client_min_messages=WARNING; CREATE COLLATION ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes', deterministic = false); +RESET client_min_messages; CREATE TABLE test4 (a int, b text); INSERT INTO test4 VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté');