Use ICU by default at initdb time.

If the ICU locale is not specified, initialize the default collator
and retrieve the locale name from that.

Discussion: https://postgr.es/m/510d284759f6e943ce15096167760b2edcb2e700.camel@j-davis.com
Reviewed-by: Peter Eisentraut
This commit is contained in:
Jeff Davis 2023-03-09 10:52:41 -08:00
parent a7e584a7d6
commit 27b62377b4
18 changed files with 147 additions and 42 deletions

View File

@ -1,9 +1,16 @@
/* /*
* This test must be run in a database with UTF-8 encoding * This test must be run in a database with UTF-8 encoding
* and a Unicode-aware locale. * and a Unicode-aware locale.
*
* Also disable this file for ICU, because the test for the the
* Turkish dotted I is not correct for many ICU locales. citext always
* uses the default collation, so it's not easy to restrict the test
* to the "tr-TR-x-icu" collation where it will succeed.
*/ */
SELECT getdatabaseencoding() <> 'UTF8' OR SELECT getdatabaseencoding() <> 'UTF8' OR
current_setting('lc_ctype') = 'C' current_setting('lc_ctype') = 'C' OR
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset AS skip_test \gset
\if :skip_test \if :skip_test
\quit \quit

View File

@ -1,9 +1,16 @@
/* /*
* This test must be run in a database with UTF-8 encoding * This test must be run in a database with UTF-8 encoding
* and a Unicode-aware locale. * and a Unicode-aware locale.
*
* Also disable this file for ICU, because the test for the the
* Turkish dotted I is not correct for many ICU locales. citext always
* uses the default collation, so it's not easy to restrict the test
* to the "tr-TR-x-icu" collation where it will succeed.
*/ */
SELECT getdatabaseencoding() <> 'UTF8' OR SELECT getdatabaseencoding() <> 'UTF8' OR
current_setting('lc_ctype') = 'C' current_setting('lc_ctype') = 'C' OR
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset AS skip_test \gset
\if :skip_test \if :skip_test
\quit \quit

View File

@ -1,10 +1,17 @@
/* /*
* This test must be run in a database with UTF-8 encoding * This test must be run in a database with UTF-8 encoding
* and a Unicode-aware locale. * and a Unicode-aware locale.
*
* Also disable this file for ICU, because the test for the the
* Turkish dotted I is not correct for many ICU locales. citext always
* uses the default collation, so it's not easy to restrict the test
* to the "tr-TR-x-icu" collation where it will succeed.
*/ */
SELECT getdatabaseencoding() <> 'UTF8' OR SELECT getdatabaseencoding() <> 'UTF8' OR
current_setting('lc_ctype') = 'C' current_setting('lc_ctype') = 'C' OR
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset AS skip_test \gset
\if :skip_test \if :skip_test
\quit \quit

View File

@ -1,3 +1,12 @@
-- unaccent is broken if the default collation is provided by ICU and
-- LC_CTYPE=C
SELECT current_setting('lc_ctype') = 'C' AND
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset
\if :skip_test
\quit
\endif
CREATE EXTENSION unaccent; CREATE EXTENSION unaccent;
-- must have a UTF8 database -- must have a UTF8 database
SELECT getdatabaseencoding(); SELECT getdatabaseencoding();

View File

@ -0,0 +1,8 @@
-- unaccent is broken if the default collation is provided by ICU and
-- LC_CTYPE=C
SELECT current_setting('lc_ctype') = 'C' AND
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset
\if :skip_test
\quit

View File

@ -1,3 +1,14 @@
-- unaccent is broken if the default collation is provided by ICU and
-- LC_CTYPE=C
SELECT current_setting('lc_ctype') = 'C' AND
(SELECT datlocprovider='i' FROM pg_database
WHERE datname=current_database())
AS skip_test \gset
\if :skip_test
\quit
\endif
CREATE EXTENSION unaccent; CREATE EXTENSION unaccent;
-- must have a UTF8 database -- must have a UTF8 database

View File

@ -89,10 +89,28 @@ PostgreSQL documentation
and character set encoding. These can also be set separately for each and character set encoding. These can also be set separately for each
database when it is created. <command>initdb</command> determines those database when it is created. <command>initdb</command> determines those
settings for the template databases, which will serve as the default for settings for the template databases, which will serve as the default for
all other databases. By default, <command>initdb</command> uses the all other databases.
locale provider <literal>libc</literal>, takes the locale settings from </para>
the environment, and determines the encoding from the locale settings.
This is almost always sufficient, unless there are special requirements. <para>
By default, <command>initdb</command> uses the ICU library to provide
locale services if the server was built with ICU support; otherwise it uses
the <literal>libc</literal> locale provider (see <xref
linkend="locale-providers"/>). To choose the specific ICU locale ID to
apply, use the option <option>--icu-locale</option>. Note that for
implementation reasons and to support legacy code,
<command>initdb</command> will still select and initialize libc locale
settings when the ICU locale provider is used.
</para>
<para>
Alternatively, <command>initdb</command> can use the locale provider
<literal>libc</literal>. To select this option, specify
<literal>--locale-provider=libc</literal>, or build the server without ICU
support. The <literal>libc</literal> locale provider takes the locale
settings from the environment, and determines the encoding from the locale
settings. This is almost always sufficient, unless there are special
requirements.
</para> </para>
<para> <para>
@ -103,17 +121,6 @@ PostgreSQL documentation
categories can give nonsensical results, so this should be used with care. categories can give nonsensical results, so this should be used with care.
</para> </para>
<para>
Alternatively, the ICU library can be used to provide locale services.
(Again, this only sets the default for subsequently created databases.) To
select this option, specify <literal>--locale-provider=icu</literal>.
To choose the specific ICU locale ID to apply, use the option
<option>--icu-locale</option>. Note that
for implementation reasons and to support legacy code,
<command>initdb</command> will still select and initialize libc locale
settings when the ICU locale provider is used.
</para>
<para> <para>
When <command>initdb</command> runs, it will print out the locale settings When <command>initdb</command> runs, it will print out the locale settings
it has chosen. If you have complex requirements or specified multiple it has chosen. If you have complex requirements or specified multiple
@ -234,7 +241,13 @@ PostgreSQL documentation
<term><option>--icu-locale=<replaceable>locale</replaceable></option></term> <term><option>--icu-locale=<replaceable>locale</replaceable></option></term>
<listitem> <listitem>
<para> <para>
Specifies the ICU locale ID, if the ICU locale provider is used. Specifies the ICU locale when the ICU provider is used. Locale support
is described in <xref linkend="locale"/>.
</para>
<para>
If this option is not specified, the locale is inherited from the
environment in which <command>initdb</command> runs. The environment's
locale is matched to a similar ICU locale name, if possible.
</para> </para>
</listitem> </listitem>
</varlistentry> </varlistentry>
@ -307,10 +320,12 @@ PostgreSQL documentation
<term><option>--locale-provider={<literal>libc</literal>|<literal>icu</literal>}</option></term> <term><option>--locale-provider={<literal>libc</literal>|<literal>icu</literal>}</option></term>
<listitem> <listitem>
<para> <para>
This option sets the locale provider for databases created in the This option sets the locale provider for databases created in the new
new cluster. It can be overridden in the <command>CREATE cluster. It can be overridden in the <command>CREATE
DATABASE</command> command when new databases are subsequently DATABASE</command> command when new databases are subsequently
created. The default is <literal>libc</literal>. created. The default is <literal>icu</literal> if the server was
built with ICU support; otherwise the default is
<literal>libc</literal> (see <xref linkend="locale-providers"/>).
</para> </para>
</listitem> </listitem>
</varlistentry> </varlistentry>

View File

@ -16,7 +16,7 @@ subdir = src/bin/initdb
top_builddir = ../../.. top_builddir = ../../..
include $(top_builddir)/src/Makefile.global include $(top_builddir)/src/Makefile.global
override CPPFLAGS := -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(CPPFLAGS) override CPPFLAGS := -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(ICU_CFLAGS) $(CPPFLAGS)
# Note: it's important that we link to encnames.o from libpgcommon, not # Note: it's important that we link to encnames.o from libpgcommon, not
# from libpq, else we have risks of version skew if we run with a libpq # from libpq, else we have risks of version skew if we run with a libpq
@ -24,7 +24,7 @@ override CPPFLAGS := -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(CPPFLAGS)
# should ensure that that happens. # should ensure that that happens.
# #
# We need libpq only because fe_utils does. # We need libpq only because fe_utils does.
LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) $(ICU_LIBS)
# use system timezone data? # use system timezone data?
ifneq (,$(with_system_tzdata)) ifneq (,$(with_system_tzdata))

View File

@ -53,6 +53,9 @@
#include <netdb.h> #include <netdb.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/stat.h> #include <sys/stat.h>
#ifdef USE_ICU
#include <unicode/ucol.h>
#endif
#include <unistd.h> #include <unistd.h>
#include <signal.h> #include <signal.h>
#include <time.h> #include <time.h>
@ -133,7 +136,11 @@ static char *lc_monetary = NULL;
static char *lc_numeric = NULL; static char *lc_numeric = NULL;
static char *lc_time = NULL; static char *lc_time = NULL;
static char *lc_messages = NULL; static char *lc_messages = NULL;
#ifdef USE_ICU
static char locale_provider = COLLPROVIDER_ICU;
#else
static char locale_provider = COLLPROVIDER_LIBC; static char locale_provider = COLLPROVIDER_LIBC;
#endif
static char *icu_locale = NULL; static char *icu_locale = NULL;
static char *icu_rules = NULL; static char *icu_rules = NULL;
static const char *default_text_search_config = NULL; static const char *default_text_search_config = NULL;
@ -2028,6 +2035,50 @@ check_icu_locale_encoding(int user_enc)
return true; return true;
} }
/*
* Check that ICU accepts the locale name; or if not specified, retrieve the
* default ICU locale.
*/
static void
check_icu_locale(void)
{
#ifdef USE_ICU
UCollator *collator;
UErrorCode status;
status = U_ZERO_ERROR;
collator = ucol_open(icu_locale, &status);
if (U_FAILURE(status))
{
if (icu_locale)
pg_fatal("could not open collator for locale \"%s\": %s",
icu_locale, u_errorName(status));
else
pg_fatal("could not open collator for default locale: %s",
u_errorName(status));
}
/* if not specified, get locale from default collator */
if (icu_locale == NULL)
{
const char *default_locale;
status = U_ZERO_ERROR;
default_locale = ucol_getLocaleByType(collator, ULOC_VALID_LOCALE,
&status);
if (U_FAILURE(status))
{
ucol_close(collator);
pg_fatal("could not determine default ICU locale");
}
icu_locale = pg_strdup(default_locale);
}
ucol_close(collator);
#endif
}
/* /*
* set up the locale variables * set up the locale variables
* *
@ -2081,8 +2132,7 @@ setlocales(void)
if (locale_provider == COLLPROVIDER_ICU) if (locale_provider == COLLPROVIDER_ICU)
{ {
if (!icu_locale) check_icu_locale();
pg_fatal("ICU locale must be specified");
/* /*
* In supported builds, the ICU locale ID will be checked by the * In supported builds, the ICU locale ID will be checked by the

View File

@ -97,11 +97,6 @@ SKIP:
if ($ENV{with_icu} eq 'yes') if ($ENV{with_icu} eq 'yes')
{ {
command_fails_like(
[ 'initdb', '--no-sync', '--locale-provider=icu', "$tempdir/data2" ],
qr/initdb: error: ICU locale must be specified/,
'locale provider ICU requires --icu-locale');
command_ok( command_ok(
[ [
'initdb', '--no-sync', 'initdb', '--no-sync',
@ -116,7 +111,7 @@ if ($ENV{with_icu} eq 'yes')
'--locale-provider=icu', '--icu-locale=@colNumeric=lower', '--locale-provider=icu', '--icu-locale=@colNumeric=lower',
"$tempdir/dataX" "$tempdir/dataX"
], ],
qr/FATAL: could not open collator for locale/, qr/error: could not open collator for locale/,
'fails for invalid ICU locale'); 'fails for invalid ICU locale');
command_fails_like( command_fails_like(

View File

@ -1758,7 +1758,7 @@ my %tests = (
create_sql => create_sql =>
"CREATE DATABASE dump_test2 LOCALE = 'C' TEMPLATE = template0;", "CREATE DATABASE dump_test2 LOCALE = 'C' TEMPLATE = template0;",
regexp => qr/^ regexp => qr/^
\QCREATE DATABASE dump_test2 \E.*\QLOCALE = 'C';\E \QCREATE DATABASE dump_test2 \E.*\QLOCALE = 'C'\E
/xm, /xm,
like => { pg_dumpall_dbprivs => 1, }, like => { pg_dumpall_dbprivs => 1, },
}, },

View File

@ -13,7 +13,7 @@ program_version_ok('createdb');
program_options_handling_ok('createdb'); program_options_handling_ok('createdb');
my $node = PostgreSQL::Test::Cluster->new('main'); my $node = PostgreSQL::Test::Cluster->new('main');
$node->init; $node->init(extra => ['--locale-provider=libc']);
$node->start; $node->start;
$node->issues_sql_like( $node->issues_sql_like(

View File

@ -14,9 +14,6 @@ override CPPFLAGS := \
'-DSHELLPROG="$(SHELL)"' \ '-DSHELLPROG="$(SHELL)"' \
$(CPPFLAGS) $(CPPFLAGS)
# default encoding for regression tests
ENCODING = SQL_ASCII
ifneq ($(build_os),mingw32) ifneq ($(build_os),mingw32)
abs_builddir := $(shell pwd) abs_builddir := $(shell pwd)
else else

View File

@ -55,7 +55,7 @@ exec sql end declare section;
exec sql connect to 'unix:postgresql://localhost/ecpg2_regression' as main user :user USING "connectpw"; exec sql connect to 'unix:postgresql://localhost/ecpg2_regression' as main user :user USING "connectpw";
exec sql disconnect main; exec sql disconnect main;
exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=180&client_encoding=latin1 as main user regress_ecpg_user1/connectpw; exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=180&client_encoding=sql_ascii as main user regress_ecpg_user1/connectpw;
exec sql disconnect main; exec sql disconnect main;
exec sql connect to "unix:postgresql://200.46.204.71/ecpg2_regression" as main user regress_ecpg_user1/connectpw; exec sql connect to "unix:postgresql://200.46.204.71/ecpg2_regression" as main user regress_ecpg_user1/connectpw;

View File

@ -117,7 +117,7 @@ main(void)
#line 56 "test5.pgc" #line 56 "test5.pgc"
{ ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=180 & client_encoding=latin1" , "regress_ecpg_user1" , "connectpw" , "main", 0); } { ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=180 & client_encoding=sql_ascii" , "regress_ecpg_user1" , "connectpw" , "main", 0); }
#line 58 "test5.pgc" #line 58 "test5.pgc"
{ ECPGdisconnect(__LINE__, "main");} { ECPGdisconnect(__LINE__, "main");}

View File

@ -50,7 +50,7 @@
[NO_PID]: sqlca: code: 0, state: 00000 [NO_PID]: sqlca: code: 0, state: 00000
[NO_PID]: ecpg_finish: connection main closed [NO_PID]: ecpg_finish: connection main closed
[NO_PID]: sqlca: code: 0, state: 00000 [NO_PID]: sqlca: code: 0, state: 00000
[NO_PID]: ECPGconnect: opening database ecpg2_regression on <DEFAULT> port <DEFAULT> with options connect_timeout=180 & client_encoding=latin1 for user regress_ecpg_user1 [NO_PID]: ECPGconnect: opening database ecpg2_regression on <DEFAULT> port <DEFAULT> with options connect_timeout=180 & client_encoding=sql_ascii for user regress_ecpg_user1
[NO_PID]: sqlca: code: 0, state: 00000 [NO_PID]: sqlca: code: 0, state: 00000
[NO_PID]: ecpg_finish: connection main closed [NO_PID]: ecpg_finish: connection main closed
[NO_PID]: sqlca: code: 0, state: 00000 [NO_PID]: sqlca: code: 0, state: 00000

View File

@ -69,7 +69,6 @@ ecpg_test_files = files(
ecpg_regress_args = [ ecpg_regress_args = [
'--dbname=ecpg1_regression,ecpg2_regression', '--dbname=ecpg1_regression,ecpg2_regression',
'--create-role=regress_ecpg_user1,regress_ecpg_user2', '--create-role=regress_ecpg_user1,regress_ecpg_user2',
'--encoding=SQL_ASCII',
] ]
tests += { tests += {

View File

@ -12,7 +12,7 @@ if ($ENV{with_icu} ne 'yes')
} }
my $node1 = PostgreSQL::Test::Cluster->new('node1'); my $node1 = PostgreSQL::Test::Cluster->new('node1');
$node1->init; $node1->init(extra => ['--locale-provider=libc']);
$node1->start; $node1->start;
$node1->safe_psql('postgres', $node1->safe_psql('postgres',