diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index edb09d4e35..bb416c8674 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -79,6 +79,7 @@ OBJS = \ orderedsetaggs.o \ partitionfuncs.o \ pg_locale.o \ + pg_locale_icu.o \ pg_lsn.o \ pg_upgrade_support.o \ pgstatfuncs.o \ diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index 8c6fc80c37..19a27465a2 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -66,6 +66,7 @@ backend_sources += files( 'orderedsetaggs.c', 'partitionfuncs.c', 'pg_locale.c', + 'pg_locale_icu.c', 'pg_lsn.c', 'pg_upgrade_support.c', 'pgstatfuncs.c', diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index b4954959f9..313200009b 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -69,11 +69,6 @@ #include "utils/pg_locale.h" #include "utils/syscache.h" -#ifdef USE_ICU -#include -#include -#endif - #ifdef __GLIBC__ #include #endif @@ -94,6 +89,21 @@ #define MAX_L10N_DATA 80 +/* pg_locale_icu.c */ +#ifdef USE_ICU +extern UCollator *pg_ucol_open(const char *loc_str); +extern UCollator *make_icu_collator(const char *iculocstr, + const char *icurules); +extern int strncoll_icu(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); +extern size_t strnxfrm_icu(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +extern size_t strnxfrm_prefix_icu(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +#endif /* GUC settings */ char *locale_messages; @@ -163,25 +173,6 @@ static pg_locale_t last_collation_cache_locale = NULL; static char *IsoLocaleName(const char *); #endif -#ifdef USE_ICU -/* - * Converter object for converting between ICU's UChar strings and C strings - * in database encoding. Since the database encoding doesn't change, we only - * need one of these per session. - */ -static UConverter *icu_converter = NULL; - -static UCollator *pg_ucol_open(const char *loc_str); -static void init_icu_converter(void); -static size_t uchar_length(UConverter *converter, - const char *str, int32_t len); -static int32_t uchar_convert(UConverter *converter, - UChar *dest, int32_t destlen, - const char *src, int32_t srclen); -static void icu_set_collation_attributes(UCollator *collator, const char *loc, - UErrorCode *status); -#endif - /* * POSIX doesn't define _l-variants of these functions, but several systems * have them. We provide our own replacements here. @@ -1391,76 +1382,6 @@ make_libc_collator(const char *collate, const char *ctype) return loc; } -/* - * Create a UCollator with the given locale string and rules. - * - * Ensure that no path leaks a UCollator. - */ -#ifdef USE_ICU -static UCollator * -make_icu_collator(const char *iculocstr, const char *icurules) -{ - if (!icurules) - { - /* simple case without rules */ - return pg_ucol_open(iculocstr); - } - else - { - UCollator *collator_std_rules; - UCollator *collator_all_rules; - const UChar *std_rules; - UChar *my_rules; - UChar *all_rules; - int32_t length; - int32_t total; - UErrorCode status; - - /* - * If rules are specified, we extract the rules of the standard - * collation, add our own rules, and make a new collator with the - * combined rules. - */ - icu_to_uchar(&my_rules, icurules, strlen(icurules)); - - collator_std_rules = pg_ucol_open(iculocstr); - - std_rules = ucol_getRules(collator_std_rules, &length); - - total = u_strlen(std_rules) + u_strlen(my_rules) + 1; - - /* avoid leaking collator on OOM */ - all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM); - if (!all_rules) - { - ucol_close(collator_std_rules); - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - - u_strcpy(all_rules, std_rules); - u_strcat(all_rules, my_rules); - - ucol_close(collator_std_rules); - - status = U_ZERO_ERROR; - collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules), - UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, - NULL, &status); - if (U_FAILURE(status)) - { - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s", - iculocstr, icurules, u_errorName(status)))); - } - - return collator_all_rules; - } -} -#endif /* not USE_ICU */ - /* * Initialize default_locale with database locale settings. */ @@ -1969,104 +1890,6 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, return result; } -#ifdef USE_ICU - -/* - * strncoll_icu_no_utf8 - * - * Convert the arguments from the database encoding to UChar strings, then - * call ucol_strcoll(). An argument length of -1 means that the string is - * NUL-terminated. - * - * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(), - * caller should call that instead. - */ -static int -strncoll_icu_no_utf8(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, pg_locale_t locale) -{ - char sbuf[TEXTBUFLEN]; - char *buf = sbuf; - int32_t ulen1; - int32_t ulen2; - size_t bufsize1; - size_t bufsize2; - UChar *uchar1, - *uchar2; - int result; - - Assert(locale->provider == COLLPROVIDER_ICU); -#ifdef HAVE_UCOL_STRCOLLUTF8 - Assert(GetDatabaseEncoding() != PG_UTF8); -#endif - - init_icu_converter(); - - ulen1 = uchar_length(icu_converter, arg1, len1); - ulen2 = uchar_length(icu_converter, arg2, len2); - - bufsize1 = (ulen1 + 1) * sizeof(UChar); - bufsize2 = (ulen2 + 1) * sizeof(UChar); - - if (bufsize1 + bufsize2 > TEXTBUFLEN) - buf = palloc(bufsize1 + bufsize2); - - uchar1 = (UChar *) buf; - uchar2 = (UChar *) (buf + bufsize1); - - ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1); - ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2); - - result = ucol_strcoll(locale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); - - if (buf != sbuf) - pfree(buf); - - return result; -} - -/* - * strncoll_icu - * - * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given - * database encoding. An argument length of -1 means the string is - * NUL-terminated. - */ -static int -strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, - pg_locale_t locale) -{ - int result; - - Assert(locale->provider == COLLPROVIDER_ICU); - -#ifdef HAVE_UCOL_STRCOLLUTF8 - if (GetDatabaseEncoding() == PG_UTF8) - { - UErrorCode status; - - status = U_ZERO_ERROR; - result = ucol_strcollUTF8(locale->info.icu.ucol, - arg1, len1, - arg2, len2, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("collation failed: %s", u_errorName(status)))); - } - else -#endif - { - result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale); - } - - return result; -} - -#endif /* USE_ICU */ - /* * pg_strcoll * @@ -2162,143 +1985,6 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, return result; } -#ifdef USE_ICU - -/* 'srclen' of -1 means the strings are NUL-terminated */ -static size_t -strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, - pg_locale_t locale) -{ - char sbuf[TEXTBUFLEN]; - char *buf = sbuf; - UChar *uchar; - int32_t ulen; - size_t uchar_bsize; - Size result_bsize; - - Assert(locale->provider == COLLPROVIDER_ICU); - - init_icu_converter(); - - ulen = uchar_length(icu_converter, src, srclen); - - uchar_bsize = (ulen + 1) * sizeof(UChar); - - if (uchar_bsize > TEXTBUFLEN) - buf = palloc(uchar_bsize); - - uchar = (UChar *) buf; - - ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); - - result_bsize = ucol_getSortKey(locale->info.icu.ucol, - uchar, ulen, - (uint8_t *) dest, destsize); - - /* - * ucol_getSortKey() counts the nul-terminator in the result length, but - * this function should not. - */ - Assert(result_bsize > 0); - result_bsize--; - - if (buf != sbuf) - pfree(buf); - - /* if dest is defined, it should be nul-terminated */ - Assert(result_bsize >= destsize || dest[result_bsize] == '\0'); - - return result_bsize; -} - -/* 'srclen' of -1 means the strings are NUL-terminated */ -static size_t -strnxfrm_prefix_icu_no_utf8(char *dest, size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale) -{ - char sbuf[TEXTBUFLEN]; - char *buf = sbuf; - UCharIterator iter; - uint32_t state[2]; - UErrorCode status; - int32_t ulen = -1; - UChar *uchar = NULL; - size_t uchar_bsize; - Size result_bsize; - - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() != PG_UTF8); - - init_icu_converter(); - - ulen = uchar_length(icu_converter, src, srclen); - - uchar_bsize = (ulen + 1) * sizeof(UChar); - - if (uchar_bsize > TEXTBUFLEN) - buf = palloc(uchar_bsize); - - uchar = (UChar *) buf; - - ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); - - uiter_setString(&iter, uchar, ulen); - state[0] = state[1] = 0; /* won't need that again */ - status = U_ZERO_ERROR; - result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol, - &iter, - state, - (uint8_t *) dest, - destsize, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("sort key generation failed: %s", - u_errorName(status)))); - - return result_bsize; -} - -/* 'srclen' of -1 means the strings are NUL-terminated */ -static size_t -strnxfrm_prefix_icu(char *dest, size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale) -{ - size_t result; - - Assert(locale->provider == COLLPROVIDER_ICU); - - if (GetDatabaseEncoding() == PG_UTF8) - { - UCharIterator iter; - uint32_t state[2]; - UErrorCode status; - - uiter_setUTF8(&iter, src, srclen); - state[0] = state[1] = 0; /* won't need that again */ - status = U_ZERO_ERROR; - result = ucol_nextSortKeyPart(locale->info.icu.ucol, - &iter, - state, - (uint8_t *) dest, - destsize, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("sort key generation failed: %s", - u_errorName(status)))); - } - else - result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen, - locale); - - return result; -} - -#endif - /* * Return true if the collation provider supports pg_strxfrm() and * pg_strnxfrm(); otherwise false. @@ -2509,354 +2195,6 @@ builtin_validate_locale(int encoding, const char *locale) } -#ifdef USE_ICU - -/* - * Wrapper around ucol_open() to handle API differences for older ICU - * versions. - * - * Ensure that no path leaks a UCollator. - */ -static UCollator * -pg_ucol_open(const char *loc_str) -{ - UCollator *collator; - UErrorCode status; - const char *orig_str = loc_str; - char *fixed_str = NULL; - - /* - * Must never open default collator, because it depends on the environment - * and may change at any time. Should not happen, but check here to catch - * bugs that might be hard to catch otherwise. - * - * NB: the default collator is not the same as the collator for the root - * locale. The root locale may be specified as the empty string, "und", or - * "root". The default collator is opened by passing NULL to ucol_open(). - */ - if (loc_str == NULL) - elog(ERROR, "opening default collator is not supported"); - - /* - * In ICU versions 54 and earlier, "und" is not a recognized spelling of - * the root locale. If the first component of the locale is "und", replace - * with "root" before opening. - */ - if (U_ICU_VERSION_MAJOR_NUM < 55) - { - char lang[ULOC_LANG_CAPACITY]; - - status = U_ZERO_ERROR; - uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status); - if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) - { - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not get language from locale \"%s\": %s", - loc_str, u_errorName(status)))); - } - - if (strcmp(lang, "und") == 0) - { - const char *remainder = loc_str + strlen("und"); - - fixed_str = palloc(strlen("root") + strlen(remainder) + 1); - strcpy(fixed_str, "root"); - strcat(fixed_str, remainder); - - loc_str = fixed_str; - } - } - - status = U_ZERO_ERROR; - collator = ucol_open(loc_str, &status); - if (U_FAILURE(status)) - ereport(ERROR, - /* use original string for error report */ - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not open collator for locale \"%s\": %s", - orig_str, u_errorName(status)))); - - if (U_ICU_VERSION_MAJOR_NUM < 54) - { - status = U_ZERO_ERROR; - icu_set_collation_attributes(collator, loc_str, &status); - - /* - * Pretend the error came from ucol_open(), for consistent error - * message across ICU versions. - */ - if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) - { - ucol_close(collator); - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not open collator for locale \"%s\": %s", - orig_str, u_errorName(status)))); - } - } - - if (fixed_str != NULL) - pfree(fixed_str); - - return collator; -} - -static void -init_icu_converter(void) -{ - const char *icu_encoding_name; - UErrorCode status; - UConverter *conv; - - if (icu_converter) - return; /* already done */ - - icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding()); - if (!icu_encoding_name) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("encoding \"%s\" not supported by ICU", - pg_encoding_to_char(GetDatabaseEncoding())))); - - status = U_ZERO_ERROR; - conv = ucnv_open(icu_encoding_name, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not open ICU converter for encoding \"%s\": %s", - icu_encoding_name, u_errorName(status)))); - - icu_converter = conv; -} - -/* - * Find length, in UChars, of given string if converted to UChar string. - * - * A length of -1 indicates that the input string is NUL-terminated. - */ -static size_t -uchar_length(UConverter *converter, const char *str, int32_t len) -{ - UErrorCode status = U_ZERO_ERROR; - int32_t ulen; - - ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status); - if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); - return ulen; -} - -/* - * Convert the given source string into a UChar string, stored in dest, and - * return the length (in UChars). - * - * A srclen of -1 indicates that the input string is NUL-terminated. - */ -static int32_t -uchar_convert(UConverter *converter, UChar *dest, int32_t destlen, - const char *src, int32_t srclen) -{ - UErrorCode status = U_ZERO_ERROR; - int32_t ulen; - - status = U_ZERO_ERROR; - ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); - return ulen; -} - -/* - * Convert a string in the database encoding into a string of UChars. - * - * The source string at buff is of length nbytes - * (it needn't be nul-terminated) - * - * *buff_uchar receives a pointer to the palloc'd result string, and - * the function's result is the number of UChars generated. - * - * The result string is nul-terminated, though most callers rely on the - * result length instead. - */ -int32_t -icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) -{ - int32_t len_uchar; - - init_icu_converter(); - - len_uchar = uchar_length(icu_converter, buff, nbytes); - - *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar)); - len_uchar = uchar_convert(icu_converter, - *buff_uchar, len_uchar + 1, buff, nbytes); - - return len_uchar; -} - -/* - * Convert a string of UChars into the database encoding. - * - * The source string at buff_uchar is of length len_uchar - * (it needn't be nul-terminated) - * - * *result receives a pointer to the palloc'd result string, and the - * function's result is the number of bytes generated (not counting nul). - * - * The result string is nul-terminated. - */ -int32_t -icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar) -{ - UErrorCode status; - int32_t len_result; - - init_icu_converter(); - - status = U_ZERO_ERROR; - len_result = ucnv_fromUChars(icu_converter, NULL, 0, - buff_uchar, len_uchar, &status); - if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_fromUChars", - u_errorName(status)))); - - *result = palloc(len_result + 1); - - status = U_ZERO_ERROR; - len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1, - buff_uchar, len_uchar, &status); - if (U_FAILURE(status) || - status == U_STRING_NOT_TERMINATED_WARNING) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_fromUChars", - u_errorName(status)))); - - return len_result; -} - -/* - * Parse collation attributes from the given locale string and apply them to - * the open collator. - * - * First, the locale string is canonicalized to an ICU format locale ID such - * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies - * the key-value arguments. - * - * Starting with ICU version 54, the attributes are processed automatically by - * ucol_open(), so this is only necessary for emulating this behavior on older - * versions. - */ -pg_attribute_unused() -static void -icu_set_collation_attributes(UCollator *collator, const char *loc, - UErrorCode *status) -{ - int32_t len; - char *icu_locale_id; - char *lower_str; - char *str; - char *token; - - /* - * The input locale may be a BCP 47 language tag, e.g. - * "und-u-kc-ks-level1", which expresses the same attributes in a - * different form. It will be converted to the equivalent ICU format - * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by - * uloc_canonicalize(). - */ - *status = U_ZERO_ERROR; - len = uloc_canonicalize(loc, NULL, 0, status); - icu_locale_id = palloc(len + 1); - *status = U_ZERO_ERROR; - len = uloc_canonicalize(loc, icu_locale_id, len + 1, status); - if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING) - return; - - lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id)); - - pfree(icu_locale_id); - - str = strchr(lower_str, '@'); - if (!str) - return; - str++; - - while ((token = strsep(&str, ";"))) - { - char *e = strchr(token, '='); - - if (e) - { - char *name; - char *value; - UColAttribute uattr; - UColAttributeValue uvalue; - - *status = U_ZERO_ERROR; - - *e = '\0'; - name = token; - value = e + 1; - - /* - * See attribute name and value lists in ICU i18n/coll.cpp - */ - if (strcmp(name, "colstrength") == 0) - uattr = UCOL_STRENGTH; - else if (strcmp(name, "colbackwards") == 0) - uattr = UCOL_FRENCH_COLLATION; - else if (strcmp(name, "colcaselevel") == 0) - uattr = UCOL_CASE_LEVEL; - else if (strcmp(name, "colcasefirst") == 0) - uattr = UCOL_CASE_FIRST; - else if (strcmp(name, "colalternate") == 0) - uattr = UCOL_ALTERNATE_HANDLING; - else if (strcmp(name, "colnormalization") == 0) - uattr = UCOL_NORMALIZATION_MODE; - else if (strcmp(name, "colnumeric") == 0) - uattr = UCOL_NUMERIC_COLLATION; - else - /* ignore if unknown */ - continue; - - if (strcmp(value, "primary") == 0) - uvalue = UCOL_PRIMARY; - else if (strcmp(value, "secondary") == 0) - uvalue = UCOL_SECONDARY; - else if (strcmp(value, "tertiary") == 0) - uvalue = UCOL_TERTIARY; - else if (strcmp(value, "quaternary") == 0) - uvalue = UCOL_QUATERNARY; - else if (strcmp(value, "identical") == 0) - uvalue = UCOL_IDENTICAL; - else if (strcmp(value, "no") == 0) - uvalue = UCOL_OFF; - else if (strcmp(value, "yes") == 0) - uvalue = UCOL_ON; - else if (strcmp(value, "shifted") == 0) - uvalue = UCOL_SHIFTED; - else if (strcmp(value, "non-ignorable") == 0) - uvalue = UCOL_NON_IGNORABLE; - else if (strcmp(value, "lower") == 0) - uvalue = UCOL_LOWER_FIRST; - else if (strcmp(value, "upper") == 0) - uvalue = UCOL_UPPER_FIRST; - else - { - *status = U_ILLEGAL_ARGUMENT_ERROR; - break; - } - - ucol_setAttribute(collator, uattr, uvalue, status); - } - } - - pfree(lower_str); -} -#endif /* * Return the BCP47 language tag representation of the requested locale. diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c new file mode 100644 index 0000000000..2a87e25dfb --- /dev/null +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -0,0 +1,708 @@ +/*----------------------------------------------------------------------- + * + * PostgreSQL locale utilities for ICU + * + * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group + * + * src/backend/utils/adt/pg_locale_icu.c + * + *----------------------------------------------------------------------- + */ + +#include "postgres.h" + +#ifdef USE_ICU + +#include +#include + +#include "catalog/pg_collation.h" +#include "mb/pg_wchar.h" +#include "utils/formatting.h" +#include "utils/pg_locale.h" + +/* + * Size of stack buffer to use for string transformations, used to avoid heap + * allocations in typical cases. This should be large enough that most strings + * will fit, but small enough that we feel comfortable putting it on the + * stack. + */ +#define TEXTBUFLEN 1024 + +extern UCollator *pg_ucol_open(const char *loc_str); +extern UCollator *make_icu_collator(const char *iculocstr, + const char *icurules); +extern int strncoll_icu(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); +extern size_t strnxfrm_icu(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +extern size_t strnxfrm_prefix_icu(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + +/* + * Converter object for converting between ICU's UChar strings and C strings + * in database encoding. Since the database encoding doesn't change, we only + * need one of these per session. + */ +static UConverter *icu_converter = NULL; + +static int strncoll_icu_no_utf8(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); +static size_t strnxfrm_prefix_icu_no_utf8(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +static void init_icu_converter(void); +static size_t uchar_length(UConverter *converter, + const char *str, int32_t len); +static int32_t uchar_convert(UConverter *converter, + UChar *dest, int32_t destlen, + const char *src, int32_t srclen); +static void icu_set_collation_attributes(UCollator *collator, const char *loc, + UErrorCode *status); + +/* + * Wrapper around ucol_open() to handle API differences for older ICU + * versions. + * + * Ensure that no path leaks a UCollator. + */ +UCollator * +pg_ucol_open(const char *loc_str) +{ + UCollator *collator; + UErrorCode status; + const char *orig_str = loc_str; + char *fixed_str = NULL; + + /* + * Must never open default collator, because it depends on the environment + * and may change at any time. Should not happen, but check here to catch + * bugs that might be hard to catch otherwise. + * + * NB: the default collator is not the same as the collator for the root + * locale. The root locale may be specified as the empty string, "und", or + * "root". The default collator is opened by passing NULL to ucol_open(). + */ + if (loc_str == NULL) + elog(ERROR, "opening default collator is not supported"); + + /* + * In ICU versions 54 and earlier, "und" is not a recognized spelling of + * the root locale. If the first component of the locale is "und", replace + * with "root" before opening. + */ + if (U_ICU_VERSION_MAJOR_NUM < 55) + { + char lang[ULOC_LANG_CAPACITY]; + + status = U_ZERO_ERROR; + uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status); + if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not get language from locale \"%s\": %s", + loc_str, u_errorName(status)))); + } + + if (strcmp(lang, "und") == 0) + { + const char *remainder = loc_str + strlen("und"); + + fixed_str = palloc(strlen("root") + strlen(remainder) + 1); + strcpy(fixed_str, "root"); + strcat(fixed_str, remainder); + + loc_str = fixed_str; + } + } + + status = U_ZERO_ERROR; + collator = ucol_open(loc_str, &status); + if (U_FAILURE(status)) + ereport(ERROR, + /* use original string for error report */ + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not open collator for locale \"%s\": %s", + orig_str, u_errorName(status)))); + + if (U_ICU_VERSION_MAJOR_NUM < 54) + { + status = U_ZERO_ERROR; + icu_set_collation_attributes(collator, loc_str, &status); + + /* + * Pretend the error came from ucol_open(), for consistent error + * message across ICU versions. + */ + if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) + { + ucol_close(collator); + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not open collator for locale \"%s\": %s", + orig_str, u_errorName(status)))); + } + } + + if (fixed_str != NULL) + pfree(fixed_str); + + return collator; +} + +/* + * Create a UCollator with the given locale string and rules. + * + * Ensure that no path leaks a UCollator. + */ +UCollator * +make_icu_collator(const char *iculocstr, const char *icurules) +{ + if (!icurules) + { + /* simple case without rules */ + return pg_ucol_open(iculocstr); + } + else + { + UCollator *collator_std_rules; + UCollator *collator_all_rules; + const UChar *std_rules; + UChar *my_rules; + UChar *all_rules; + int32_t length; + int32_t total; + UErrorCode status; + + /* + * If rules are specified, we extract the rules of the standard + * collation, add our own rules, and make a new collator with the + * combined rules. + */ + icu_to_uchar(&my_rules, icurules, strlen(icurules)); + + collator_std_rules = pg_ucol_open(iculocstr); + + std_rules = ucol_getRules(collator_std_rules, &length); + + total = u_strlen(std_rules) + u_strlen(my_rules) + 1; + + /* avoid leaking collator on OOM */ + all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM); + if (!all_rules) + { + ucol_close(collator_std_rules); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + u_strcpy(all_rules, std_rules); + u_strcat(all_rules, my_rules); + + ucol_close(collator_std_rules); + + status = U_ZERO_ERROR; + collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules), + UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, + NULL, &status); + if (U_FAILURE(status)) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s", + iculocstr, icurules, u_errorName(status)))); + } + + return collator_all_rules; + } +} + +/* + * strncoll_icu + * + * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given + * database encoding. An argument length of -1 means the string is + * NUL-terminated. + */ +int +strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, + pg_locale_t locale) +{ + int result; + + Assert(locale->provider == COLLPROVIDER_ICU); + +#ifdef HAVE_UCOL_STRCOLLUTF8 + if (GetDatabaseEncoding() == PG_UTF8) + { + UErrorCode status; + + status = U_ZERO_ERROR; + result = ucol_strcollUTF8(locale->info.icu.ucol, + arg1, len1, + arg2, len2, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("collation failed: %s", u_errorName(status)))); + } + else +#endif + { + result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale); + } + + return result; +} + +/* 'srclen' of -1 means the strings are NUL-terminated */ +size_t +strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + UChar *uchar; + int32_t ulen; + size_t uchar_bsize; + Size result_bsize; + + Assert(locale->provider == COLLPROVIDER_ICU); + + init_icu_converter(); + + ulen = uchar_length(icu_converter, src, srclen); + + uchar_bsize = (ulen + 1) * sizeof(UChar); + + if (uchar_bsize > TEXTBUFLEN) + buf = palloc(uchar_bsize); + + uchar = (UChar *) buf; + + ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); + + result_bsize = ucol_getSortKey(locale->info.icu.ucol, + uchar, ulen, + (uint8_t *) dest, destsize); + + /* + * ucol_getSortKey() counts the nul-terminator in the result length, but + * this function should not. + */ + Assert(result_bsize > 0); + result_bsize--; + + if (buf != sbuf) + pfree(buf); + + /* if dest is defined, it should be nul-terminated */ + Assert(result_bsize >= destsize || dest[result_bsize] == '\0'); + + return result_bsize; +} + +/* 'srclen' of -1 means the strings are NUL-terminated */ +size_t +strnxfrm_prefix_icu(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale) +{ + size_t result; + + Assert(locale->provider == COLLPROVIDER_ICU); + + if (GetDatabaseEncoding() == PG_UTF8) + { + UCharIterator iter; + uint32_t state[2]; + UErrorCode status; + + uiter_setUTF8(&iter, src, srclen); + state[0] = state[1] = 0; /* won't need that again */ + status = U_ZERO_ERROR; + result = ucol_nextSortKeyPart(locale->info.icu.ucol, + &iter, + state, + (uint8_t *) dest, + destsize, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("sort key generation failed: %s", + u_errorName(status)))); + } + else + result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen, + locale); + + return result; +} + +/* + * Convert a string in the database encoding into a string of UChars. + * + * The source string at buff is of length nbytes + * (it needn't be nul-terminated) + * + * *buff_uchar receives a pointer to the palloc'd result string, and + * the function's result is the number of UChars generated. + * + * The result string is nul-terminated, though most callers rely on the + * result length instead. + */ +int32_t +icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) +{ + int32_t len_uchar; + + init_icu_converter(); + + len_uchar = uchar_length(icu_converter, buff, nbytes); + + *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar)); + len_uchar = uchar_convert(icu_converter, + *buff_uchar, len_uchar + 1, buff, nbytes); + + return len_uchar; +} + +/* + * Convert a string of UChars into the database encoding. + * + * The source string at buff_uchar is of length len_uchar + * (it needn't be nul-terminated) + * + * *result receives a pointer to the palloc'd result string, and the + * function's result is the number of bytes generated (not counting nul). + * + * The result string is nul-terminated. + */ +int32_t +icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar) +{ + UErrorCode status; + int32_t len_result; + + init_icu_converter(); + + status = U_ZERO_ERROR; + len_result = ucnv_fromUChars(icu_converter, NULL, 0, + buff_uchar, len_uchar, &status); + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_fromUChars", + u_errorName(status)))); + + *result = palloc(len_result + 1); + + status = U_ZERO_ERROR; + len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1, + buff_uchar, len_uchar, &status); + if (U_FAILURE(status) || + status == U_STRING_NOT_TERMINATED_WARNING) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_fromUChars", + u_errorName(status)))); + + return len_result; +} + +/* + * strncoll_icu_no_utf8 + * + * Convert the arguments from the database encoding to UChar strings, then + * call ucol_strcoll(). An argument length of -1 means that the string is + * NUL-terminated. + * + * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(), + * caller should call that instead. + */ +static int +strncoll_icu_no_utf8(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + int32_t ulen1; + int32_t ulen2; + size_t bufsize1; + size_t bufsize2; + UChar *uchar1, + *uchar2; + int result; + + Assert(locale->provider == COLLPROVIDER_ICU); +#ifdef HAVE_UCOL_STRCOLLUTF8 + Assert(GetDatabaseEncoding() != PG_UTF8); +#endif + + init_icu_converter(); + + ulen1 = uchar_length(icu_converter, arg1, len1); + ulen2 = uchar_length(icu_converter, arg2, len2); + + bufsize1 = (ulen1 + 1) * sizeof(UChar); + bufsize2 = (ulen2 + 1) * sizeof(UChar); + + if (bufsize1 + bufsize2 > TEXTBUFLEN) + buf = palloc(bufsize1 + bufsize2); + + uchar1 = (UChar *) buf; + uchar2 = (UChar *) (buf + bufsize1); + + ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1); + ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2); + + result = ucol_strcoll(locale->info.icu.ucol, + uchar1, ulen1, + uchar2, ulen2); + + if (buf != sbuf) + pfree(buf); + + return result; +} + +/* 'srclen' of -1 means the strings are NUL-terminated */ +static size_t +strnxfrm_prefix_icu_no_utf8(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + UCharIterator iter; + uint32_t state[2]; + UErrorCode status; + int32_t ulen = -1; + UChar *uchar = NULL; + size_t uchar_bsize; + Size result_bsize; + + Assert(locale->provider == COLLPROVIDER_ICU); + Assert(GetDatabaseEncoding() != PG_UTF8); + + init_icu_converter(); + + ulen = uchar_length(icu_converter, src, srclen); + + uchar_bsize = (ulen + 1) * sizeof(UChar); + + if (uchar_bsize > TEXTBUFLEN) + buf = palloc(uchar_bsize); + + uchar = (UChar *) buf; + + ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); + + uiter_setString(&iter, uchar, ulen); + state[0] = state[1] = 0; /* won't need that again */ + status = U_ZERO_ERROR; + result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol, + &iter, + state, + (uint8_t *) dest, + destsize, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("sort key generation failed: %s", + u_errorName(status)))); + + return result_bsize; +} + +static void +init_icu_converter(void) +{ + const char *icu_encoding_name; + UErrorCode status; + UConverter *conv; + + if (icu_converter) + return; /* already done */ + + icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding()); + if (!icu_encoding_name) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("encoding \"%s\" not supported by ICU", + pg_encoding_to_char(GetDatabaseEncoding())))); + + status = U_ZERO_ERROR; + conv = ucnv_open(icu_encoding_name, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not open ICU converter for encoding \"%s\": %s", + icu_encoding_name, u_errorName(status)))); + + icu_converter = conv; +} + +/* + * Find length, in UChars, of given string if converted to UChar string. + * + * A length of -1 indicates that the input string is NUL-terminated. + */ +static size_t +uchar_length(UConverter *converter, const char *str, int32_t len) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t ulen; + + ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status); + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + return ulen; +} + +/* + * Convert the given source string into a UChar string, stored in dest, and + * return the length (in UChars). + * + * A srclen of -1 indicates that the input string is NUL-terminated. + */ +static int32_t +uchar_convert(UConverter *converter, UChar *dest, int32_t destlen, + const char *src, int32_t srclen) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t ulen; + + status = U_ZERO_ERROR; + ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + return ulen; +} + +/* + * Parse collation attributes from the given locale string and apply them to + * the open collator. + * + * First, the locale string is canonicalized to an ICU format locale ID such + * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies + * the key-value arguments. + * + * Starting with ICU version 54, the attributes are processed automatically by + * ucol_open(), so this is only necessary for emulating this behavior on older + * versions. + */ +pg_attribute_unused() +static void +icu_set_collation_attributes(UCollator *collator, const char *loc, + UErrorCode *status) +{ + int32_t len; + char *icu_locale_id; + char *lower_str; + char *str; + char *token; + + /* + * The input locale may be a BCP 47 language tag, e.g. + * "und-u-kc-ks-level1", which expresses the same attributes in a + * different form. It will be converted to the equivalent ICU format + * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by + * uloc_canonicalize(). + */ + *status = U_ZERO_ERROR; + len = uloc_canonicalize(loc, NULL, 0, status); + icu_locale_id = palloc(len + 1); + *status = U_ZERO_ERROR; + len = uloc_canonicalize(loc, icu_locale_id, len + 1, status); + if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING) + return; + + lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id)); + + pfree(icu_locale_id); + + str = strchr(lower_str, '@'); + if (!str) + return; + str++; + + while ((token = strsep(&str, ";"))) + { + char *e = strchr(token, '='); + + if (e) + { + char *name; + char *value; + UColAttribute uattr; + UColAttributeValue uvalue; + + *status = U_ZERO_ERROR; + + *e = '\0'; + name = token; + value = e + 1; + + /* + * See attribute name and value lists in ICU i18n/coll.cpp + */ + if (strcmp(name, "colstrength") == 0) + uattr = UCOL_STRENGTH; + else if (strcmp(name, "colbackwards") == 0) + uattr = UCOL_FRENCH_COLLATION; + else if (strcmp(name, "colcaselevel") == 0) + uattr = UCOL_CASE_LEVEL; + else if (strcmp(name, "colcasefirst") == 0) + uattr = UCOL_CASE_FIRST; + else if (strcmp(name, "colalternate") == 0) + uattr = UCOL_ALTERNATE_HANDLING; + else if (strcmp(name, "colnormalization") == 0) + uattr = UCOL_NORMALIZATION_MODE; + else if (strcmp(name, "colnumeric") == 0) + uattr = UCOL_NUMERIC_COLLATION; + else + /* ignore if unknown */ + continue; + + if (strcmp(value, "primary") == 0) + uvalue = UCOL_PRIMARY; + else if (strcmp(value, "secondary") == 0) + uvalue = UCOL_SECONDARY; + else if (strcmp(value, "tertiary") == 0) + uvalue = UCOL_TERTIARY; + else if (strcmp(value, "quaternary") == 0) + uvalue = UCOL_QUATERNARY; + else if (strcmp(value, "identical") == 0) + uvalue = UCOL_IDENTICAL; + else if (strcmp(value, "no") == 0) + uvalue = UCOL_OFF; + else if (strcmp(value, "yes") == 0) + uvalue = UCOL_ON; + else if (strcmp(value, "shifted") == 0) + uvalue = UCOL_SHIFTED; + else if (strcmp(value, "non-ignorable") == 0) + uvalue = UCOL_NON_IGNORABLE; + else if (strcmp(value, "lower") == 0) + uvalue = UCOL_LOWER_FIRST; + else if (strcmp(value, "upper") == 0) + uvalue = UCOL_UPPER_FIRST; + else + { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + + ucol_setAttribute(collator, uattr, uvalue, status); + } + } + + pfree(lower_str); +} + +#endif /* USE_ICU */