From 58274728fb8e087049df67c0eee903d9743fdeda Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 30 Jan 2014 19:07:06 -0500 Subject: [PATCH] Be forgiving of variant spellings of locale names in pg_upgrade. Even though the server tries to canonicalize stored locale names, the platform often doesn't cooperate, so it's entirely possible that one DB thinks its locale is, say, "en_US.UTF-8" while the other has "en_US.utf8". Rather than failing, we should try to allow this where it's clearly OK. There is already pretty robust encoding lookup in encnames.c, so make use of that to compare the encoding parts of the names. The locale identifier parts are just compared case-insensitively, which we were already doing. The major problem known to exist in the field is variant encoding-name spellings, so hopefully this will be Good Enough. If not, we can try being even laxer. Pavel Raiskup, reviewed by Rushabh Lathia --- contrib/pg_upgrade/check.c | 82 ++++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 13 deletions(-) diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c index e395c7c7f6..d52051a6e9 100644 --- a/contrib/pg_upgrade/check.c +++ b/contrib/pg_upgrade/check.c @@ -9,6 +9,7 @@ #include "postgres_fe.h" +#include "mb/pg_wchar.h" #include "pg_upgrade.h" @@ -16,6 +17,8 @@ static void set_locale_and_encoding(ClusterInfo *cluster); static void check_new_cluster_is_empty(void); static void check_locale_and_encoding(ControlData *oldctrl, ControlData *newctrl); +static bool equivalent_locale(const char *loca, const char *locb); +static bool equivalent_encoding(const char *chara, const char *charb); static void check_is_super_user(ClusterInfo *cluster); static void check_for_prepared_transactions(ClusterInfo *cluster); static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster); @@ -397,27 +400,80 @@ set_locale_and_encoding(ClusterInfo *cluster) /* * check_locale_and_encoding() * - * locale is not in pg_controldata in 8.4 and later so - * we probably had to get via a database query. + * Check that old and new locale and encoding match. Even though the backend + * tries to canonicalize stored locale names, the platform often doesn't + * cooperate, so it's entirely possible that one DB thinks its locale is + * "en_US.UTF-8" while the other says "en_US.utf8". Try to be forgiving. */ static void check_locale_and_encoding(ControlData *oldctrl, ControlData *newctrl) { - /* - * These are often defined with inconsistent case, so use pg_strcasecmp(). - * They also often use inconsistent hyphenation, which we cannot fix, e.g. - * UTF-8 vs. UTF8, so at least we display the mismatching values. - */ - if (pg_strcasecmp(oldctrl->lc_collate, newctrl->lc_collate) != 0) + if (!equivalent_locale(oldctrl->lc_collate, newctrl->lc_collate)) pg_fatal("lc_collate cluster values do not match: old \"%s\", new \"%s\"\n", - oldctrl->lc_collate, newctrl->lc_collate); - if (pg_strcasecmp(oldctrl->lc_ctype, newctrl->lc_ctype) != 0) + oldctrl->lc_collate, newctrl->lc_collate); + if (!equivalent_locale(oldctrl->lc_ctype, newctrl->lc_ctype)) pg_fatal("lc_ctype cluster values do not match: old \"%s\", new \"%s\"\n", - oldctrl->lc_ctype, newctrl->lc_ctype); - if (pg_strcasecmp(oldctrl->encoding, newctrl->encoding) != 0) + oldctrl->lc_ctype, newctrl->lc_ctype); + if (!equivalent_encoding(oldctrl->encoding, newctrl->encoding)) pg_fatal("encoding cluster values do not match: old \"%s\", new \"%s\"\n", - oldctrl->encoding, newctrl->encoding); + oldctrl->encoding, newctrl->encoding); +} + +/* + * equivalent_locale() + * + * Best effort locale-name comparison. Return false if we are not 100% sure + * the locales are equivalent. + */ +static bool +equivalent_locale(const char *loca, const char *locb) +{ + const char *chara = strrchr(loca, '.'); + const char *charb = strrchr(locb, '.'); + int lencmp; + + /* If they don't both contain an encoding part, just do strcasecmp(). */ + if (!chara || !charb) + return (pg_strcasecmp(loca, locb) == 0); + + /* Compare the encoding parts. */ + if (!equivalent_encoding(chara + 1, charb + 1)) + return false; + + /* + * OK, compare the locale identifiers (e.g. en_US part of en_US.utf8). + * + * It's tempting to ignore non-alphanumeric chars here, but for now it's + * not clear that that's necessary; just do case-insensitive comparison. + */ + lencmp = chara - loca; + if (lencmp != charb - locb) + return false; + + return (pg_strncasecmp(loca, locb, lencmp) == 0); +} + +/* + * equivalent_encoding() + * + * Best effort encoding-name comparison. Return true only if the encodings + * are valid server-side encodings and known equivalent. + * + * Because the lookup in pg_valid_server_encoding() does case folding and + * ignores non-alphanumeric characters, this will recognize many popular + * variant spellings as equivalent, eg "utf8" and "UTF-8" will match. + */ +static bool +equivalent_encoding(const char *chara, const char *charb) +{ + int enca = pg_valid_server_encoding(chara); + int encb = pg_valid_server_encoding(charb); + + if (enca < 0 || encb < 0) + return false; + + return (enca == encb); }