Refactor to add pg_strcoll(), pg_strxfrm(), and variants.

Offers a generally better separation of responsibilities for collation code. Also, a step towards multi-lib ICU, which should be based on a clean separation of the routines required for collation providers. Callers with NUL-terminated strings should call pg_strcoll() or pg_strxfrm(); callers with strings and their length should call the variants pg_strncoll() or pg_strnxfrm(). Reviewed-by: Peter Eisentraut, Peter Geoghegan Discussion: https://postgr.es/m/a581136455c940d7bd0ff482d3a2bd51af25a94f.camel%40j-davis.com
2023-02-23 10:55:20 -08:00 · 2023-02-23 10:55:20 -08:00 · d87d548cd0
commit d87d548cd0
parent e9960732a9
5 changed files with 870 additions and 390 deletions
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@ -292,21 +292,24 @@ hashtext(PG_FUNCTION_ARGS)
 #ifdef USE_ICU
 		if (mylocale->provider == COLLPROVIDER_ICU)
 		{
-			int32_t		ulen = -1;
+			Size		bsize, rsize;
-			UChar	   *uchar = NULL;
+			char	   *buf;
-			Size		bsize;
+			const char *keydata = VARDATA_ANY(key);
-			uint8_t    *buf;
+			size_t		keylen = VARSIZE_ANY_EXHDR(key);
-			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
+			bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale);
 			buf = palloc(bsize + 1);
-			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
+			rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale);
-									uchar, ulen, NULL, 0);
+			if (rsize != bsize)
-			buf = palloc(bsize);
+				elog(ERROR, "pg_strnxfrm() returned unexpected result");
 			ucol_getSortKey(mylocale->info.icu.ucol,
 							uchar, ulen, buf, bsize);
 			pfree(uchar);
-			result = hash_any(buf, bsize);
+			/*
 			 * In principle, there's no reason to include the terminating NUL
 			 * character in the hash, but it was done before and the behavior
 			 * must be preserved.
 			 */
 			result = hash_any((uint8_t *) buf, bsize + 1);
 			pfree(buf);
 		}
@ -350,21 +353,25 @@ hashtextextended(PG_FUNCTION_ARGS)
 #ifdef USE_ICU
 		if (mylocale->provider == COLLPROVIDER_ICU)
 		{
-			int32_t		ulen = -1;
+			Size		bsize, rsize;
-			UChar	   *uchar = NULL;
+			char	   *buf;
-			Size		bsize;
+			const char *keydata = VARDATA_ANY(key);
-			uint8_t    *buf;
+			size_t		keylen = VARSIZE_ANY_EXHDR(key);
-			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
+			bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale);
 			buf = palloc(bsize + 1);
-			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
+			rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale);
-									uchar, ulen, NULL, 0);
+			if (rsize != bsize)
-			buf = palloc(bsize);
+				elog(ERROR, "pg_strnxfrm() returned unexpected result");
 			ucol_getSortKey(mylocale->info.icu.ucol,
 							uchar, ulen, buf, bsize);
 			pfree(uchar);
-			result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1));
+			/*
 			 * In principle, there's no reason to include the terminating NUL
 			 * character in the hash, but it was done before and the behavior
 			 * must be preserved.
 			 */
 			result = hash_any_extended((uint8_t *) buf, bsize + 1,
 									   PG_GETARG_INT64(1));
 			pfree(buf);
 		}
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@ -79,6 +79,12 @@
 #include <shlwapi.h>
 #endif
 /*
 * This should be large enough that most strings will fit, but small enough
 * that we feel comfortable putting it on the stack
 */
 #define		TEXTBUFLEN			1024
 #define		MAX_L10N_DATA		80
@ -123,6 +129,19 @@ static char *IsoLocaleName(const char *);
 #endif
 #ifdef USE_ICU
 /*
 * Converter object for converting between ICU's UChar strings and C strings
 * in database encoding.  Since the database encoding doesn't change, we only
 * need one of these per session.
 */
 static UConverter *icu_converter = NULL;
 static void init_icu_converter(void);
 static size_t uchar_length(UConverter *converter,
 						   const char *str, int32_t len);
 static int32_t uchar_convert(UConverter *converter,
 							 UChar *dest, int32_t destlen,
 							 const char *str, int32_t srclen);
 static void icu_set_collation_attributes(UCollator *collator, const char *loc);
 #endif
@ -1731,15 +1750,705 @@ get_collation_actual_version(char collprovider, const char *collcollate)
 	return collversion;
 }
 /*
 * pg_strncoll_libc_win32_utf8
 *
 * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
 * invoke wcscoll() or wcscoll_l().
 */
 #ifdef WIN32
 static int
 pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
 							size_t len2, pg_locale_t locale)
 {
 	char		sbuf[TEXTBUFLEN];
 	char	   *buf = sbuf;
 	char	   *a1p,
 			   *a2p;
 	int			a1len = len1 * 2 + 2;
 	int			a2len = len2 * 2 + 2;
 	int			r;
 	int			result;
 	Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
 	Assert(GetDatabaseEncoding() == PG_UTF8);
 #ifndef WIN32
 	Assert(false);
 #endif
 	if (a1len + a2len > TEXTBUFLEN)
 		buf = palloc(a1len + a2len);
 	a1p = buf;
 	a2p = buf + a1len;
 	/* API does not work for zero-length input */
 	if (len1 == 0)
 		r = 0;
 	else
 	{
 		r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
 								(LPWSTR) a1p, a1len / 2);
 		if (!r)
 			ereport(ERROR,
 					(errmsg("could not convert string to UTF-16: error code %lu",
 							GetLastError())));
 	}
 	((LPWSTR) a1p)[r] = 0;
 	if (len2 == 0)
 		r = 0;
 	else
 	{
 		r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
 								(LPWSTR) a2p, a2len / 2);
 		if (!r)
 			ereport(ERROR,
 					(errmsg("could not convert string to UTF-16: error code %lu",
 							GetLastError())));
 	}
 	((LPWSTR) a2p)[r] = 0;
 	errno = 0;
 #ifdef HAVE_LOCALE_T
 	if (locale)
 		result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
 	else
 #endif
 		result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
 	if (result == 2147483647)	/* _NLSCMPERROR; missing from mingw
 								 * headers */
 		ereport(ERROR,
 				(errmsg("could not compare Unicode strings: %m")));
 	if (buf != sbuf)
 		pfree(buf);
 	return result;
 }
 #endif							/* WIN32 */
 /*
 * pg_strcoll_libc
 *
 * Call strcoll(), strcoll_l(), wcscoll(), or wcscoll_l() as appropriate for
 * the given locale, platform, and database encoding. If the locale is NULL,
 * use the database collation.
 *
 * Arguments must be encoded in the database encoding and nul-terminated.
 */
 static int
 pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
 {
 	int result;
 	Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
 #ifdef WIN32
 	if (GetDatabaseEncoding() == PG_UTF8)
 	{
 		size_t len1 = strlen(arg1);
 		size_t len2 = strlen(arg2);
 		result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
 	}
 	else
 #endif							/* WIN32 */
 	if (locale)
 	{
 #ifdef HAVE_LOCALE_T
 		result = strcoll_l(arg1, arg2, locale->info.lt);
 #else
 		/* shouldn't happen */
 		elog(ERROR, "unsupported collprovider: %c", locale->provider);
 #endif
 	}
 	else
 		result = strcoll(arg1, arg2);
 	return result;
 }
 /*
 * pg_strncoll_libc
 *
 * Nul-terminate the arguments and call pg_strcoll_libc().
 */
 static int
 pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
 				 pg_locale_t locale)
 {
 	char	 sbuf[TEXTBUFLEN];
 	char	*buf	  = sbuf;
 	size_t	 bufsize1 = len1 + 1;
 	size_t	 bufsize2 = len2 + 1;
 	char	*arg1n;
 	char	*arg2n;
 	int		 result;
 	Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
 #ifdef WIN32
 	/* check for this case before doing the work for nul-termination */
 	if (GetDatabaseEncoding() == PG_UTF8)
 		return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
 #endif							/* WIN32 */
 	if (bufsize1 + bufsize2 > TEXTBUFLEN)
 		buf = palloc(bufsize1 + bufsize2);
 	arg1n = buf;
 	arg2n = buf + bufsize1;
 	/* nul-terminate arguments */
 	memcpy(arg1n, arg1, len1);
 	arg1n[len1] = '\0';
 	memcpy(arg2n, arg2, len2);
 	arg2n[len2] = '\0';
 	result = pg_strcoll_libc(arg1n, arg2n, locale);
 	if (buf != sbuf)
 		pfree(buf);
 	return result;
 }
 #ifdef USE_ICU
 /*
 * Converter object for converting between ICU's UChar strings and C strings
 * in database encoding.  Since the database encoding doesn't change, we only
 * need one of these per session.
 */
 static UConverter *icu_converter = NULL;
 /*
 * pg_strncoll_icu_no_utf8
 *
 * Convert the arguments from the database encoding to UChar strings, then
 * call ucol_strcoll(). An argument length of -1 means that the string is
 * NUL-terminated.
 *
 * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
 * caller should call that instead.
 */
 static int
 pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
 						const char *arg2, int32_t len2, pg_locale_t locale)
 {
 	char	 sbuf[TEXTBUFLEN];
 	char	*buf = sbuf;
 	int32_t	 ulen1;
 	int32_t	 ulen2;
 	size_t   bufsize1;
 	size_t   bufsize2;
 	UChar	*uchar1,
 			*uchar2;
 	int		 result;
 	Assert(locale->provider == COLLPROVIDER_ICU);
 #ifdef HAVE_UCOL_STRCOLLUTF8
 	Assert(GetDatabaseEncoding() != PG_UTF8);
 #endif
 	init_icu_converter();
 	ulen1 = uchar_length(icu_converter, arg1, len1);
 	ulen2 = uchar_length(icu_converter, arg2, len2);
 	bufsize1 = (ulen1 + 1) * sizeof(UChar);
 	bufsize2 = (ulen2 + 1) * sizeof(UChar);
 	if (bufsize1 + bufsize2 > TEXTBUFLEN)
 		buf = palloc(bufsize1 + bufsize2);
 	uchar1 = (UChar *) buf;
 	uchar2 = (UChar *) (buf + bufsize1);
 	ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
 	ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
 	result = ucol_strcoll(locale->info.icu.ucol,
 						  uchar1, ulen1,
 						  uchar2, ulen2);
 	if (buf != sbuf)
 		pfree(buf);
 	return result;
 }
 /*
 * pg_strncoll_icu
 *
 * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
 * database encoding. An argument length of -1 means the string is
 * NUL-terminated.
 *
 * Arguments must be encoded in the database encoding.
 */
 static int
 pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
 				pg_locale_t locale)
 {
 	int result;
 	Assert(locale->provider == COLLPROVIDER_ICU);
 #ifdef HAVE_UCOL_STRCOLLUTF8
 	if (GetDatabaseEncoding() == PG_UTF8)
 	{
 		UErrorCode	status;
 		status = U_ZERO_ERROR;
 		result = ucol_strcollUTF8(locale->info.icu.ucol,
 								  arg1, len1,
 								  arg2, len2,
 								  &status);
 		if (U_FAILURE(status))
 			ereport(ERROR,
 					(errmsg("collation failed: %s", u_errorName(status))));
 	}
 	else
 #endif
 	{
 		result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
 	}
 	return result;
 }
 #endif							/* USE_ICU */
 /*
 * pg_strcoll
 *
 * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
 * or wcscoll_l() as appropriate for the given locale, platform, and database
 * encoding. If the locale is not specified, use the database collation.
 *
 * Arguments must be encoded in the database encoding and nul-terminated.
 *
 * The caller is responsible for breaking ties if the collation is
 * deterministic; this maintains consistency with pg_strxfrm(), which cannot
 * easily account for deterministic collations.
 */
 int
 pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
 {
 	int			result;
 	if (!locale || locale->provider == COLLPROVIDER_LIBC)
 		result = pg_strcoll_libc(arg1, arg2, locale);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
 		result = pg_strncoll_icu(arg1, -1, arg2, -1, locale);
 #endif
 	else
 		/* shouldn't happen */
 		elog(ERROR, "unsupported collprovider: %c", locale->provider);
 	return result;
 }
 /*
 * pg_strncoll
 *
 * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
 * or wcscoll_l() as appropriate for the given locale, platform, and database
 * encoding. If the locale is not specified, use the database collation.
 *
 * Arguments must be encoded in the database encoding.
 *
 * This function may need to nul-terminate the arguments for libc functions;
 * so if the caller already has nul-terminated strings, it should call
 * pg_strcoll() instead.
 *
 * The caller is responsible for breaking ties if the collation is
 * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
 * easily account for deterministic collations.
 */
 int
 pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
 			pg_locale_t locale)
 {
 	int		 result;
 	if (!locale || locale->provider == COLLPROVIDER_LIBC)
 		result = pg_strncoll_libc(arg1, len1, arg2, len2, locale);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
 		result = pg_strncoll_icu(arg1, len1, arg2, len2, locale);
 #endif
 	else
 		/* shouldn't happen */
 		elog(ERROR, "unsupported collprovider: %c", locale->provider);
 	return result;
 }
 static size_t
 pg_strxfrm_libc(char *dest, const char *src, size_t destsize,
 				pg_locale_t locale)
 {
 	Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
 #ifdef TRUST_STRXFRM
 #ifdef HAVE_LOCALE_T
 	if (locale)
 		return strxfrm_l(dest, src, destsize, locale->info.lt);
 	else
 #endif
 		return strxfrm(dest, src, destsize);
 #else
 	/* shouldn't happen */
 	elog(ERROR, "unsupported collprovider: %c", locale->provider);
 #endif
 }
 static size_t
 pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
 				 pg_locale_t locale)
 {
 	char	 sbuf[TEXTBUFLEN];
 	char	*buf	 = sbuf;
 	size_t	 bufsize = srclen + 1;
 	size_t	 result;
 	Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
 	if (bufsize > TEXTBUFLEN)
 		buf = palloc(bufsize);
 	/* nul-terminate arguments */
 	memcpy(buf, src, srclen);
 	buf[srclen] = '\0';
 	result = pg_strxfrm_libc(dest, buf, destsize, locale);
 	if (buf != sbuf)
 		pfree(buf);
 	/* if dest is defined, it should be nul-terminated */
 	Assert(result >= destsize || dest[result] == '\0');
 	return result;
 }
 #ifdef USE_ICU
 /* 'srclen' of -1 means the strings are NUL-terminated */
 static size_t
 pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize,
 				pg_locale_t locale)
 {
 	char	 sbuf[TEXTBUFLEN];
 	char	*buf	= sbuf;
 	UChar	*uchar;
 	int32_t	 ulen;
 	size_t   uchar_bsize;
 	Size	 result_bsize;
 	Assert(locale->provider == COLLPROVIDER_ICU);
 	init_icu_converter();
 	ulen = uchar_length(icu_converter, src, srclen);
 	uchar_bsize = (ulen + 1) * sizeof(UChar);
 	if (uchar_bsize > TEXTBUFLEN)
 		buf = palloc(uchar_bsize);
 	uchar = (UChar *) buf;
 	ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
 	result_bsize = ucol_getSortKey(locale->info.icu.ucol,
 								   uchar, ulen,
 								   (uint8_t *) dest, destsize);
 	/*
 	 * ucol_getSortKey() counts the nul-terminator in the result length, but
 	 * this function should not.
 	 */
 	Assert(result_bsize > 0);
 	result_bsize--;
 	if (buf != sbuf)
 		pfree(buf);
 	/* if dest is defined, it should be nul-terminated */
 	Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
 	return result_bsize;
 }
 /* 'srclen' of -1 means the strings are NUL-terminated */
 static size_t
 pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen,
 							   int32_t destsize, pg_locale_t locale)
 {
 	char			 sbuf[TEXTBUFLEN];
 	char			*buf   = sbuf;
 	UCharIterator	 iter;
 	uint32_t		 state[2];
 	UErrorCode		 status;
 	int32_t			 ulen  = -1;
 	UChar			*uchar = NULL;
 	size_t			 uchar_bsize;
 	Size			 result_bsize;
 	Assert(locale->provider == COLLPROVIDER_ICU);
 	Assert(GetDatabaseEncoding() != PG_UTF8);
 	init_icu_converter();
 	ulen = uchar_length(icu_converter, src, srclen);
 	uchar_bsize = (ulen + 1) * sizeof(UChar);
 	if (uchar_bsize > TEXTBUFLEN)
 		buf = palloc(uchar_bsize);
 	uchar = (UChar *) buf;
 	ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
 	uiter_setString(&iter, uchar, ulen);
 	state[0] = state[1] = 0;	/* won't need that again */
 	status = U_ZERO_ERROR;
 	result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
 										&iter,
 										state,
 										(uint8_t *) dest,
 										destsize,
 										&status);
 	if (U_FAILURE(status))
 		ereport(ERROR,
 				(errmsg("sort key generation failed: %s",
 						u_errorName(status))));
 	return result_bsize;
 }
 /* 'srclen' of -1 means the strings are NUL-terminated */
 static size_t
 pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen,
 					   int32_t destsize, pg_locale_t locale)
 {
 	size_t result;
 	Assert(locale->provider == COLLPROVIDER_ICU);
 	if (GetDatabaseEncoding() == PG_UTF8)
 	{
 		UCharIterator iter;
 		uint32_t	state[2];
 		UErrorCode	status;
 		uiter_setUTF8(&iter, src, srclen);
 		state[0] = state[1] = 0;	/* won't need that again */
 		status = U_ZERO_ERROR;
 		result = ucol_nextSortKeyPart(locale->info.icu.ucol,
 									  &iter,
 									  state,
 									  (uint8_t *) dest,
 									  destsize,
 									  &status);
 		if (U_FAILURE(status))
 			ereport(ERROR,
 					(errmsg("sort key generation failed: %s",
 							u_errorName(status))));
 	}
 	else
 		result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize,
 												locale);
 	return result;
 }
 #endif
 /*
 * Return true if the collation provider supports pg_strxfrm() and
 * pg_strnxfrm(); otherwise false.
 *
 * Unfortunately, it seems that strxfrm() for non-C collations is broken on
 * many common platforms; testing of multiple versions of glibc reveals that,
 * for many locales, strcoll() and strxfrm() do not return consistent
 * results. While no other libc other than Cygwin has so far been shown to
 * have a problem, we take the conservative course of action for right now and
 * disable this categorically.  (Users who are certain this isn't a problem on
 * their system can define TRUST_STRXFRM.)
 *
 * No similar problem is known for the ICU provider.
 */
 bool
 pg_strxfrm_enabled(pg_locale_t locale)
 {
 	if (!locale || locale->provider == COLLPROVIDER_LIBC)
 #ifdef TRUST_STRXFRM
 		return true;
 #else
 		return false;
 #endif
 	else if (locale->provider == COLLPROVIDER_ICU)
 		return true;
 	else
 		/* shouldn't happen */
 		elog(ERROR, "unsupported collprovider: %c", locale->provider);
 }
 /*
 * pg_strxfrm
 *
 * Transforms 'src' to a nul-terminated string stored in 'dest' such that
 * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
 * untransformed strings.
 *
 * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest'
 * may be NULL.
 *
 * Returns the number of bytes needed to store the transformed string,
 * excluding the terminating nul byte. If the value returned is 'destsize' or
 * greater, the resulting contents of 'dest' are undefined.
 */
 size_t
 pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
 {
 	size_t result;
 	if (!locale || locale->provider == COLLPROVIDER_LIBC)
 		result = pg_strxfrm_libc(dest, src, destsize, locale);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
 		result = pg_strnxfrm_icu(dest, src, -1, destsize, locale);
 #endif
 	else
 		/* shouldn't happen */
 		elog(ERROR, "unsupported collprovider: %c", locale->provider);
 	return result;
 }
 /*
 * pg_strnxfrm
 *
 * Transforms 'src' to a nul-terminated string stored in 'dest' such that
 * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
 * untransformed strings.
 *
 * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may
 * be NULL.
 *
 * Returns the number of bytes needed to store the transformed string,
 * excluding the terminating nul byte. If the value returned is 'destsize' or
 * greater, the resulting contents of 'dest' are undefined.
 *
 * This function may need to nul-terminate the argument for libc functions;
 * so if the caller already has a nul-terminated string, it should call
 * pg_strxfrm() instead.
 */
 size_t
 pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen,
 			pg_locale_t locale)
 {
 	size_t result;
 	if (!locale || locale->provider == COLLPROVIDER_LIBC)
 		result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
 		result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale);
 #endif
 	else
 		/* shouldn't happen */
 		elog(ERROR, "unsupported collprovider: %c", locale->provider);
 	return result;
 }
 /*
 * Return true if the collation provider supports pg_strxfrm_prefix() and
 * pg_strnxfrm_prefix(); otherwise false.
 */
 bool
 pg_strxfrm_prefix_enabled(pg_locale_t locale)
 {
 	if (!locale || locale->provider == COLLPROVIDER_LIBC)
 		return false;
 	else if (locale->provider == COLLPROVIDER_ICU)
 		return true;
 	else
 		/* shouldn't happen */
 		elog(ERROR, "unsupported collprovider: %c", locale->provider);
 }
 /*
 * pg_strxfrm_prefix
 *
 * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
 * memcmp() on the byte sequence is equivalent to pg_strcoll() on
 * untransformed strings. The result is not nul-terminated.
 *
 * The provided 'src' must be nul-terminated.
 *
 * If destsize is not large enough to hold the resulting byte sequence, stores
 * only the first destsize bytes in 'dest'. Returns the number of bytes
 * actually copied to 'dest'.
 */
 size_t
 pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
 				  pg_locale_t locale)
 {
 	size_t result;
 	if (!locale || locale->provider == COLLPROVIDER_LIBC)
 		elog(ERROR, "collprovider '%c' does not support pg_strxfrm_prefix()",
 			 locale->provider);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
 		result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
 #endif
 	else
 		/* shouldn't happen */
 		elog(ERROR, "unsupported collprovider: %c", locale->provider);
 	return result;
 }
 /*
 * pg_strnxfrm_prefix
 *
 * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
 * memcmp() on the byte sequence is equivalent to pg_strcoll() on
 * untransformed strings. The result is not nul-terminated.
 *
 * The provided 'src' must be nul-terminated.
 *
 * If destsize is not large enough to hold the resulting byte sequence, stores
 * only the first destsize bytes in 'dest'. Returns the number of bytes
 * actually copied to 'dest'.
 *
 * This function may need to nul-terminate the argument for libc functions;
 * so if the caller already has a nul-terminated string, it should call
 * pg_strxfrm_prefix() instead.
 */
 size_t
 pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
 				   size_t srclen, pg_locale_t locale)
 {
 	size_t result;
 	if (!locale || locale->provider == COLLPROVIDER_LIBC)
 		elog(ERROR, "collprovider '%c' does not support pg_strnxfrm_prefix()",
 			 locale->provider);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
 		result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
 #endif
 	else
 		/* shouldn't happen */
 		elog(ERROR, "unsupported collprovider: %c", locale->provider);
 	return result;
 }
 #ifdef USE_ICU
 static void
 init_icu_converter(void)
 {
@ -1767,6 +2476,39 @@ init_icu_converter(void)
 	icu_converter = conv;
 }
 /*
 * Find length, in UChars, of given string if converted to UChar string.
 */
 static size_t
 uchar_length(UConverter *converter, const char *str, int32_t len)
 {
 	UErrorCode	status = U_ZERO_ERROR;
 	int32_t		ulen;
 	ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
 	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
 		ereport(ERROR,
 				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
 	return ulen;
 }
 /*
 * Convert the given source string into a UChar string, stored in dest, and
 * return the length (in UChars).
 */
 static int32_t
 uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
 			  const char *src, int32_t srclen)
 {
 	UErrorCode	status = U_ZERO_ERROR;
 	int32_t		ulen;
 	status = U_ZERO_ERROR;
 	ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
 	if (U_FAILURE(status))
 		ereport(ERROR,
 				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
 	return ulen;
 }
 /*
 * Convert a string in the database encoding into a string of UChars.
 *
@ -1782,26 +2524,15 @@ init_icu_converter(void)
 int32_t
 icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
 {
 	UErrorCode	status;
 	int32_t len_uchar;
 	init_icu_converter();
-	status = U_ZERO_ERROR;
+	len_uchar = uchar_length(icu_converter, buff, nbytes);
 	len_uchar = ucnv_toUChars(icu_converter, NULL, 0,
 							  buff, nbytes, &status);
 	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
 		ereport(ERROR,
 				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
 	*buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
-
+	len_uchar = uchar_convert(icu_converter,
-	status = U_ZERO_ERROR;
+							  *buff_uchar, len_uchar + 1, buff, nbytes);
 	len_uchar = ucnv_toUChars(icu_converter, *buff_uchar, len_uchar + 1,
 							  buff, nbytes, &status);
 	if (U_FAILURE(status))
 		ereport(ERROR,
 				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
 	return len_uchar;
 }
--- a/src/backend/utils/adt/varchar.c
+++ b/src/backend/utils/adt/varchar.c
@ -1024,21 +1024,22 @@ hashbpchar(PG_FUNCTION_ARGS)
 #ifdef USE_ICU
 		if (mylocale->provider == COLLPROVIDER_ICU)
 		{
-			int32_t		ulen = -1;
+			Size		bsize, rsize;
-			UChar	   *uchar = NULL;
+			char	   *buf;
 			Size		bsize;
 			uint8_t    *buf;
-			ulen = icu_to_uchar(&uchar, keydata, keylen);
+			bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale);
 			buf = palloc(bsize + 1);
-			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
+			rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale);
-									uchar, ulen, NULL, 0);
+			if (rsize != bsize)
-			buf = palloc(bsize);
+				elog(ERROR, "pg_strnxfrm() returned unexpected result");
 			ucol_getSortKey(mylocale->info.icu.ucol,
 							uchar, ulen, buf, bsize);
 			pfree(uchar);
-			result = hash_any(buf, bsize);
+			/*
 			 * In principle, there's no reason to include the terminating NUL
 			 * character in the hash, but it was done before and the behavior
 			 * must be preserved.
 			 */
 			result = hash_any((uint8_t *) buf, bsize + 1);
 			pfree(buf);
 		}
@ -1086,21 +1087,23 @@ hashbpcharextended(PG_FUNCTION_ARGS)
 #ifdef USE_ICU
 		if (mylocale->provider == COLLPROVIDER_ICU)
 		{
-			int32_t		ulen = -1;
+			Size		bsize, rsize;
-			UChar	   *uchar = NULL;
+			char	   *buf;
 			Size		bsize;
 			uint8_t    *buf;
-			ulen = icu_to_uchar(&uchar, keydata, keylen);
+			bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale);
 			buf = palloc(bsize + 1);
-			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
+			rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale);
-									uchar, ulen, NULL, 0);
+			if (rsize != bsize)
-			buf = palloc(bsize);
+				elog(ERROR, "pg_strnxfrm() returned unexpected result");
 			ucol_getSortKey(mylocale->info.icu.ucol,
 							uchar, ulen, buf, bsize);
 			pfree(uchar);
-			result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1));
+			/*
 			 * In principle, there's no reason to include the terminating NUL
 			 * character in the hash, but it was done before and the behavior
 			 * must be preserved.
 			 */
 			result = hash_any_extended((uint8_t *) buf, bsize + 1,
 									   PG_GETARG_INT64(1));
 			pfree(buf);
 		}
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@ -1553,10 +1553,6 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
 	}
 	else
 	{
 		char		a1buf[TEXTBUFLEN];
 		char		a2buf[TEXTBUFLEN];
 		char	   *a1p,
 				   *a2p;
 		pg_locale_t mylocale;
 		mylocale = pg_newlocale_from_collation(collid);
@ -1573,74 +1569,7 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
 		if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
 			return 0;
-#ifdef WIN32
+		result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
 		/* Win32 does not have UTF-8, so we need to map to UTF-16 */
 		if (GetDatabaseEncoding() == PG_UTF8
 			&& (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
 		{
 			int			a1len;
 			int			a2len;
 			int			r;
 			if (len1 >= TEXTBUFLEN / 2)
 			{
 				a1len = len1 * 2 + 2;
 				a1p = palloc(a1len);
 			}
 			else
 			{
 				a1len = TEXTBUFLEN;
 				a1p = a1buf;
 			}
 			if (len2 >= TEXTBUFLEN / 2)
 			{
 				a2len = len2 * 2 + 2;
 				a2p = palloc(a2len);
 			}
 			else
 			{
 				a2len = TEXTBUFLEN;
 				a2p = a2buf;
 			}
 			/* stupid Microsloth API does not work for zero-length input */
 			if (len1 == 0)
 				r = 0;
 			else
 			{
 				r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
 										(LPWSTR) a1p, a1len / 2);
 				if (!r)
 					ereport(ERROR,
 							(errmsg("could not convert string to UTF-16: error code %lu",
 									GetLastError())));
 			}
 			((LPWSTR) a1p)[r] = 0;
 			if (len2 == 0)
 				r = 0;
 			else
 			{
 				r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
 										(LPWSTR) a2p, a2len / 2);
 				if (!r)
 					ereport(ERROR,
 							(errmsg("could not convert string to UTF-16: error code %lu",
 									GetLastError())));
 			}
 			((LPWSTR) a2p)[r] = 0;
 			errno = 0;
 #ifdef HAVE_LOCALE_T
 			if (mylocale)
 				result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
 			else
 #endif
 				result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
 			if (result == 2147483647)	/* _NLSCMPERROR; missing from mingw
 										 * headers */
 				ereport(ERROR,
 						(errmsg("could not compare Unicode strings: %m")));
 		/* Break tie if necessary. */
 		if (result == 0 &&
@ -1650,94 +1579,6 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
 			if ((result == 0) && (len1 != len2))
 				result = (len1 < len2) ? -1 : 1;
 		}
 			if (a1p != a1buf)
 				pfree(a1p);
 			if (a2p != a2buf)
 				pfree(a2p);
 			return result;
 		}
 #endif							/* WIN32 */
 		if (len1 >= TEXTBUFLEN)
 			a1p = (char *) palloc(len1 + 1);
 		else
 			a1p = a1buf;
 		if (len2 >= TEXTBUFLEN)
 			a2p = (char *) palloc(len2 + 1);
 		else
 			a2p = a2buf;
 		memcpy(a1p, arg1, len1);
 		a1p[len1] = '\0';
 		memcpy(a2p, arg2, len2);
 		a2p[len2] = '\0';
 		if (mylocale)
 		{
 			if (mylocale->provider == COLLPROVIDER_ICU)
 			{
 #ifdef USE_ICU
 #ifdef HAVE_UCOL_STRCOLLUTF8
 				if (GetDatabaseEncoding() == PG_UTF8)
 				{
 					UErrorCode	status;
 					status = U_ZERO_ERROR;
 					result = ucol_strcollUTF8(mylocale->info.icu.ucol,
 											  arg1, len1,
 											  arg2, len2,
 											  &status);
 					if (U_FAILURE(status))
 						ereport(ERROR,
 								(errmsg("collation failed: %s", u_errorName(status))));
 				}
 				else
 #endif
 				{
 					int32_t		ulen1,
 								ulen2;
 					UChar	   *uchar1,
 							   *uchar2;
 					ulen1 = icu_to_uchar(&uchar1, arg1, len1);
 					ulen2 = icu_to_uchar(&uchar2, arg2, len2);
 					result = ucol_strcoll(mylocale->info.icu.ucol,
 										  uchar1, ulen1,
 										  uchar2, ulen2);
 					pfree(uchar1);
 					pfree(uchar2);
 				}
 #else							/* not USE_ICU */
 				/* shouldn't happen */
 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
 #endif							/* not USE_ICU */
 			}
 			else
 			{
 #ifdef HAVE_LOCALE_T
 				result = strcoll_l(a1p, a2p, mylocale->info.lt);
 #else
 				/* shouldn't happen */
 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
 #endif
 			}
 		}
 		else
 			result = strcoll(a1p, a2p);
 		/* Break tie if necessary. */
 		if (result == 0 &&
 			(!mylocale || mylocale->deterministic))
 			result = strcmp(a1p, a2p);
 		if (a1p != a1buf)
 			pfree(a1p);
 		if (a2p != a2buf)
 			pfree(a2p);
 	}
 	return result;
@ -2073,20 +1914,6 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
 		 */
 		locale = pg_newlocale_from_collation(collid);
 		/*
 		 * There is a further exception on Windows.  When the database
 		 * encoding is UTF-8 and we are not using the C collation, complex
 		 * hacks are required.  We don't currently have a comparator that
 		 * handles that case, so we fall back on the slow method of having the
 		 * sort code invoke bttextcmp() (in the case of text) via the fmgr
 		 * trampoline.  ICU locales work just the same on Windows, however.
 		 */
 #ifdef WIN32
 		if (GetDatabaseEncoding() == PG_UTF8 &&
 			!(locale && locale->provider == COLLPROVIDER_ICU))
 			return;
 #endif
 		/*
 		 * We use varlenafastcmp_locale except for type NAME.
 		 */
@ -2102,13 +1929,7 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
 	/*
 	 * Unfortunately, it seems that abbreviation for non-C collations is
-	 * broken on many common platforms; testing of multiple versions of glibc
+	 * broken on many common platforms; see pg_strxfrm_enabled().
 	 * reveals that, for many locales, strcoll() and strxfrm() do not return
 	 * consistent results, which is fatal to this optimization.  While no
 	 * other libc other than Cygwin has so far been shown to have a problem,
 	 * we take the conservative course of action for right now and disable
 	 * this categorically.  (Users who are certain this isn't a problem on
 	 * their system can define TRUST_STRXFRM.)
 	 *
 	 * Even apart from the risk of broken locales, it's possible that there
 	 * are platforms where the use of abbreviated keys should be disabled at
@ -2121,10 +1942,8 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
 	 * categorically, we may still want or need to disable it for particular
 	 * platforms.
 	 */
-#ifndef TRUST_STRXFRM
+	if (!collate_c && !pg_strxfrm_enabled(locale))
 	if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
 		abbreviate = false;
 #endif
 	/*
 	 * If we're using abbreviated keys, or if we're using a locale-aware
@ -2395,60 +2214,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
 		return sss->last_returned;
 	}
-	if (sss->locale)
+	result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
 	{
 		if (sss->locale->provider == COLLPROVIDER_ICU)
 		{
 #ifdef USE_ICU
 #ifdef HAVE_UCOL_STRCOLLUTF8
 			if (GetDatabaseEncoding() == PG_UTF8)
 			{
 				UErrorCode	status;
 				status = U_ZERO_ERROR;
 				result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
 										  a1p, len1,
 										  a2p, len2,
 										  &status);
 				if (U_FAILURE(status))
 					ereport(ERROR,
 							(errmsg("collation failed: %s", u_errorName(status))));
 			}
 			else
 #endif
 			{
 				int32_t		ulen1,
 							ulen2;
 				UChar	   *uchar1,
 						   *uchar2;
 				ulen1 = icu_to_uchar(&uchar1, a1p, len1);
 				ulen2 = icu_to_uchar(&uchar2, a2p, len2);
 				result = ucol_strcoll(sss->locale->info.icu.ucol,
 									  uchar1, ulen1,
 									  uchar2, ulen2);
 				pfree(uchar1);
 				pfree(uchar2);
 			}
 #else							/* not USE_ICU */
 			/* shouldn't happen */
 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
 #endif							/* not USE_ICU */
 		}
 		else
 		{
 #ifdef HAVE_LOCALE_T
 			result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
 #else
 			/* shouldn't happen */
 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
 #endif
 		}
 	}
 	else
 		result = strcoll(sss->buf1, sss->buf2);
 	/* Break tie if necessary. */
 	if (result == 0 &&
@ -2471,6 +2237,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
 static Datum
 varstr_abbrev_convert(Datum original, SortSupport ssup)
 {
 	const size_t max_prefix_bytes = sizeof(Datum);
 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
 	VarString  *authoritative = DatumGetVarStringPP(original);
 	char	   *authoritative_data = VARDATA_ANY(authoritative);
@ -2483,7 +2250,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 	pres = (char *) &res;
 	/* memset(), so any non-overwritten bytes are NUL */
-	memset(pres, 0, sizeof(Datum));
+	memset(pres, 0, max_prefix_bytes);
 	len = VARSIZE_ANY_EXHDR(authoritative);
 	/* Get number of bytes, ignoring trailing spaces */
@ -2518,14 +2285,10 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 	 * thing: explicitly consider string length.
 	 */
 	if (sss->collate_c)
-		memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
+		memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
 	else
 	{
 		Size		bsize;
 #ifdef USE_ICU
 		int32_t		ulen = -1;
 		UChar	   *uchar = NULL;
 #endif
 		/*
 		 * We're not using the C collation, so fall back on strxfrm or ICU
@ -2543,7 +2306,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 		if (sss->last_len1 == len && sss->cache_blob &&
 			memcmp(sss->buf1, authoritative_data, len) == 0)
 		{
-			memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
+			memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
 			/* No change affecting cardinality, so no hashing required */
 			goto done;
 		}
@ -2551,70 +2314,37 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 		memcpy(sss->buf1, authoritative_data, len);
 		/*
-		 * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
+		 * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated
-		 * necessary for ICU, but doesn't hurt.
+		 * strings.
 		 */
 		sss->buf1[len] = '\0';
 		sss->last_len1 = len;
-#ifdef USE_ICU
+		if (pg_strxfrm_prefix_enabled(sss->locale))
-		/* When using ICU and not UTF8, convert string to UChar. */
+		{
-		if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
+			if (sss->buflen2 < max_prefix_bytes)
-			GetDatabaseEncoding() != PG_UTF8)
+			{
-			ulen = icu_to_uchar(&uchar, sss->buf1, len);
+				sss->buflen2 = Max(max_prefix_bytes,
-#endif
+								   Min(sss->buflen2 * 2, MaxAllocSize));
 				sss->buf2 = repalloc(sss->buf2, sss->buflen2);
 			}
 			bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
 									  max_prefix_bytes, sss->locale);
 		}
 		else
 		{
 			/*
-		 * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
+			 * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
-		 * and try again.  Both of these functions have the result buffer
+			 * again.  The pg_strxfrm() function leaves the result buffer
-		 * content undefined if the result did not fit, so we need to retry
+			 * content undefined if the result did not fit, so we need to
-		 * until everything fits, even though we only need the first few bytes
+			 * retry until everything fits, even though we only need the first
-		 * in the end.  When using ucol_nextSortKeyPart(), however, we only
+			 * few bytes in the end.
 		 * ask for as many bytes as we actually need.
 			 */
 			for (;;)
 			{
-#ifdef USE_ICU
+				bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
-			if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
+								   sss->locale);
 			{
 				/*
 				 * When using UTF8, use the iteration interface so we only
 				 * need to produce as many bytes as we actually need.
 				 */
 				if (GetDatabaseEncoding() == PG_UTF8)
 				{
 					UCharIterator iter;
 					uint32_t	state[2];
 					UErrorCode	status;
 					uiter_setUTF8(&iter, sss->buf1, len);
 					state[0] = state[1] = 0;	/* won't need that again */
 					status = U_ZERO_ERROR;
 					bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
 												 &iter,
 												 state,
 												 (uint8_t *) sss->buf2,
 												 Min(sizeof(Datum), sss->buflen2),
 												 &status);
 					if (U_FAILURE(status))
 						ereport(ERROR,
 								(errmsg("sort key generation failed: %s",
 										u_errorName(status))));
 				}
 				else
 					bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
 											uchar, ulen,
 											(uint8_t *) sss->buf2, sss->buflen2);
 			}
 			else
 #endif
 #ifdef HAVE_LOCALE_T
 			if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
 				bsize = strxfrm_l(sss->buf2, sss->buf1,
 								  sss->buflen2, sss->locale->info.lt);
 			else
 #endif
 				bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
 				sss->last_len2 = bsize;
 				if (bsize < sss->buflen2)
@ -2627,6 +2357,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 								   Min(sss->buflen2 * 2, MaxAllocSize));
 				sss->buf2 = repalloc(sss->buf2, sss->buflen2);
 			}
 		}
 		/*
 		 * Every Datum byte is always compared.  This is safe because the
@ -2637,12 +2368,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 		 * (Actually, even if there were NUL bytes in the blob it would be
 		 * okay.  See remarks on bytea case above.)
 		 */
-		memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
+		memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
 #ifdef USE_ICU
 		if (uchar)
 			pfree(uchar);
 #endif
 	}
 	/*
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@ -100,6 +100,19 @@ extern void make_icu_collator(const char *iculocstr,
 extern pg_locale_t pg_newlocale_from_collation(Oid collid);
 extern char *get_collation_actual_version(char collprovider, const char *collcollate);
 extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
 extern int pg_strncoll(const char *arg1, size_t len1,
 					   const char *arg2, size_t len2, pg_locale_t locale);
 extern bool pg_strxfrm_enabled(pg_locale_t locale);
 extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize,
 						 pg_locale_t locale);
 extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src,
 						  size_t srclen, pg_locale_t locale);
 extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale);
 extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
 								pg_locale_t locale);
 extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
 								 size_t srclen, pg_locale_t locale);
 #ifdef USE_ICU
 extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);