Additional unicode primitive functions.

Introduce unicode_version(), icu_unicode_version(), and unicode_assigned(). The latter requires introducing a new lookup table for the Unicode General Category, which is generated along with the other Unicode lookup tables. Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com Reviewed-by: Peter Eisentraut
2023-11-01 22:47:06 -07:00 · 2023-11-01 22:47:06 -07:00 · a02b37fc08
commit a02b37fc08
parent 7021d3b176
18 changed files with 4924 additions and 22 deletions
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@ -2859,6 +2859,22 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
       </para></entry>
      </row>
      <row>
       <entry role="func_table_entry"><para role="func_signature">
        <indexterm>
         <primary>unicode_assigned</primary>
        </indexterm>
        <function>unicode_assigned</function> ( <type>text</type> )
        <returnvalue>text</returnvalue>
       </para>
       <para>
        Returns <literal>true</literal> if all characters in the string are
        assigned Unicode codepoints; <literal>false</literal> otherwise. This
        function can only be used when the server encoding is
        <literal>UTF8</literal>.
       </para></entry>
      </row>
      <row>
       <entry role="func_table_entry"><para role="func_signature">
        <indexterm>
@ -23427,25 +23443,6 @@ SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n);
        This is equivalent to <function>current_user</function>.
       </para></entry>
      </row>
      <row>
       <entry role="func_table_entry"><para role="func_signature">
        <indexterm>
         <primary>version</primary>
        </indexterm>
        <function>version</function> ()
        <returnvalue>text</returnvalue>
       </para>
       <para>
        Returns a string describing the <productname>PostgreSQL</productname>
        server's version.  You can also get this information from
        <xref linkend="guc-server-version"/>, or for a machine-readable
        version use <xref linkend="guc-server-version-num"/>.  Software
        developers should use <varname>server_version_num</varname> (available
        since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
        parsing the text version.
       </para></entry>
      </row>
     </tbody>
    </tgroup>
   </table>
@ -26332,6 +26329,80 @@ SELECT collation for ('foo' COLLATE "de_DE");
  </sect2>
  <sect2 id="functions-info-version">
   <title>Version Information Functions</title>
   <para>
    The functions shown in <xref linkend="functions-version"/>
    print version information.
   </para>
   <table id="functions-version">
    <title>Version Information Functions</title>
    <tgroup cols="1">
     <thead>
      <row>
       <entry role="func_table_entry"><para role="func_signature">
        Function
       </para>
       <para>
        Description
       </para></entry>
      </row>
     </thead>
     <tbody>
      <row>
       <entry role="func_table_entry"><para role="func_signature">
        <indexterm>
         <primary>version</primary>
        </indexterm>
        <function>version</function> ()
        <returnvalue>text</returnvalue>
       </para>
       <para>
        Returns a string describing the <productname>PostgreSQL</productname>
        server's version.  You can also get this information from
        <xref linkend="guc-server-version"/>, or for a machine-readable
        version use <xref linkend="guc-server-version-num"/>.  Software
        developers should use <varname>server_version_num</varname> (available
        since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
        parsing the text version.
       </para></entry>
      </row>
      <row>
       <entry role="func_table_entry"><para role="func_signature">
        <indexterm>
         <primary>unicode_version</primary>
        </indexterm>
        <function>unicode_version</function> ()
        <returnvalue>text</returnvalue>
       </para>
       <para>
        Returns a string representing the version of Unicode used by
        <productname>PostgreSQL</productname>.
       </para></entry>
      </row>
      <row>
       <entry role="func_table_entry"><para role="func_signature">
        <indexterm>
         <primary>icu_unicode_version</primary>
        </indexterm>
        <function>icu_unicode_version</function> ()
        <returnvalue>text</returnvalue>
       </para>
       <para>
        Returns a string representing the version of Unicode used by ICU, if
        the server was built with ICU support; otherwise returns
        <literal>NULL</literal> </para></entry>
      </row>
     </tbody>
    </tgroup>
   </table>
  </sect2>
  </sect1>
  <sect1 id="functions-admin">
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@ -23,7 +23,9 @@
 #include "catalog/pg_type.h"
 #include "common/hashfn.h"
 #include "common/int.h"
 #include "common/unicode_category.h"
 #include "common/unicode_norm.h"
 #include "common/unicode_version.h"
 #include "funcapi.h"
 #include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
@ -6237,6 +6239,65 @@ unicode_norm_form_from_string(const char *formstr)
 	return form;
 }
 /*
 * Returns version of Unicode used by Postgres in "major.minor" format (the
 * same format as the Unicode version reported by ICU). The third component
 * ("update version") never involves additions to the character repertiore and
 * is unimportant for most purposes.
 *
 * See: https://unicode.org/versions/
 */
 Datum
 unicode_version(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
 }
 /*
 * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
 */
 Datum
 icu_unicode_version(PG_FUNCTION_ARGS)
 {
 #ifdef USE_ICU
 	PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
 #else
 	PG_RETURN_NULL();
 #endif
 }
 /*
 * Check whether the string contains only assigned Unicode code
 * points. Requires that the database encoding is UTF-8.
 */
 Datum
 unicode_assigned(PG_FUNCTION_ARGS)
 {
 	text	   *input = PG_GETARG_TEXT_PP(0);
 	unsigned char *p;
 	int			size;
 	if (GetDatabaseEncoding() != PG_UTF8)
 		ereport(ERROR,
 				(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
 	/* convert to pg_wchar */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
 	p = (unsigned char *) VARDATA_ANY(input);
 	for (int i = 0; i < size; i++)
 	{
 		pg_wchar	uchar = utf8_to_unicode(p);
 		int			category = unicode_category(uchar);
 		if (category == PG_U_UNASSIGNED)
 			PG_RETURN_BOOL(false);
 		p += pg_utf_mblen(p);
 	}
 	PG_RETURN_BOOL(true);
 }
 Datum
 unicode_normalize_func(PG_FUNCTION_ARGS)
 {
--- a/src/common/Makefile
+++ b/src/common/Makefile
@ -78,6 +78,7 @@ OBJS_COMMON = \
 	scram-common.o \
 	string.o \
 	stringinfo.o \
 	unicode_category.o \
 	unicode_norm.o \
 	username.o \
 	wait_error.o \
--- a/src/common/meson.build
+++ b/src/common/meson.build
@ -30,6 +30,7 @@ common_sources = files(
  'scram-common.c',
  'string.c',
  'stringinfo.c',
  'unicode_category.c',
  'unicode_norm.c',
  'username.c',
  'wait_error.c',
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@ -15,11 +15,15 @@ include $(top_builddir)/src/Makefile.global
 override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS)
 LIBS += $(PTHREAD_LIBS)
 LDFLAGS_INTERNAL += $(ICU_LIBS)
 CPPFLAGS += $(ICU_CFLAGS)
 # By default, do nothing.
 all:
-update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
+update-unicode: unicode_category_table.h unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h unicode_version.h
 	mv $^ $(top_srcdir)/src/include/common/
 	$(MAKE) category-check
 	$(MAKE) normalization-check
 # These files are part of the Unicode Character Database. Download
@ -28,6 +32,12 @@ update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asi
 UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
 	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
 unicode_version.h: generate-unicode_version.pl
 	$(PERL) $< --version $(UNICODE_VERSION)
 unicode_category_table.h: generate-unicode_category_table.pl UnicodeData.txt
 	$(PERL) $<
 # Generation of conversion tables used for string normalization with
 # UTF-8 strings.
 unicode_norm_hashfunc.h: unicode_norm_table.h
@ -45,9 +55,14 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat
 	$(PERL) $^ >$@
 # Test suite
 category-check: category_test
 	./category_test
 normalization-check: norm_test
 	./norm_test
 category_test: category_test.o ../unicode_category.o | submake-common
 norm_test: norm_test.o ../unicode_norm.o | submake-common
 norm_test.o: norm_test_table.h
@ -64,7 +79,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt
 clean:
-	rm -f $(OBJS) norm_test norm_test.o
+	rm -f $(OBJS) category_test category_test.o norm_test norm_test.o
 distclean: clean
 	rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
--- a/src/common/unicode/category_test.c
+++ b/src/common/unicode/category_test.c
@ -0,0 +1,108 @@
 /*-------------------------------------------------------------------------
 * category_test.c
 *		Program to test Unicode general category functions.
 *
 * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/common/unicode/category_test.c
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres_fe.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef USE_ICU
 #include <unicode/uchar.h>
 #endif
 #include "common/unicode_category.h"
 #include "common/unicode_version.h"
 /*
 * Parse version into integer for easy comparison.
 */
 #ifdef USE_ICU
 static int
 parse_unicode_version(const char *version)
 {
 	int			n,
 				major,
 				minor;
 	n = sscanf(version, "%d.%d", &major, &minor);
 	Assert(n == 2);
 	Assert(minor < 100);
 	return major * 100 + minor;
 }
 #endif
 /*
 * Exhaustively test that the Unicode category for each codepoint matches that
 * returned by ICU.
 */
 int
 main(int argc, char **argv)
 {
 #ifdef USE_ICU
 	int			pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
 	int			icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
 	int			pg_skipped_codepoints = 0;
 	int			icu_skipped_codepoints = 0;
 	printf("Postgres Unicode Version:\t%s\n", PG_UNICODE_VERSION);
 	printf("ICU Unicode Version:\t\t%s\n", U_UNICODE_VERSION);
 	for (UChar32 code = 0; code <= 0x10ffff; code++)
 	{
 		uint8_t		pg_category = unicode_category(code);
 		uint8_t		icu_category = u_charType(code);
 		if (pg_category != icu_category)
 		{
 			/*
 			 * A version mismatch means that some assigned codepoints in the
 			 * newer version may be unassigned in the older version. That's
 			 * OK, though the test will not cover those codepoints marked
 			 * unassigned in the older version (that is, it will no longer be
 			 * an exhaustive test).
 			 */
 			if (pg_category == PG_U_UNASSIGNED &&
 				pg_unicode_version < icu_unicode_version)
 				pg_skipped_codepoints++;
 			else if (icu_category == PG_U_UNASSIGNED &&
 					 icu_unicode_version < pg_unicode_version)
 				icu_skipped_codepoints++;
 			else
 			{
 				printf("FAILURE for codepoint %06x\n", code);
 				printf("Postgres category:	%02d %s %s\n", pg_category,
 					   unicode_category_abbrev(pg_category),
 					   unicode_category_string(pg_category));
 				printf("ICU category:		%02d %s %s\n", icu_category,
 					   unicode_category_abbrev(icu_category),
 					   unicode_category_string(icu_category));
 				printf("\n");
 				exit(1);
 			}
 		}
 	}
 	if (pg_skipped_codepoints > 0)
 		printf("Skipped %d codepoints unassigned in Postgres due to Unicode version mismatch.\n",
 			   pg_skipped_codepoints);
 	if (icu_skipped_codepoints > 0)
 		printf("Skipped %d codepoints unassigned in ICU due to Unicode version mismatch.\n",
 			   icu_skipped_codepoints);
 	printf("category_test: All tests successful!\n");
 	exit(0);
 #else
 	printf("ICU support required for test; skipping.\n");
 	exit(0);
 #endif
 }
--- a/src/common/unicode/generate-unicode_category_table.pl
+++ b/src/common/unicode/generate-unicode_category_table.pl
@ -0,0 +1,204 @@
 #!/usr/bin/perl
 #
 # Generate a code point category table and its lookup utilities, using
 # Unicode data files as input.
 #
 # Input: UnicodeData.txt
 # Output: unicode_category_table.h
 #
 # Copyright (c) 2000-2023, PostgreSQL Global Development Group
 use strict;
 use warnings;
 use Getopt::Long;
 use FindBin;
 use lib "$FindBin::RealBin/../../tools/";
 my $CATEGORY_UNASSIGNED = 'Cn';
 my $output_path = '.';
 GetOptions('outdir:s' => \$output_path);
 my $output_table_file = "$output_path/unicode_category_table.h";
 my $FH;
 # Read entries from UnicodeData.txt into a list of codepoint ranges
 # and their general category.
 my @category_ranges = ();
 my $range_start = undef;
 my $range_end = undef;
 my $range_category = undef;
 # If between a "<..., First>" entry and a "<..., Last>" entry, the gap in
 # codepoints represents a range, and $gap_category is equal to the
 # category for both (which must match). Otherwise, the gap represents
 # unassigned code points.
 my $gap_category = undef;
 open($FH, '<', "$output_path/UnicodeData.txt")
  or die "Could not open $output_path/UnicodeData.txt: $!.";
 while (my $line = <$FH>)
 {
 	my @elts = split(';', $line);
 	my $code = hex($elts[0]);
 	my $name = $elts[1];
 	my $category = $elts[2];
 	die "codepoint out of range" if $code > 0x10FFFF;
 	die "unassigned codepoint in UnicodeData.txt" if $category eq $CATEGORY_UNASSIGNED;
 	if (!defined($range_start)) {
 		my $code_str = sprintf "0x%06x", $code;
 		die if defined($range_end) || defined($range_category) || defined($gap_category);
 		die "unexpected first entry <..., Last>" if ($name =~ /Last>/);
 		die "expected 0x000000 for first entry, got $code_str" if $code != 0x000000;
 		# initialize
 		$range_start = $code;
 		$range_end = $code;
 		$range_category = $category;
 		if ($name =~ /<.*, First>$/) {
 			$gap_category = $category;
 		} else {
 			$gap_category = $CATEGORY_UNASSIGNED;
 		}
 		next;
 	}
 	# Gap in codepoints detected. If it's a different category than
 	# the current range, emit the current range and initialize a new
 	# range representing the gap.
 	if ($range_end + 1 != $code && $range_category ne $gap_category) {
 		push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
 		$range_start = $range_end + 1;
 		$range_end = $code - 1;
 		$range_category = $gap_category;
 	}
 	# different category; new range
 	if ($range_category ne $category) {
 		push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
 		$range_start = $code;
 		$range_end = $code;
 		$range_category = $category;
 	}
 	if ($name =~ /<.*, First>$/) {
 		die "<..., First> entry unexpectedly follows another <..., First> entry"
 		  if $gap_category ne $CATEGORY_UNASSIGNED;
 		$gap_category = $category;
 	}
 	elsif ($name =~ /<.*, Last>$/) {
 		die "<..., First> and <..., Last> entries have mismatching general category"
 		  if $gap_category ne $category;
 		$gap_category = $CATEGORY_UNASSIGNED;
 	}
 	else {
 		die "unexpected entry found between <..., First> and <..., Last>"
 		  if $gap_category ne $CATEGORY_UNASSIGNED;
 	}
 	$range_end = $code;
 }
 close $FH;
 die "<..., First> entry with no corresponding <..., Last> entry"
  if $gap_category ne $CATEGORY_UNASSIGNED;
 # emit final range
 push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
 # emit range for any unassigned code points after last entry
 if ($range_end < 0x10FFFF) {
 	$range_start = $range_end + 1;
 	$range_end = 0x10FFFF;
 	$range_category = $CATEGORY_UNASSIGNED;
 	push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
 }
 my $num_ranges = scalar @category_ranges;
 # See: https://www.unicode.org/reports/tr44/#General_Category_Values
 my $categories = {
 	Cn => 'PG_U_UNASSIGNED',
 	Lu => 'PG_U_UPPERCASE_LETTER',
 	Ll => 'PG_U_LOWERCASE_LETTER',
 	Lt => 'PG_U_TITLECASE_LETTER',
 	Lm => 'PG_U_MODIFIER_LETTER',
 	Lo => 'PG_U_OTHER_LETTER',
 	Mn => 'PG_U_NONSPACING_MARK',
 	Me => 'PG_U_ENCLOSING_MARK',
 	Mc => 'PG_U_SPACING_MARK',
 	Nd => 'PG_U_DECIMAL_NUMBER',
 	Nl => 'PG_U_LETTER_NUMBER',
 	No => 'PG_U_OTHER_NUMBER',
 	Zs => 'PG_U_SPACE_SEPARATOR',
 	Zl => 'PG_U_LINE_SEPARATOR',
 	Zp => 'PG_U_PARAGRAPH_SEPARATOR',
 	Cc => 'PG_U_CONTROL',
 	Cf => 'PG_U_FORMAT',
 	Co => 'PG_U_PRIVATE_USE',
 	Cs => 'PG_U_SURROGATE',
 	Pd => 'PG_U_DASH_PUNCTUATION',
 	Ps => 'PG_U_OPEN_PUNCTUATION',
 	Pe => 'PG_U_CLOSE_PUNCTUATION',
 	Pc => 'PG_U_CONNECTOR_PUNCTUATION',
 	Po => 'PG_U_OTHER_PUNCTUATION',
 	Sm => 'PG_U_MATH_SYMBOL',
 	Sc => 'PG_U_CURRENCY_SYMBOL',
 	Sk => 'PG_U_MODIFIER_SYMBOL',
 	So => 'PG_U_OTHER_SYMBOL',
 	Pi => 'PG_U_INITIAL_PUNCTUATION',
 	Pf => 'PG_U_FINAL_PUNCTUATION'
 };
 # Start writing out the output files
 open my $OT, '>', $output_table_file
  or die "Could not open output file $output_table_file: $!\n";
 print $OT <<HEADER;
 /*-------------------------------------------------------------------------
 *
 * unicode_category_table.h
 *	  Category table for Unicode character classification.
 *
 * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/include/common/unicode_category_table.h
 *
 *-------------------------------------------------------------------------
 */
 #include "common/unicode_category.h"
 /*
 * File auto-generated by src/common/unicode/generate-unicode_category_table.pl,
 * do not edit. There is deliberately not an #ifndef PG_UNICODE_CATEGORY_TABLE_H
 * here.
 */
 typedef struct
 {
 	uint32		first;			/* Unicode codepoint */
 	uint32		last;			/* Unicode codepoint */
 	uint8		category;		/* General Category */
 }			pg_category_range;
 /* table of Unicode codepoint ranges and their categories */
 static const pg_category_range unicode_categories[$num_ranges] =
 {
 HEADER
 my $firsttime = 1;
 foreach my $range (@category_ranges) {
 	printf $OT ",\n" unless $firsttime;
 	$firsttime = 0;
 	my $category = $categories->{$range->{category}};
 	die "category missing: $range->{category}" unless $category;
 	printf $OT "\t{0x%06x, 0x%06x, %s}", $range->{start}, $range->{end}, $category;
 }
 print $OT "\n};\n";
--- a/src/common/unicode/generate-unicode_version.pl
+++ b/src/common/unicode/generate-unicode_version.pl
@ -0,0 +1,46 @@
 #!/usr/bin/perl
 #
 # Generate header file with Unicode version used by Postgres.
 #
 # Output: unicode_version.h
 #
 # Copyright (c) 2000-2023, PostgreSQL Global Development Group
 use strict;
 use warnings;
 use Getopt::Long;
 use FindBin;
 use lib "$FindBin::RealBin/../../tools/";
 my $output_path = '.';
 my $version_str = undef;
 GetOptions('outdir:s' => \$output_path, 'version:s' => \$version_str);
 my @version_parts = split /\./, $version_str;
 my $unicode_version_str = sprintf "%d.%d", $version_parts[0], $version_parts[1];
 my $output_file = "$output_path/unicode_version.h";
 # Start writing out the output files
 open my $OT, '>', $output_file
  or die "Could not open output file $output_file: $!\n";
 print $OT <<HEADER;
 /*-------------------------------------------------------------------------
 *
 * unicode_version.h
 *	  Unicode version used by Postgres.
 *
 * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/include/common/unicode_version.h
 *
 *-------------------------------------------------------------------------
 */
 #define PG_UNICODE_VERSION		"$unicode_version_str"
 HEADER
--- a/src/common/unicode/meson.build
+++ b/src/common/unicode/meson.build
@ -24,6 +24,25 @@ endforeach
 update_unicode_targets = []
 update_unicode_targets += \
  custom_target('unicode_version.h',
    output: ['unicode_version.h'],
    command: [
      perl, files('generate-unicode_version.pl'),
      '--outdir', '@OUTDIR@', '--version', UNICODE_VERSION],
    build_by_default: false,
  )
 update_unicode_targets += \
  custom_target('unicode_category_table.h',
    input: [unicode_data['UnicodeData.txt']],
    output: ['unicode_category_table.h'],
    command: [
      perl, files('generate-unicode_category_table.pl'),
      '--outdir', '@OUTDIR@', '@INPUT@'],
    build_by_default: false,
  )
 update_unicode_targets += \
  custom_target('unicode_norm_table.h',
    input: [unicode_data['UnicodeData.txt'], unicode_data['CompositionExclusions.txt']],
@ -73,6 +92,17 @@ norm_test_table = custom_target('norm_test_table.h',
 inc = include_directories('.')
 category_test = executable('category_test',
  ['category_test.c'],
  dependencies: [frontend_port_code, icu],
  include_directories: inc,
  link_with: [common_static, pgport_static],
  build_by_default: false,
  kwargs: default_bin_args + {
    'install': false,
  }
 )
 norm_test = executable('norm_test',
  ['norm_test.c', norm_test_table],
  dependencies: [frontend_port_code],
@ -86,6 +116,16 @@ norm_test = executable('norm_test',
 update_unicode_dep = []
 if not meson.is_cross_build()
  update_unicode_dep += custom_target('category_test.run',
    output: 'category_test.run',
    input: update_unicode_targets,
    command: [category_test, UNICODE_VERSION],
    build_by_default: false,
    build_always_stale: true,
  )
 endif
 if not meson.is_cross_build()
  update_unicode_dep += custom_target('norm_test.run',
    output: 'norm_test.run',
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@ -81,6 +81,6 @@ main(int argc, char **argv)
 		}
 	}
-	printf("All tests successful!\n");
+	printf("norm_test: All tests successful!\n");
 	exit(0);
 }
--- a/src/common/unicode_category.c
+++ b/src/common/unicode_category.c
@ -0,0 +1,195 @@
 /*-------------------------------------------------------------------------
 * unicode_category.c
 *		Determine general category of Unicode characters.
 *
 * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/common/unicode_category.c
 *
 *-------------------------------------------------------------------------
 */
 #ifndef FRONTEND
 #include "postgres.h"
 #else
 #include "postgres_fe.h"
 #endif
 #include "common/unicode_category.h"
 #include "common/unicode_category_table.h"
 /*
 * Unicode general category for the given codepoint.
 */
 pg_unicode_category
 unicode_category(pg_wchar ucs)
 {
 	int			min = 0;
 	int			mid;
 	int			max = lengthof(unicode_categories) - 1;
 	Assert(ucs >= unicode_categories[0].first &&
 		   ucs <= unicode_categories[max].last);
 	while (max >= min)
 	{
 		mid = (min + max) / 2;
 		if (ucs > unicode_categories[mid].last)
 			min = mid + 1;
 		else if (ucs < unicode_categories[mid].first)
 			max = mid - 1;
 		else
 			return unicode_categories[mid].category;
 	}
 	Assert(false);
 	return (pg_unicode_category) - 1;
 }
 /*
 * Description of Unicode general category.
 */
 const char *
 unicode_category_string(pg_unicode_category category)
 {
 	switch (category)
 	{
 		case PG_U_UNASSIGNED:
 			return "Unassigned";
 		case PG_U_UPPERCASE_LETTER:
 			return "Uppercase_Letter";
 		case PG_U_LOWERCASE_LETTER:
 			return "Lowercase_Letter";
 		case PG_U_TITLECASE_LETTER:
 			return "Titlecase_Letter";
 		case PG_U_MODIFIER_LETTER:
 			return "Modifier_Letter";
 		case PG_U_OTHER_LETTER:
 			return "Other_Letter";
 		case PG_U_NONSPACING_MARK:
 			return "Nonspacing_Mark";
 		case PG_U_ENCLOSING_MARK:
 			return "Enclosing_Mark";
 		case PG_U_SPACING_MARK:
 			return "Spacing_Mark";
 		case PG_U_DECIMAL_NUMBER:
 			return "Decimal_Number";
 		case PG_U_LETTER_NUMBER:
 			return "Letter_Number";
 		case PG_U_OTHER_NUMBER:
 			return "Other_Number";
 		case PG_U_SPACE_SEPARATOR:
 			return "Space_Separator";
 		case PG_U_LINE_SEPARATOR:
 			return "Line_Separator";
 		case PG_U_PARAGRAPH_SEPARATOR:
 			return "Paragraph_Separator";
 		case PG_U_CONTROL:
 			return "Control";
 		case PG_U_FORMAT:
 			return "Format";
 		case PG_U_PRIVATE_USE:
 			return "Private_Use";
 		case PG_U_SURROGATE:
 			return "Surrogate";
 		case PG_U_DASH_PUNCTUATION:
 			return "Dash_Punctuation";
 		case PG_U_OPEN_PUNCTUATION:
 			return "Open_Punctuation";
 		case PG_U_CLOSE_PUNCTUATION:
 			return "Close_Punctuation";
 		case PG_U_CONNECTOR_PUNCTUATION:
 			return "Connector_Punctuation";
 		case PG_U_OTHER_PUNCTUATION:
 			return "Other_Punctuation";
 		case PG_U_MATH_SYMBOL:
 			return "Math_Symbol";
 		case PG_U_CURRENCY_SYMBOL:
 			return "Currency_Symbol";
 		case PG_U_MODIFIER_SYMBOL:
 			return "Modifier_Symbol";
 		case PG_U_OTHER_SYMBOL:
 			return "Other_Symbol";
 		case PG_U_INITIAL_PUNCTUATION:
 			return "Initial_Punctuation";
 		case PG_U_FINAL_PUNCTUATION:
 			return "Final_Punctuation";
 	}
 	Assert(false);
 	return "Unrecognized";		/* keep compiler quiet */
 }
 /*
 * Short code for Unicode general category.
 */
 const char *
 unicode_category_abbrev(pg_unicode_category category)
 {
 	switch (category)
 	{
 		case PG_U_UNASSIGNED:
 			return "Cn";
 		case PG_U_UPPERCASE_LETTER:
 			return "Lu";
 		case PG_U_LOWERCASE_LETTER:
 			return "Ll";
 		case PG_U_TITLECASE_LETTER:
 			return "Lt";
 		case PG_U_MODIFIER_LETTER:
 			return "Lm";
 		case PG_U_OTHER_LETTER:
 			return "Lo";
 		case PG_U_NONSPACING_MARK:
 			return "Mn";
 		case PG_U_ENCLOSING_MARK:
 			return "Me";
 		case PG_U_SPACING_MARK:
 			return "Mc";
 		case PG_U_DECIMAL_NUMBER:
 			return "Nd";
 		case PG_U_LETTER_NUMBER:
 			return "Nl";
 		case PG_U_OTHER_NUMBER:
 			return "No";
 		case PG_U_SPACE_SEPARATOR:
 			return "Zs";
 		case PG_U_LINE_SEPARATOR:
 			return "Zl";
 		case PG_U_PARAGRAPH_SEPARATOR:
 			return "Zp";
 		case PG_U_CONTROL:
 			return "Cc";
 		case PG_U_FORMAT:
 			return "Cf";
 		case PG_U_PRIVATE_USE:
 			return "Co";
 		case PG_U_SURROGATE:
 			return "Cs";
 		case PG_U_DASH_PUNCTUATION:
 			return "Pd";
 		case PG_U_OPEN_PUNCTUATION:
 			return "Ps";
 		case PG_U_CLOSE_PUNCTUATION:
 			return "Pe";
 		case PG_U_CONNECTOR_PUNCTUATION:
 			return "Pc";
 		case PG_U_OTHER_PUNCTUATION:
 			return "Po";
 		case PG_U_MATH_SYMBOL:
 			return "Sm";
 		case PG_U_CURRENCY_SYMBOL:
 			return "Sc";
 		case PG_U_MODIFIER_SYMBOL:
 			return "Sk";
 		case PG_U_OTHER_SYMBOL:
 			return "So";
 		case PG_U_INITIAL_PUNCTUATION:
 			return "Pi";
 		case PG_U_FINAL_PUNCTUATION:
 			return "Pf";
 	}
 	Assert(false);
 	return "??";				/* keep compiler quiet */
 }
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@ -12019,6 +12019,18 @@
  proname => 'pg_partition_root', prorettype => 'regclass',
  proargtypes => 'regclass', prosrc => 'pg_partition_root' },
 { oid => '4549', descr => 'Unicode version used by Postgres',
  proname => 'unicode_version', prorettype => 'text', proargtypes => '',
  prosrc => 'unicode_version' },
 { oid => '6099', descr => 'Unicode version used by ICU, if enabled',
  proname => 'icu_unicode_version', prorettype => 'text', proargtypes => '',
  prosrc => 'icu_unicode_version' },
 { oid => '6105', descr => 'check valid Unicode',
  proname => 'unicode_assigned', prorettype => 'bool', proargtypes => 'text',
  prosrc => 'unicode_assigned' },
 { oid => '4350', descr => 'Unicode normalization',
  proname => 'normalize', prorettype => 'text', proargtypes => 'text text',
  prosrc => 'unicode_normalize_func' },
--- a/src/include/common/unicode_category.h
+++ b/src/include/common/unicode_category.h
@ -0,0 +1,68 @@
 /*-------------------------------------------------------------------------
 *
 * unicode_category.h
 *	  Routines for determining the category of Unicode characters.
 *
 * These definitions can be used by both frontend and backend code.
 *
 * Copyright (c) 2017-2023, PostgreSQL Global Development Group
 *
 * src/include/common/unicode_category.h
 *
 *-------------------------------------------------------------------------
 */
 #ifndef UNICODE_CATEGORY_H
 #define UNICODE_CATEGORY_H
 #include "mb/pg_wchar.h"
 /*
 * Unicode General Category Values
 *
 * See: https://www.unicode.org/reports/tr44/#General_Category_Values
 *
 * The Unicode stability policy guarantees: "The enumeration of
 * General_Category property values is fixed. No new values will be
 * added". See: https://www.unicode.org/policies/stability_policy.html
 *
 * Numeric values chosen to match corresponding ICU UCharCategory.
 */
 typedef enum pg_unicode_category
 {
 	PG_U_UNASSIGNED = 0,		/* Cn */
 	PG_U_UPPERCASE_LETTER = 1,	/* Lu */
 	PG_U_LOWERCASE_LETTER = 2,	/* Ll */
 	PG_U_TITLECASE_LETTER = 3,	/* Lt */
 	PG_U_MODIFIER_LETTER = 4,	/* Lm */
 	PG_U_OTHER_LETTER = 5,		/* Lo */
 	PG_U_NONSPACING_MARK = 6,	/* Mn */
 	PG_U_ENCLOSING_MARK = 7,	/* Me */
 	PG_U_SPACING_MARK = 8,		/* Mc */
 	PG_U_DECIMAL_NUMBER = 9,	/* Nd */
 	PG_U_LETTER_NUMBER = 10,	/* Nl */
 	PG_U_OTHER_NUMBER = 11,		/* No */
 	PG_U_SPACE_SEPARATOR = 12,	/* Zs */
 	PG_U_LINE_SEPARATOR = 13,	/* Zl */
 	PG_U_PARAGRAPH_SEPARATOR = 14,	/* Zp */
 	PG_U_CONTROL = 15,			/* Cc */
 	PG_U_FORMAT = 16,			/* Cf */
 	PG_U_PRIVATE_USE = 17,		/* Co */
 	PG_U_SURROGATE = 18,		/* Cs */
 	PG_U_DASH_PUNCTUATION = 19, /* Pd */
 	PG_U_OPEN_PUNCTUATION = 20, /* Ps */
 	PG_U_CLOSE_PUNCTUATION = 21,	/* Pe */
 	PG_U_CONNECTOR_PUNCTUATION = 22,	/* Pc */
 	PG_U_OTHER_PUNCTUATION = 23,	/* Po */
 	PG_U_MATH_SYMBOL = 24,		/* Sm */
 	PG_U_CURRENCY_SYMBOL = 25,	/* Sc */
 	PG_U_MODIFIER_SYMBOL = 26,	/* Sk */
 	PG_U_OTHER_SYMBOL = 27,		/* So */
 	PG_U_INITIAL_PUNCTUATION = 28,	/* Pi */
 	PG_U_FINAL_PUNCTUATION = 29 /* Pf */
 } pg_unicode_category;
 extern pg_unicode_category unicode_category(pg_wchar ucs);
 const char *unicode_category_string(pg_unicode_category category);
 const char *unicode_category_abbrev(pg_unicode_category category);
 #endif							/* UNICODE_CATEGORY_H */
--- a/src/include/common/unicode_category_table.h
+++ b/src/include/common/unicode_category_table.h
--- a/src/include/common/unicode_version.h
+++ b/src/include/common/unicode_version.h
@ -0,0 +1,14 @@
 /*-------------------------------------------------------------------------
 *
 * unicode_version.h
 *	  Unicode version used by Postgres.
 *
 * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/include/common/unicode_version.h
 *
 *-------------------------------------------------------------------------
 */
 #define PG_UNICODE_VERSION		"15.1"
--- a/src/test/icu/t/010_database.pl
+++ b/src/test/icu/t/010_database.pl
@ -27,6 +27,10 @@ CREATE TABLE icu (def text, en text COLLATE "en-x-icu", upfirst text COLLATE upp
 INSERT INTO icu VALUES ('a', 'a', 'a'), ('b', 'b', 'b'), ('A', 'A', 'A'), ('B', 'B', 'B');
 });
 is( $node1->safe_psql('dbicu', q{SELECT icu_unicode_version() IS NOT NULL}),
 	qq(t),
 	'ICU unicode version defined');
 is( $node1->safe_psql('dbicu', q{SELECT def FROM icu ORDER BY def}),
 	qq(A
 a
--- a/src/test/regress/expected/unicode.out
+++ b/src/test/regress/expected/unicode.out
@ -8,6 +8,24 @@ SELECT U&'\0061\0308bc' <> U&'\00E4bc' COLLATE "C" AS sanity_check;
 t
 (1 row)
 SELECT unicode_version() IS NOT NULL;
 ?column? 
 ----------
 t
 (1 row)
 SELECT unicode_assigned(U&'abc');
 unicode_assigned 
 ------------------
 t
 (1 row)
 SELECT unicode_assigned(U&'abc\+10FFFF');
 unicode_assigned 
 ------------------
 f
 (1 row)
 SELECT normalize('');
 normalize 
 -----------
--- a/src/test/regress/sql/unicode.sql
+++ b/src/test/regress/sql/unicode.sql
@ -5,6 +5,10 @@ SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
 SELECT U&'\0061\0308bc' <> U&'\00E4bc' COLLATE "C" AS sanity_check;
 SELECT unicode_version() IS NOT NULL;
 SELECT unicode_assigned(U&'abc');
 SELECT unicode_assigned(U&'abc\+10FFFF');
 SELECT normalize('');
 SELECT normalize(U&'\0061\0308\24D1c') = U&'\00E4\24D1c' COLLATE "C" AS test_default;
 SELECT normalize(U&'\0061\0308\24D1c', NFC) = U&'\00E4\24D1c' COLLATE "C" AS test_nfc;