mirror of https://github.com/postgres/postgres
Add SQL functions for Unicode normalization
This adds SQL expressions NORMALIZE() and IS NORMALIZED to convert and check Unicode normal forms, per SQL standard. To support fast IS NORMALIZED tests, we pull in a new data file DerivedNormalizationProps.txt from Unicode and build a lookup table from that, using techniques similar to ones already used for other Unicode data. make update-unicode will keep it up to date. We only build and use these tables for the NFC and NFKC forms, because they are too big for NFD and NFKD and the improvement is not significant enough there. Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Andreas Karlsson <andreas@proxel.se> Discussion: https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
This commit is contained in:
parent
070c3d3937
commit
2991ac5fc9
|
@ -934,6 +934,16 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr
|
|||
such as pattern matching operations. Therefore, they should be used
|
||||
only in cases where they are specifically wanted.
|
||||
</para>
|
||||
|
||||
<tip>
|
||||
<para>
|
||||
To deal with text in different Unicode normalization forms, it is also
|
||||
an option to use the functions/expressions
|
||||
<function>normalize</function> and <literal>is normalized</literal> to
|
||||
preprocess or check the strings, instead of using nondeterministic
|
||||
collations. There are different trade-offs for each approach.
|
||||
</para>
|
||||
</tip>
|
||||
</sect3>
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
|
|
@ -1560,6 +1560,30 @@
|
|||
<entry><literal>Value: 42</literal></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>
|
||||
<indexterm>
|
||||
<primary>normalized</primary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>Unicode normalization</primary>
|
||||
</indexterm>
|
||||
<literal><parameter>string</parameter> is <optional>not</optional> <optional><parameter>form</parameter></optional> normalized</literal>
|
||||
</entry>
|
||||
<entry><type>boolean</type></entry>
|
||||
<entry>
|
||||
Checks whether the string is in the specified Unicode normalization
|
||||
form. The optional parameter specifies the form:
|
||||
<literal>NFC</literal> (default), <literal>NFD</literal>,
|
||||
<literal>NFKC</literal>, <literal>NFKD</literal>. This expression can
|
||||
only be used if the server encoding is <literal>UTF8</literal>. Note
|
||||
that checking for normalization using this expression is often faster
|
||||
than normalizing possibly already normalized strings.
|
||||
</entry>
|
||||
<entry><literal>U&'\0061\0308bc' IS NFD NORMALIZED</literal></entry>
|
||||
<entry><literal>true</literal></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>
|
||||
<indexterm>
|
||||
|
@ -1610,6 +1634,30 @@
|
|||
<entry><literal>tom</literal></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>
|
||||
<indexterm>
|
||||
<primary>normalize</primary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>Unicode normalization</primary>
|
||||
</indexterm>
|
||||
<literal><function>normalize(<parameter>string</parameter> <type>text</type>
|
||||
<optional>, <parameter>form</parameter> </optional>)</function></literal>
|
||||
</entry>
|
||||
<entry><type>text</type></entry>
|
||||
<entry>
|
||||
Converts the string in the first argument to the specified Unicode
|
||||
normalization form. The optional second argument specifies the form
|
||||
as an identifier: <literal>NFC</literal> (default),
|
||||
<literal>NFD</literal>, <literal>NFKC</literal>,
|
||||
<literal>NFKD</literal>. This function can only be used if the server
|
||||
encoding is <literal>UTF8</literal>.
|
||||
</entry>
|
||||
<entry><literal>normalize(U&'\0061\0308bc', NFC)</literal></entry>
|
||||
<entry><literal>U&'\00E4bc'</literal></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>
|
||||
<indexterm>
|
||||
|
|
|
@ -257,7 +257,7 @@ F386 Set identity column generation clause YES
|
|||
F391 Long identifiers YES
|
||||
F392 Unicode escapes in identifiers YES
|
||||
F393 Unicode escapes in literals YES
|
||||
F394 Optional normal form specification NO
|
||||
F394 Optional normal form specification YES
|
||||
F401 Extended joined table YES
|
||||
F401 Extended joined table 01 NATURAL JOIN YES
|
||||
F401 Extended joined table 02 FULL OUTER JOIN YES
|
||||
|
|
|
@ -1400,6 +1400,21 @@ LANGUAGE INTERNAL
|
|||
STRICT STABLE PARALLEL SAFE
|
||||
AS 'jsonb_path_query_first_tz';
|
||||
|
||||
-- default normalization form is NFC, per SQL standard
|
||||
CREATE OR REPLACE FUNCTION
|
||||
"normalize"(text, text DEFAULT 'NFC')
|
||||
RETURNS text
|
||||
LANGUAGE internal
|
||||
STRICT IMMUTABLE PARALLEL SAFE
|
||||
AS 'unicode_normalize_func';
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
is_normalized(text, text DEFAULT 'NFC')
|
||||
RETURNS boolean
|
||||
LANGUAGE internal
|
||||
STRICT IMMUTABLE PARALLEL SAFE
|
||||
AS 'unicode_is_normalized';
|
||||
|
||||
--
|
||||
-- The default permissions for functions mean that anyone can execute them.
|
||||
-- A number of functions shouldn't be executable by just anyone, but rather
|
||||
|
|
|
@ -444,6 +444,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
|
|||
%type <list> substr_list trim_list
|
||||
%type <list> opt_interval interval_second
|
||||
%type <node> overlay_placing substr_from substr_for
|
||||
%type <str> unicode_normal_form
|
||||
|
||||
%type <boolean> opt_instead
|
||||
%type <boolean> opt_unique opt_concurrently opt_verbose opt_full
|
||||
|
@ -664,7 +665,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
|
|||
|
||||
MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE
|
||||
|
||||
NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE
|
||||
NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NFC NFD NFKC NFKD NO NONE
|
||||
NORMALIZE NORMALIZED
|
||||
NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF
|
||||
NULLS_P NUMERIC
|
||||
|
||||
|
@ -13491,6 +13493,22 @@ a_expr: c_expr { $$ = $1; }
|
|||
list_make1($1), @2),
|
||||
@2);
|
||||
}
|
||||
| a_expr IS NORMALIZED %prec IS
|
||||
{
|
||||
$$ = (Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make1($1), @2);
|
||||
}
|
||||
| a_expr IS unicode_normal_form NORMALIZED %prec IS
|
||||
{
|
||||
$$ = (Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make2($1, makeStringConst($3, @3)), @2);
|
||||
}
|
||||
| a_expr IS NOT NORMALIZED %prec IS
|
||||
{
|
||||
$$ = makeNotExpr((Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make1($1), @2), @2);
|
||||
}
|
||||
| a_expr IS NOT unicode_normal_form NORMALIZED %prec IS
|
||||
{
|
||||
$$ = makeNotExpr((Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make2($1, makeStringConst($4, @4)), @2), @2);
|
||||
}
|
||||
| DEFAULT
|
||||
{
|
||||
/*
|
||||
|
@ -13934,6 +13952,14 @@ func_expr_common_subexpr:
|
|||
{
|
||||
$$ = (Node *) makeFuncCall(SystemFuncName("date_part"), $3, @1);
|
||||
}
|
||||
| NORMALIZE '(' a_expr ')'
|
||||
{
|
||||
$$ = (Node *) makeFuncCall(SystemFuncName("normalize"), list_make1($3), @1);
|
||||
}
|
||||
| NORMALIZE '(' a_expr ',' unicode_normal_form ')'
|
||||
{
|
||||
$$ = (Node *) makeFuncCall(SystemFuncName("normalize"), list_make2($3, makeStringConst($5, @5)), @1);
|
||||
}
|
||||
| OVERLAY '(' overlay_list ')'
|
||||
{
|
||||
/* overlay(A PLACING B FROM C FOR D) is converted to
|
||||
|
@ -14569,6 +14595,13 @@ extract_arg:
|
|||
| Sconst { $$ = $1; }
|
||||
;
|
||||
|
||||
unicode_normal_form:
|
||||
NFC { $$ = "nfc"; }
|
||||
| NFD { $$ = "nfd"; }
|
||||
| NFKC { $$ = "nfkc"; }
|
||||
| NFKD { $$ = "nfkd"; }
|
||||
;
|
||||
|
||||
/* OVERLAY() arguments
|
||||
* SQL99 defines the OVERLAY() function:
|
||||
* o overlay(text placing text from int for int)
|
||||
|
@ -15315,7 +15348,12 @@ unreserved_keyword:
|
|||
| NAMES
|
||||
| NEW
|
||||
| NEXT
|
||||
| NFC
|
||||
| NFD
|
||||
| NFKC
|
||||
| NFKD
|
||||
| NO
|
||||
| NORMALIZED
|
||||
| NOTHING
|
||||
| NOTIFY
|
||||
| NOWAIT
|
||||
|
@ -15494,6 +15532,7 @@ col_name_keyword:
|
|||
| NATIONAL
|
||||
| NCHAR
|
||||
| NONE
|
||||
| NORMALIZE
|
||||
| NULLIF
|
||||
| NUMERIC
|
||||
| OUT_P
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "catalog/pg_type.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "common/int.h"
|
||||
#include "common/unicode_norm.h"
|
||||
#include "lib/hyperloglog.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "miscadmin.h"
|
||||
|
@ -5976,3 +5977,152 @@ rest_of_char_same(const char *s1, const char *s2, int len)
|
|||
#include "levenshtein.c"
|
||||
#define LEVENSHTEIN_LESS_EQUAL
|
||||
#include "levenshtein.c"
|
||||
|
||||
|
||||
/*
|
||||
* Unicode support
|
||||
*/
|
||||
|
||||
static UnicodeNormalizationForm
|
||||
unicode_norm_form_from_string(const char *formstr)
|
||||
{
|
||||
UnicodeNormalizationForm form = -1;
|
||||
|
||||
/*
|
||||
* Might as well check this while we're here.
|
||||
*/
|
||||
if (GetDatabaseEncoding() != PG_UTF8)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
|
||||
|
||||
if (pg_strcasecmp(formstr, "NFC") == 0)
|
||||
form = UNICODE_NFC;
|
||||
else if (pg_strcasecmp(formstr, "NFD") == 0)
|
||||
form = UNICODE_NFD;
|
||||
else if (pg_strcasecmp(formstr, "NFKC") == 0)
|
||||
form = UNICODE_NFKC;
|
||||
else if (pg_strcasecmp(formstr, "NFKD") == 0)
|
||||
form = UNICODE_NFKD;
|
||||
else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("invalid normalization form: %s", formstr)));
|
||||
|
||||
return form;
|
||||
}
|
||||
|
||||
Datum
|
||||
unicode_normalize_func(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *input = PG_GETARG_TEXT_PP(0);
|
||||
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
|
||||
UnicodeNormalizationForm form;
|
||||
int size;
|
||||
pg_wchar *input_chars;
|
||||
pg_wchar *output_chars;
|
||||
unsigned char *p;
|
||||
text *result;
|
||||
int i;
|
||||
|
||||
form = unicode_norm_form_from_string(formstr);
|
||||
|
||||
/* convert to pg_wchar */
|
||||
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
|
||||
input_chars = palloc((size + 1) * sizeof(pg_wchar));
|
||||
p = (unsigned char *) VARDATA_ANY(input);
|
||||
for (i = 0; i < size; i++)
|
||||
{
|
||||
input_chars[i] = utf8_to_unicode(p);
|
||||
p += pg_utf_mblen(p);
|
||||
}
|
||||
input_chars[i] = (pg_wchar) '\0';
|
||||
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
|
||||
|
||||
/* action */
|
||||
output_chars = unicode_normalize(form, input_chars);
|
||||
|
||||
/* convert back to UTF-8 string */
|
||||
size = 0;
|
||||
for (pg_wchar *wp = output_chars; *wp; wp++)
|
||||
{
|
||||
unsigned char buf[4];
|
||||
|
||||
unicode_to_utf8(*wp, buf);
|
||||
size += pg_utf_mblen(buf);
|
||||
}
|
||||
|
||||
result = palloc(size + VARHDRSZ);
|
||||
SET_VARSIZE(result, size + VARHDRSZ);
|
||||
|
||||
p = (unsigned char *) VARDATA_ANY(result);
|
||||
for (pg_wchar *wp = output_chars; *wp; wp++)
|
||||
{
|
||||
unicode_to_utf8(*wp, p);
|
||||
p += pg_utf_mblen(p);
|
||||
}
|
||||
Assert((char *) p == (char *) result + size + VARHDRSZ);
|
||||
|
||||
PG_RETURN_TEXT_P(result);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check whether the string is in the specified Unicode normalization form.
|
||||
*
|
||||
* This is done by convering the string to the specified normal form and then
|
||||
* comparing that to the original string. To speed that up, we also apply the
|
||||
* "quick check" algorithm specified in UAX #15, which can give a yes or no
|
||||
* answer for many strings by just scanning the string once.
|
||||
*
|
||||
* This function should generally be optimized for the case where the string
|
||||
* is in fact normalized. In that case, we'll end up looking at the entire
|
||||
* string, so it's probably not worth doing any incremental conversion etc.
|
||||
*/
|
||||
Datum
|
||||
unicode_is_normalized(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *input = PG_GETARG_TEXT_PP(0);
|
||||
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
|
||||
UnicodeNormalizationForm form;
|
||||
int size;
|
||||
pg_wchar *input_chars;
|
||||
pg_wchar *output_chars;
|
||||
unsigned char *p;
|
||||
int i;
|
||||
UnicodeNormalizationQC quickcheck;
|
||||
int output_size;
|
||||
bool result;
|
||||
|
||||
form = unicode_norm_form_from_string(formstr);
|
||||
|
||||
/* convert to pg_wchar */
|
||||
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
|
||||
input_chars = palloc((size + 1) * sizeof(pg_wchar));
|
||||
p = (unsigned char *) VARDATA_ANY(input);
|
||||
for (i = 0; i < size; i++)
|
||||
{
|
||||
input_chars[i] = utf8_to_unicode(p);
|
||||
p += pg_utf_mblen(p);
|
||||
}
|
||||
input_chars[i] = (pg_wchar) '\0';
|
||||
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
|
||||
|
||||
/* quick check (see UAX #15) */
|
||||
quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
|
||||
if (quickcheck == UNICODE_NORM_QC_YES)
|
||||
PG_RETURN_BOOL(true);
|
||||
else if (quickcheck == UNICODE_NORM_QC_NO)
|
||||
PG_RETURN_BOOL(false);
|
||||
|
||||
/* normalize and compare with original */
|
||||
output_chars = unicode_normalize(form, input_chars);
|
||||
|
||||
output_size = 0;
|
||||
for (pg_wchar *wp = output_chars; *wp; wp++)
|
||||
output_size++;
|
||||
|
||||
result = (size == output_size) &&
|
||||
(memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
|
||||
|
||||
PG_RETURN_BOOL(result);
|
||||
}
|
||||
|
|
|
@ -3,5 +3,6 @@
|
|||
|
||||
# Downloaded files
|
||||
/CompositionExclusions.txt
|
||||
/DerivedNormalizationProps.txt
|
||||
/NormalizationTest.txt
|
||||
/UnicodeData.txt
|
||||
|
|
|
@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS)
|
|||
# By default, do nothing.
|
||||
all:
|
||||
|
||||
update-unicode: unicode_norm_table.h unicode_combining_table.h
|
||||
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h
|
||||
$(MAKE) normalization-check
|
||||
mv unicode_norm_table.h unicode_combining_table.h ../../../src/include/common/
|
||||
mv $^ ../../../src/include/common/
|
||||
|
||||
# These files are part of the Unicode Character Database. Download
|
||||
# them on demand. The dependency on Makefile.global is for
|
||||
# UNICODE_VERSION.
|
||||
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
|
||||
UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
|
||||
|
||||
# Generation of conversion tables used for string normalization with
|
||||
|
@ -36,6 +36,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition
|
|||
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
|
||||
$(PERL) $^ >$@
|
||||
|
||||
unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
|
||||
$(PERL) $^ >$@
|
||||
|
||||
# Test suite
|
||||
normalization-check: norm_test
|
||||
./norm_test
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
#!/usr/bin/perl
|
||||
#
|
||||
# Generate table of Unicode normalization "quick check" properties
|
||||
# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The
|
||||
# output is on stdout.
|
||||
#
|
||||
# Copyright (c) 2020, PostgreSQL Global Development Group
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
my %data;
|
||||
|
||||
print "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";
|
||||
|
||||
print <<EOS;
|
||||
#include "common/unicode_norm.h"
|
||||
|
||||
/*
|
||||
* We use a bit field here to save space.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
unsigned int codepoint:21;
|
||||
signed int quickcheck:4; /* really UnicodeNormalizationQC */
|
||||
} pg_unicode_normprops;
|
||||
EOS
|
||||
|
||||
foreach my $line (<ARGV>)
|
||||
{
|
||||
chomp $line;
|
||||
$line =~ s/\s*#.*$//;
|
||||
next if $line eq '';
|
||||
my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
|
||||
next if $prop !~ /_QC/;
|
||||
|
||||
my ($first, $last);
|
||||
if ($codepoint =~ /\.\./)
|
||||
{
|
||||
($first, $last) = split /\.\./, $codepoint;
|
||||
}
|
||||
else
|
||||
{
|
||||
$first = $last = $codepoint;
|
||||
}
|
||||
|
||||
foreach my $cp (hex($first)..hex($last))
|
||||
{
|
||||
$data{$prop}{$cp} = $value;
|
||||
}
|
||||
}
|
||||
|
||||
# We create a separate array for each normalization form rather than,
|
||||
# say, a two-dimensional array, because that array would be very
|
||||
# sparse and would create unnecessary overhead especially for the NFC
|
||||
# lookup.
|
||||
foreach my $prop (sort keys %data)
|
||||
{
|
||||
# Don't build the tables for the "D" forms because they are too
|
||||
# big. See also unicode_is_normalized_quickcheck().
|
||||
next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";
|
||||
|
||||
print "\n";
|
||||
print "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
|
||||
|
||||
my %subdata = %{$data{$prop}};
|
||||
foreach my $cp (sort { $a <=> $b } keys %subdata)
|
||||
{
|
||||
my $qc;
|
||||
if ($subdata{$cp} eq 'N')
|
||||
{
|
||||
$qc = 'UNICODE_NORM_QC_NO';
|
||||
}
|
||||
elsif ($subdata{$cp} eq 'M')
|
||||
{
|
||||
$qc = 'UNICODE_NORM_QC_MAYBE';
|
||||
}
|
||||
else
|
||||
{
|
||||
die;
|
||||
}
|
||||
printf "\t{0x%04X, %s},\n", $cp, $qc;
|
||||
}
|
||||
|
||||
print "};\n";
|
||||
}
|
|
@ -20,6 +20,9 @@
|
|||
|
||||
#include "common/unicode_norm.h"
|
||||
#include "common/unicode_norm_table.h"
|
||||
#ifndef FRONTEND
|
||||
#include "common/unicode_normprops_table.h"
|
||||
#endif
|
||||
|
||||
#ifndef FRONTEND
|
||||
#define ALLOC(size) palloc(size)
|
||||
|
@ -442,3 +445,110 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
|
|||
|
||||
return recomp_chars;
|
||||
}
|
||||
|
||||
/*
|
||||
* Normalization "quick check" algorithm; see
|
||||
* <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
|
||||
*/
|
||||
|
||||
/* We only need this in the backend. */
|
||||
#ifndef FRONTEND
|
||||
|
||||
static uint8
|
||||
get_canonical_class(pg_wchar ch)
|
||||
{
|
||||
pg_unicode_decomposition *entry = get_code_entry(ch);
|
||||
|
||||
if (!entry)
|
||||
return 0;
|
||||
else
|
||||
return entry->comb_class;
|
||||
}
|
||||
|
||||
static int
|
||||
qc_compare(const void *p1, const void *p2)
|
||||
{
|
||||
uint32 v1,
|
||||
v2;
|
||||
|
||||
v1 = ((const pg_unicode_normprops *) p1)->codepoint;
|
||||
v2 = ((const pg_unicode_normprops *) p2)->codepoint;
|
||||
return (v1 - v2);
|
||||
}
|
||||
|
||||
/*
|
||||
* Look up the normalization quick check character property
|
||||
*/
|
||||
static UnicodeNormalizationQC
|
||||
qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
|
||||
{
|
||||
pg_unicode_normprops key;
|
||||
pg_unicode_normprops *found = NULL;
|
||||
|
||||
key.codepoint = ch;
|
||||
|
||||
switch (form)
|
||||
{
|
||||
case UNICODE_NFC:
|
||||
found = bsearch(&key,
|
||||
UnicodeNormProps_NFC_QC,
|
||||
lengthof(UnicodeNormProps_NFC_QC),
|
||||
sizeof(pg_unicode_normprops),
|
||||
qc_compare);
|
||||
break;
|
||||
case UNICODE_NFKC:
|
||||
found = bsearch(&key,
|
||||
UnicodeNormProps_NFKC_QC,
|
||||
lengthof(UnicodeNormProps_NFKC_QC),
|
||||
sizeof(pg_unicode_normprops),
|
||||
qc_compare);
|
||||
break;
|
||||
default:
|
||||
Assert(false);
|
||||
break;
|
||||
}
|
||||
|
||||
if (found)
|
||||
return found->quickcheck;
|
||||
else
|
||||
return UNICODE_NORM_QC_YES;
|
||||
}
|
||||
|
||||
UnicodeNormalizationQC
|
||||
unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
|
||||
{
|
||||
uint8 lastCanonicalClass = 0;
|
||||
UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
|
||||
|
||||
/*
|
||||
* For the "D" forms, we don't run the quickcheck. We don't include the
|
||||
* lookup tables for those because they are huge, checking for these
|
||||
* particular forms is less common, and running the slow path is faster
|
||||
* for the "D" forms than the "C" forms because you don't need to
|
||||
* recompose, which is slow.
|
||||
*/
|
||||
if (form == UNICODE_NFD || form == UNICODE_NFKD)
|
||||
return UNICODE_NORM_QC_MAYBE;
|
||||
|
||||
for (const pg_wchar *p = input; *p; p++)
|
||||
{
|
||||
pg_wchar ch = *p;
|
||||
uint8 canonicalClass;
|
||||
UnicodeNormalizationQC check;
|
||||
|
||||
canonicalClass = get_canonical_class(ch);
|
||||
if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
|
||||
return UNICODE_NORM_QC_NO;
|
||||
|
||||
check = qc_is_allowed(form, ch);
|
||||
if (check == UNICODE_NORM_QC_NO)
|
||||
return UNICODE_NORM_QC_NO;
|
||||
else if (check == UNICODE_NORM_QC_MAYBE)
|
||||
result = UNICODE_NORM_QC_MAYBE;
|
||||
|
||||
lastCanonicalClass = canonicalClass;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif /* !FRONTEND */
|
||||
|
|
|
@ -53,6 +53,6 @@
|
|||
*/
|
||||
|
||||
/* yyyymmddN */
|
||||
#define CATALOG_VERSION_NO 202004021
|
||||
#define CATALOG_VERSION_NO 202004022
|
||||
|
||||
#endif
|
||||
|
|
|
@ -10851,4 +10851,12 @@
|
|||
proname => 'pg_partition_root', prorettype => 'regclass',
|
||||
proargtypes => 'regclass', prosrc => 'pg_partition_root' },
|
||||
|
||||
{ oid => '4350', descr => 'Unicode normalization',
|
||||
proname => 'normalize', prorettype => 'text',
|
||||
proargtypes => 'text text', prosrc => 'unicode_normalize_func' },
|
||||
|
||||
{ oid => '4351', descr => 'check Unicode normalization',
|
||||
proname => 'is_normalized', prorettype => 'bool',
|
||||
proargtypes => 'text text', prosrc => 'unicode_is_normalized' },
|
||||
|
||||
]
|
||||
|
|
|
@ -24,6 +24,16 @@ typedef enum
|
|||
UNICODE_NFKD = 3,
|
||||
} UnicodeNormalizationForm;
|
||||
|
||||
/* see UAX #15 */
|
||||
typedef enum
|
||||
{
|
||||
UNICODE_NORM_QC_NO = 0,
|
||||
UNICODE_NORM_QC_YES = 1,
|
||||
UNICODE_NORM_QC_MAYBE = -1,
|
||||
} UnicodeNormalizationQC;
|
||||
|
||||
extern pg_wchar *unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input);
|
||||
|
||||
extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input);
|
||||
|
||||
#endif /* UNICODE_NORM_H */
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -260,8 +260,14 @@ PG_KEYWORD("natural", NATURAL, TYPE_FUNC_NAME_KEYWORD)
|
|||
PG_KEYWORD("nchar", NCHAR, COL_NAME_KEYWORD)
|
||||
PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD)
|
||||
PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD)
|
||||
PG_KEYWORD("nfc", NFC, UNRESERVED_KEYWORD)
|
||||
PG_KEYWORD("nfd", NFD, UNRESERVED_KEYWORD)
|
||||
PG_KEYWORD("nfkc", NFKC, UNRESERVED_KEYWORD)
|
||||
PG_KEYWORD("nfkd", NFKD, UNRESERVED_KEYWORD)
|
||||
PG_KEYWORD("no", NO, UNRESERVED_KEYWORD)
|
||||
PG_KEYWORD("none", NONE, COL_NAME_KEYWORD)
|
||||
PG_KEYWORD("normalize", NORMALIZE, COL_NAME_KEYWORD)
|
||||
PG_KEYWORD("normalized", NORMALIZED, UNRESERVED_KEYWORD)
|
||||
PG_KEYWORD("not", NOT, RESERVED_KEYWORD)
|
||||
PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD)
|
||||
PG_KEYWORD("notify", NOTIFY, UNRESERVED_KEYWORD)
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
\endif
|
||||
SELECT U&'\0061\0308bc' <> U&'\00E4bc' COLLATE "C" AS sanity_check;
|
||||
sanity_check
|
||||
--------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\0061\0308\24D1c') = U&'\00E4\24D1c' COLLATE "C" AS test_default;
|
||||
test_default
|
||||
--------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\0061\0308\24D1c', NFC) = U&'\00E4\24D1c' COLLATE "C" AS test_nfc;
|
||||
test_nfc
|
||||
----------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\00E4bc', NFC) = U&'\00E4bc' COLLATE "C" AS test_nfc_idem;
|
||||
test_nfc_idem
|
||||
---------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\00E4\24D1c', NFD) = U&'\0061\0308\24D1c' COLLATE "C" AS test_nfd;
|
||||
test_nfd
|
||||
----------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\0061\0308\24D1c', NFKC) = U&'\00E4bc' COLLATE "C" AS test_nfkc;
|
||||
test_nfkc
|
||||
-----------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT normalize(U&'\00E4\24D1c', NFKD) = U&'\0061\0308bc' COLLATE "C" AS test_nfkd;
|
||||
test_nfkd
|
||||
-----------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT "normalize"('abc', 'def'); -- run-time error
|
||||
ERROR: invalid normalization form: def
|
||||
SELECT U&'\00E4\24D1c' IS NORMALIZED AS test_default;
|
||||
test_default
|
||||
--------------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT U&'\00E4\24D1c' IS NFC NORMALIZED AS test_nfc;
|
||||
test_nfc
|
||||
----------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT num, val,
|
||||
val IS NFC NORMALIZED AS NFC,
|
||||
val IS NFD NORMALIZED AS NFD,
|
||||
val IS NFKC NORMALIZED AS NFKC,
|
||||
val IS NFKD NORMALIZED AS NFKD
|
||||
FROM
|
||||
(VALUES (1, U&'\00E4bc'),
|
||||
(2, U&'\0061\0308bc'),
|
||||
(3, U&'\00E4\24D1c'),
|
||||
(4, U&'\0061\0308\24D1c')) vals (num, val)
|
||||
ORDER BY num;
|
||||
num | val | nfc | nfd | nfkc | nfkd
|
||||
-----+-----+-----+-----+------+------
|
||||
1 | äbc | t | f | t | f
|
||||
2 | äbc | f | t | f | t
|
||||
3 | äⓑc | t | f | f | f
|
||||
4 | äⓑc | f | t | f | f
|
||||
(4 rows)
|
||||
|
||||
SELECT is_normalized('abc', 'def'); -- run-time error
|
||||
ERROR: invalid normalization form: def
|
|
@ -0,0 +1,3 @@
|
|||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
|
@ -27,7 +27,7 @@ test: strings numerology point lseg line box path polygon circle date time timet
|
|||
# geometry depends on point, lseg, box, path, polygon and circle
|
||||
# horology depends on interval, timetz, timestamp, timestamptz
|
||||
# ----------
|
||||
test: geometry horology regex oidjoins type_sanity opr_sanity misc_sanity comments expressions
|
||||
test: geometry horology regex oidjoins type_sanity opr_sanity misc_sanity comments expressions unicode
|
||||
|
||||
# ----------
|
||||
# These four each depend on the previous one
|
||||
|
|
|
@ -49,6 +49,7 @@ test: opr_sanity
|
|||
test: misc_sanity
|
||||
test: comments
|
||||
test: expressions
|
||||
test: unicode
|
||||
test: create_function_1
|
||||
test: create_type
|
||||
test: create_table
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
|
||||
\if :skip_test
|
||||
\quit
|
||||
\endif
|
||||
|
||||
SELECT U&'\0061\0308bc' <> U&'\00E4bc' COLLATE "C" AS sanity_check;
|
||||
|
||||
SELECT normalize(U&'\0061\0308\24D1c') = U&'\00E4\24D1c' COLLATE "C" AS test_default;
|
||||
SELECT normalize(U&'\0061\0308\24D1c', NFC) = U&'\00E4\24D1c' COLLATE "C" AS test_nfc;
|
||||
SELECT normalize(U&'\00E4bc', NFC) = U&'\00E4bc' COLLATE "C" AS test_nfc_idem;
|
||||
SELECT normalize(U&'\00E4\24D1c', NFD) = U&'\0061\0308\24D1c' COLLATE "C" AS test_nfd;
|
||||
SELECT normalize(U&'\0061\0308\24D1c', NFKC) = U&'\00E4bc' COLLATE "C" AS test_nfkc;
|
||||
SELECT normalize(U&'\00E4\24D1c', NFKD) = U&'\0061\0308bc' COLLATE "C" AS test_nfkd;
|
||||
|
||||
SELECT "normalize"('abc', 'def'); -- run-time error
|
||||
|
||||
SELECT U&'\00E4\24D1c' IS NORMALIZED AS test_default;
|
||||
SELECT U&'\00E4\24D1c' IS NFC NORMALIZED AS test_nfc;
|
||||
|
||||
SELECT num, val,
|
||||
val IS NFC NORMALIZED AS NFC,
|
||||
val IS NFD NORMALIZED AS NFD,
|
||||
val IS NFKC NORMALIZED AS NFKC,
|
||||
val IS NFKD NORMALIZED AS NFKD
|
||||
FROM
|
||||
(VALUES (1, U&'\00E4bc'),
|
||||
(2, U&'\0061\0308bc'),
|
||||
(3, U&'\00E4\24D1c'),
|
||||
(4, U&'\0061\0308\24D1c')) vals (num, val)
|
||||
ORDER BY num;
|
||||
|
||||
SELECT is_normalized('abc', 'def'); -- run-time error
|
Loading…
Reference in New Issue