Treat Unicode codepoints of category "Format" as non-spacing
Commit d8594d123 updated the list of non-spacing codepoints used for calculating display width, but in doing so inadvertently removed some, since the script used for that commit only considered combining characters. For complete coverage for zero-width characters, include codepoints in the category Cf (Format). To reflect the wider purpose, also rename files and update comments that referred specifically to combining characters. Some of these ranges have been missing since v12, but due to lack of field complaints it was determined not important enough to justify adding special-case logic the backbranches. Kyotaro Horiguchi Report by Pavel Stehule Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRBE8yvpQ0FSkPCoe0Ny1jAAsAQ6j3qMgVwWvkqAoaaNmQ%40mail.gmail.com
This commit is contained in:
parent
bb629c294b
commit
0bd9c62973
@ -18,7 +18,7 @@ LIBS += $(PTHREAD_LIBS)
|
||||
# By default, do nothing.
|
||||
all:
|
||||
|
||||
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
|
||||
update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
|
||||
mv $^ $(top_srcdir)/src/include/common/
|
||||
$(MAKE) normalization-check
|
||||
|
||||
@ -35,7 +35,7 @@ unicode_norm_hashfunc.h: unicode_norm_table.h
|
||||
unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
|
||||
$(PERL) $<
|
||||
|
||||
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
|
||||
unicode_nonspacing_table.h: generate-unicode_nonspacing_table.pl UnicodeData.txt
|
||||
$(PERL) $^ >$@
|
||||
|
||||
unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
|
||||
|
@ -15,9 +15,9 @@ my $prev_codepoint;
|
||||
my $count = 0;
|
||||
|
||||
print
|
||||
"/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n";
|
||||
"/* generated by src/common/unicode/generate-unicode_nonspacing_table.pl, do not edit */\n\n";
|
||||
|
||||
print "static const struct mbinterval combining[] = {\n";
|
||||
print "static const struct mbinterval nonspacing[] = {\n";
|
||||
|
||||
foreach my $line (<ARGV>)
|
||||
{
|
||||
@ -25,9 +25,11 @@ foreach my $line (<ARGV>)
|
||||
my @fields = split ';', $line;
|
||||
$codepoint = hex $fields[0];
|
||||
|
||||
if ($fields[2] eq 'Me' || $fields[2] eq 'Mn')
|
||||
# Me and Mn refer to combining characters
|
||||
# Cf refers to format characters
|
||||
if ($fields[2] eq 'Me' || $fields[2] eq 'Mn' || $fields[2] eq 'Cf')
|
||||
{
|
||||
# combining character, save for start of range
|
||||
# non-spacing character, save for start of range
|
||||
if (!defined($range_start))
|
||||
{
|
||||
$range_start = $codepoint;
|
||||
@ -35,7 +37,7 @@ foreach my $line (<ARGV>)
|
||||
}
|
||||
else
|
||||
{
|
||||
# not a combining character, print out previous range if any
|
||||
# not a non-spacing character, print out previous range if any
|
||||
if (defined($range_start))
|
||||
{
|
||||
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint;
|
@ -620,7 +620,7 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
|
||||
* value of -1.
|
||||
*
|
||||
* - Non-spacing and enclosing combining characters (general
|
||||
* category code Mn or Me in the Unicode database) have a
|
||||
* category code Mn, Me or Cf in the Unicode database) have a
|
||||
* column width of 0.
|
||||
*
|
||||
* - Spacing characters in the East Asian Wide (W) or East Asian
|
||||
@ -638,7 +638,7 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
|
||||
static int
|
||||
ucs_wcwidth(pg_wchar ucs)
|
||||
{
|
||||
#include "common/unicode_combining_table.h"
|
||||
#include "common/unicode_nonspacing_table.h"
|
||||
#include "common/unicode_east_asian_fw_table.h"
|
||||
|
||||
/* test for 8-bit control characters */
|
||||
@ -657,8 +657,8 @@ ucs_wcwidth(pg_wchar ucs)
|
||||
* factor for display width leads to the correct behavior, so do that
|
||||
* search first.
|
||||
*/
|
||||
if (mbbisearch(ucs, combining,
|
||||
sizeof(combining) / sizeof(struct mbinterval) - 1))
|
||||
if (mbbisearch(ucs, nonspacing,
|
||||
sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
|
||||
return 0;
|
||||
|
||||
/* binary search in table of wide characters */
|
||||
|
@ -1,6 +1,7 @@
|
||||
/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */
|
||||
/* generated by src/common/unicode/generate-unicode_nonspacing_table.pl, do not edit */
|
||||
|
||||
static const struct mbinterval combining[] = {
|
||||
static const struct mbinterval nonspacing[] = {
|
||||
{0x00AD, 0x00AD},
|
||||
{0x0300, 0x036F},
|
||||
{0x0483, 0x0489},
|
||||
{0x0591, 0x05BD},
|
||||
@ -8,13 +9,16 @@ static const struct mbinterval combining[] = {
|
||||
{0x05C1, 0x05C2},
|
||||
{0x05C4, 0x05C5},
|
||||
{0x05C7, 0x05C7},
|
||||
{0x0600, 0x0605},
|
||||
{0x0610, 0x061A},
|
||||
{0x061C, 0x061C},
|
||||
{0x064B, 0x065F},
|
||||
{0x0670, 0x0670},
|
||||
{0x06D6, 0x06DC},
|
||||
{0x06D6, 0x06DD},
|
||||
{0x06DF, 0x06E4},
|
||||
{0x06E7, 0x06E8},
|
||||
{0x06EA, 0x06ED},
|
||||
{0x070F, 0x070F},
|
||||
{0x0711, 0x0711},
|
||||
{0x0730, 0x074A},
|
||||
{0x07A6, 0x07B0},
|
||||
@ -25,9 +29,8 @@ static const struct mbinterval combining[] = {
|
||||
{0x0825, 0x0827},
|
||||
{0x0829, 0x082D},
|
||||
{0x0859, 0x085B},
|
||||
{0x0898, 0x089F},
|
||||
{0x08CA, 0x08E1},
|
||||
{0x08E3, 0x0902},
|
||||
{0x0890, 0x089F},
|
||||
{0x08CA, 0x0902},
|
||||
{0x093A, 0x093A},
|
||||
{0x093C, 0x093C},
|
||||
{0x0941, 0x0948},
|
||||
@ -114,8 +117,7 @@ static const struct mbinterval combining[] = {
|
||||
{0x17C6, 0x17C6},
|
||||
{0x17C9, 0x17D3},
|
||||
{0x17DD, 0x17DD},
|
||||
{0x180B, 0x180D},
|
||||
{0x180F, 0x180F},
|
||||
{0x180B, 0x180F},
|
||||
{0x1885, 0x1886},
|
||||
{0x18A9, 0x18A9},
|
||||
{0x1920, 0x1922},
|
||||
@ -152,6 +154,9 @@ static const struct mbinterval combining[] = {
|
||||
{0x1CF4, 0x1CF4},
|
||||
{0x1CF8, 0x1CF9},
|
||||
{0x1DC0, 0x1DFF},
|
||||
{0x200B, 0x200F},
|
||||
{0x202A, 0x202E},
|
||||
{0x2060, 0x206F},
|
||||
{0x20D0, 0x20F0},
|
||||
{0x2CEF, 0x2CF1},
|
||||
{0x2D7F, 0x2D7F},
|
||||
@ -196,6 +201,8 @@ static const struct mbinterval combining[] = {
|
||||
{0xFB1E, 0xFB1E},
|
||||
{0xFE00, 0xFE0F},
|
||||
{0xFE20, 0xFE2F},
|
||||
{0xFEFF, 0xFEFF},
|
||||
{0xFFF9, 0xFFFB},
|
||||
{0x101FD, 0x101FD},
|
||||
{0x102E0, 0x102E0},
|
||||
{0x10376, 0x1037A},
|
||||
@ -213,7 +220,8 @@ static const struct mbinterval combining[] = {
|
||||
{0x1107F, 0x11081},
|
||||
{0x110B3, 0x110B6},
|
||||
{0x110B9, 0x110BA},
|
||||
{0x110C2, 0x110C2},
|
||||
{0x110BD, 0x110BD},
|
||||
{0x110C2, 0x110CD},
|
||||
{0x11100, 0x11102},
|
||||
{0x11127, 0x1112B},
|
||||
{0x1112D, 0x11134},
|
||||
@ -281,15 +289,16 @@ static const struct mbinterval combining[] = {
|
||||
{0x11D95, 0x11D95},
|
||||
{0x11D97, 0x11D97},
|
||||
{0x11EF3, 0x11EF4},
|
||||
{0x13430, 0x13438},
|
||||
{0x16AF0, 0x16AF4},
|
||||
{0x16B30, 0x16B36},
|
||||
{0x16F4F, 0x16F4F},
|
||||
{0x16F8F, 0x16F92},
|
||||
{0x16FE4, 0x16FE4},
|
||||
{0x1BC9D, 0x1BC9E},
|
||||
{0x1CF00, 0x1CF46},
|
||||
{0x1BCA0, 0x1CF46},
|
||||
{0x1D167, 0x1D169},
|
||||
{0x1D17B, 0x1D182},
|
||||
{0x1D173, 0x1D182},
|
||||
{0x1D185, 0x1D18B},
|
||||
{0x1D1AA, 0x1D1AD},
|
||||
{0x1D242, 0x1D244},
|
||||
@ -304,5 +313,5 @@ static const struct mbinterval combining[] = {
|
||||
{0x1E2EC, 0x1E2EF},
|
||||
{0x1E8D0, 0x1E8D6},
|
||||
{0x1E944, 0x1E94A},
|
||||
{0xE0100, 0xE01EF},
|
||||
{0xE0001, 0xE01EF},
|
||||
};
|
Loading…
x
Reference in New Issue
Block a user