Fix conversion table generator scripts.
convutils.pm used implicit conversion of undefined value to integer zero. Some of conversion scripts are susceptible to regexp greediness. Fix, avoiding whitespace changes in the output. Also update ICU URLs that moved. No need to back-patch, because the output of these scripts is also in the source tree so we shouldn't need to rerun them on back-branches. Author: Kyotaro Horiguchi <horikyoga.ntt@gmail.com> Discussion: https://postgr.es/m/CA%2BhUKGJ7SEGLbj%3D%3DTQCcyKRA9aqj8%2B6L%3DexSq1y25TA%3DWxLziQ%40mail.gmail.com
This commit is contained in:
parent
e47c2602aa
commit
a5073871ea
@ -122,7 +122,7 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt:
|
||||
$(DOWNLOAD) http://x0213.org/codetable/$(@F)
|
||||
|
||||
gb-18030-2000.xml windows-949-2000.xml:
|
||||
$(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F)
|
||||
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)
|
||||
|
||||
GB2312.TXT:
|
||||
$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
|
||||
|
@ -24,12 +24,13 @@ my @all;
|
||||
|
||||
while (my $line = <$in>)
|
||||
{
|
||||
if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
|
||||
if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/)
|
||||
{
|
||||
|
||||
# combined characters
|
||||
my ($c, $u1, $u2) = ($1, $2, $3);
|
||||
my $rest = "U+" . $u1 . "+" . $u2 . $4;
|
||||
# The "\t \t" below is just to avoid insubstantial diffs.
|
||||
my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4;
|
||||
my $code = hex($c);
|
||||
my $ucs1 = hex($u1);
|
||||
my $ucs2 = hex($u2);
|
||||
@ -45,7 +46,7 @@ while (my $line = <$in>)
|
||||
l => $.
|
||||
};
|
||||
}
|
||||
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
|
||||
elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/)
|
||||
{
|
||||
|
||||
# non-combined characters
|
||||
|
@ -80,7 +80,8 @@ foreach my $i (@$ct932)
|
||||
}
|
||||
}
|
||||
|
||||
foreach my $i (@mapping)
|
||||
# extract only SJIS characers
|
||||
foreach my $i (grep defined $_->{sjis}, @mapping)
|
||||
{
|
||||
my $sjis = $i->{sjis};
|
||||
|
||||
|
@ -24,12 +24,13 @@ my @mapping;
|
||||
|
||||
while (my $line = <$in>)
|
||||
{
|
||||
if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
|
||||
if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/)
|
||||
{
|
||||
|
||||
# combined characters
|
||||
my ($c, $u1, $u2) = ($1, $2, $3);
|
||||
my $rest = "U+" . $u1 . "+" . $u2 . $4;
|
||||
# The "\t \t" below is just to avoid insubstantial diffs.
|
||||
my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4;
|
||||
my $code = hex($c);
|
||||
my $ucs1 = hex($u1);
|
||||
my $ucs2 = hex($u2);
|
||||
@ -45,7 +46,7 @@ while (my $line = <$in>)
|
||||
l => $.
|
||||
};
|
||||
}
|
||||
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
|
||||
elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/)
|
||||
{
|
||||
|
||||
# non-combined characters
|
||||
|
@ -380,7 +380,8 @@ sub print_radix_table
|
||||
{
|
||||
header => "Dummy map, for invalid values",
|
||||
min_idx => 0,
|
||||
max_idx => $widest_range
|
||||
max_idx => $widest_range,
|
||||
label => "dummy map"
|
||||
};
|
||||
|
||||
###
|
||||
@ -471,35 +472,37 @@ sub print_radix_table
|
||||
}
|
||||
|
||||
# Also look up the positions of the roots in the table.
|
||||
my $b1root = $segmap{"1-byte"};
|
||||
my $b2root = $segmap{"2-byte"};
|
||||
my $b3root = $segmap{"3-byte"};
|
||||
my $b4root = $segmap{"4-byte"};
|
||||
# Missing map represents dummy mapping.
|
||||
my $b1root = $segmap{"1-byte"} || 0;
|
||||
my $b2root = $segmap{"2-byte"} || 0;
|
||||
my $b3root = $segmap{"3-byte"} || 0;
|
||||
my $b4root = $segmap{"4-byte"} || 0;
|
||||
|
||||
# And the lower-upper values of each level in each radix tree.
|
||||
my $b1_lower = $min_idx{1}{1};
|
||||
my $b1_upper = $max_idx{1}{1};
|
||||
# Missing values represent zero.
|
||||
my $b1_lower = $min_idx{1}{1} || 0;
|
||||
my $b1_upper = $max_idx{1}{1} || 0;
|
||||
|
||||
my $b2_1_lower = $min_idx{2}{1};
|
||||
my $b2_1_upper = $max_idx{2}{1};
|
||||
my $b2_2_lower = $min_idx{2}{2};
|
||||
my $b2_2_upper = $max_idx{2}{2};
|
||||
my $b2_1_lower = $min_idx{2}{1} || 0;
|
||||
my $b2_1_upper = $max_idx{2}{1} || 0;
|
||||
my $b2_2_lower = $min_idx{2}{2} || 0;
|
||||
my $b2_2_upper = $max_idx{2}{2} || 0;
|
||||
|
||||
my $b3_1_lower = $min_idx{3}{1};
|
||||
my $b3_1_upper = $max_idx{3}{1};
|
||||
my $b3_2_lower = $min_idx{3}{2};
|
||||
my $b3_2_upper = $max_idx{3}{2};
|
||||
my $b3_3_lower = $min_idx{3}{3};
|
||||
my $b3_3_upper = $max_idx{3}{3};
|
||||
my $b3_1_lower = $min_idx{3}{1} || 0;
|
||||
my $b3_1_upper = $max_idx{3}{1} || 0;
|
||||
my $b3_2_lower = $min_idx{3}{2} || 0;
|
||||
my $b3_2_upper = $max_idx{3}{2} || 0;
|
||||
my $b3_3_lower = $min_idx{3}{3} || 0;
|
||||
my $b3_3_upper = $max_idx{3}{3} || 0;
|
||||
|
||||
my $b4_1_lower = $min_idx{4}{1};
|
||||
my $b4_1_upper = $max_idx{4}{1};
|
||||
my $b4_2_lower = $min_idx{4}{2};
|
||||
my $b4_2_upper = $max_idx{4}{2};
|
||||
my $b4_3_lower = $min_idx{4}{3};
|
||||
my $b4_3_upper = $max_idx{4}{3};
|
||||
my $b4_4_lower = $min_idx{4}{4};
|
||||
my $b4_4_upper = $max_idx{4}{4};
|
||||
my $b4_1_lower = $min_idx{4}{1} || 0;
|
||||
my $b4_1_upper = $max_idx{4}{1} || 0;
|
||||
my $b4_2_lower = $min_idx{4}{2} || 0;
|
||||
my $b4_2_upper = $max_idx{4}{2} || 0;
|
||||
my $b4_3_lower = $min_idx{4}{3} || 0;
|
||||
my $b4_3_upper = $max_idx{4}{3} || 0;
|
||||
my $b4_4_lower = $min_idx{4}{4} || 0;
|
||||
my $b4_4_upper = $max_idx{4}{4} || 0;
|
||||
|
||||
###
|
||||
### Find the maximum value in the whole table, to determine if we can
|
||||
@ -607,7 +610,8 @@ sub print_radix_table
|
||||
for (my $j = 0;
|
||||
$j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
|
||||
{
|
||||
my $val = $seg->{values}->{$i};
|
||||
# missing values represent zero.
|
||||
my $val = $seg->{values}->{$i} || 0;
|
||||
|
||||
printf $out " 0x%0*x", $colwidth, $val;
|
||||
$off++;
|
||||
|
Loading…
x
Reference in New Issue
Block a user