
Add missing semicolons in UCS_to_* perl scripts. For consistency, use "$hashref->{key}" style everywhere. Kyotaro Horiguchi Discussion: https://www.postgresql.org/message-id/20170130.153738.139030994.horiguchi.kyotaro@lab.ntt.co.jp
283 lines
7.3 KiB
Perl
283 lines
7.3 KiB
Perl
#
|
|
# Copyright (c) 2001-2017, PostgreSQL Global Development Group
|
|
#
|
|
# src/backend/utils/mb/Unicode/convutils.pm
|
|
|
|
use strict;
|
|
|
|
#######################################################################
|
|
# convert UCS-4 to UTF-8
|
|
#
|
|
sub ucs2utf
|
|
{
|
|
my ($ucs) = @_;
|
|
my $utf;
|
|
|
|
if ($ucs <= 0x007f)
|
|
{
|
|
$utf = $ucs;
|
|
}
|
|
elsif ($ucs > 0x007f && $ucs <= 0x07ff)
|
|
{
|
|
$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
|
|
}
|
|
elsif ($ucs > 0x07ff && $ucs <= 0xffff)
|
|
{
|
|
$utf =
|
|
((($ucs >> 12) | 0xe0) << 16) |
|
|
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
|
|
}
|
|
else
|
|
{
|
|
$utf =
|
|
((($ucs >> 18) | 0xf0) << 24) |
|
|
(((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
|
|
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
|
|
}
|
|
return ($utf);
|
|
}
|
|
|
|
#######################################################################
|
|
# read_source - common routine to read source file
|
|
#
|
|
# fname ; input file name
|
|
sub read_source
|
|
{
|
|
my ($fname) = @_;
|
|
my @r;
|
|
|
|
open(my $in, '<', $fname) || die("cannot open $fname");
|
|
|
|
while (<$in>)
|
|
{
|
|
next if (/^#/);
|
|
chop;
|
|
|
|
next if (/^$/); # Ignore empty lines
|
|
|
|
next if (/^0x([0-9A-F]+)\s+(#.*)$/);
|
|
|
|
# Skip the first column for JIS0208.TXT
|
|
if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/)
|
|
{
|
|
print STDERR "READ ERROR at line $. in $fname: $_\n";
|
|
exit;
|
|
}
|
|
my $out = {f => $fname, l => $.,
|
|
code => hex($1),
|
|
ucs => hex($2),
|
|
comment => $4,
|
|
direction => "both"
|
|
};
|
|
|
|
# Ignore pure ASCII mappings. PostgreSQL character conversion code
|
|
# never even passes these to the conversion code.
|
|
next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
|
|
|
|
push(@r, $out);
|
|
}
|
|
close($in);
|
|
|
|
return \@r;
|
|
}
|
|
|
|
##################################################################
|
|
# print_tables : output mapping tables
|
|
#
|
|
# Arguments:
|
|
# charset - string name of the character set.
|
|
# table - mapping table (see format below)
|
|
# verbose - if 1, output comment on each line,
|
|
# if 2, also output source file name and number
|
|
#
|
|
#
|
|
#
|
|
# Mapping table format:
|
|
#
|
|
# Mapping table is a list of hashes. Each hash has the following fields:
|
|
# direction - Direction: 'both', 'from_unicode' or 'to_unicode'
|
|
# ucs - Unicode code point
|
|
# ucs_second - Second Unicode code point, if this is a "combined" character.
|
|
# code - Byte sequence in the "other" character set, as an integer
|
|
# comment - Text representation of the character
|
|
# f - Source filename
|
|
# l - Line number in source file
|
|
#
|
|
#
|
|
sub print_tables
|
|
{
|
|
my ($charset, $table, $verbose) = @_;
|
|
|
|
# Build an array with only the to-UTF8 direction mappings
|
|
my @to_unicode;
|
|
my @to_unicode_combined;
|
|
my @from_unicode;
|
|
my @from_unicode_combined;
|
|
|
|
foreach my $i (@$table)
|
|
{
|
|
if (defined $i->{ucs_second})
|
|
{
|
|
my $entry = {utf8 => ucs2utf($i->{ucs}),
|
|
utf8_second => ucs2utf($i->{ucs_second}),
|
|
code => $i->{code},
|
|
comment => $i->{comment},
|
|
f => $i->{f}, l => $i->{l}};
|
|
if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
|
|
{
|
|
push @to_unicode_combined, $entry;
|
|
}
|
|
if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
|
|
{
|
|
push @from_unicode_combined, $entry;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
my $entry = {utf8 => ucs2utf($i->{ucs}),
|
|
code => $i->{code},
|
|
comment => $i->{comment},
|
|
f => $i->{f}, l => $i->{l}};
|
|
if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
|
|
{
|
|
push @to_unicode, $entry;
|
|
}
|
|
if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
|
|
{
|
|
push @from_unicode, $entry;
|
|
}
|
|
}
|
|
}
|
|
|
|
print_to_utf8_map($charset, \@to_unicode, $verbose);
|
|
print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0);
|
|
print_from_utf8_map($charset, \@from_unicode, $verbose);
|
|
print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0);
|
|
}
|
|
|
|
sub print_from_utf8_map
|
|
{
|
|
my ($charset, $table, $verbose) = @_;
|
|
|
|
my $last_comment = "";
|
|
|
|
my $fname = lc("utf8_to_${charset}.map");
|
|
print "- Writing UTF8=>${charset} conversion table: $fname\n";
|
|
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
|
|
printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
|
|
"static const pg_utf_to_local ULmap${charset}[ %d ] = {",
|
|
scalar(@$table));
|
|
my $first = 1;
|
|
foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table)
|
|
{
|
|
print($out ",") if (!$first);
|
|
$first = 0;
|
|
print($out "\t/* $last_comment */") if ($verbose);
|
|
|
|
printf($out "\n {0x%04x, 0x%04x}", $i->{utf8}, $i->{code});
|
|
if ($verbose >= 2)
|
|
{
|
|
$last_comment = "$i->{f}:$i->{l} $i->{comment}";
|
|
}
|
|
else
|
|
{
|
|
$last_comment = $i->{comment};
|
|
}
|
|
}
|
|
print($out "\t/* $last_comment */") if ($verbose);
|
|
print $out "\n};\n";
|
|
close($out);
|
|
}
|
|
|
|
sub print_from_utf8_combined_map
|
|
{
|
|
my ($charset, $table, $verbose) = @_;
|
|
|
|
my $last_comment = "";
|
|
|
|
my $fname = lc("utf8_to_${charset}_combined.map");
|
|
print "- Writing UTF8=>${charset} conversion table: $fname\n";
|
|
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
|
|
printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
|
|
"static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
|
|
scalar(@$table));
|
|
my $first = 1;
|
|
foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table)
|
|
{
|
|
print($out ",") if (!$first);
|
|
$first = 0;
|
|
print($out "\t/* $last_comment */") if ($verbose);
|
|
|
|
printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $i->{utf8}, $i->{utf8_second}, $i->{code});
|
|
$last_comment = "$i->{comment}";
|
|
}
|
|
print($out "\t/* $last_comment */") if ($verbose);
|
|
print $out "\n};\n";
|
|
close($out);
|
|
}
|
|
|
|
sub print_to_utf8_map
|
|
{
|
|
my ($charset, $table, $verbose) = @_;
|
|
|
|
my $last_comment = "";
|
|
|
|
my $fname = lc("${charset}_to_utf8.map");
|
|
|
|
print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
|
|
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
|
|
printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
|
|
"static const pg_local_to_utf LUmap${charset}[ %d ] = {",
|
|
scalar(@$table));
|
|
my $first = 1;
|
|
foreach my $i (sort {$a->{code} <=> $b->{code}} @$table)
|
|
{
|
|
print($out ",") if (!$first);
|
|
$first = 0;
|
|
print($out "\t/* $last_comment */") if ($verbose);
|
|
|
|
printf($out "\n {0x%04x, 0x%x}", $i->{code}, $i->{utf8});
|
|
if ($verbose >= 2)
|
|
{
|
|
$last_comment = "$i->{f}:$i->{l} $i->{comment}";
|
|
}
|
|
else
|
|
{
|
|
$last_comment = $i->{comment};
|
|
}
|
|
}
|
|
print($out "\t/* $last_comment */") if ($verbose);
|
|
print $out "\n};\n";
|
|
close($out);
|
|
}
|
|
|
|
sub print_to_utf8_combined_map
|
|
{
|
|
my ($charset, $table, $verbose) = @_;
|
|
|
|
my $last_comment = "";
|
|
|
|
my $fname = lc("${charset}_to_utf8_combined.map");
|
|
|
|
print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
|
|
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
|
|
printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
|
|
"static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
|
|
scalar(@$table));
|
|
my $first = 1;
|
|
foreach my $i (sort {$a->{code} <=> $b->{code}} @$table)
|
|
{
|
|
print($out ",") if (!$first);
|
|
$first = 0;
|
|
print($out "\t/* $last_comment */") if ($verbose);
|
|
|
|
printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $i->{code}, $i->{utf8}, $i->{utf8_second});
|
|
$last_comment = "$i->{comment}";
|
|
}
|
|
print($out "\t/* $last_comment */") if ($verbose);
|
|
print $out "\n};\n";
|
|
close($out);
|
|
}
|
|
|
|
1;
|