postgres/src/backend/utils/mb/Unicode/convutils.pm

#
# Copyright (c) 2001-2017, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/convutils.pm

use strict;

#######################################################################
# convert UCS-4 to UTF-8
#
sub ucs2utf
{
	my ($ucs) = @_;
	my $utf;

	if ($ucs <= 0x007f)
	{
		$utf = $ucs;
	}
	elsif ($ucs > 0x007f && $ucs <= 0x07ff)
	{
		$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
	}
	elsif ($ucs > 0x07ff && $ucs <= 0xffff)
	{
		$utf =
		  ((($ucs >> 12) | 0xe0) << 16) |
		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
	}
	else
	{
		$utf =
		  ((($ucs >> 18) | 0xf0) << 24) |
		  (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
	}
	return ($utf);
}

#######################################################################
# read_source - common routine to read source file
#
# fname ; input file name
sub read_source
{
	my ($fname) = @_;
	my @r;

	open(my $in, '<', $fname) || die("cannot open $fname");

	while (<$in>)
	{
		next if (/^#/);
		chop;

		next if (/^$/); # Ignore empty lines

		next if (/^0x([0-9A-F]+)\s+(#.*)$/);

		# Skip the first column for JIS0208.TXT
		if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/)
		{
			print STDERR "READ ERROR at line $. in $fname: $_\n";
			exit;
		}
		my $out = {f => $fname, l => $.,
				   code => hex($1),
				   ucs => hex($2),
				   comment => $4,
				   direction => "both"
				};

		# Ignore pure ASCII mappings. PostgreSQL character conversion code
		# never even passes these to the conversion code.
		next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);

		push(@r, $out);
	}
	close($in);

	return \@r;
}

##################################################################
# print_tables : output mapping tables
#
# Arguments:
#  charset - string name of the character set.
#  table   - mapping table (see format below)
#  verbose - if 1, output comment on each line,
#            if 2, also output source file name and number
#
#
#
# Mapping table format:
#
# Mapping table is a list of hashes. Each hash has the following fields:
#   direction  - Direction: 'both', 'from_unicode' or 'to_unicode'
#   ucs        - Unicode code point
#   ucs_second - Second Unicode code point, if this is a "combined" character.
#   code       - Byte sequence in the "other" character set, as an integer
#   comment    - Text representation of the character
#   f          - Source filename
#   l          - Line number in source file
#
#
sub print_tables
{
	my ($charset, $table, $verbose) = @_;

	# Build an array with only the to-UTF8 direction mappings
	my @to_unicode;
	my @to_unicode_combined;
	my @from_unicode;
	my @from_unicode_combined;

	foreach my $i (@$table)
	{
		if (defined $i->{ucs_second})
		{
			my $entry = {utf8 => ucs2utf($i->{ucs}),
						 utf8_second => ucs2utf($i->{ucs_second}),
						 code => $i->{code},
						 comment => $i->{comment},
						 f => $i->{f}, l => $i->{l}};
			if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
			{
				push @to_unicode_combined, $entry;
			}
			if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
			{
				push @from_unicode_combined, $entry;
			}
		}
		else
		{
			my $entry = {utf8 => ucs2utf($i->{ucs}),
						 code => $i->{code},
						 comment => $i->{comment},
						 f => $i->{f}, l => $i->{l}};
			if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
			{
				push @to_unicode, $entry;
			}
			if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
			{
				push @from_unicode, $entry;
			}
		}
	}

	print_to_utf8_map($charset, \@to_unicode, $verbose);
	print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0);
	print_from_utf8_map($charset, \@from_unicode, $verbose);
	print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0);
}

sub print_from_utf8_map
{
	my ($charset, $table, $verbose) = @_;

	my $last_comment = "";

	my $fname = lc("utf8_to_${charset}.map");
	print "- Writing UTF8=>${charset} conversion table: $fname\n";
	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
	printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
		   "static const pg_utf_to_local ULmap${charset}[ %d ] = {",
		   scalar(@$table));
	my $first = 1;
	foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table)
    {
		print($out ",") if (!$first);
		$first = 0;
		print($out "\t/* $last_comment */") if ($verbose);

		printf($out "\n  {0x%04x, 0x%04x}", $i->{utf8}, $i->{code});
		if ($verbose >= 2)
		{
			$last_comment = "$i->{f}:$i->{l} $i->{comment}";
		}
		else
		{
			$last_comment = $i->{comment};
		}
	}
	print($out "\t/* $last_comment */") if ($verbose);
	print $out "\n};\n";
	close($out);
}

sub print_from_utf8_combined_map
{
	my ($charset, $table, $verbose) = @_;

	my $last_comment = "";

	my $fname = lc("utf8_to_${charset}_combined.map");
	print "- Writing UTF8=>${charset} conversion table: $fname\n";
	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
	printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
		   "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
		   scalar(@$table));
	my $first = 1;
	foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table)
    {
		print($out ",") if (!$first);
		$first = 0;
		print($out "\t/* $last_comment */") if ($verbose);

		printf($out "\n  {0x%08x, 0x%08x, 0x%04x}", $i->{utf8}, $i->{utf8_second}, $i->{code});
		$last_comment = "$i->{comment}";
	}
	print($out "\t/* $last_comment */") if ($verbose);
	print $out "\n};\n";
	close($out);
}

sub print_to_utf8_map
{
	my ($charset, $table, $verbose) = @_;

	my $last_comment = "";

	my $fname = lc("${charset}_to_utf8.map");

	print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
	printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
		   "static const pg_local_to_utf LUmap${charset}[ %d ] = {",
		   scalar(@$table));
	my $first = 1;
	foreach my $i (sort {$a->{code} <=> $b->{code}} @$table)
    {
		print($out ",") if (!$first);
		$first = 0;
		print($out "\t/* $last_comment */") if ($verbose);

		printf($out "\n  {0x%04x, 0x%x}", $i->{code}, $i->{utf8});
		if ($verbose >= 2)
		{
			$last_comment = "$i->{f}:$i->{l} $i->{comment}";
		}
		else
		{
			$last_comment = $i->{comment};
		}
	}
	print($out "\t/* $last_comment */") if ($verbose);
	print $out "\n};\n";
	close($out);
}

sub print_to_utf8_combined_map
{
	my ($charset, $table, $verbose) = @_;

	my $last_comment = "";

	my $fname = lc("${charset}_to_utf8_combined.map");

	print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
	printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
		   "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
		   scalar(@$table));
	my $first = 1;
	foreach my $i (sort {$a->{code} <=> $b->{code}} @$table)
    {
		print($out ",") if (!$first);
		$first = 0;
		print($out "\t/* $last_comment */") if ($verbose);

		printf($out "\n  {0x%04x, 0x%08x, 0x%08x}", $i->{code}, $i->{utf8}, $i->{utf8_second});
		$last_comment = "$i->{comment}";
	}
	print($out "\t/* $last_comment */") if ($verbose);
	print $out "\n};\n";
	close($out);
}

1;