[project @ 2003-12-29 00:38:59 by bursa]

Transliterate Unicode to Latin1 using Markus Kuhn's transtab.

svn path=/import/netsurf/; revision=465
This commit is contained in:
James Bursa 2003-12-29 00:38:59 +00:00
parent 3a8b8485ad
commit 4fcbc23c1c
5 changed files with 1762 additions and 7 deletions

View File

@ -9,7 +9,7 @@ CC_DEBUG = gcc
OBJECTS_COMMON = cache.o content.o fetch.o fetchcache.o other.o \
css.o css_enum.o parser.o ruleset.o scanner.o \
box.o form.o html.o layout.o textplain.o \
messages.o utils.o
messages.o utils.o translit.c
OBJECTS = $(OBJECTS_COMMON) \
browser.o loginlist.o netsurf.o \
htmlinstance.o htmlredraw.o \
@ -69,6 +69,8 @@ css/parser.c: css/parser.y
-cd css; lemon parser.y
css/scanner.c css/scanner.h: css/scanner.l
cd css; flex scanner.l
utils/translit.c: transtab
cd utils; ./tt2code < transtab > translit.c
# create documentation
$(DOCDIR)/%.html: documentation/%.xml

1688
utils/transtab Normal file

File diff suppressed because it is too large Load Diff

51
utils/tt2code Executable file
View File

@ -0,0 +1,51 @@
#!/usr/bin/perl -W
print <<END;
void unicode_transliterate(unsigned int c, char **r)
{
char *s = *r;
switch (c) {
END
LINE: while (<>) {
chomp;
next if m/^%/;
next if m/^ *$/;
m/^<U([0-9A-F]{4})> /g or die "invalid line '$_'";
$z = $1;
next if (hex($z) < 256);
SUBST: while (m/\G"?((<U([0-9A-F]{4})>)*)"?;?/g) {
next if $& eq '';
$m = $1;
if ($m eq '') {
print "case 0x$z: break;\n";
next;
}
chop $m;
@s = split /></, substr $m, 1;
foreach $s (@s) {
$s = substr $s, 1;
next SUBST if 255 < hex($s);
}
print "case 0x$z: ";
foreach $s (@s) {
print "*s++ = 0x$s; ";
}
print "break;\n";
next LINE;
}
}
print <<END;
default: *s++ = '?'; break;
}
*r = s;
}
END

View File

@ -117,22 +117,35 @@ char * squash_whitespace(const char * s)
char * tolat1(xmlChar * s)
{
unsigned int length = strlen((char*) s);
char *d = xcalloc(length + 1, sizeof(char));
unsigned int space = length + 100;
char *d = xcalloc(space, sizeof(char));
char *d0 = d;
char *end = d0 + space - 10;
int u, chars;
while (*s != 0) {
chars = length;
u = xmlGetUTF8Char((unsigned char *) s, &chars);
if (chars <= 0) {
s += 1;
length -= 1;
LOG(("UTF-8 error"));
continue;
}
s += chars;
length -= chars;
if (u == 0x09 || u == 0x0a || u == 0x0d)
*d = ' ';
*d++ = ' ';
else if ((0x20 <= u && u <= 0x7f) || (0xa0 <= u && u <= 0xff))
*d = u;
else
*d = '?';
d++;
*d++ = u;
else {
unicode_transliterate((unsigned int) u, &d);
if (end < d) {
space += 100;
d0 = xrealloc(d0, space);
end = d0 + space - 10;
}
}
}
*d = 0;

View File

@ -31,5 +31,6 @@ char *get_host_from_url(char* url);
bool is_dir(const char *path);
void regcomp_wrapper(regex_t *preg, const char *regex, int cflags);
void clean_cookiejar(void);
void unicode_transliterate(unsigned int c, char **r);
#endif