[project @ 2003-12-29 00:38:59 by bursa]
Transliterate Unicode to Latin1 using Markus Kuhn's transtab. svn path=/import/netsurf/; revision=465
This commit is contained in:
parent
3a8b8485ad
commit
4fcbc23c1c
4
makefile
4
makefile
|
@ -9,7 +9,7 @@ CC_DEBUG = gcc
|
|||
OBJECTS_COMMON = cache.o content.o fetch.o fetchcache.o other.o \
|
||||
css.o css_enum.o parser.o ruleset.o scanner.o \
|
||||
box.o form.o html.o layout.o textplain.o \
|
||||
messages.o utils.o
|
||||
messages.o utils.o translit.c
|
||||
OBJECTS = $(OBJECTS_COMMON) \
|
||||
browser.o loginlist.o netsurf.o \
|
||||
htmlinstance.o htmlredraw.o \
|
||||
|
@ -69,6 +69,8 @@ css/parser.c: css/parser.y
|
|||
-cd css; lemon parser.y
|
||||
css/scanner.c css/scanner.h: css/scanner.l
|
||||
cd css; flex scanner.l
|
||||
utils/translit.c: transtab
|
||||
cd utils; ./tt2code < transtab > translit.c
|
||||
|
||||
# create documentation
|
||||
$(DOCDIR)/%.html: documentation/%.xml
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,51 @@
|
|||
#!/usr/bin/perl -W
|
||||
|
||||
print <<END;
|
||||
void unicode_transliterate(unsigned int c, char **r)
|
||||
{
|
||||
char *s = *r;
|
||||
switch (c) {
|
||||
|
||||
END
|
||||
|
||||
LINE: while (<>) {
|
||||
chomp;
|
||||
next if m/^%/;
|
||||
next if m/^ *$/;
|
||||
|
||||
m/^<U([0-9A-F]{4})> /g or die "invalid line '$_'";
|
||||
$z = $1;
|
||||
next if (hex($z) < 256);
|
||||
|
||||
SUBST: while (m/\G"?((<U([0-9A-F]{4})>)*)"?;?/g) {
|
||||
next if $& eq '';
|
||||
$m = $1;
|
||||
if ($m eq '') {
|
||||
print "case 0x$z: break;\n";
|
||||
next;
|
||||
}
|
||||
chop $m;
|
||||
@s = split /></, substr $m, 1;
|
||||
foreach $s (@s) {
|
||||
$s = substr $s, 1;
|
||||
next SUBST if 255 < hex($s);
|
||||
}
|
||||
|
||||
print "case 0x$z: ";
|
||||
foreach $s (@s) {
|
||||
print "*s++ = 0x$s; ";
|
||||
}
|
||||
print "break;\n";
|
||||
next LINE;
|
||||
}
|
||||
}
|
||||
|
||||
print <<END;
|
||||
|
||||
default: *s++ = '?'; break;
|
||||
}
|
||||
|
||||
*r = s;
|
||||
}
|
||||
END
|
||||
|
|
@ -117,22 +117,35 @@ char * squash_whitespace(const char * s)
|
|||
char * tolat1(xmlChar * s)
|
||||
{
|
||||
unsigned int length = strlen((char*) s);
|
||||
char *d = xcalloc(length + 1, sizeof(char));
|
||||
unsigned int space = length + 100;
|
||||
char *d = xcalloc(space, sizeof(char));
|
||||
char *d0 = d;
|
||||
char *end = d0 + space - 10;
|
||||
int u, chars;
|
||||
|
||||
while (*s != 0) {
|
||||
chars = length;
|
||||
u = xmlGetUTF8Char((unsigned char *) s, &chars);
|
||||
if (chars <= 0) {
|
||||
s += 1;
|
||||
length -= 1;
|
||||
LOG(("UTF-8 error"));
|
||||
continue;
|
||||
}
|
||||
s += chars;
|
||||
length -= chars;
|
||||
if (u == 0x09 || u == 0x0a || u == 0x0d)
|
||||
*d = ' ';
|
||||
*d++ = ' ';
|
||||
else if ((0x20 <= u && u <= 0x7f) || (0xa0 <= u && u <= 0xff))
|
||||
*d = u;
|
||||
else
|
||||
*d = '?';
|
||||
d++;
|
||||
*d++ = u;
|
||||
else {
|
||||
unicode_transliterate((unsigned int) u, &d);
|
||||
if (end < d) {
|
||||
space += 100;
|
||||
d0 = xrealloc(d0, space);
|
||||
end = d0 + space - 10;
|
||||
}
|
||||
}
|
||||
}
|
||||
*d = 0;
|
||||
|
||||
|
|
|
@ -31,5 +31,6 @@ char *get_host_from_url(char* url);
|
|||
bool is_dir(const char *path);
|
||||
void regcomp_wrapper(regex_t *preg, const char *regex, int cflags);
|
||||
void clean_cookiejar(void);
|
||||
void unicode_transliterate(unsigned int c, char **r);
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue