Fix bugs in charset detection.

Strip BOM from parser input, as it confuses libxml.
Ignore non-ASCII-compatible charsets declared in meta tag (the parser 
defaults to 8 bit, so if it's managed to extract a meta charset, then it 
must be ASCII-compatible, so a non-ASCII-compatible meta charset is lies).

Fixes WightLink timetable and 1726341.

svn path=/trunk/netsurf/; revision=3304
This commit is contained in:
John Mark Bell 2007-05-29 18:03:07 +00:00
parent 15be8d1ade
commit 01316168fc

View File

@ -38,7 +38,7 @@
static bool html_set_parser_encoding(struct content *c, const char *encoding);
static const char *html_detect_encoding(const char *data, unsigned int size);
static const char *html_detect_encoding(const char **data, unsigned int *size);
static void html_convert_css_callback(content_msg msg, struct content *css,
intptr_t p1, intptr_t p2, union content_msg_data data);
static bool html_meta_refresh(struct content *c, xmlNode *head);
@ -157,7 +157,7 @@ bool html_process_data(struct content *c, char *data, unsigned int size)
* searches for a <meta http-equiv="content-type"
* content="text/html; charset=...">. */
const char *encoding;
encoding = html_detect_encoding(data, size);
encoding = html_detect_encoding((const char **) &data, &size);
if (encoding) {
if (!html_set_parser_encoding(c, encoding))
return false;
@ -168,6 +168,12 @@ bool html_process_data(struct content *c, char *data, unsigned int size)
ENCODING_SOURCE_DETECTED;
}
c->data.html.getenc = false;
/* The data we received may have solely consisted of a BOM.
* If so, it will have been stripped by html_detect_encoding.
* Therefore, we'll have nothing to do in that case. */
if (size == 0)
return true;
}
for (x = 0; x + CHUNK <= size; x += CHUNK) {
@ -180,8 +186,22 @@ bool html_process_data(struct content *c, char *data, unsigned int size)
/* The encoding was not in headers or detected,
* and the parser found a <meta http-equiv="content-type"
* content="text/html; charset=...">. */
c->data.html.encoding = talloc_strdup(c,
/* However, if that encoding is non-ASCII-compatible,
* ignore it, as it can't possibly be correct */
if (strncasecmp(c->data.html.parser->input->encoding,
"UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */
strncasecmp(c->data.html.parser->input->encoding,
"UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */
c->data.html.encoding = talloc_strdup(c, "ISO-8859-1");
c->data.html.encoding_source =
ENCODING_SOURCE_DETECTED;
} else {
c->data.html.encoding = talloc_strdup(c,
c->data.html.parser->input->encoding);
c->data.html.encoding_source = ENCODING_SOURCE_META;
}
if (!c->data.html.encoding) {
union content_msg_data msg_data;
@ -189,7 +209,6 @@ bool html_process_data(struct content *c, char *data, unsigned int size)
content_broadcast(c, CONTENT_MSG_ERROR, msg_data);
return false;
}
c->data.html.encoding_source = ENCODING_SOURCE_META;
/* have the encoding; don't attempt to detect it */
c->data.html.getenc = false;
@ -293,33 +312,60 @@ bool html_set_parser_encoding(struct content *c, const char *encoding)
/**
* Attempt to detect the encoding of some HTML data.
*
* \param data HTML source data
* \param size length of data
* \param data Pointer to HTML source data
* \param size Pointer to length of data
* \return a constant string giving the encoding, or 0 if the encoding
* appears to be some 8-bit encoding
*
* If a BOM is encountered, *data and *size will be modified to skip over it
*/
const char *html_detect_encoding(const char *data, unsigned int size)
const char *html_detect_encoding(const char **data, unsigned int *size)
{
const unsigned char *d = (const unsigned char *) *data;
/* this detection assumes that the first two characters are <= 0xff */
if (size < 4)
if (*size < 4)
return 0;
if (data[0] == 0xfe && data[1] == 0xff) /* BOM fe ff */
if (d[0] == 0x00 && d[1] == 0x00 &&
d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */
*data += 4;
*size -= 4;
return "UTF-32BE";
} else if (d[0] == 0xff && d[1] == 0xfe &&
d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */
*data += 4;
*size -= 4;
return "UTF-32LE";
}
else if (d[0] == 0x00 && d[1] != 0x00 &&
d[2] == 0x00 && d[3] != 0x00) /* 00 xx 00 xx */
return "UTF-16BE";
else if (data[0] == 0xfe && data[1] == 0xff) /* BOM ff fe */
else if (d[0] != 0x00 && d[1] == 0x00 &&
d[2] != 0x00 && d[3] == 0x00) /* xx 00 xx 00 */
return "UTF-16LE";
else if (data[0] == 0x00 && data[1] != 0x00 &&
data[2] == 0x00 && data[3] != 0x00) /* 00 xx 00 xx */
return "UTF-16BE";
else if (data[0] != 0x00 && data[1] == 0x00 &&
data[2] != 0x00 && data[3] == 0x00) /* xx 00 xx 00 */
return "UTF-16BE";
else if (data[0] == 0x00 && data[1] == 0x00 &&
data[2] == 0x00 && data[3] != 0x00) /* 00 00 00 xx */
else if (d[0] == 0x00 && d[1] == 0x00 &&
d[2] == 0x00 && d[3] != 0x00) /* 00 00 00 xx */
return "ISO-10646-UCS-4";
else if (data[0] != 0x00 && data[1] == 0x00 &&
data[2] == 0x00 && data[3] == 0x00) /* xx 00 00 00 */
else if (d[0] != 0x00 && d[1] == 0x00 &&
d[2] == 0x00 && d[3] == 0x00) /* xx 00 00 00 */
return "ISO-10646-UCS-4";
else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM fe ff */
*data += 2;
*size -= 2;
return "UTF-16BE";
} else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM ff fe */
*data += 2;
*size -= 2;
return "UTF-16LE";
} else if (d[0] == 0xef && d[1] == 0xbb &&
d[2] == 0xbf) { /* BOM ef bb bf */
*data += 3;
*size -= 3;
return "UTF-8";
}
return 0;
}