[project @ 2004-03-08 18:21:21 by jmb]

Attempt to detect document charset encoding if the server doesn't send it.

svn path=/import/netsurf/; revision=592
This commit is contained in:
John Mark Bell 2004-03-08 18:21:21 +00:00
parent 7d9bf053b4
commit 217eae922b
2 changed files with 21 additions and 1 deletions

View File

@ -13,6 +13,7 @@
#include <string.h>
#include <strings.h>
#include <stdlib.h>
#include "libxml/parserInternals.h"
#include "netsurf/utils/config.h"
#include "netsurf/content/content.h"
#include "netsurf/content/fetch.h"
@ -53,12 +54,16 @@ void html_create(struct content *c, const char *params[])
struct content_html_data *html = &c->data.html;
html->encoding = XML_CHAR_ENCODING_8859_1;
html->getenc = true;
for (i = 0; params[i]; i += 2) {
if (strcasecmp(params[i], "charset") == 0) {
html->encoding = xmlParseCharEncoding(params[i + 1]);
if (html->encoding == XML_CHAR_ENCODING_ERROR)
html->getenc = false; /* encoding specified - trust the server... */
if (html->encoding == XML_CHAR_ENCODING_ERROR) {
html->encoding = XML_CHAR_ENCODING_8859_1;
html->getenc = true;
}
break;
}
}
@ -97,6 +102,20 @@ void html_process_data(struct content *c, char *data, unsigned long size)
memcpy(c->data.html.source + c->data.html.length, data, size);
c->data.html.length += size;
c->size += size;
/* First time through, check if we need to get the encoding
* if so, get it and reset the parser instance with it.
* if it fails, assume Latin1
*/
if (c->data.html.getenc) {
c->data.html.encoding = xmlDetectCharEncoding(c->data.html.source, c->data.html.length);
if (c->data.html.encoding == XML_CHAR_ENCODING_ERROR ||
c->data.html.encoding == XML_CHAR_ENCODING_NONE) {
c->data.html.encoding = XML_CHAR_ENCODING_8859_1;
}
xmlSwitchEncoding((xmlParserCtxtPtr)c->data.html.parser, c->data.html.encoding);
c->data.html.getenc = false;
LOG(("Encoding: %s", xmlGetCharEncodingName(c->data.html.encoding)));
}
for (x = 0; x + CHUNK <= size; x += CHUNK) {
htmlParseChunk(c->data.html.parser, data + x, CHUNK, 0);
gui_multitask();

View File

@ -40,6 +40,7 @@ struct content_html_data {
char *source; /**< Source data. */
int length; /**< Length of source. */
xmlCharEncoding encoding; /**< Encoding of source. */
bool getenc; /**< Need to get the encoding from the document, as server is broken. */
char *base_url; /**< Base URL (may be a copy of content->url). */