diff --git a/render/html.c b/render/html.c
index d1c2cafa3..aaf0c4bf6 100644
--- a/render/html.c
+++ b/render/html.c
@@ -13,6 +13,7 @@
#include
#include
#include
+#include "libxml/parserInternals.h"
#include "netsurf/utils/config.h"
#include "netsurf/content/content.h"
#include "netsurf/content/fetch.h"
@@ -53,12 +54,16 @@ void html_create(struct content *c, const char *params[])
struct content_html_data *html = &c->data.html;
html->encoding = XML_CHAR_ENCODING_8859_1;
+ html->getenc = true;
for (i = 0; params[i]; i += 2) {
if (strcasecmp(params[i], "charset") == 0) {
html->encoding = xmlParseCharEncoding(params[i + 1]);
- if (html->encoding == XML_CHAR_ENCODING_ERROR)
+ html->getenc = false; /* encoding specified - trust the server... */
+ if (html->encoding == XML_CHAR_ENCODING_ERROR) {
html->encoding = XML_CHAR_ENCODING_8859_1;
+ html->getenc = true;
+ }
break;
}
}
@@ -97,6 +102,20 @@ void html_process_data(struct content *c, char *data, unsigned long size)
memcpy(c->data.html.source + c->data.html.length, data, size);
c->data.html.length += size;
c->size += size;
+ /* First time through, check if we need to get the encoding
+ * if so, get it and reset the parser instance with it.
+ * if it fails, assume Latin1
+ */
+ if (c->data.html.getenc) {
+ c->data.html.encoding = xmlDetectCharEncoding(c->data.html.source, c->data.html.length);
+ if (c->data.html.encoding == XML_CHAR_ENCODING_ERROR ||
+ c->data.html.encoding == XML_CHAR_ENCODING_NONE) {
+ c->data.html.encoding = XML_CHAR_ENCODING_8859_1;
+ }
+ xmlSwitchEncoding((xmlParserCtxtPtr)c->data.html.parser, c->data.html.encoding);
+ c->data.html.getenc = false;
+ LOG(("Encoding: %s", xmlGetCharEncodingName(c->data.html.encoding)));
+ }
for (x = 0; x + CHUNK <= size; x += CHUNK) {
htmlParseChunk(c->data.html.parser, data + x, CHUNK, 0);
gui_multitask();
diff --git a/render/html.h b/render/html.h
index 6eaa651df..b20040a2e 100644
--- a/render/html.h
+++ b/render/html.h
@@ -40,6 +40,7 @@ struct content_html_data {
char *source; /**< Source data. */
int length; /**< Length of source. */
xmlCharEncoding encoding; /**< Encoding of source. */
+ bool getenc; /**< Need to get the encoding from the document, as server is broken. */
char *base_url; /**< Base URL (may be a copy of content->url). */