netsurf/render/libxml_binding.c

/*
 * Copyright 2007 James Bursa <bursa@users.sourceforge.net>
 * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
 *
 * This file is part of NetSurf, http://www.netsurf-browser.org/
 *
 * NetSurf is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * NetSurf is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef WITH_HUBBUB

#include <stdbool.h>
#include <string.h>

#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>

#include "render/parser_binding.h"

#include "utils/log.h"
#include "utils/talloc.h"

typedef struct libxml_ctx {
	htmlParserCtxt *parser;

	/** HTML parser encoding handler. */
	xmlCharEncodingHandler *encoding_handler;

	const char *encoding;
	binding_encoding_source encoding_source;

	bool getenc;
} libxml_ctx;

static bool set_parser_encoding(libxml_ctx *c, const char *encoding);
static const char *detect_encoding(const char **data, size_t *size);

binding_error binding_create_tree(void *arena, const char *charset, void **ctx)
{
	libxml_ctx *c;

	c = malloc(sizeof(libxml_ctx));
	if (c == NULL)
		return BINDING_NOMEM;

	c->parser = NULL;
	c->encoding_handler = NULL;
	c->encoding = charset;
	c->encoding_source = ENCODING_SOURCE_HEADER;
	c->getenc = true;

	c->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0, 
			XML_CHAR_ENCODING_NONE);
	if (c->parser == NULL) {
		free(c);
		return BINDING_NOMEM;
	}

	if (c->encoding != NULL && !set_parser_encoding(c, charset)) {
		if (c->parser->myDoc != NULL)
			xmlFreeDoc(c->parser->myDoc);
		htmlFreeParserCtxt(c->parser);
		free(c);
		return BINDING_BADENCODING;
	}

	*ctx = (void *) c;

	return BINDING_OK;
}

binding_error binding_destroy_tree(void *ctx)
{
	libxml_ctx *c = (libxml_ctx *) ctx;

	if (ctx == NULL)
		return BINDING_OK;

	if (c->parser->myDoc != NULL)
		xmlFreeDoc(c->parser->myDoc);

	if (c->parser != NULL)
		htmlFreeParserCtxt(c->parser);

	c->parser = NULL;
	c->encoding = NULL;

	free(c);

	return BINDING_OK;
}

binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len)
{
	libxml_ctx *c = (libxml_ctx *) ctx;

	if (c->getenc) {
		/* No encoding was specified in the Content-Type header.
		 * Attempt to detect if the encoding is not 8-bit. If the
		 * encoding is 8-bit, leave the parser unchanged, so that it
		 * searches for a <meta http-equiv="content-type"
		 * content="text/html; charset=...">. */
		const char *encoding;
		encoding = detect_encoding((const char **) (void *) &data, 
				&len);
		if (encoding) {
			if (!set_parser_encoding(c, encoding))
				return BINDING_NOMEM;
			c->encoding = encoding;
			c->encoding_source = ENCODING_SOURCE_DETECTED;
		}
		c->getenc = false;

		/* The data we received may have solely consisted of a BOM.
		 * If so, it will have been stripped by html_detect_encoding.
		 * Therefore, we'll have nothing to do in that case. */
		if (len == 0)
			return BINDING_OK;
	}

	htmlParseChunk(c->parser, (const char *) data, len, 0);
	/** \todo error handling */

	if (!c->encoding && c->parser->input->encoding) {
		/* The encoding was not in headers or detected,
		 * and the parser found a <meta http-equiv="content-type"
		 * content="text/html; charset=...">. */

		/* However, if that encoding is non-ASCII-compatible,
		 * ignore it, as it can't possibly be correct */
		if (strncasecmp((const char *) c->parser->input->encoding,
				"UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */
			strncasecmp((const char *) c->parser->input->encoding,
				"UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */
			c->encoding = "ISO-8859-1";
			c->encoding_source = ENCODING_SOURCE_DETECTED;
		} else {
			c->encoding = (const char *) c->parser->input->encoding;
			c->encoding_source = ENCODING_SOURCE_META;
		}

		if (!c->encoding)
			return BINDING_NOMEM;

		/* have the encoding; don't attempt to detect it */
		c->getenc = false;

		return BINDING_ENCODINGCHANGE;
	}

	return BINDING_OK;
}

binding_error binding_parse_completed(void *ctx)
{
	libxml_ctx *c = (libxml_ctx *) ctx;

	htmlParseChunk(c->parser, "", 0, 1);
	/** \todo error handling */

	return BINDING_OK;
}

const char *binding_get_encoding(void *ctx, binding_encoding_source *source)
{
	libxml_ctx *c = (libxml_ctx *) ctx;

	*source = c->encoding_source;

	return c->encoding;
}

xmlDocPtr binding_get_document(void *ctx)
{
	libxml_ctx *c = (libxml_ctx *) ctx;
	xmlDocPtr doc = c->parser->myDoc;

	c->parser->myDoc = NULL;

	return doc;
}

/******************************************************************************/

/**
 * Set the HTML parser character encoding.
 *
 * \param  c         context
 * \param  encoding  name of encoding
 * \return  true on success, false on error and error reported
 */
bool set_parser_encoding(libxml_ctx *c, const char *encoding)
{
	xmlError *error;

	c->encoding_handler = xmlFindCharEncodingHandler(encoding);
	if (!c->encoding_handler) {
		/* either out of memory, or no handler available */
		/* assume no handler available, which is not a fatal error */
		LOG(("no encoding handler for \"%s\"", encoding));
		/* \todo  warn user and ask them to install iconv? */
		return true;
	}

	xmlCtxtResetLastError(c->parser);
	if (xmlSwitchToEncoding(c->parser, c->encoding_handler)) {
		error = xmlCtxtGetLastError(c->parser);
		LOG(("xmlSwitchToEncoding(): %s",
				error ? error->message : "failed"));
		return false;
	}

	/* Dirty hack to get around libxml oddness:
	 * 1) When creating a push parser context, the input flow's encoding
	 *    string is not set (whether an encoding is specified or not)
	 * 2) When switching encoding (as above), the input flow's encoding
	 *    string is never changed
	 * 3) When handling a meta charset, the input flow's encoding string
	 *    is checked to determine if an encoding has already been set.
	 *    If it has been set, then the meta charset is ignored.
	 *
	 * The upshot of this is that, if we don't explicitly set the input
	 * flow's encoding string here, any meta charset in the document
	 * will override our setting, which is incorrect behaviour.
	 *
	 * Ideally, this would be fixed in libxml, but that requires rather
	 * more knowledge than I currently have of what libxml is doing.
	 */
	if (!c->parser->input->encoding)
		c->parser->input->encoding =
				xmlStrdup((const xmlChar *) encoding);

	/* Ensure noone else attempts to reset the encoding */
	c->getenc = false;

	return true;
}

/**
 * Attempt to detect the encoding of some HTML data.
 *
 * \param  data  Pointer to HTML source data
 * \param  size  Pointer to length of data
 * \return  a constant string giving the encoding, or 0 if the encoding
 *          appears to be some 8-bit encoding
 *
 * If a BOM is encountered, *data and *size will be modified to skip over it
 */

const char *detect_encoding(const char **data, size_t *size)
{
	const unsigned char *d = (const unsigned char *) *data;

	/* this detection assumes that the first two characters are <= 0xff */
	if (*size < 4)
		return 0;

	if (d[0] == 0x00 && d[1] == 0x00 &&
			d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */
		*data += 4;
		*size -= 4;
		return "UTF-32BE";
	} else if (d[0] == 0xff && d[1] == 0xfe &&
			d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */
		*data += 4;
		*size -= 4;
		return "UTF-32LE";
	}
	else if (d[0] == 0x00 && d[1] != 0x00 &&
			d[2] == 0x00 && d[3] != 0x00)   /* 00 xx 00 xx */
		return "UTF-16BE";
	else if (d[0] != 0x00 && d[1] == 0x00 &&
			d[2] != 0x00 && d[3] == 0x00)   /* xx 00 xx 00 */
		return "UTF-16LE";
	else if (d[0] == 0x00 && d[1] == 0x00 &&
			d[2] == 0x00 && d[3] != 0x00)   /* 00 00 00 xx */
		return "ISO-10646-UCS-4";
	else if (d[0] != 0x00 && d[1] == 0x00 &&
			d[2] == 0x00 && d[3] == 0x00)   /* xx 00 00 00 */
		return "ISO-10646-UCS-4";
	else if (d[0] == 0xfe && d[1] == 0xff) {        /* BOM fe ff */
		*data += 2;
		*size -= 2;
		return "UTF-16BE";
	} else if (d[0] == 0xff && d[1] == 0xfe) {      /* BOM ff fe */
		*data += 2;
		*size -= 2;
		return "UTF-16LE";
	} else if (d[0] == 0xef && d[1] == 0xbb &&
			d[2] == 0xbf) {                 /* BOM ef bb bf */
		*data += 3;
		*size -= 3;
		return "UTF-8";
	}

	return 0;
}

#endif
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00			`/*`
			`* Copyright 2007 James Bursa <bursa@users.sourceforge.net>`
			`* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>`
			`*`
			`* This file is part of NetSurf, http://www.netsurf-browser.org/`
			`*`
			`* NetSurf is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; version 2 of the License.`
			`*`
			`* NetSurf is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`#ifndef WITH_HUBBUB`

			`#include <stdbool.h>`
			`#include <string.h>`

			`#include <libxml/HTMLparser.h>`
			`#include <libxml/HTMLtree.h>`
			`#include <libxml/parser.h>`
			`#include <libxml/parserInternals.h>`

			`#include "render/parser_binding.h"`

			`#include "utils/log.h"`
			`#include "utils/talloc.h"`

			`typedef struct libxml_ctx {`
			`htmlParserCtxt *parser;`

			`/** HTML parser encoding handler. */`
			`xmlCharEncodingHandler *encoding_handler;`

			`const char *encoding;`
			`binding_encoding_source encoding_source;`

			`bool getenc;`
			`} libxml_ctx;`

			`static bool set_parser_encoding(libxml_ctx c, const char encoding);`
			`static const char detect_encoding(const char data, size_t size);`

Port to new hubbub parser API svn path=/trunk/netsurf/; revision=5669 2008-11-09 22:04:30 +03:00			`binding_error binding_create_tree(void arena, const char charset, void **ctx)`
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00			`{`
Port to new hubbub parser API svn path=/trunk/netsurf/; revision=5669 2008-11-09 22:04:30 +03:00			`libxml_ctx *c;`
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00
Port to new hubbub parser API svn path=/trunk/netsurf/; revision=5669 2008-11-09 22:04:30 +03:00			`c = malloc(sizeof(libxml_ctx));`
			`if (c == NULL)`
			`return BINDING_NOMEM;`
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00
Port to new hubbub parser API svn path=/trunk/netsurf/; revision=5669 2008-11-09 22:04:30 +03:00			`c->parser = NULL;`
			`c->encoding_handler = NULL;`
			`c->encoding = charset;`
			`c->encoding_source = ENCODING_SOURCE_HEADER;`
			`c->getenc = true;`
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00
Port to new hubbub parser API svn path=/trunk/netsurf/; revision=5669 2008-11-09 22:04:30 +03:00			`c->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0,`
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00			`XML_CHAR_ENCODING_NONE);`
Port to new hubbub parser API svn path=/trunk/netsurf/; revision=5669 2008-11-09 22:04:30 +03:00			`if (c->parser == NULL) {`
			`free(c);`
			`return BINDING_NOMEM;`
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00			`}`

Port to new hubbub parser API svn path=/trunk/netsurf/; revision=5669 2008-11-09 22:04:30 +03:00			`if (c->encoding != NULL && !set_parser_encoding(c, charset)) {`
			`if (c->parser->myDoc != NULL)`
			`xmlFreeDoc(c->parser->myDoc);`
			`htmlFreeParserCtxt(c->parser);`
			`free(c);`
			`return BINDING_BADENCODING;`
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00			`}`

Port to new hubbub parser API svn path=/trunk/netsurf/; revision=5669 2008-11-09 22:04:30 +03:00			`ctx = (void ) c;`

			`return BINDING_OK;`
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00			`}`

Port to new hubbub parser API svn path=/trunk/netsurf/; revision=5669 2008-11-09 22:04:30 +03:00			`binding_error binding_destroy_tree(void *ctx)`
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00			`{`
			`libxml_ctx c = (libxml_ctx ) ctx;`

			`if (ctx == NULL)`
Port to new hubbub parser API svn path=/trunk/netsurf/; revision=5669 2008-11-09 22:04:30 +03:00			`return BINDING_OK;`
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00
			`if (c->parser->myDoc != NULL)`
			`xmlFreeDoc(c->parser->myDoc);`

			`if (c->parser != NULL)`
			`htmlFreeParserCtxt(c->parser);`

			`c->parser = NULL;`
			`c->encoding = NULL;`

			`free(c);`
Port to new hubbub parser API svn path=/trunk/netsurf/; revision=5669 2008-11-09 22:04:30 +03:00
			`return BINDING_OK;`
Rework html parser bindings to have a common API and reside in separate files for ease of reading. Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404 2008-09-23 06:19:50 +04:00			`}`

			`binding_error binding_parse_chunk(void ctx, const uint8_t data, size_t len)`
			`{`
			`libxml_ctx c = (libxml_ctx ) ctx;`

			`if (c->getenc) {`
			`/* No encoding was specified in the Content-Type header.`
			`* Attempt to detect if the encoding is not 8-bit. If the`
			`* encoding is 8-bit, leave the parser unchanged, so that it`
			`* searches for a <meta http-equiv="content-type"`
			`* content="text/html; charset=...">. */`
			`const char *encoding;`
			`encoding = detect_encoding((const char *) (void ) &data,`
			`&len);`
			`if (encoding) {`
			`if (!set_parser_encoding(c, encoding))`
			`return BINDING_NOMEM;`
			`c->encoding = encoding;`
			`c->encoding_source = ENCODING_SOURCE_DETECTED;`
			`}`
			`c->getenc = false;`

			`/* The data we received may have solely consisted of a BOM.`
			`* If so, it will have been stripped by html_detect_encoding.`
			`* Therefore, we'll have nothing to do in that case. */`
			`if (len == 0)`
			`return BINDING_OK;`
			`}`

			`htmlParseChunk(c->parser, (const char *) data, len, 0);`
			`/** \todo error handling */`

			`if (!c->encoding && c->parser->input->encoding) {`
			`/* The encoding was not in headers or detected,`
			`* and the parser found a <meta http-equiv="content-type"`
			`* content="text/html; charset=...">. */`

			`/* However, if that encoding is non-ASCII-compatible,`
			`* ignore it, as it can't possibly be correct */`
			`if (strncasecmp((const char *) c->parser->input->encoding,`
			`"UTF-16", 6) == 0 \|\| /* UTF-16(LE\|BE)? */`
			`strncasecmp((const char *) c->parser->input->encoding,`
			`"UTF-32", 6) == 0) { /* UTF-32(LE\|BE)? */`
			`c->encoding = "ISO-8859-1";`
			`c->encoding_source = ENCODING_SOURCE_DETECTED;`
			`} else {`
			`c->encoding = (const char *) c->parser->input->encoding;`
			`c->encoding_source = ENCODING_SOURCE_META;`
			`}`

			`if (!c->encoding)`
			`return BINDING_NOMEM;`

			`/* have the encoding; don't attempt to detect it */`
			`c->getenc = false;`

			`return BINDING_ENCODINGCHANGE;`
			`}`

			`return BINDING_OK;`
			`}`

			`binding_error binding_parse_completed(void *ctx)`
			`{`
			`libxml_ctx c = (libxml_ctx ) ctx;`

			`htmlParseChunk(c->parser, "", 0, 1);`
			`/** \todo error handling */`

			`return BINDING_OK;`
			`}`

			`const char binding_get_encoding(void ctx, binding_encoding_source *source)`
			`{`
			`libxml_ctx c = (libxml_ctx ) ctx;`

			`*source = c->encoding_source;`

			`return c->encoding;`
			`}`

			`xmlDocPtr binding_get_document(void *ctx)`
			`{`
			`libxml_ctx c = (libxml_ctx ) ctx;`
			`xmlDocPtr doc = c->parser->myDoc;`

			`c->parser->myDoc = NULL;`

			`return doc;`
			`}`

			`/******************************************************************************/`

			`/**`
			`* Set the HTML parser character encoding.`
			`*`
			`* \param c context`
			`* \param encoding name of encoding`
			`* \return true on success, false on error and error reported`
			`*/`
			`bool set_parser_encoding(libxml_ctx c, const char encoding)`
			`{`
			`xmlError *error;`

			`c->encoding_handler = xmlFindCharEncodingHandler(encoding);`
			`if (!c->encoding_handler) {`
			`/* either out of memory, or no handler available */`
			`/* assume no handler available, which is not a fatal error */`
			`LOG(("no encoding handler for \"%s\"", encoding));`
			`/* \todo warn user and ask them to install iconv? */`
			`return true;`
			`}`

			`xmlCtxtResetLastError(c->parser);`
			`if (xmlSwitchToEncoding(c->parser, c->encoding_handler)) {`
			`error = xmlCtxtGetLastError(c->parser);`
			`LOG(("xmlSwitchToEncoding(): %s",`
			`error ? error->message : "failed"));`
			`return false;`
			`}`

			`/* Dirty hack to get around libxml oddness:`
			`* 1) When creating a push parser context, the input flow's encoding`
			`* string is not set (whether an encoding is specified or not)`
			`* 2) When switching encoding (as above), the input flow's encoding`
			`* string is never changed`
			`* 3) When handling a meta charset, the input flow's encoding string`
			`* is checked to determine if an encoding has already been set.`
			`* If it has been set, then the meta charset is ignored.`
			`*`
			`* The upshot of this is that, if we don't explicitly set the input`
			`* flow's encoding string here, any meta charset in the document`
			`* will override our setting, which is incorrect behaviour.`
			`*`
			`* Ideally, this would be fixed in libxml, but that requires rather`
			`* more knowledge than I currently have of what libxml is doing.`
			`*/`
			`if (!c->parser->input->encoding)`
			`c->parser->input->encoding =`
			`xmlStrdup((const xmlChar *) encoding);`

			`/* Ensure noone else attempts to reset the encoding */`
			`c->getenc = false;`

			`return true;`
			`}`

			`/**`
			`* Attempt to detect the encoding of some HTML data.`
			`*`
			`* \param data Pointer to HTML source data`
			`* \param size Pointer to length of data`
			`* \return a constant string giving the encoding, or 0 if the encoding`
			`* appears to be some 8-bit encoding`
			`*`
			`* If a BOM is encountered, data and size will be modified to skip over it`
			`*/`

			`const char detect_encoding(const char data, size_t size)`
			`{`
			`const unsigned char d = (const unsigned char ) *data;`

			`/* this detection assumes that the first two characters are <= 0xff */`
			`if (*size < 4)`
			`return 0;`

			`if (d[0] == 0x00 && d[1] == 0x00 &&`
			`d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */`
			`*data += 4;`
			`*size -= 4;`
			`return "UTF-32BE";`
			`} else if (d[0] == 0xff && d[1] == 0xfe &&`
			`d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */`
			`*data += 4;`
			`*size -= 4;`
			`return "UTF-32LE";`
			`}`
			`else if (d[0] == 0x00 && d[1] != 0x00 &&`
			`d[2] == 0x00 && d[3] != 0x00) /* 00 xx 00 xx */`
			`return "UTF-16BE";`
			`else if (d[0] != 0x00 && d[1] == 0x00 &&`
			`d[2] != 0x00 && d[3] == 0x00) /* xx 00 xx 00 */`
			`return "UTF-16LE";`
			`else if (d[0] == 0x00 && d[1] == 0x00 &&`
			`d[2] == 0x00 && d[3] != 0x00) /* 00 00 00 xx */`
			`return "ISO-10646-UCS-4";`
			`else if (d[0] != 0x00 && d[1] == 0x00 &&`
			`d[2] == 0x00 && d[3] == 0x00) /* xx 00 00 00 */`
			`return "ISO-10646-UCS-4";`
			`else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM fe ff */`
			`*data += 2;`
			`*size -= 2;`
			`return "UTF-16BE";`
			`} else if (d[0] == 0xff && d[1] == 0xfe) { /* BOM ff fe */`
			`*data += 2;`
			`*size -= 2;`
			`return "UTF-16LE";`
			`} else if (d[0] == 0xef && d[1] == 0xbb &&`
			`d[2] == 0xbf) { /* BOM ef bb bf */`
			`*data += 3;`
			`*size -= 3;`
			`return "UTF-8";`
			`}`

			`return 0;`
			`}`

			`#endif`