netsurf/render/libxml_binding.c

/*
 * Copyright 2007 James Bursa <bursa@users.sourceforge.net>
 * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
 *
 * This file is part of NetSurf, http://www.netsurf-browser.org/
 *
 * NetSurf is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * NetSurf is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef WITH_HUBBUB

#include <stdbool.h>
#include <string.h>

#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>

#include "render/parser_binding.h"

#include "utils/log.h"
#include "utils/talloc.h"

typedef struct libxml_ctx {
	htmlParserCtxt *parser;

	/** HTML parser encoding handler. */
	xmlCharEncodingHandler *encoding_handler;

	const char *encoding;
	binding_encoding_source encoding_source;

	bool getenc;
} libxml_ctx;

static bool set_parser_encoding(libxml_ctx *c, const char *encoding);
static const char *detect_encoding(const char **data, size_t *size);

binding_error binding_create_tree(void *arena, const char *charset, void **ctx)
{
	libxml_ctx *c;

	c = malloc(sizeof(libxml_ctx));
	if (c == NULL)
		return BINDING_NOMEM;

	c->parser = NULL;
	c->encoding_handler = NULL;
	c->encoding = charset;
	c->encoding_source = ENCODING_SOURCE_HEADER;
	c->getenc = true;

	c->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0,
			XML_CHAR_ENCODING_NONE);
	if (c->parser == NULL) {
		free(c);
		return BINDING_NOMEM;
	}

	if (c->encoding != NULL && !set_parser_encoding(c, charset)) {
		if (c->parser->myDoc != NULL)
			xmlFreeDoc(c->parser->myDoc);
		htmlFreeParserCtxt(c->parser);
		free(c);
		return BINDING_BADENCODING;
	}

	*ctx = (void *) c;

	return BINDING_OK;
}

binding_error binding_destroy_tree(void *ctx)
{
	libxml_ctx *c = (libxml_ctx *) ctx;

	if (ctx == NULL)
		return BINDING_OK;

	if (c->parser->myDoc != NULL)
		xmlFreeDoc(c->parser->myDoc);

	if (c->parser != NULL)
		htmlFreeParserCtxt(c->parser);

	c->parser = NULL;
	c->encoding = NULL;

	free(c);

	return BINDING_OK;
}

binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len)
{
	libxml_ctx *c = (libxml_ctx *) ctx;

	if (c->getenc) {
		/* No encoding was specified in the Content-Type header.
		 * Attempt to detect if the encoding is not 8-bit. If the
		 * encoding is 8-bit, leave the parser unchanged, so that it
		 * searches for a <meta http-equiv="content-type"
		 * content="text/html; charset=...">. */
		const char *encoding;
		encoding = detect_encoding((const char **) (void *) &data,
				&len);
		if (encoding) {
			if (!set_parser_encoding(c, encoding))
				return BINDING_NOMEM;
			c->encoding = encoding;
			c->encoding_source = ENCODING_SOURCE_DETECTED;
		}
		c->getenc = false;

		/* The data we received may have solely consisted of a BOM.
		 * If so, it will have been stripped by html_detect_encoding.
		 * Therefore, we'll have nothing to do in that case. */
		if (len == 0)
			return BINDING_OK;
	}

	htmlParseChunk(c->parser, (const char *) data, len, 0);
	/** \todo error handling */

	if (!c->encoding && c->parser->input->encoding) {
		/* The encoding was not in headers or detected,
		 * and the parser found a <meta http-equiv="content-type"
		 * content="text/html; charset=...">. */

		/* However, if that encoding is non-ASCII-compatible,
		 * ignore it, as it can't possibly be correct */
		if (strncasecmp((const char *) c->parser->input->encoding,
				"UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */
			strncasecmp((const char *) c->parser->input->encoding,
				"UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */
			c->encoding = "ISO-8859-1";
			c->encoding_source = ENCODING_SOURCE_DETECTED;
		} else {
			c->encoding = (const char *) c->parser->input->encoding;
			c->encoding_source = ENCODING_SOURCE_META;
		}

		if (!c->encoding)
			return BINDING_NOMEM;

		/* have the encoding; don't attempt to detect it */
		c->getenc = false;

		return BINDING_ENCODINGCHANGE;
	}

	return BINDING_OK;
}

binding_error binding_parse_completed(void *ctx)
{
	libxml_ctx *c = (libxml_ctx *) ctx;

	htmlParseChunk(c->parser, "", 0, 1);
	/** \todo error handling */

	return BINDING_OK;
}

const char *binding_get_encoding(void *ctx, binding_encoding_source *source)
{
	libxml_ctx *c = (libxml_ctx *) ctx;

	*source = c->encoding_source;

	return c->encoding;
}

xmlDocPtr binding_get_document(void *ctx)
{
	libxml_ctx *c = (libxml_ctx *) ctx;
	xmlDocPtr doc = c->parser->myDoc;

	c->parser->myDoc = NULL;

	return doc;
}

/******************************************************************************/

/**
 * Set the HTML parser character encoding.
 *
 * \param  c         context
 * \param  encoding  name of encoding
 * \return  true on success, false on error and error reported
 */
bool set_parser_encoding(libxml_ctx *c, const char *encoding)
{
	xmlError *error;

	c->encoding_handler = xmlFindCharEncodingHandler(encoding);
	if (!c->encoding_handler) {
		/* either out of memory, or no handler available */
		/* assume no handler available, which is not a fatal error */
		LOG(("no encoding handler for \"%s\"", encoding));
		/* \todo  warn user and ask them to install iconv? */
		return true;
	}

	xmlCtxtResetLastError(c->parser);
	if (xmlSwitchToEncoding(c->parser, c->encoding_handler)) {
		error = xmlCtxtGetLastError(c->parser);
		LOG(("xmlSwitchToEncoding(): %s",
				error ? error->message : "failed"));
		return false;
	}

	/* Dirty hack to get around libxml oddness:
	 * 1) When creating a push parser context, the input flow's encoding
	 *    string is not set (whether an encoding is specified or not)
	 * 2) When switching encoding (as above), the input flow's encoding
	 *    string is never changed
	 * 3) When handling a meta charset, the input flow's encoding string
	 *    is checked to determine if an encoding has already been set.
	 *    If it has been set, then the meta charset is ignored.
	 *
	 * The upshot of this is that, if we don't explicitly set the input
	 * flow's encoding string here, any meta charset in the document
	 * will override our setting, which is incorrect behaviour.
	 *
	 * Ideally, this would be fixed in libxml, but that requires rather
	 * more knowledge than I currently have of what libxml is doing.
	 */
	if (!c->parser->input->encoding)
		c->parser->input->encoding =
				xmlStrdup((const xmlChar *) encoding);

	/* Ensure noone else attempts to reset the encoding */
	c->getenc = false;

	return true;
}

/**
 * Attempt to detect the encoding of some HTML data.
 *
 * \param  data  Pointer to HTML source data
 * \param  size  Pointer to length of data
 * \return  a constant string giving the encoding, or 0 if the encoding
 *          appears to be some 8-bit encoding
 *
 * If a BOM is encountered, *data and *size will be modified to skip over it
 */

const char *detect_encoding(const char **data, size_t *size)
{
	const unsigned char *d = (const unsigned char *) *data;

	/* this detection assumes that the first two characters are <= 0xff */
	if (*size < 4)
		return 0;

	if (d[0] == 0x00 && d[1] == 0x00 &&
			d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */
		*data += 4;
		*size -= 4;
		return "UTF-32BE";
	} else if (d[0] == 0xff && d[1] == 0xfe &&
			d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */
		*data += 4;
		*size -= 4;
		return "UTF-32LE";
	}
	else if (d[0] == 0x00 && d[1] != 0x00 &&
			d[2] == 0x00 && d[3] != 0x00)   /* 00 xx 00 xx */
		return "UTF-16BE";
	else if (d[0] != 0x00 && d[1] == 0x00 &&
			d[2] != 0x00 && d[3] == 0x00)   /* xx 00 xx 00 */
		return "UTF-16LE";
	else if (d[0] == 0x00 && d[1] == 0x00 &&
			d[2] == 0x00 && d[3] != 0x00)   /* 00 00 00 xx */
		return "ISO-10646-UCS-4";
	else if (d[0] != 0x00 && d[1] == 0x00 &&
			d[2] == 0x00 && d[3] == 0x00)   /* xx 00 00 00 */
		return "ISO-10646-UCS-4";
	else if (d[0] == 0xfe && d[1] == 0xff) {        /* BOM fe ff */
		*data += 2;
		*size -= 2;
		return "UTF-16BE";
	} else if (d[0] == 0xff && d[1] == 0xfe) {      /* BOM ff fe */
		*data += 2;
		*size -= 2;
		return "UTF-16LE";
	} else if (d[0] == 0xef && d[1] == 0xbb &&
			d[2] == 0xbf) {                 /* BOM ef bb bf */
		*data += 3;
		*size -= 3;
		return "UTF-8";
	}

	return 0;
}

#endif