netsurf/riscos/save_complete.c
John Mark Bell b577562953 We don't need to reparse the document when saving complete -- the document persists for the lifetime of its content.
Better still would be to perform the serialisation manually, so that we don't need to copy the document at all.

svn path=/trunk/netsurf/; revision=6774
2009-03-11 17:22:46 +00:00

760 lines
18 KiB
C

/*
* Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
* Copyright 2004-2007 James Bursa <bursa@users.sourceforge.net>
*
* This file is part of NetSurf, http://www.netsurf-browser.org/
*
* NetSurf is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; version 2 of the License.
*
* NetSurf is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/** \file
* Save HTML document with dependencies (implementation).
*/
#include "utils/config.h"
#ifdef WITH_SAVE_COMPLETE
#define _GNU_SOURCE /* for strndup */
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <regex.h>
#include <libxml/HTMLtree.h>
#include <libxml/parserInternals.h>
#include "oslib/osfile.h"
#include "utils/config.h"
#include "content/content.h"
#include "css/css.h"
#include "render/box.h"
#include "riscos/gui.h"
#include "riscos/save_complete.h"
#include "utils/log.h"
#include "utils/url.h"
#include "utils/utils.h"
regex_t save_complete_import_re;
/** An entry in save_complete_list. */
struct save_complete_entry {
struct content *content;
struct save_complete_entry *next; /**< Next entry in list */
};
/** List of urls seen and saved so far. */
static struct save_complete_entry *save_complete_list = 0;
static bool save_complete_html(struct content *c, const char *path,
bool index);
static bool save_imported_sheets(struct content *c, const char *path);
static char * rewrite_stylesheet_urls(const char *source, unsigned int size,
int *osize, const char *base);
static bool rewrite_document_urls(xmlDoc *doc, const char *base);
static bool rewrite_urls(xmlNode *n, const char *base);
static bool rewrite_url(xmlNode *n, const char *attr, const char *base);
static bool save_complete_list_add(struct content *content);
static struct content * save_complete_list_find(const char *url);
static bool save_complete_list_check(struct content *content);
/* static void save_complete_list_dump(void); */
static bool save_complete_inventory(const char *path);
/**
* Save an HTML page with all dependencies.
*
* \param c CONTENT_HTML to save
* \param path directory to save to (must exist)
* \return true on success, false on error and error reported
*/
bool save_complete(struct content *c, const char *path)
{
bool result;
result = save_complete_html(c, path, true);
if (result)
result = save_complete_inventory(path);
/* free save_complete_list */
while (save_complete_list) {
struct save_complete_entry *next = save_complete_list->next;
free(save_complete_list);
save_complete_list = next;
}
return result;
}
/**
* Save an HTML page with all dependencies, recursing through imported pages.
*
* \param c CONTENT_HTML to save
* \param path directory to save to (must exist)
* \param index true to save as "index"
* \return true on success, false on error and error reported
*/
bool save_complete_html(struct content *c, const char *path, bool index)
{
char spath[256];
unsigned int i;
xmlDocPtr doc;
os_error *error;
if (c->type != CONTENT_HTML)
return false;
if (save_complete_list_check(c))
return true;
/* save stylesheets, ignoring the base and adblocking sheets */
for (i = STYLESHEET_STYLE; i != c->data.html.stylesheet_count; i++) {
struct content *css = c->data.html.stylesheet_content[i];
char *source;
int source_len;
if (!css)
continue;
if (save_complete_list_check(css))
continue;
if (i != STYLESHEET_STYLE) {
if (!save_complete_list_add(css)) {
warn_user("NoMemory", 0);
return false;
}
}
if (!save_imported_sheets(css, path))
return false;
if (i == STYLESHEET_STYLE)
continue; /* don't save <style> elements */
snprintf(spath, sizeof spath, "%s.%x", path,
(unsigned int) css);
source = rewrite_stylesheet_urls(css->source_data,
css->source_size, &source_len, css->url);
if (!source) {
warn_user("NoMemory", 0);
return false;
}
error = xosfile_save_stamped(spath, 0xf79, source,
source + source_len);
free(source);
if (error) {
LOG(("xosfile_save_stamped: 0x%x: %s",
error->errnum, error->errmess));
warn_user("SaveError", error->errmess);
return false;
}
}
/* save objects */
for (i = 0; i != c->data.html.object_count; i++) {
struct content *obj = c->data.html.object[i].content;
/* skip difficult content types */
if (!obj || obj->type >= CONTENT_OTHER || !obj->source_data)
continue;
if (save_complete_list_check(obj))
continue;
if (!save_complete_list_add(obj)) {
warn_user("NoMemory", 0);
return false;
}
if (obj->type == CONTENT_HTML) {
if (!save_complete_html(obj, path, false))
return false;
continue;
}
snprintf(spath, sizeof spath, "%s.%x", path,
(unsigned int) obj);
error = xosfile_save_stamped(spath,
ro_content_filetype(obj),
obj->source_data,
obj->source_data + obj->source_size);
if (error) {
LOG(("xosfile_save_stamped: 0x%x: %s",
error->errnum, error->errmess));
warn_user("SaveError", error->errmess);
return false;
}
}
/*save_complete_list_dump();*/
/* copy document */
doc = xmlCopyDoc(c->data.html.document, 1);
if (doc == NULL) {
warn_user("NoMemory", 0);
return false;
}
/* rewrite all urls we know about */
if (!rewrite_document_urls(doc, c->data.html.base_url)) {
xmlFreeDoc(doc);
warn_user("NoMemory", 0);
return false;
}
/* save the html file out last of all */
if (index)
snprintf(spath, sizeof spath, "%s.index", path);
else
snprintf(spath, sizeof spath, "%s.%x", path, (unsigned int)c);
errno = 0;
if (htmlSaveFileFormat(spath, doc, 0, 0) == -1) {
if (errno)
warn_user("SaveError", strerror(errno));
else
warn_user("SaveError", "htmlSaveFileFormat failed");
xmlFreeDoc(doc);
return false;
}
xmlFreeDoc(doc);
error = xosfile_set_type(spath, 0xfaf);
if (error) {
LOG(("xosfile_set_type: 0x%x: %s",
error->errnum, error->errmess));
warn_user("SaveError", error->errmess);
return false;
}
return true;
}
/**
* Save stylesheets imported by a CONTENT_CSS.
*
* \param c a CONTENT_CSS
* \param path path to save to
* \return true on success, false on error and error reported
*/
bool save_imported_sheets(struct content *c, const char *path)
{
char spath[256];
unsigned int j;
char *source;
int source_len;
os_error *error;
for (j = 0; j != c->data.css.import_count; j++) {
struct content *css = c->data.css.import_content[j];
if (!css)
continue;
if (save_complete_list_check(css))
continue;
if (!save_complete_list_add(css)) {
warn_user("NoMemory", 0);
return false;
}
if (!save_imported_sheets(css, path))
return false;
snprintf(spath, sizeof spath, "%s.%x", path,
(unsigned int) css);
source = rewrite_stylesheet_urls(css->source_data,
css->source_size, &source_len, css->url);
if (!source) {
warn_user("NoMemory", 0);
return false;
}
error = xosfile_save_stamped(spath, 0xf79, source,
source + source_len);
free(source);
if (error) {
LOG(("xosfile_save_stamped: 0x%x: %s",
error->errnum, error->errmess));
warn_user("SaveError", error->errmess);
return false;
}
}
return true;
}
/**
* Initialise the save_complete module.
*/
void save_complete_init(void)
{
/* Match an @import rule - see CSS 2.1 G.1. */
regcomp_wrapper(&save_complete_import_re,
"@import" /* IMPORT_SYM */
"[ \t\r\n\f]*" /* S* */
/* 1 */
"(" /* [ */
/* 2 3 */
"\"(([^\"]|[\\]\")*)\"" /* STRING (approximated) */
"|"
/* 4 5 */
"'(([^']|[\\]')*)'"
"|" /* | */
"url\\([ \t\r\n\f]*" /* URI (approximated) */
/* 6 7 */
"\"(([^\"]|[\\]\")*)\""
"[ \t\r\n\f]*\\)"
"|"
"url\\([ \t\r\n\f]*"
/* 8 9 */
"'(([^']|[\\]')*)'"
"[ \t\r\n\f]*\\)"
"|"
"url\\([ \t\r\n\f]*"
/* 10 */
"([^) \t\r\n\f]*)"
"[ \t\r\n\f]*\\)"
")", /* ] */
REG_EXTENDED | REG_ICASE);
}
/**
* Rewrite stylesheet \@import rules for save complete.
*
* @param source stylesheet source
* @param size size of source
* @param osize updated with the size of the result
* @param base url of stylesheet
* @return converted source, or 0 on out of memory
*/
char * rewrite_stylesheet_urls(const char *source, unsigned int size,
int *osize, const char *base)
{
char *res;
const char *url;
char *url2;
char buf[20];
unsigned int offset = 0;
int url_len = 0;
struct content *content;
int m;
unsigned int i;
unsigned int imports = 0;
regmatch_t match[11];
url_func_result result;
/* count number occurences of @import to (over)estimate result size */
/* can't use strstr because source is not 0-terminated string */
for (i = 0; 7 < size && i != size - 7; i++) {
if (source[i] == '@' &&
tolower(source[i + 1]) == 'i' &&
tolower(source[i + 2]) == 'm' &&
tolower(source[i + 3]) == 'p' &&
tolower(source[i + 4]) == 'o' &&
tolower(source[i + 5]) == 'r' &&
tolower(source[i + 6]) == 't')
imports++;
}
res = malloc(size + imports * 20);
if (!res)
return 0;
*osize = 0;
while (offset < size) {
m = regexec(&save_complete_import_re, source + offset,
11, match, 0);
if (m)
break;
/*for (unsigned int i = 0; i != 11; i++) {
if (match[i].rm_so == -1)
continue;
fprintf(stderr, "%i: '%.*s'\n", i,
match[i].rm_eo - match[i].rm_so,
source + offset + match[i].rm_so);
}*/
url = 0;
if (match[2].rm_so != -1) {
url = source + offset + match[2].rm_so;
url_len = match[2].rm_eo - match[2].rm_so;
} else if (match[4].rm_so != -1) {
url = source + offset + match[4].rm_so;
url_len = match[4].rm_eo - match[4].rm_so;
} else if (match[6].rm_so != -1) {
url = source + offset + match[6].rm_so;
url_len = match[6].rm_eo - match[6].rm_so;
} else if (match[8].rm_so != -1) {
url = source + offset + match[8].rm_so;
url_len = match[8].rm_eo - match[8].rm_so;
} else if (match[10].rm_so != -1) {
url = source + offset + match[10].rm_so;
url_len = match[10].rm_eo - match[10].rm_so;
}
assert(url);
url2 = strndup(url, url_len);
if (!url2) {
free(res);
return 0;
}
result = url_join(url2, base, (char**)&url);
free(url2);
if (result == URL_FUNC_NOMEM) {
free(res);
return 0;
}
/* copy data before match */
memcpy(res + *osize, source + offset, match[0].rm_so);
*osize += match[0].rm_so;
if (result == URL_FUNC_OK) {
content = save_complete_list_find(url);
if (content) {
/* replace import */
snprintf(buf, sizeof buf, "@import '%x'",
(unsigned int) content);
memcpy(res + *osize, buf, strlen(buf));
*osize += strlen(buf);
} else {
/* copy import */
memcpy(res + *osize, source + offset + match[0].rm_so,
match[0].rm_eo - match[0].rm_so);
*osize += match[0].rm_eo - match[0].rm_so;
}
}
else {
/* copy import */
memcpy(res + *osize, source + offset + match[0].rm_so,
match[0].rm_eo - match[0].rm_so);
*osize += match[0].rm_eo - match[0].rm_so;
}
assert(0 < match[0].rm_eo);
offset += match[0].rm_eo;
}
/* copy rest of source */
if (offset < size) {
memcpy(res + *osize, source + offset, size - offset);
*osize += size - offset;
}
return res;
}
/**
* Rewrite URLs in a HTML document to be relative.
*
* \param doc root of the document tree
* \param base base url of document
* \return true on success, false on out of memory
*/
bool rewrite_document_urls(xmlDoc *doc, const char *base)
{
xmlNode *node;
for (node = doc->children; node; node = node->next)
if (node->type == XML_ELEMENT_NODE)
if (!rewrite_urls(node, base))
return false;
return true;
}
/**
* Traverse tree, rewriting URLs as we go.
*
* \param n xmlNode of type XML_ELEMENT_NODE to rewrite
* \param base base url of document
* \return true on success, false on out of memory
*
* URLs in the tree rooted at element n are rewritten.
*/
bool rewrite_urls(xmlNode *n, const char *base)
{
xmlNode *child;
assert(n->type == XML_ELEMENT_NODE);
/**
* We only need to consider the following cases:
*
* Attribute: Elements:
*
* 1) data <object>
* 2) href <a> <area> <link>
* 3) src <script> <input> <frame> <iframe> <img>
* 4) n/a <style>
* 5) n/a any <base> tag
* 6) background any (except those above)
*/
if (!n->name) {
/* ignore */
}
/* 1 */
else if (strcmp(n->name, "object") == 0) {
if (!rewrite_url(n, "data", base))
return false;
}
/* 2 */
else if (strcmp(n->name, "a") == 0 ||
strcmp(n->name, "area") == 0 ||
strcmp(n->name, "link") == 0) {
if (!rewrite_url(n, "href", base))
return false;
}
/* 3 */
else if (strcmp(n->name, "frame") == 0 ||
strcmp(n->name, "iframe") == 0 ||
strcmp(n->name, "input") == 0 ||
strcmp(n->name, "img") == 0 ||
strcmp(n->name, "script") == 0) {
if (!rewrite_url(n, "src", base))
return false;
}
/* 4 */
else if (strcmp(n->name, "style") == 0) {
unsigned int len;
xmlChar *content;
for (child = n->children; child != 0; child = child->next) {
/* Get current content */
content = xmlNodeGetContent(child);
if (!content)
/* unfortunately we don't know if this is
* due to memory exhaustion, or because
* there is no content for this node */
continue;
/* Rewrite @import rules */
char *rewritten = rewrite_stylesheet_urls(
content,
strlen((char*)content),
&len, base);
xmlFree(content);
if (!rewritten)
return false;
/* set new content */
xmlNodeSetContentLen(child,
(const xmlChar*)rewritten,
len);
}
return true;
}
/* 5 */
else if (strcmp(n->name, "base") == 0) {
/* simply remove any <base> tags from the document */
xmlUnlinkNode(n);
xmlFreeNode(n);
/* base tags have no content, so there's no point recursing
* additionally, we've just destroyed this node, so trying
* to recurse would result in bad things happening */
return true;
}
/* 6 */
else {
if (!rewrite_url(n, "background", base))
return false;
}
/* now recurse */
for (child = n->children; child;) {
/* we must extract the next child now, as if the current
* child is a <base> element, it will be removed from the
* tree (see 5, above), thus preventing extraction of the
* next child */
xmlNode *next = child->next;
if (child->type == XML_ELEMENT_NODE) {
if (!rewrite_urls(child, base))
return false;
}
child = next;
}
return true;
}
/**
* Rewrite an URL in a HTML document.
*
* \param n The node to modify
* \param attr The html attribute to modify
* \param base base url of document
* \return true on success, false on out of memory
*/
bool rewrite_url(xmlNode *n, const char *attr, const char *base)
{
char *url, *data;
char rel[20];
struct content *content;
url_func_result res;
if (!xmlHasProp(n, (const xmlChar *) attr))
return true;
data = xmlGetProp(n, (const xmlChar *) attr);
if (!data)
return false;
res = url_join(data, base, &url);
xmlFree(data);
if (res == URL_FUNC_NOMEM)
return false;
else if (res == URL_FUNC_OK) {
content = save_complete_list_find(url);
if (content) {
/* found a match */
free(url);
snprintf(rel, sizeof rel, "%x",
(unsigned int) content);
if (!xmlSetProp(n, (const xmlChar *) attr,
(xmlChar *) rel))
return false;
} else {
/* no match found */
if (!xmlSetProp(n, (const xmlChar *) attr,
(xmlChar *) url)) {
free(url);
return false;
}
free(url);
}
}
return true;
}
/**
* Add a content to the save_complete_list.
*
* \param content content to add
* \return true on success, false on out of memory
*/
bool save_complete_list_add(struct content *content)
{
struct save_complete_entry *entry;
entry = malloc(sizeof (*entry));
if (!entry)
return false;
entry->content = content;
entry->next = save_complete_list;
save_complete_list = entry;
return true;
}
/**
* Look up a url in the save_complete_list.
*
* \param url url to find
* \return content if found, 0 otherwise
*/
struct content * save_complete_list_find(const char *url)
{
struct save_complete_entry *entry;
for (entry = save_complete_list; entry; entry = entry->next)
if (strcmp(url, entry->content->url) == 0)
return entry->content;
return 0;
}
/**
* Look up a content in the save_complete_list.
*
* \param content pointer to content
* \return true if the content is in the save_complete_list
*/
bool save_complete_list_check(struct content *content)
{
struct save_complete_entry *entry;
for (entry = save_complete_list; entry; entry = entry->next)
if (entry->content == content)
return true;
return false;
}
#if 0
/**
* Dump save complete list to stderr
*/
void save_complete_list_dump(void)
{
struct save_complete_entry *entry;
for (entry = save_complete_list; entry; entry = entry->next)
fprintf(stderr, "%p : %s\n", entry->content,
entry->content->url);
}
#endif
/**
* Create the inventory file listing original URLs.
*/
bool save_complete_inventory(const char *path)
{
char spath[256];
FILE *fp;
snprintf(spath, sizeof spath, "%s.Inventory", path);
fp = fopen(spath, "w");
if (!fp) {
LOG(("fopen(): errno = %i", errno));
warn_user("SaveError", strerror(errno));
return false;
}
struct save_complete_entry *entry;
for (entry = save_complete_list; entry; entry = entry->next)
fprintf(fp, "%x %s\n",
(unsigned int) entry->content,
entry->content->url);
fclose(fp);
return true;
}
#endif