1349 lines
29 KiB
C
1349 lines
29 KiB
C
/*
|
|
* Copyright 2012 John-Mark Bell <jmb@netsurf-browser.org>
|
|
* Copyright 2004-2007 James Bursa <bursa@users.sourceforge.net>
|
|
*
|
|
* This file is part of NetSurf, http://www.netsurf-browser.org/
|
|
*
|
|
* NetSurf is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; version 2 of the License.
|
|
*
|
|
* NetSurf is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
/**
|
|
* \file
|
|
* Save HTML document with dependencies implementation.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <errno.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <strings.h>
|
|
#include <sys/types.h>
|
|
#include <dom/dom.h>
|
|
|
|
#include "utils/config.h"
|
|
#include "utils/regex.h"
|
|
#include "utils/corestrings.h"
|
|
#include "utils/log.h"
|
|
#include "utils/nsurl.h"
|
|
#include "utils/utf8.h"
|
|
#include "utils/utils.h"
|
|
#include "utils/file.h"
|
|
#include "utils/messages.h"
|
|
#include "utils/ascii.h"
|
|
#include "netsurf/content.h"
|
|
#include "content/hlcache.h"
|
|
#include "css/css.h"
|
|
#include "html/box.h"
|
|
#include "html/html_save.h"
|
|
#include "html/html.h"
|
|
|
|
#include "netsurf/misc.h"
|
|
#include "desktop/gui_internal.h"
|
|
#include "desktop/save_complete.h"
|
|
|
|
regex_t save_complete_import_re;
|
|
|
|
/** An entry in save_complete_list. */
|
|
typedef struct save_complete_entry {
|
|
struct hlcache_handle *content;
|
|
struct save_complete_entry *next; /**< Next entry in list */
|
|
} save_complete_entry;
|
|
|
|
typedef struct save_complete_ctx {
|
|
const char *path;
|
|
save_complete_entry *list;
|
|
save_complete_set_type_cb set_type;
|
|
|
|
nsurl *base;
|
|
FILE *fp;
|
|
enum { STATE_NORMAL, STATE_IN_STYLE } iter_state;
|
|
} save_complete_ctx;
|
|
|
|
typedef enum {
|
|
EVENT_ENTER,
|
|
EVENT_LEAVE
|
|
} save_complete_event_type;
|
|
|
|
|
|
static nserror save_complete_save_html(save_complete_ctx *ctx, struct hlcache_handle *c, bool index);
|
|
static nserror save_complete_save_imported_sheets(save_complete_ctx *ctx,
|
|
struct nscss_import *imports, uint32_t import_count);
|
|
|
|
|
|
static void save_complete_ctx_initialise(save_complete_ctx *ctx,
|
|
const char *path, save_complete_set_type_cb set_type)
|
|
{
|
|
ctx->path = path;
|
|
ctx->list = NULL;
|
|
ctx->set_type = set_type;
|
|
}
|
|
|
|
static void save_complete_ctx_finalise(save_complete_ctx *ctx)
|
|
{
|
|
save_complete_entry *list = ctx->list;
|
|
|
|
while (list != NULL) {
|
|
save_complete_entry *next = list->next;
|
|
free(list);
|
|
list = next;
|
|
}
|
|
}
|
|
|
|
static nserror
|
|
save_complete_ctx_add_content(save_complete_ctx *ctx,
|
|
struct hlcache_handle *content)
|
|
{
|
|
save_complete_entry *entry;
|
|
|
|
entry = malloc(sizeof (*entry));
|
|
if (entry == NULL) {
|
|
return NSERROR_NOMEM;
|
|
}
|
|
|
|
entry->content = content;
|
|
entry->next = ctx->list;
|
|
ctx->list = entry;
|
|
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
/**
|
|
* find handle to content for url
|
|
*
|
|
* \param ctx The save context
|
|
* \param url The url to find content handle for
|
|
* \return The content handle or NULL if not found.
|
|
*/
|
|
static struct hlcache_handle *
|
|
save_complete_ctx_find_content(save_complete_ctx *ctx, const nsurl *url)
|
|
{
|
|
save_complete_entry *entry;
|
|
|
|
for (entry = ctx->list; entry != NULL; entry = entry->next) {
|
|
if (nsurl_compare(url,
|
|
hlcache_handle_get_url(entry->content),
|
|
NSURL_COMPLETE)) {
|
|
return entry->content;
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
|
|
static bool
|
|
save_complete_ctx_has_content(save_complete_ctx *ctx,
|
|
struct hlcache_handle *content)
|
|
{
|
|
save_complete_entry *entry;
|
|
|
|
for (entry = ctx->list; entry != NULL; entry = entry->next) {
|
|
if (hlcache_handle_get_content(entry->content) ==
|
|
hlcache_handle_get_content(content))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static nserror
|
|
save_complete_save_buffer(save_complete_ctx *ctx,
|
|
const char *leafname,
|
|
const uint8_t *data,
|
|
size_t data_len,
|
|
lwc_string *mime_type)
|
|
{
|
|
nserror ret;
|
|
FILE *fp;
|
|
char *fname = NULL;
|
|
|
|
ret = netsurf_mkpath(&fname, NULL, 2, ctx->path, leafname);
|
|
if (ret != NSERROR_OK) {
|
|
return ret;
|
|
}
|
|
|
|
fp = fopen(fname, "wb");
|
|
if (fp == NULL) {
|
|
free(fname);
|
|
NSLOG(netsurf, INFO, "fopen(): %s", strerror(errno));
|
|
return NSERROR_SAVE_FAILED;
|
|
}
|
|
|
|
fwrite(data, sizeof(*data), data_len, fp);
|
|
|
|
fclose(fp);
|
|
|
|
if (ctx->set_type != NULL) {
|
|
ctx->set_type(fname, mime_type);
|
|
}
|
|
free(fname);
|
|
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
|
|
/**
|
|
* perform a posix regexec on a string without a null terminator
|
|
*/
|
|
static int
|
|
snregexec(const regex_t *preg,
|
|
const char *string,
|
|
size_t stringlen,
|
|
size_t nmatch,
|
|
regmatch_t pmatch[],
|
|
int eflags)
|
|
{
|
|
char *strbuf;
|
|
int matches;
|
|
|
|
strbuf = calloc(1, stringlen + 1);
|
|
if (strbuf == NULL) {
|
|
return -1;
|
|
}
|
|
memcpy(strbuf, string, stringlen);
|
|
|
|
matches = regexec(preg, strbuf, nmatch, pmatch, eflags);
|
|
|
|
free(strbuf);
|
|
|
|
return matches;
|
|
}
|
|
|
|
|
|
/**
|
|
* Rewrite stylesheet \@import rules for save complete.
|
|
*
|
|
* \param ctx Save complete context.
|
|
* \param source stylesheet source.
|
|
* \param size size of source.
|
|
* \param base url of stylesheet.
|
|
* \param osize updated with the size of the result.
|
|
* \return converted source, or NULL on out of memory.
|
|
*/
|
|
static uint8_t *
|
|
save_complete_rewrite_stylesheet_urls(save_complete_ctx *ctx,
|
|
const uint8_t *source,
|
|
size_t size,
|
|
const nsurl *base,
|
|
size_t *osize)
|
|
{
|
|
uint8_t *rewritten;
|
|
unsigned long offset = 0;
|
|
unsigned int imports = 0;
|
|
nserror error;
|
|
|
|
/* count number occurrences of @import to (over)estimate result size */
|
|
/* can't use strstr because source is not 0-terminated string */
|
|
for (offset = 0;
|
|
(SLEN("@import") < size) && (offset <= (size - SLEN("@import")));
|
|
offset++) {
|
|
if (source[offset] == '@' &&
|
|
ascii_to_lower(source[offset + 1]) == 'i' &&
|
|
ascii_to_lower(source[offset + 2]) == 'm' &&
|
|
ascii_to_lower(source[offset + 3]) == 'p' &&
|
|
ascii_to_lower(source[offset + 4]) == 'o' &&
|
|
ascii_to_lower(source[offset + 5]) == 'r' &&
|
|
ascii_to_lower(source[offset + 6]) == 't') {
|
|
imports++;
|
|
}
|
|
}
|
|
|
|
rewritten = malloc(size + imports * 20);
|
|
if (rewritten == NULL)
|
|
return NULL;
|
|
*osize = 0;
|
|
|
|
offset = 0;
|
|
while (offset < size) {
|
|
const uint8_t *import_url = NULL;
|
|
char *import_url_copy;
|
|
int import_url_len = 0;
|
|
nsurl *url = NULL;
|
|
regmatch_t match[11];
|
|
int m;
|
|
|
|
m = snregexec(&save_complete_import_re,
|
|
(const char *)source + offset,
|
|
size - offset,
|
|
11,
|
|
match,
|
|
0);
|
|
if (m)
|
|
break;
|
|
|
|
if (match[2].rm_so != -1) {
|
|
import_url = source + offset + match[2].rm_so;
|
|
import_url_len = match[2].rm_eo - match[2].rm_so;
|
|
} else if (match[4].rm_so != -1) {
|
|
import_url = source + offset + match[4].rm_so;
|
|
import_url_len = match[4].rm_eo - match[4].rm_so;
|
|
} else if (match[6].rm_so != -1) {
|
|
import_url = source + offset + match[6].rm_so;
|
|
import_url_len = match[6].rm_eo - match[6].rm_so;
|
|
} else if (match[8].rm_so != -1) {
|
|
import_url = source + offset + match[8].rm_so;
|
|
import_url_len = match[8].rm_eo - match[8].rm_so;
|
|
} else if (match[10].rm_so != -1) {
|
|
import_url = source + offset + match[10].rm_so;
|
|
import_url_len = match[10].rm_eo - match[10].rm_so;
|
|
}
|
|
assert(import_url != NULL);
|
|
|
|
import_url_copy = strndup((const char *)import_url,
|
|
import_url_len);
|
|
if (import_url_copy == NULL) {
|
|
free(rewritten);
|
|
return NULL;
|
|
}
|
|
|
|
error = nsurl_join(base, import_url_copy, &url);
|
|
free(import_url_copy);
|
|
if (error == NSERROR_NOMEM) {
|
|
free(rewritten);
|
|
return NULL;
|
|
}
|
|
|
|
/* copy data before match */
|
|
memcpy(rewritten + *osize, source + offset, match[0].rm_so);
|
|
*osize += match[0].rm_so;
|
|
|
|
if (url != NULL) {
|
|
hlcache_handle *content;
|
|
content = save_complete_ctx_find_content(ctx, url);
|
|
if (content != NULL) {
|
|
/* replace import */
|
|
char buf[64];
|
|
snprintf(buf, sizeof buf, "@import '%p'",
|
|
content);
|
|
memcpy(rewritten + *osize, buf, strlen(buf));
|
|
*osize += strlen(buf);
|
|
} else {
|
|
/* copy import */
|
|
memcpy(rewritten + *osize,
|
|
source + offset + match[0].rm_so,
|
|
match[0].rm_eo - match[0].rm_so);
|
|
*osize += match[0].rm_eo - match[0].rm_so;
|
|
}
|
|
nsurl_unref(url);
|
|
} else {
|
|
/* copy import */
|
|
memcpy(rewritten + *osize,
|
|
source + offset + match[0].rm_so,
|
|
match[0].rm_eo - match[0].rm_so);
|
|
*osize += match[0].rm_eo - match[0].rm_so;
|
|
}
|
|
|
|
assert(0 < match[0].rm_eo);
|
|
offset += match[0].rm_eo;
|
|
}
|
|
|
|
/* copy rest of source */
|
|
if (offset < size) {
|
|
memcpy(rewritten + *osize, source + offset, size - offset);
|
|
*osize += size - offset;
|
|
}
|
|
|
|
return rewritten;
|
|
}
|
|
|
|
static nserror
|
|
save_complete_save_stylesheet(save_complete_ctx *ctx, hlcache_handle *css)
|
|
{
|
|
const uint8_t *css_data;
|
|
size_t css_size;
|
|
uint8_t *source;
|
|
size_t source_len;
|
|
struct nscss_import *imports;
|
|
uint32_t import_count;
|
|
lwc_string *type;
|
|
char filename[32];
|
|
nserror result;
|
|
|
|
if (save_complete_ctx_find_content(ctx,
|
|
hlcache_handle_get_url(css)) != NULL) {
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
result = save_complete_ctx_add_content(ctx, css);
|
|
if (result != NSERROR_OK) {
|
|
return result;
|
|
}
|
|
|
|
imports = nscss_get_imports(css, &import_count);
|
|
result = save_complete_save_imported_sheets(ctx,
|
|
imports,
|
|
import_count);
|
|
if (result != NSERROR_OK) {
|
|
return result;
|
|
}
|
|
|
|
css_data = content_get_source_data(css, &css_size);
|
|
source = save_complete_rewrite_stylesheet_urls(
|
|
ctx,
|
|
css_data,
|
|
css_size,
|
|
hlcache_handle_get_url(css),
|
|
&source_len);
|
|
if (source == NULL) {
|
|
return NSERROR_NOMEM;
|
|
}
|
|
|
|
type = content_get_mime_type(css);
|
|
if (type == NULL) {
|
|
free(source);
|
|
return NSERROR_NOMEM;
|
|
}
|
|
|
|
snprintf(filename, sizeof filename, "%p", css);
|
|
|
|
result = save_complete_save_buffer(ctx, filename,
|
|
source, source_len, type);
|
|
|
|
lwc_string_unref(type);
|
|
free(source);
|
|
|
|
return result;
|
|
}
|
|
|
|
static nserror
|
|
save_complete_save_imported_sheets(save_complete_ctx *ctx,
|
|
struct nscss_import *imports,
|
|
uint32_t import_count)
|
|
{
|
|
nserror res = NSERROR_OK;
|
|
uint32_t i;
|
|
|
|
for (i = 0; i < import_count; i++) {
|
|
/* treat a valid content as a stylesheet to save */
|
|
if (imports[i].c != NULL) {
|
|
res = save_complete_save_stylesheet(ctx, imports[i].c);
|
|
if (res != NSERROR_OK) {
|
|
return res;
|
|
}
|
|
}
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
static nserror
|
|
save_complete_save_html_stylesheet(save_complete_ctx *ctx,
|
|
struct html_stylesheet *sheet)
|
|
{
|
|
if (sheet->sheet == NULL) {
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
return save_complete_save_stylesheet(ctx, sheet->sheet);
|
|
}
|
|
|
|
static nserror
|
|
save_complete_save_html_stylesheets(save_complete_ctx *ctx,
|
|
hlcache_handle *c)
|
|
{
|
|
struct html_stylesheet *sheets;
|
|
unsigned int i, count;
|
|
nserror res;
|
|
|
|
sheets = html_get_stylesheets(c, &count);
|
|
|
|
for (i = STYLESHEET_START; i != count; i++) {
|
|
res = save_complete_save_html_stylesheet(ctx, &sheets[i]);
|
|
if (res != NSERROR_OK) {
|
|
return res;
|
|
}
|
|
}
|
|
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
static nserror
|
|
save_complete_save_html_object(save_complete_ctx *ctx, hlcache_handle *obj)
|
|
{
|
|
const uint8_t *obj_data;
|
|
size_t obj_size;
|
|
lwc_string *type;
|
|
nserror result;
|
|
char filename[32];
|
|
|
|
if (content_get_type(obj) == CONTENT_NONE) {
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
obj_data = content_get_source_data(obj, &obj_size);
|
|
if (obj_data == NULL) {
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
if (save_complete_ctx_find_content(ctx,
|
|
hlcache_handle_get_url(obj)) != NULL) {
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
result = save_complete_ctx_add_content(ctx, obj);
|
|
if (result != NSERROR_OK) {
|
|
return result;
|
|
}
|
|
|
|
if (content_get_type(obj) == CONTENT_HTML) {
|
|
return save_complete_save_html(ctx, obj, false);
|
|
}
|
|
|
|
snprintf(filename, sizeof filename, "%p", obj);
|
|
|
|
type = content_get_mime_type(obj);
|
|
if (type == NULL) {
|
|
return NSERROR_NOMEM;
|
|
}
|
|
|
|
result = save_complete_save_buffer(ctx, filename, obj_data, obj_size, type);
|
|
|
|
lwc_string_unref(type);
|
|
|
|
return result;
|
|
}
|
|
|
|
static nserror
|
|
save_complete_save_html_objects(save_complete_ctx *ctx,
|
|
hlcache_handle *c)
|
|
{
|
|
struct content_html_object *object;
|
|
unsigned int count;
|
|
nserror res;
|
|
|
|
object = html_get_objects(c, &count);
|
|
|
|
for (; object != NULL; object = object->next) {
|
|
if ((object->content != NULL) &&
|
|
(object->box != NULL)) {
|
|
res = save_complete_save_html_object(ctx, object->content);
|
|
if (res != NSERROR_OK) {
|
|
return res;
|
|
}
|
|
}
|
|
}
|
|
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
static bool
|
|
save_complete_libdom_treewalk(dom_node *root,
|
|
bool (*callback)(dom_node *node,
|
|
save_complete_event_type event_type,
|
|
void *ctx),
|
|
void *ctx)
|
|
{
|
|
dom_node *node;
|
|
|
|
node = dom_node_ref(root); /* tree root */
|
|
|
|
while (node != NULL) {
|
|
dom_node *next = NULL;
|
|
dom_exception exc;
|
|
|
|
exc = dom_node_get_first_child(node, &next);
|
|
if (exc != DOM_NO_ERR) {
|
|
dom_node_unref(node);
|
|
break;
|
|
}
|
|
|
|
if (next != NULL) { /* 1. children */
|
|
dom_node_unref(node);
|
|
node = next;
|
|
} else {
|
|
exc = dom_node_get_next_sibling(node, &next);
|
|
if (exc != DOM_NO_ERR) {
|
|
dom_node_unref(node);
|
|
break;
|
|
}
|
|
|
|
if (next != NULL) { /* 2. siblings */
|
|
if (callback(node, EVENT_LEAVE, ctx) == false) {
|
|
return false;
|
|
}
|
|
dom_node_unref(node);
|
|
node = next;
|
|
} else { /* 3. ancestor siblings */
|
|
while (node != NULL) {
|
|
exc = dom_node_get_next_sibling(node,
|
|
&next);
|
|
if (exc != DOM_NO_ERR) {
|
|
dom_node_unref(node);
|
|
node = NULL;
|
|
break;
|
|
}
|
|
|
|
if (next != NULL) {
|
|
dom_node_unref(next);
|
|
break;
|
|
}
|
|
|
|
exc = dom_node_get_parent_node(node,
|
|
&next);
|
|
if (exc != DOM_NO_ERR) {
|
|
dom_node_unref(node);
|
|
node = NULL;
|
|
break;
|
|
}
|
|
|
|
if (callback(node, EVENT_LEAVE,
|
|
ctx) == false) {
|
|
return false;
|
|
}
|
|
dom_node_unref(node);
|
|
node = next;
|
|
}
|
|
|
|
if (node == NULL)
|
|
break;
|
|
|
|
exc = dom_node_get_next_sibling(node, &next);
|
|
if (exc != DOM_NO_ERR) {
|
|
dom_node_unref(node);
|
|
break;
|
|
}
|
|
|
|
if (callback(node, EVENT_LEAVE, ctx) == false) {
|
|
return false;
|
|
}
|
|
dom_node_unref(node);
|
|
node = next;
|
|
}
|
|
}
|
|
|
|
assert(node != NULL);
|
|
|
|
if (callback(node, EVENT_ENTER, ctx) == false) {
|
|
return false; /* callback caused early termination */
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool save_complete_rewrite_url_value(save_complete_ctx *ctx,
|
|
const char *value, size_t value_len)
|
|
{
|
|
nsurl *url;
|
|
hlcache_handle *content;
|
|
char *escaped;
|
|
nserror error;
|
|
|
|
error = nsurl_join(ctx->base, value, &url);
|
|
if (error == NSERROR_NOMEM)
|
|
return false;
|
|
|
|
if (url != NULL) {
|
|
content = save_complete_ctx_find_content(ctx, url);
|
|
if (content != NULL) {
|
|
/* found a match */
|
|
nsurl_unref(url);
|
|
|
|
fprintf(ctx->fp, "\"%p\"", content);
|
|
} else {
|
|
/* no match found */
|
|
error = utf8_to_html(nsurl_access(url), "UTF-8",
|
|
nsurl_length(url), &escaped);
|
|
nsurl_unref(url);
|
|
|
|
if (error != NSERROR_OK)
|
|
return false;
|
|
|
|
fprintf(ctx->fp, "\"%s\"", escaped);
|
|
|
|
free(escaped);
|
|
}
|
|
} else {
|
|
error = utf8_to_html(value, "UTF-8", value_len, &escaped);
|
|
if (error != NSERROR_OK)
|
|
return false;
|
|
|
|
fprintf(ctx->fp, "\"%s\"", escaped);
|
|
|
|
free(escaped);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool save_complete_write_value(save_complete_ctx *ctx,
|
|
const char *value, size_t value_len)
|
|
{
|
|
char *escaped;
|
|
nserror ret;
|
|
|
|
ret = utf8_to_html(value, "UTF-8", value_len, &escaped);
|
|
if (ret != NSERROR_OK)
|
|
return false;
|
|
|
|
fprintf(ctx->fp, "\"%s\"", escaped);
|
|
|
|
free(escaped);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool save_complete_handle_attr_value(save_complete_ctx *ctx,
|
|
dom_string *node_name, dom_string *attr_name,
|
|
dom_string *attr_value)
|
|
{
|
|
const char *node_data = dom_string_data(node_name);
|
|
size_t node_len = dom_string_byte_length(node_name);
|
|
const char *name_data = dom_string_data(attr_name);
|
|
size_t name_len = dom_string_byte_length(attr_name);
|
|
const char *value_data = dom_string_data(attr_value);
|
|
size_t value_len = dom_string_byte_length(attr_value);
|
|
|
|
/**
|
|
* We only need to consider the following cases:
|
|
*
|
|
* Attribute: Elements:
|
|
*
|
|
* 1) data object
|
|
* 2) href a, area, link
|
|
* 3) src script, input, frame, iframe, img
|
|
* 4) background any (except those above)
|
|
*/
|
|
/* 1 */
|
|
if (name_len == SLEN("data") &&
|
|
strncasecmp(name_data, "data", name_len) == 0) {
|
|
if (node_len == SLEN("object") &&
|
|
strncasecmp(node_data,
|
|
"object", node_len) == 0) {
|
|
return save_complete_rewrite_url_value(ctx,
|
|
value_data, value_len);
|
|
} else {
|
|
return save_complete_write_value(ctx,
|
|
value_data, value_len);
|
|
}
|
|
}
|
|
/* 2 */
|
|
else if (name_len == SLEN("href") &&
|
|
strncasecmp(name_data, "href", name_len) == 0) {
|
|
if ((node_len == SLEN("a") &&
|
|
strncasecmp(node_data, "a", node_len) == 0) ||
|
|
(node_len == SLEN("area") &&
|
|
strncasecmp(node_data, "area",
|
|
node_len) == 0) ||
|
|
(node_len == SLEN("link") &&
|
|
strncasecmp(node_data, "link",
|
|
node_len) == 0)) {
|
|
return save_complete_rewrite_url_value(ctx,
|
|
value_data, value_len);
|
|
} else {
|
|
return save_complete_write_value(ctx,
|
|
value_data, value_len);
|
|
}
|
|
}
|
|
/* 3 */
|
|
else if (name_len == SLEN("src") &&
|
|
strncasecmp(name_data, "src", name_len) == 0) {
|
|
if ((node_len == SLEN("frame") &&
|
|
strncasecmp(node_data, "frame",
|
|
node_len) == 0) ||
|
|
(node_len == SLEN("iframe") &&
|
|
strncasecmp(node_data, "iframe",
|
|
node_len) == 0) ||
|
|
(node_len == SLEN("input") &&
|
|
strncasecmp(node_data, "input",
|
|
node_len) == 0) ||
|
|
(node_len == SLEN("img") &&
|
|
strncasecmp(node_data, "img",
|
|
node_len) == 0) ||
|
|
(node_len == SLEN("script") &&
|
|
strncasecmp(node_data, "script",
|
|
node_len) == 0)) {
|
|
return save_complete_rewrite_url_value(ctx,
|
|
value_data, value_len);
|
|
} else {
|
|
return save_complete_write_value(ctx,
|
|
value_data, value_len);
|
|
}
|
|
}
|
|
/* 4 */
|
|
else if (name_len == SLEN("background") &&
|
|
strncasecmp(name_data, "background", name_len) == 0) {
|
|
return save_complete_rewrite_url_value(ctx,
|
|
value_data, value_len);
|
|
} else {
|
|
return save_complete_write_value(ctx,
|
|
value_data, value_len);
|
|
}
|
|
}
|
|
|
|
static bool
|
|
save_complete_handle_attr(save_complete_ctx *ctx,
|
|
dom_string *node_name,
|
|
dom_attr *attr)
|
|
{
|
|
dom_string *name;
|
|
const char *name_data;
|
|
size_t name_len;
|
|
dom_string *value;
|
|
dom_exception error;
|
|
|
|
error = dom_attr_get_name(attr, &name);
|
|
if (error != DOM_NO_ERR)
|
|
return false;
|
|
|
|
if (name == NULL)
|
|
return true;
|
|
|
|
error = dom_attr_get_value(attr, &value);
|
|
if (error != DOM_NO_ERR) {
|
|
dom_string_unref(name);
|
|
return false;
|
|
}
|
|
|
|
name_data = dom_string_data(name);
|
|
name_len = dom_string_byte_length(name);
|
|
|
|
fputc(' ', ctx->fp);
|
|
fwrite(name_data, sizeof(*name_data), name_len, ctx->fp);
|
|
|
|
if (value != NULL) {
|
|
fputc('=', ctx->fp);
|
|
if (save_complete_handle_attr_value(ctx, node_name,
|
|
name, value) == false) {
|
|
dom_string_unref(value);
|
|
dom_string_unref(name);
|
|
return false;
|
|
}
|
|
dom_string_unref(value);
|
|
}
|
|
|
|
dom_string_unref(name);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
save_complete_handle_attrs(save_complete_ctx *ctx,
|
|
dom_string *node_name,
|
|
dom_namednodemap *attrs)
|
|
{
|
|
uint32_t length, i;
|
|
dom_exception error;
|
|
|
|
error = dom_namednodemap_get_length(attrs, &length);
|
|
if (error != DOM_NO_ERR)
|
|
return false;
|
|
|
|
for (i = 0; i < length; i++) {
|
|
dom_attr *attr;
|
|
|
|
error = dom_namednodemap_item(attrs, i, (void *) &attr);
|
|
if (error != DOM_NO_ERR)
|
|
return false;
|
|
|
|
if (attr == NULL)
|
|
continue;
|
|
|
|
if (save_complete_handle_attr(ctx, node_name, attr) == false) {
|
|
dom_node_unref(attr);
|
|
return false;
|
|
}
|
|
|
|
dom_node_unref(attr);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
save_complete_handle_element(save_complete_ctx *ctx,
|
|
dom_node *node,
|
|
save_complete_event_type event_type)
|
|
{
|
|
dom_string *name;
|
|
dom_namednodemap *attrs;
|
|
const char *name_data;
|
|
size_t name_len;
|
|
bool process = true;
|
|
dom_exception error;
|
|
|
|
ctx->iter_state = STATE_NORMAL;
|
|
|
|
error = dom_node_get_node_name(node, &name);
|
|
if (error != DOM_NO_ERR)
|
|
return false;
|
|
|
|
if (name == NULL)
|
|
return true;
|
|
|
|
name_data = dom_string_data(name);
|
|
name_len = dom_string_byte_length(name);
|
|
|
|
if ((name_len == SLEN("base")) &&
|
|
(strncasecmp(name_data, "base", name_len) == 0)) {
|
|
/* Elide BASE elements from the output */
|
|
process = false;
|
|
} else if ((name_len == SLEN("meta")) &&
|
|
(strncasecmp(name_data, "meta", name_len) == 0)) {
|
|
/* Don't emit close tags for META elements */
|
|
if (event_type == EVENT_LEAVE) {
|
|
process = false;
|
|
} else {
|
|
/* Elide meta charsets */
|
|
dom_string *value;
|
|
error = dom_element_get_attribute(node,
|
|
corestring_dom_http_equiv,
|
|
&value);
|
|
if (error != DOM_NO_ERR) {
|
|
dom_string_unref(name);
|
|
return false;
|
|
}
|
|
|
|
if (value != NULL) {
|
|
if (dom_string_length(value) ==
|
|
SLEN("Content-Type") &&
|
|
strncasecmp(dom_string_data(value),
|
|
"Content-Type",
|
|
SLEN("Content-Type")) == 0)
|
|
process = false;
|
|
|
|
dom_string_unref(value);
|
|
} else {
|
|
bool yes;
|
|
|
|
error = dom_element_has_attribute(node,
|
|
corestring_dom_charset, &yes);
|
|
if (error != DOM_NO_ERR) {
|
|
dom_string_unref(name);
|
|
return false;
|
|
}
|
|
|
|
if (yes)
|
|
process = false;
|
|
}
|
|
}
|
|
} else if (event_type == EVENT_LEAVE &&
|
|
((name_len == SLEN("link") &&
|
|
strncasecmp(name_data, "link", name_len) == 0))) {
|
|
/* Don't emit close tags for void elements */
|
|
process = false;
|
|
}
|
|
|
|
if (process == false) {
|
|
dom_string_unref(name);
|
|
return true;
|
|
}
|
|
|
|
fputc('<', ctx->fp);
|
|
if (event_type == EVENT_LEAVE) {
|
|
fputc('/', ctx->fp);
|
|
}
|
|
fwrite(name_data, sizeof(*name_data), name_len, ctx->fp);
|
|
|
|
if (event_type == EVENT_ENTER) {
|
|
error = dom_node_get_attributes(node, &attrs);
|
|
if (error != DOM_NO_ERR) {
|
|
dom_string_unref(name);
|
|
return false;
|
|
}
|
|
|
|
if (save_complete_handle_attrs(ctx, name, attrs) == false) {
|
|
dom_namednodemap_unref(attrs);
|
|
dom_string_unref(name);
|
|
return false;
|
|
}
|
|
|
|
dom_namednodemap_unref(attrs);
|
|
}
|
|
|
|
fputc('>', ctx->fp);
|
|
|
|
/* Rewrite contents of style elements */
|
|
if (event_type == EVENT_ENTER && name_len == SLEN("style") &&
|
|
strncasecmp(name_data, "style", name_len) == 0) {
|
|
dom_string *content;
|
|
|
|
error = dom_node_get_text_content(node, &content);
|
|
if (error != DOM_NO_ERR) {
|
|
dom_string_unref(name);
|
|
return false;
|
|
}
|
|
|
|
if (content != NULL) {
|
|
uint8_t *rewritten;
|
|
size_t len;
|
|
|
|
/* Rewrite @import rules */
|
|
rewritten = save_complete_rewrite_stylesheet_urls(
|
|
ctx,
|
|
(const uint8_t *)dom_string_data(content),
|
|
dom_string_byte_length(content),
|
|
ctx->base,
|
|
&len);
|
|
if (rewritten == NULL) {
|
|
dom_string_unref(content);
|
|
dom_string_unref(name);
|
|
return false;
|
|
}
|
|
|
|
dom_string_unref(content);
|
|
|
|
fwrite(rewritten, sizeof(*rewritten), len, ctx->fp);
|
|
|
|
free(rewritten);
|
|
}
|
|
|
|
ctx->iter_state = STATE_IN_STYLE;
|
|
} else if (event_type == EVENT_ENTER && name_len == SLEN("head") &&
|
|
strncasecmp(name_data, "head", name_len) == 0) {
|
|
/* If this is a HEAD element, insert a meta charset */
|
|
fputs("<META http-equiv=\"Content-Type\" "
|
|
"content=\"text/html; charset=utf-8\">",
|
|
ctx->fp);
|
|
}
|
|
|
|
dom_string_unref(name);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
save_complete_node_handler(dom_node *node,
|
|
save_complete_event_type event_type,
|
|
void *ctxin)
|
|
{
|
|
save_complete_ctx *ctx = ctxin;
|
|
dom_node_type type;
|
|
dom_exception error;
|
|
nserror ret;
|
|
|
|
error = dom_node_get_node_type(node, &type);
|
|
if (error != DOM_NO_ERR)
|
|
return false;
|
|
|
|
if (type == DOM_ELEMENT_NODE) {
|
|
return save_complete_handle_element(ctx, node, event_type);
|
|
} else if (type == DOM_TEXT_NODE || type == DOM_COMMENT_NODE) {
|
|
if (event_type != EVENT_ENTER)
|
|
return true;
|
|
|
|
if (ctx->iter_state != STATE_IN_STYLE) {
|
|
/* Emit text content */
|
|
dom_string *text;
|
|
const char *text_data;
|
|
size_t text_len;
|
|
|
|
error = dom_characterdata_get_data(node, &text);
|
|
if (error != DOM_NO_ERR) {
|
|
return false;
|
|
}
|
|
|
|
if (type == DOM_COMMENT_NODE)
|
|
fwrite("<!--", 1, sizeof("<!--") - 1, ctx->fp);
|
|
|
|
if (text != NULL) {
|
|
char *escaped;
|
|
|
|
text_data = dom_string_data(text);
|
|
text_len = dom_string_byte_length(text);
|
|
|
|
ret = utf8_to_html(text_data, "UTF-8",
|
|
text_len, &escaped);
|
|
if (ret != NSERROR_OK)
|
|
return false;
|
|
|
|
fwrite(escaped, sizeof(*escaped),
|
|
strlen(escaped), ctx->fp);
|
|
|
|
free(escaped);
|
|
|
|
dom_string_unref(text);
|
|
}
|
|
|
|
if (type == DOM_COMMENT_NODE) {
|
|
fwrite("-->", 1, sizeof("-->") - 1, ctx->fp);
|
|
}
|
|
}
|
|
|
|
} else if (type == DOM_DOCUMENT_TYPE_NODE) {
|
|
dom_string *name;
|
|
const char *name_data;
|
|
size_t name_len;
|
|
|
|
if (event_type != EVENT_ENTER)
|
|
return true;
|
|
|
|
error = dom_document_type_get_name(node, &name);
|
|
if (error != DOM_NO_ERR)
|
|
return false;
|
|
|
|
if (name == NULL)
|
|
return true;
|
|
|
|
name_data = dom_string_data(name);
|
|
name_len = dom_string_byte_length(name);
|
|
|
|
fputs("<!DOCTYPE ", ctx->fp);
|
|
fwrite(name_data, sizeof(*name_data), name_len, ctx->fp);
|
|
|
|
dom_string_unref(name);
|
|
|
|
error = dom_document_type_get_public_id(node, &name);
|
|
if (error != DOM_NO_ERR)
|
|
return false;
|
|
|
|
if (name != NULL) {
|
|
name_data = dom_string_data(name);
|
|
name_len = dom_string_byte_length(name);
|
|
|
|
if (name_len > 0)
|
|
fprintf(ctx->fp, " PUBLIC \"%.*s\"",
|
|
(int) name_len, name_data);
|
|
|
|
dom_string_unref(name);
|
|
}
|
|
|
|
error = dom_document_type_get_system_id(node, &name);
|
|
if (error != DOM_NO_ERR)
|
|
return false;
|
|
|
|
if (name != NULL) {
|
|
name_data = dom_string_data(name);
|
|
name_len = dom_string_byte_length(name);
|
|
|
|
if (name_len > 0)
|
|
fprintf(ctx->fp, " \"%.*s\"",
|
|
(int) name_len, name_data);
|
|
|
|
dom_string_unref(name);
|
|
}
|
|
|
|
fputc('>', ctx->fp);
|
|
} else if (type == DOM_DOCUMENT_NODE) {
|
|
/* Do nothing */
|
|
} else {
|
|
NSLOG(netsurf, INFO, "Unhandled node type: %d", type);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static nserror
|
|
save_complete_save_html_document(save_complete_ctx *ctx,
|
|
hlcache_handle *c,
|
|
bool index)
|
|
{
|
|
nserror ret;
|
|
FILE *fp;
|
|
char *fname = NULL;
|
|
dom_document *doc;
|
|
lwc_string *mime_type;
|
|
char filename[32];
|
|
|
|
if (index) {
|
|
snprintf(filename, sizeof filename, "index");
|
|
} else {
|
|
snprintf(filename, sizeof filename, "%p", c);
|
|
}
|
|
|
|
ret = netsurf_mkpath(&fname, NULL, 2, ctx->path, filename);
|
|
if (ret != NSERROR_OK) {
|
|
return ret;
|
|
}
|
|
|
|
fp = fopen(fname, "wb");
|
|
if (fp == NULL) {
|
|
free(fname);
|
|
NSLOG(netsurf, INFO, "fopen(): %s", strerror(errno));
|
|
return NSERROR_SAVE_FAILED;
|
|
}
|
|
|
|
ctx->base = html_get_base_url(c);
|
|
ctx->fp = fp;
|
|
ctx->iter_state = STATE_NORMAL;
|
|
|
|
doc = html_get_document(c);
|
|
|
|
if (save_complete_libdom_treewalk((dom_node *)doc,
|
|
save_complete_node_handler,
|
|
ctx) == false) {
|
|
free(fname);
|
|
fclose(fp);
|
|
return NSERROR_NOMEM;
|
|
}
|
|
|
|
fclose(fp);
|
|
|
|
mime_type = content_get_mime_type(c);
|
|
if (mime_type != NULL) {
|
|
if (ctx->set_type != NULL) {
|
|
ctx->set_type(fname, mime_type);
|
|
}
|
|
|
|
lwc_string_unref(mime_type);
|
|
}
|
|
free(fname);
|
|
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
/**
|
|
* Save an HTML page with all dependencies, recursing through imported pages.
|
|
*
|
|
* \param ctx Save complete context
|
|
* \param c Content to save
|
|
* \param index true to save as "index"
|
|
* \return true on success, false on error and error reported
|
|
*/
|
|
static nserror
|
|
save_complete_save_html(save_complete_ctx *ctx,
|
|
hlcache_handle *c,
|
|
bool index)
|
|
{
|
|
nserror res;
|
|
|
|
if (content_get_type(c) != CONTENT_HTML) {
|
|
return NSERROR_INVALID;
|
|
}
|
|
|
|
if (save_complete_ctx_has_content(ctx, c)) {
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
res = save_complete_save_html_stylesheets(ctx, c);
|
|
if (res != NSERROR_OK) {
|
|
return res;
|
|
}
|
|
|
|
res = save_complete_save_html_objects(ctx, c);
|
|
if (res != NSERROR_OK) {
|
|
return res;
|
|
}
|
|
|
|
return save_complete_save_html_document(ctx, c, index);
|
|
}
|
|
|
|
|
|
/**
|
|
* Create the inventory file listing original URLs.
|
|
*/
|
|
|
|
static nserror save_complete_inventory(save_complete_ctx *ctx)
|
|
{
|
|
nserror ret;
|
|
FILE *fp;
|
|
char *fname = NULL;
|
|
save_complete_entry *entry;
|
|
|
|
ret = netsurf_mkpath(&fname, NULL, 2, ctx->path, "Inventory");
|
|
if (ret != NSERROR_OK) {
|
|
return ret;
|
|
}
|
|
|
|
fp = fopen(fname, "w");
|
|
free(fname);
|
|
if (fp == NULL) {
|
|
NSLOG(netsurf, INFO, "fopen(): %s", strerror(errno));
|
|
return NSERROR_SAVE_FAILED;
|
|
}
|
|
|
|
for (entry = ctx->list; entry != NULL; entry = entry->next) {
|
|
fprintf(fp, "%p %s\n",
|
|
entry->content,
|
|
nsurl_access(hlcache_handle_get_url(
|
|
entry->content)));
|
|
}
|
|
|
|
fclose(fp);
|
|
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
/**
|
|
* Compile a regular expression, handling errors.
|
|
*
|
|
* Parameters as for regcomp(), see man regex.
|
|
*/
|
|
static nserror regcomp_wrapper(regex_t *preg, const char *regex, int cflags)
|
|
{
|
|
int r;
|
|
r = regcomp(preg, regex, cflags);
|
|
if (r) {
|
|
char errbuf[200];
|
|
regerror(r, preg, errbuf, sizeof errbuf);
|
|
NSLOG(netsurf, INFO, "Failed to compile regexp '%s': %s\n",
|
|
regex, errbuf);
|
|
return NSERROR_INIT_FAILED;
|
|
}
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
|
|
/* Documented in save_complete.h */
|
|
void save_complete_init(void)
|
|
{
|
|
/* Match an @import rule - see CSS 2.1 G.1. */
|
|
regcomp_wrapper(&save_complete_import_re,
|
|
"@import" /* IMPORT_SYM */
|
|
"[ \t\r\n\f]*" /* S* */
|
|
/* 1 */
|
|
"(" /* [ */
|
|
/* 2 3 */
|
|
"\"(([^\"]|[\\]\")*)\"" /* STRING (approximated) */
|
|
"|"
|
|
/* 4 5 */
|
|
"'(([^']|[\\]')*)'"
|
|
"|" /* | */
|
|
"url\\([ \t\r\n\f]*" /* URI (approximated) */
|
|
/* 6 7 */
|
|
"\"(([^\"]|[\\]\")*)\""
|
|
"[ \t\r\n\f]*\\)"
|
|
"|"
|
|
"url\\([ \t\r\n\f]*"
|
|
/* 8 9 */
|
|
"'(([^']|[\\]')*)'"
|
|
"[ \t\r\n\f]*\\)"
|
|
"|"
|
|
"url\\([ \t\r\n\f]*"
|
|
/* 10 */
|
|
"([^) \t\r\n\f]*)"
|
|
"[ \t\r\n\f]*\\)"
|
|
")", /* ] */
|
|
REG_EXTENDED | REG_ICASE);
|
|
}
|
|
|
|
/* Documented in save_complete.h */
|
|
nserror save_complete_finalise(void)
|
|
{
|
|
regfree(&save_complete_import_re);
|
|
return NSERROR_OK;
|
|
}
|
|
|
|
/* Documented in save_complete.h */
|
|
nserror
|
|
save_complete(hlcache_handle *c,
|
|
const char *path,
|
|
save_complete_set_type_cb set_type)
|
|
{
|
|
nserror result;
|
|
save_complete_ctx ctx;
|
|
|
|
save_complete_ctx_initialise(&ctx, path, set_type);
|
|
|
|
result = save_complete_save_html(&ctx, c, true);
|
|
|
|
if (result == NSERROR_OK) {
|
|
result = save_complete_inventory(&ctx);
|
|
}
|
|
|
|
save_complete_ctx_finalise(&ctx);
|
|
|
|
return result;
|
|
}
|